diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 7bb822ad9..3145e136e 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -29,6 +29,7 @@ jobs:
           - NotGradients
           - Gradients
     steps:
+      - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE"
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v2
         with:
diff --git a/.gitignore b/.gitignore
index 293442edd..697b70410 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *.jl.*.cov
 *.jl.mem
 docs/build
-/Manifest.toml
+*Manifest.toml
 benchmark/tune.json
 benchmark/results
 .vscode/settings.json
diff --git a/Project.toml b/Project.toml
index 5654df3c5..469f820ea 100644
--- a/Project.toml
+++ b/Project.toml
@@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1"
 BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864"
 Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
@@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615"
 FLoops = "cc61a311-1640-44b5-9fba-1b764f453329"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce"
 PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884"
@@ -32,6 +33,7 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a"
 UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
@@ -39,6 +41,7 @@ KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
 PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d"
 
 [extensions]
+MollyCUDAExt = "CUDA"
 MollyEnzymeExt = "Enzyme"
 MollyGLMakieExt = ["GLMakie", "Colors"]
 MollyKernelDensityExt = "KernelDensity"
@@ -61,7 +64,9 @@ Enzyme = "0.13.20"
 EzXML = "1"
 FLoops = "0.2"
 GLMakie = "0.8, 0.9, 0.10, 0.11"
+GPUArrays = "11"
 Graphs = "1.8"
+KernelAbstractions = "0.9"
 KernelDensity = "0.5, 0.6"
 LinearAlgebra = "1.9"
 NearestNeighbors = "0.4"
diff --git a/README.md b/README.md
index 2db2fa5d8..182d52f52 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,7 @@ Implemented features include:
 - [Unitful.jl](https://github.com/PainterQubits/Unitful.jl) compatibility so numbers have physical meaning.
 - Set up crystal systems using [SimpleCrystals.jl](https://github.com/ejmeitz/SimpleCrystals.jl).
 - Automatic multithreading.
-- GPU acceleration on CUDA-enabled devices.
+- GPU acceleration on all backends supported by [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl), with better performance on CUDA-enabled devices.
 - Run with Float64, Float32 or other float types.
 - Some analysis functions, e.g. RDF.
 - Visualise simulations as animations with [Makie.jl](https://makie.juliaplots.org/stable).
diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl
index 08e6c5b4a..c790b4463 100644
--- a/benchmark/benchmarks.jl
+++ b/benchmark/benchmarks.jl
@@ -17,15 +17,15 @@ else
     @warn "The parallel benchmarks will not be run as Julia is running on 1 thread"
 end
 
-# Allow CUDA device to be specified
-const DEVICE = get(ENV, "DEVICE", "0")
+# Allow GPU device to be specified
+const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
 
-const run_gpu_tests = CUDA.functional()
-if run_gpu_tests
-    device!(parse(Int, DEVICE))
-    @info "The GPU benchmarks will be run on device $DEVICE"
+const run_cuda_tests = CUDA.functional()
+if run_cuda_tests
+    device!(DEVICE)
+    @info "The CUDA benchmarks will be run on device $DEVICE"
 else
-    @warn "The GPU benchmarks will not be run as a CUDA-enabled device is not available"
+    @warn "The CUDA benchmarks will not be run as a CUDA-enabled device is not available"
 end
 
 const SUITE = BenchmarkGroup(
@@ -62,7 +62,7 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at
 const starting_coords_f32 = [Float32.(c) for c in starting_coords]
 const starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+function test_sim(nl::Bool, parallel::Bool, f32::Bool, ::Type{AT}) where AT
     n_atoms = 400
     n_steps = 200
     atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -72,9 +72,9 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
     bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
     specific_inter_lists = (InteractionList2Atoms(
-        gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-        gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-        gpu ? CuArray(bonds) : bonds,
+        AT(Int32.(collect(1:2:n_atoms))),
+        AT(Int32.(collect(2:2:n_atoms))),
+        AT(bonds),
     ),)
 
     neighbor_finder = NoNeighborFinder()
@@ -82,24 +82,17 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
     pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
     if nl
         neighbor_finder = DistanceNeighborFinder(
-            eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
+            eligible=AT(trues(n_atoms, n_atoms)),
             n_steps=10,
             dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
         )
         pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
     end
 
-    if gpu
-        coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-        velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-        atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                              ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-    else
-        coords = copy(f32 ? starting_coords_f32 : starting_coords)
-        velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-        atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                      ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-    end
+    coords = AT(copy(f32 ? starting_coords_f32 : starting_coords))
+    velocities = AT(copy(f32 ? starting_velocities_f32 : starting_velocities))
+    atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                     ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
     sys = System(
         atoms=atoms,
@@ -117,22 +110,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
 end
 
 runs = [
-    ("CPU"       , [false, false, false, false]),
-    ("CPU f32"   , [false, false, true , false]),
-    ("CPU NL"    , [true , false, false, false]),
-    ("CPU f32 NL", [true , false, true , false]),
+    ("CPU"       , [false, false, false, Array]),
+    ("CPU f32"   , [false, false, true , Array]),
+    ("CPU NL"    , [true , false, false, Array]),
+    ("CPU f32 NL", [true , false, true , Array]),
 ]
 if run_parallel_tests
-    push!(runs, ("CPU parallel"       , [false, true , false, false]))
-    push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-    push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-    push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+    push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+    push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+    push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+    push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
 end
-if run_gpu_tests
-    push!(runs, ("GPU"       , [false, false, false, true]))
-    push!(runs, ("GPU f32"   , [false, false, true , true]))
-    push!(runs, ("GPU NL"    , [true , false, false, true]))
-    push!(runs, ("GPU f32 NL", [true , false, true , true]))
+if run_cuda_tests
+    push!(runs, ("GPU"       , [false, false, false, CuArray]))
+    push!(runs, ("GPU f32"   , [false, false, true , CuArray]))
+    push!(runs, ("GPU NL"    , [true , false, false, CuArray]))
+    push!(runs, ("GPU f32 NL", [true , false, true , CuArray]))
 end
 
 for (name, args) in runs
diff --git a/benchmark/protein.jl b/benchmark/protein.jl
index 30f512c07..7ff549c22 100644
--- a/benchmark/protein.jl
+++ b/benchmark/protein.jl
@@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data")
 const ff_dir = joinpath(data_dir, "force_fields")
 const openmm_dir = joinpath(data_dir, "openmm_6mrr")
 
-function setup_system(gpu::Bool, f32::Bool, units::Bool)
+function setup_system(::Type{AT}, f32::Bool, units::Bool) where AT
     T = f32 ? Float32 : Float64
     ff = MolecularForceField(
         T,
@@ -27,9 +27,9 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
     sys = System(
         joinpath(data_dir, "6mrr_equil.pdb"),
         ff;
-        velocities=gpu ? CuArray(velocities) : velocities,
+        velocities=AT(velocities),
         units=units,
-        gpu=gpu,
+        array_type=AT,
         dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff),
         dist_neighbors=(units ? dist_neighbors * u"nm" : dist_neighbors),
     )
@@ -41,21 +41,21 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool)
 end
 
 runs = [
-    # run_name                             gpu    parr   f32    units
-    ("CPU 1 thread"                      , false, false, false, true ),
-    ("CPU 1 thread f32"                  , false, false, true , true ),
-    ("CPU 1 thread f32 nounits"          , false, false, true , false),
-    ("CPU $n_threads threads"            , false, true , false, true ),
-    ("CPU $n_threads threads f32"        , false, true , true , true ),
-    ("CPU $n_threads threads f32 nounits", false, true , true , false),
-    ("GPU"                               , true , false, false, true ),
-    ("GPU f32"                           , true , false, true , true ),
-    ("GPU f32 nounits"                   , true , false, true , false),
+    # run_name                             gpu      parr   f32    units
+    ("CPU 1 thread"                      , Array  , false, false, true ),
+    ("CPU 1 thread f32"                  , Array  , false, true , true ),
+    ("CPU 1 thread f32 nounits"          , Array  , false, true , false),
+    ("CPU $n_threads threads"            , Array  , true , false, true ),
+    ("CPU $n_threads threads f32"        , Array  , true , true , true ),
+    ("CPU $n_threads threads f32 nounits", Array  , true , true , false),
+    ("GPU"                               , CuArray, false, false, true ),
+    ("GPU f32"                           , CuArray, false, true , true ),
+    ("GPU f32 nounits"                   , CuArray, false, true , false),
 ]
 
-for (run_name, gpu, parallel, f32, units) in runs
+for (run_name, AT, parallel, f32, units) in runs
     n_threads_used = parallel ? n_threads : 1
-    sys, sim = setup_system(gpu, f32, units)
+    sys, sim = setup_system(AT, f32, units)
     simulate!(deepcopy(sys), sim, 20; n_threads=n_threads_used)
     println(run_name)
     @time simulate!(sys, sim, n_steps; n_threads=n_threads_used)
diff --git a/docs/src/documentation.md b/docs/src/documentation.md
index 73521b51e..ce6d7c242 100644
--- a/docs/src/documentation.md
+++ b/docs/src/documentation.md
@@ -135,11 +135,21 @@ visualize(sys.loggers.coords, boundary, "sim_lj.mp4")
 
 ## GPU acceleration
 
-To run simulations on the GPU you will need to have a CUDA-compatible device.
-[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is used to run on the device.
+To run simulations on the GPU you will need to have a GPU available and then load the appropriate package:
+
+| Hardware Available | Necessary Package | Array Type |
+| ------------------ | ----------------- | ---------- |
+| Parallel CPU       | none              | `Array`    |
+| NVIDIA GPU         | CUDA              | `CuArray`  |
+| AMD GPU            | AMDGPU            | `ROCArray` |
+| Intel GPU          | oneAPI            | `oneArray` |
+| Apple Silicon      | Metal             | `MtlArray` |
+
+As an important note, Metal/Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary.
 Simulation setup is similar to above, but with the coordinates, velocities and atoms moved to the GPU.
 This example also shows setting up a simulation to run with `Float32`, which gives much better performance on GPUs.
 Of course, you will need to determine whether this level of numerical accuracy is appropriate in your case.
+Here is an example script for an NVIDIA GPU using CUDA:
 ```julia
 using Molly
 using CUDA
@@ -168,6 +178,7 @@ sys = System(
 simulate!(deepcopy(sys), simulator, 20) # Compile function
 simulate!(sys, simulator, 1_000)
 ```
+To use another GPU package, just swap out `CUDA` for your desired package and `CuArray` for your desired array type.
 The device to run on can be changed with `device!`, e.g. `device!(1)`.
 The GPU code path is currently designed to be compatible with differentiable simulation and runs slower than related software, but this is an active area of development.
 Nonetheless, GPU performance is significantly better than CPU performance and is good enough for many applications.
@@ -316,7 +327,7 @@ sys = System(
         energy=TotalEnergyLogger(10),
         writer=StructureWriter(10, "traj_6mrr_1ps.pdb", ["HOH"]),
     ),
-    gpu=false,
+    array_type=Array,
 )
 
 minimizer = SteepestDescentMinimizer()
@@ -352,7 +363,7 @@ Residue patches, virtual sites, file includes and any force types other than `Ha
 
     Some PDB files that read in fine can be found [here](https://github.com/greener-group/GB99dms/tree/main/structures/training/conf_1).
 
-To run on the GPU, set `gpu=true`.
+To run on the GPU, set `array_type=GPUArrayType`, where `GPUArrayType` is the array type for your GPU backend (for example `CuArray` for NVIDIA or `ROCArray` for AMD).
 You can use an implicit solvent method by giving the `implicit_solvent` keyword argument to [`System`](@ref).
 The options are `"obc1"`, `"obc2"` and `"gbn2"`, corresponding to the Onufriev-Bashford-Case GBSA model with parameter set I or II and the GB-Neck2 model.
 Other options include overriding the boundary dimensions in the file (`boundary`) and modifying the non-bonded interaction and neighbor list cutoff distances (`dist_cutoff` and `dist_neighbors`).
@@ -1017,10 +1028,10 @@ function Molly.simulate!(sys::ReplicaSystem,
 end
 ```
 
-Under the hood there are two implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, and a version geared towards GPUs.
-You can define different versions of a simulator for CPU and GPU systems by dispatching on `System{D, false}` or `System{D, true}` respectively.
+Under the hood there are multiple implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, a CUDA version, and a version for other GPU backends.
+You can define different versions of a simulator for CPU, CUDA and generic GPU systems by dispatching on `System{D, Array}` or `System{D, CuArray}` and `System{D, AT} where AT <: AbstractGPUArray` respectively.
 This also applies to coupling methods, neighbor finders and analysis functions.
-You do not have to define two versions though: you may only intend to use the simulator one way, or one version may be performant in all cases.
+You do not have to define different versions though: you may only intend to use the simulator one way, or one version may be performant in all cases.
 
 ## Coupling
 
@@ -1321,7 +1332,7 @@ The available neighbor finders are:
 - [`DistanceNeighborFinder`](@ref)
 - [`TreeNeighborFinder`](@ref)
 
-The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU and [`GPUNeighborFinder`](@ref) on GPU.
+The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU, [`GPUNeighborFinder`](@ref) on NVIDIA GPUs and [`DistanceNeighborFinder`](@ref) on other GPUs.
 When using a neighbor finder you should in general also use an interaction cutoff (see [Cutoffs](@ref)) with a cutoff distance less than the neighbor finder distance.
 The difference between the two should be larger than an atom can move in the time of the `n_steps` defined by the neighbor finder.
 The exception is [`GPUNeighborFinder`](@ref), which uses the algorithm from [Eastman and Pande 2010](https://doi.org/10.1002/jcc.21413) to avoid calculating a neighbor list and should have `dist_cutoff` set to the interaction cutoff distance.
diff --git a/src/cuda.jl b/ext/MollyCUDAExt.jl
similarity index 61%
rename from src/cuda.jl
rename to ext/MollyCUDAExt.jl
index e751f51a5..de68d0807 100644
--- a/src/cuda.jl
+++ b/ext/MollyCUDAExt.jl
@@ -1,6 +1,16 @@
-# CUDA.jl kernels
+module MollyCUDAExt
+
+using Molly
+using CUDA
+using Atomix
+using KernelAbstractions
+
 const WARPSIZE = UInt32(32)
 
+Molly.uses_gpu_neighbor_finder(::Type{AT}) where {AT <: CuArray} = true
+
+CUDA.Const(nl::Molly.NoNeighborList) = nl
+
 macro shfl_multiple_sync(mask, target, width, vars...)
     all_lines = map(vars) do v
         Expr(:(=), v,
@@ -29,71 +39,69 @@ function cuda_threads_blocks_specific(n_inters)
     return n_threads_gpu, n_blocks
 end
 
-function pairwise_force_gpu!(buffers, sys::System{D, true, T}, pairwise_inters, nbs, step_n) where {D, T}
-    if typeof(nbs) == NoNeighborList
-        kernel = @cuda launch=false pairwise_force_kernel_nonl!(
-                buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
-                Val(D), Val(sys.force_units))
-        conf = launch_configuration(kernel.fun)
-        threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
-        nthreads = min(length(sys.atoms), threads_basic, conf.threads)
-        nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
-        n_blocks_i = cld(length(sys.atoms), WARPSIZE)
-        n_blocks_j = cld(length(sys.atoms), nthreads)
-        kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
-               step_n, Val(D), Val(sys.force_units); threads=nthreads,
-               blocks=(n_blocks_i, n_blocks_j))
-    else    
-        N = length(sys.coords)
-        n_blocks = cld(N, WARPSIZE)
-        r_cut = sys.neighbor_finder.dist_cutoff
-        if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized
-            Morton_bits = 4
-            w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
-            Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
-            copyto!(buffers.Morton_seq, Morton_seq_cpu)
-            CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
-                    buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N),
-                    sys.boundary, Val(D))
-            sys.neighbor_finder.initialized = true
-            CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!(
-                    buffers.Morton_seq, sys.neighbor_finder.eligible, sys.neighbor_finder.special,
-                    buffers.compressed_eligible, buffers.compressed_special, Val(N))
-        end
-        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!(
-                buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords,
-                sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters,
-                sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible,
-                Val(T), Val(D))
-    end
+function Molly.pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters,
+                                   nbs::Molly.NoNeighborList, step_n) where {D, AT <: CuArray, T}
+    kernel = @cuda launch=false pairwise_force_kernel_nonl!(
+            buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n,
+            Val(D), Val(sys.force_units))
+    conf = launch_configuration(kernel.fun)
+    threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
+    nthreads = min(length(sys.atoms), threads_basic, conf.threads)
+    nthreads = cld(nthreads, WARPSIZE) * WARPSIZE
+    n_blocks_i = cld(length(sys.atoms), WARPSIZE)
+    n_blocks_j = cld(length(sys.atoms), nthreads)
+    kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
+            step_n, Val(D), Val(sys.force_units); threads=nthreads,
+            blocks=(n_blocks_i, n_blocks_j))
     return buffers
 end
 
-function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, true, T}, pairwise_inters, nbs, step_n) where {D, T}
-    if typeof(nbs) == NoNeighborList
-        n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs))
-        CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!(
-            pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(sys.energy_units))
-    else
-        # The ordering is always recomputed for potential energy
-        # Different buffers are used to the forces case, so sys.neighbor_finder.initialized
-        #   is not updated
-        N = length(sys.coords)
-        n_blocks = cld(N, WARPSIZE)
-        r_cut = sys.neighbor_finder.dist_cutoff
+function Molly.pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::Nothing,
+                                   step_n) where {D, AT <: CuArray, T}
+    N = length(sys.coords)
+    n_blocks = cld(N, WARPSIZE)
+    r_cut = sys.neighbor_finder.dist_cutoff
+    if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized
         Morton_bits = 4
         w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
         Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
         copyto!(buffers.Morton_seq, Morton_seq_cpu)
         CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
-                buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords,
-                Val(N), sys.boundary, Val(D))
-        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!(
-                buffers.Morton_seq, pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords,
-                sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), pairwise_inters,
-                sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible,
-                Val(T), Val(D))
-    end
+                buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N),
+                sys.boundary, Val(D))
+        sys.neighbor_finder.initialized = true
+        CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!(
+                buffers.Morton_seq, sys.neighbor_finder.eligible, sys.neighbor_finder.special,
+                buffers.compressed_eligible, buffers.compressed_special, Val(N))
+    end
+    CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!(
+            buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords,
+            sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters,
+            sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible,
+            Val(T), Val(D))
+    return buffers
+end
+
+function Molly.pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters,
+                                nbs::Nothing, step_n) where {D, AT <: CuArray, T}
+    # The ordering is always recomputed for potential energy
+    # Different buffers are used to the forces case, so sys.neighbor_finder.initialized
+    #   is not updated
+    N = length(sys.coords)
+    n_blocks = cld(N, WARPSIZE)
+    r_cut = sys.neighbor_finder.dist_cutoff
+    Morton_bits = 4
+    w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut)
+    Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits)
+    copyto!(buffers.Morton_seq, Morton_seq_cpu)
+    CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!(
+            buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords,
+            Val(N), sys.boundary, Val(D))
+    CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!(
+            buffers.Morton_seq, pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords,
+            sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), pairwise_inters,
+            sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible,
+            Val(T), Val(D))
     return pe_vec_nounits
 end
 
@@ -276,7 +284,6 @@ function compress_boolean_matrices!(sorted_seq, eligible_matrix, special_matrix,
     return nothing
 end
 
-
 #=
 **The No-neighborlist pairwise force summation kernel (algorithm by Eastman, see https://onlinelibrary.wiley.com/doi/full/10.1002/jcc.21413)**: 
 1. Case j < n_blocks && i < j, i.e., `WARPSIZE`×`WARPSIZE` tiles: For such tiles each row is assiged to a different thread in a warp which calculates the
@@ -406,7 +413,7 @@ function force_kernel!(
                 spec = (special_bitmask >> (warpsize() - shuffle_idx)) | (special_bitmask << shuffle_idx)
                 condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
                  
-                f = condition ? sum_pairwise_forces(
+                f = condition ? Molly.sum_pairwise_forces(
                     inters_tuple,
                     atoms_i, atoms_j_shuffle,
                     Val(force_units),
@@ -471,7 +478,7 @@ function force_kernel!(
                 spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
                 condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
 
-                f = condition ? sum_pairwise_forces(
+                f = condition ? Molly.sum_pairwise_forces(
                     inters_tuple,
                     atoms_i, atoms_j,
                     Val(force_units),
@@ -521,7 +528,7 @@ function force_kernel!(
             spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
             condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
 
-            f = condition ? sum_pairwise_forces(
+            f = condition ? Molly.sum_pairwise_forces(
                 inters_tuple,
                 atoms_i, atoms_j,
                 Val(force_units),
@@ -568,7 +575,7 @@ function force_kernel!(
                 spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m)
                 condition = (excl & 0x1) == true && r2 <= r_cut * r_cut
                 
-                f = condition ? sum_pairwise_forces(
+                f = condition ? Molly.sum_pairwise_forces(
                     inters_tuple,
                     atoms_i, atoms_j,
                     Val(force_units),
@@ -595,7 +602,6 @@ function force_kernel!(
     return nothing
 end
 
-
 function energy_kernel!( 
     sorted_seq,
     energy_nounits, 
@@ -828,8 +834,6 @@ function energy_kernel!(
     return nothing
 end
 
-
-
 function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, velocities_var,
                         atoms_var, boundary, inters, step_n, ::Val{D}, ::Val{F}) where {T, D, F}
     coords = CUDA.Const(coords_var)
@@ -857,8 +861,8 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc
                 j = j_0_tile + del_j
                 if i != j
                     atom_j, coord_j, vel_j = atoms[j], coords[j], velocities[j]
-                    f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
-                                            boundary, vel_i, vel_j, step_n)
+                    f = Molly.sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i,
+                                                  coord_j, boundary, vel_i, vel_j, step_n)
                     for dim in 1:D
                         forces_shmem[dim, tidx] += -ustrip(f[dim])
                     end
@@ -882,7 +886,7 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc
         @inbounds for _ in 1:tilesteps
             sync_warp()
             atom_j = atoms[j]
-            f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
+            f = Molly.sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j,
                                     boundary, vel_i, vel_j, step_n)
             for dim in 1:D
                 forces_shmem[dim, tidx] += -ustrip(f[dim])
@@ -898,58 +902,14 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc
     return nothing
 end
 
-function pairwise_pe_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, inters,
-                             neighbors_var, step_n, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    neighbors = CUDA.Const(neighbors_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(neighbors)
-        i, j, special = neighbors[inter_i]
-        coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
-        dr = vector(coord_i, coord_j, boundary)
-        pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
-                                  boundary, vel_i, vel_j, step_n)
-        for inter in inters[2:end]
-            pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
-                                       boundary, vel_i, vel_j, step_n)
-        end
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-@inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j,
-                                     boundary, vel_i, vel_j, step_n) where F
-    dr = vector(coord_i, coord_j, boundary)
-    f_tuple = ntuple(length(inters)) do inter_type_i
-        force_gpu(inters[inter_type_i], dr, atom_i, atom_j, F, special, coord_i, coord_j, boundary,
-                  vel_i, vel_j, step_n)
-    end
-    f = sum(f_tuple)
-    if unit(f[1]) != F
-        # This triggers an error but it isn't printed
-        # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692
-        #   for how to throw a more meaningful error
-        error("wrong force unit returned, was expecting $F but got $(unit(f[1]))")
-    end
-    return f
-end
-
 @inline function sum_pairwise_potentials(inters, atom_i, atom_j, ::Val{E}, special, coord_i, coord_j,
                                      boundary, vel_i, vel_j, step_n) where E
     dr = vector(coord_i, coord_j, boundary)
 
     pe_tuple = ntuple(length(inters)) do inter_type_i
-        SVector(potential_energy_gpu(inters[inter_type_i], dr, atom_i, atom_j, E, special, coord_i, coord_j, boundary,
-                  vel_i, vel_j, step_n))
-                  # SVector was required to avoid a GPU error occurring with scalars (like the quantity returned by potential_energy_gpu) 
+        # SVector was required to avoid a GPU error occurring with scalars
+        SVector(Molly.potential_energy_gpu(inters[inter_type_i], dr, atom_i, atom_j, E, special,
+                            coord_i, coord_j, boundary, vel_i, vel_j, step_n))
     end
     pe = sum(pe_tuple)
     if unit(pe[1]) != E
@@ -961,281 +921,4 @@ end
     return pe
 end
 
-function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_1_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.inters,
-            Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_2_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_3_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.ks, inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
-                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_4_atoms_kernel!(fs_mat,
-            coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js,
-            inter_list.ks, inter_list.ls, inter_list.inters, Val(D), Val(force_units))
-    return fs_mat
-end
-
-function specific_force_1_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i = is[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n)
-        if unit(fs.f1[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_2_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j = is[inter_i], js[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F,
-                       velocities[i], velocities[j], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_3_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, ks_var, inters_var, ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i],
-                       atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
-        end
-    end
-    return nothing
-end
-
-function specific_force_4_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary,
-                        step_n, is_var, js_var, ks_var, ls_var, inters_var,
-                        ::Val{D}, ::Val{F}) where {D, F}
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    ls = CUDA.Const(ls_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
-        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary,
-                       atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j],
-                       velocities[k], velocities[l], step_n)
-        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F
-            error("wrong force unit returned, was expecting $F")
-        end
-        for dim in 1:D
-            Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim])
-            Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim])
-            Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim])
-            Atomix.@atomic :monotonic forces[dim, l] += ustrip(fs.f4[dim])
-        end
-    end
-    return nothing
-end
-
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_1_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_2_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_3_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
-                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
-    n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list))
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_4_atoms_kernel!(
-            pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
-            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units))
-    return pe_vec_nounits
-end
-
-function specific_pe_1_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i = is[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E,
-                                  velocities[i], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_2_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j = is[inter_i], js[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i],
-                                  atoms[j], E, velocities[i], velocities[j], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_3_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, ks_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary,
-                                  atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j],
-                                  velocities[k], step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
-end
-
-function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary,
-                    step_n, is_var, js_var, ks_var, ls_var, inters_var, ::Val{E}) where E
-    coords = CUDA.Const(coords_var)
-    velocities = CUDA.Const(velocities_var)
-    atoms = CUDA.Const(atoms_var)
-    is = CUDA.Const(is_var)
-    js = CUDA.Const(js_var)
-    ks = CUDA.Const(ks_var)
-    ls = CUDA.Const(ls_var)
-    inters = CUDA.Const(inters_var)
-
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
-
-    @inbounds if inter_i <= length(is)
-        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
-        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l],
-                                  boundary, atoms[i], atoms[j], atoms[k], atoms[l], E,
-                                  velocities[i], velocities[j], velocities[k], velocities[l],
-                                  step_n)
-        if unit(pe) != E
-            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
-        end
-        Atomix.@atomic :monotonic energy[1] += ustrip(pe)
-    end
-    return nothing
 end
diff --git a/ext/MollyEnzymeExt.jl b/ext/MollyEnzymeExt.jl
index 90e015390..26fd0e882 100644
--- a/ext/MollyEnzymeExt.jl
+++ b/ext/MollyEnzymeExt.jl
@@ -11,13 +11,10 @@ EnzymeRules.inactive(::typeof(Molly.n_infinite_dims), args...) = nothing
 EnzymeRules.inactive(::typeof(random_velocity), args...) = nothing
 EnzymeRules.inactive(::typeof(random_velocities), args...) = nothing
 EnzymeRules.inactive(::typeof(random_velocities!), args...) = nothing
-EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_pairwise), args...) = nothing
-EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_specific), args...) = nothing
 EnzymeRules.inactive(::typeof(Molly.check_force_units), args...) = nothing
 EnzymeRules.inactive(::typeof(Molly.check_energy_units), args...) = nothing
 EnzymeRules.inactive(::typeof(Molly.atoms_bonded_to_N), args...) = nothing
 EnzymeRules.inactive(::typeof(Molly.lookup_table), args...) = nothing
-EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_gbsa), args...) = nothing
 EnzymeRules.inactive(::typeof(find_neighbors), args...) = nothing
 EnzymeRules.inactive_type(::Type{DistanceNeighborFinder}) = nothing
 EnzymeRules.inactive(::typeof(visualize), args...) = nothing
diff --git a/ext/MollyPythonCallExt.jl b/ext/MollyPythonCallExt.jl
index e1afaeb82..acbb6c675 100644
--- a/ext/MollyPythonCallExt.jl
+++ b/ext/MollyPythonCallExt.jl
@@ -6,7 +6,7 @@ module MollyPythonCallExt
 using Molly
 using PythonCall
 import AtomsCalculators
-using CUDA
+using GPUArrays
 using StaticArrays
 using Unitful
 
@@ -91,9 +91,9 @@ end
 
 uconvert_vec(x...) = uconvert.(x...)
 
-function AtomsCalculators.forces(sys::System{D, G, T},
+function AtomsCalculators.forces(sys::System{D, AT, T},
                                  ase_calc::ASECalculator;
-                                 kwargs...) where {D, G, T}
+                                 kwargs...) where {D, AT, T}
     update_ase_calc!(ase_calc, sys)
     forces_py = ase_calc.ase_atoms.get_forces()
     forces_flat = reshape(transpose(pyconvert(Matrix{T}, forces_py)), length(sys) * D)
@@ -105,12 +105,12 @@ function AtomsCalculators.forces(sys::System{D, G, T},
     else
         fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å")
     end
-    return G ? CuArray(fs_unit) : fs_unit
+    return AT(fs_unit)
 end
 
-function AtomsCalculators.potential_energy(sys::System{D, G, T},
+function AtomsCalculators.potential_energy(sys::System{D, AT, T},
                                            ase_calc::ASECalculator;
-                                           kwargs...) where {D, G, T}
+                                           kwargs...) where {D, AT, T}
     update_ase_calc!(ase_calc, sys)
     pe_py = ase_calc.ase_atoms.get_potential_energy()
     pe = pyconvert(T, pe_py)
diff --git a/src/Molly.jl b/src/Molly.jl
index 19664debc..e57e17ced 100644
--- a/src/Molly.jl
+++ b/src/Molly.jl
@@ -11,13 +11,14 @@ import BioStructures # Imported to avoid clashing names
 using CellListMap
 import Chemfiles
 using Combinatorics
-using CUDA
 using DataStructures
 using Distances
 using Distributions
 using EzXML
 using FLoops
+using GPUArrays
 using Graphs
+using KernelAbstractions
 using NearestNeighbors
 using PeriodicTable
 using SimpleCrystals
@@ -34,7 +35,7 @@ include("types.jl")
 include("units.jl")
 include("spatial.jl")
 include("cutoffs.jl")
-include("cuda.jl")
+include("kernels.jl")
 include("force.jl")
 include("interactions/lennard_jones.jl")
 include("interactions/soft_sphere.jl")
diff --git a/src/analysis.jl b/src/analysis.jl
index 01429b5a2..5ad057b97 100644
--- a/src/analysis.jl
+++ b/src/analysis.jl
@@ -88,8 +88,7 @@ Calculate the hydrodynamic radius of a set of coordinates.
 """
 function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) where {D, T}
     n_atoms = length(coords)
-    diag_cpu = Diagonal(ones(T, n_atoms))
-    diag = isa(coords, CuArray) ? CuArray(diag_cpu) : diag_cpu
+    diag  = array_type(coords)(Diagonal(ones(T, n_atoms)))
     dists = distances(coords, boundary) .+ diag
     sum_inv_dists = sum(inv.(dists)) - sum(inv(diag))
     inv_R_hyd = sum_inv_dists / (2 * n_atoms^2)
diff --git a/src/coupling.jl b/src/coupling.jl
index da30dfae1..ae4fc7f55 100644
--- a/src/coupling.jl
+++ b/src/coupling.jl
@@ -58,10 +58,10 @@ struct AndersenThermostat{T, C}
     coupling_const::C
 end
 
-function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where D
+                         rng=Random.default_rng())
     for i in eachindex(sys)
         if rand(rng) < (sim.dt / thermostat.coupling_const)
             sys.velocities[i] = random_velocity(mass(sys.atoms[i]), thermostat.temperature, sys.k;
@@ -71,14 +71,14 @@ function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat,
     return false
 end
 
-function apply_coupling!(sys::System{D, true, T}, thermostat::AndersenThermostat, sim,
+function apply_coupling!(sys::System{D, AT, T}, thermostat::AndersenThermostat, sim,
                          neighbors=nothing, step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where {D, T}
+                         rng=Random.default_rng()) where {D, AT <: AbstractGPUArray, T}
     atoms_to_bump = T.(rand(rng, length(sys)) .< (sim.dt / thermostat.coupling_const))
     atoms_to_leave = one(T) .- atoms_to_bump
-    atoms_to_bump_dev = move_array(atoms_to_bump, sys)
-    atoms_to_leave_dev = move_array(atoms_to_leave, sys)
+    atoms_to_bump_dev = AT(atoms_to_bump)
+    atoms_to_leave_dev = AT(atoms_to_leave)
     vs = random_velocities(sys, thermostat.temperature; rng=rng)
     sys.velocities .= sys.velocities .* atoms_to_leave_dev .+ vs .* atoms_to_bump_dev
     return false
@@ -231,9 +231,9 @@ function MonteCarloBarostat(P, T, boundary; n_steps=30, n_iterations=1, scale_fa
                               max_volume_frac, trial_find_neighbors, 0, 0)
 end
 
-function apply_coupling!(sys::System{D, G, T}, barostat::MonteCarloBarostat, sim, neighbors=nothing,
+function apply_coupling!(sys::System{D, AT, T}, barostat::MonteCarloBarostat, sim, neighbors=nothing,
                          step_n::Integer=0; n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where {D, G, T}
+                         rng=Random.default_rng()) where {D, AT, T}
     if !iszero(step_n % barostat.n_steps)
         return false
     end
@@ -371,13 +371,13 @@ function MonteCarloAnisotropicBarostat(pressure::SVector{D},
     )
 end
 
-function apply_coupling!(sys::System{D, G, T},
+function apply_coupling!(sys::System{D, AT, T},
                          barostat::MonteCarloAnisotropicBarostat{D},
                          sim,
                          neighbors=nothing,
                          step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where {D, G, T}
+                         rng=Random.default_rng()) where {D, AT, T}
     !iszero(step_n % barostat.n_steps) && return false
     all(isnothing, barostat.pressure) && return false
 
@@ -546,13 +546,13 @@ function MonteCarloMembraneBarostat(pressure,
     )
 end
 
-function apply_coupling!(sys::System{D, G, T},
+function apply_coupling!(sys::System{D, AT, T},
                          barostat::MonteCarloMembraneBarostat,
                          sim,
                          neighbors=nothing,
                          step_n::Integer=0;
                          n_threads::Integer=Threads.nthreads(),
-                         rng=Random.default_rng()) where {D, G, T}
+                         rng=Random.default_rng()) where {D, AT, T}
     !iszero(step_n % barostat.n_steps) && return false
 
     kT = energy_remove_mol(sys.k * barostat.temperature)
diff --git a/src/energy.jl b/src/energy.jl
index 6fdd265c2..b257c6fa7 100644
--- a/src/energy.jl
+++ b/src/energy.jl
@@ -33,7 +33,7 @@ E_k = \frac{1}{2} \sum_{i} m_i v_i^2
 ```
 where ``m_i`` is the mass and ``v_i`` is the velocity of atom ``i``.
 """
-function kinetic_energy(sys::System{D, G, T}) where {D, G, T}
+function kinetic_energy(sys::System)
     ke = kinetic_energy_noconvert(sys)
     return uconvert(sys.energy_units, ke)
 end
@@ -78,8 +78,8 @@ function potential_energy(sys; n_threads::Integer=Threads.nthreads())
     return potential_energy(sys, find_neighbors(sys; n_threads=n_threads); n_threads=n_threads)
 end
 
-function potential_energy(sys::System{D, false, T}, neighbors, step_n::Integer=0;
-                          n_threads::Integer=Threads.nthreads()) where {D, T}
+function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
+                          n_threads::Integer=Threads.nthreads()) where {D, AT, T}
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     pairwise_inters_nl   = filter( use_neighbors, values(sys.pairwise_inters))
     sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists))
@@ -253,11 +253,11 @@ function specific_pe(atoms, coords, velocities, boundary, energy_units, sils_1_a
     return pe
 end
 
-function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0;
-                          n_threads::Integer=Threads.nthreads()) where {D, T}
-    pe_vec_nounits = CUDA.zeros(T, 1)
+function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0;
+                          n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     val_ft = Val(T)
-    buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1)
+    pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1)
+    buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1, true)
 
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     if length(pairwise_inters_nonl) > 0
@@ -267,7 +267,7 @@ function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0;
 
     pairwise_inters_nl = filter(use_neighbors, values(sys.pairwise_inters))
     if length(pairwise_inters_nl) > 0
-        pairwise_pe_gpu!(pe_vec_nounits, buffers, sys, pairwise_inters_nl, nothing, step_n)   
+        pairwise_pe_gpu!(pe_vec_nounits, buffers, sys, pairwise_inters_nl, neighbors, step_n)   
     end
 
     for inter_list in values(sys.specific_inter_lists)
diff --git a/src/force.jl b/src/force.jl
index 0b54db195..bbb67e38c 100644
--- a/src/force.jl
+++ b/src/force.jl
@@ -132,17 +132,19 @@ struct ForcesBuffer{F, C, M, R}
     compressed_special::R
 end
 
-function init_forces_buffer!(sys, forces_nounits::CuArray{SVector{D, T}}, n_threads) where {D, T}
+function init_forces_buffer!(sys, forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads,
+                             for_pe::Bool=false) where {D, T}
     N = length(forces_nounits)
     C = eltype(eltype(sys.coords))
     n_blocks = cld(N, 32)
-    fs_mat = CUDA.zeros(T, D, N)
-    box_mins = CUDA.zeros(C, n_blocks, D) 
-    box_maxs = CUDA.zeros(C, n_blocks, D) 
-    Morton_seq = CUDA.zeros(Int32, N)
-    compressed_eligible = CUDA.zeros(UInt32, 32, n_blocks, n_blocks)
-    compressed_special = CUDA.zeros(UInt32, 32, n_blocks, n_blocks)
-    if sys.neighbor_finder isa GPUNeighborFinder
+    backend = get_backend(forces_nounits)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, N)
+    box_mins = KernelAbstractions.zeros(backend, C, n_blocks, D) 
+    box_maxs = KernelAbstractions.zeros(backend, C, n_blocks, D) 
+    Morton_seq = KernelAbstractions.zeros(backend, Int32, N)
+    compressed_eligible = KernelAbstractions.zeros(backend, UInt32, 32, n_blocks, n_blocks)
+    compressed_special = KernelAbstractions.zeros(backend, UInt32, 32, n_blocks, n_blocks)
+    if !for_pe && sys.neighbor_finder isa GPUNeighborFinder
         sys.neighbor_finder.initialized = false
     end
     return ForcesBuffer(fs_mat, box_mins, box_maxs, Morton_seq, compressed_eligible, compressed_special)
@@ -165,8 +167,8 @@ function forces(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt
     return forces_nounits .* sys.force_units
 end
 
-function forces_nounits!(fs_nounits, sys::System{D, false}, neighbors, fs_chunks=nothing,
-                         step_n::Integer=0; n_threads::Integer=Threads.nthreads()) where D
+function forces_nounits!(fs_nounits, sys::System, neighbors, fs_chunks=nothing,
+                         step_n::Integer=0; n_threads::Integer=Threads.nthreads())
     pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters))
     pairwise_inters_nl   = filter( use_neighbors, values(sys.pairwise_inters))
     sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists))
@@ -367,9 +369,9 @@ function specific_forces!(fs_nounits, atoms, coords, velocities, boundary, force
     return fs_nounits
 end
 
-function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors,
+function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors,
                          buffers, step_n::Integer=0;
-                         n_threads::Integer=Threads.nthreads()) where {D, T}
+                         n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T}
     fill!(buffers.fs_mat, zero(T))
     val_ft = Val(T)
 
@@ -382,7 +384,7 @@ function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors,
 
     pairwise_inters_nl = filter(use_neighbors, values(sys.pairwise_inters))
     if length(pairwise_inters_nl) > 0
-        pairwise_force_gpu!(buffers, sys, pairwise_inters_nl, nothing, step_n)
+        pairwise_force_gpu!(buffers, sys, pairwise_inters_nl, neighbors, step_n)
     end
 
     for inter_list in values(sys.specific_inter_lists)
@@ -401,5 +403,3 @@ function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors,
 
     return fs_nounits
 end
-
-
diff --git a/src/interactions/implicit_solvent.jl b/src/interactions/implicit_solvent.jl
index 44e81da85..860314a91 100644
--- a/src/interactions/implicit_solvent.jl
+++ b/src/interactions/implicit_solvent.jl
@@ -411,10 +411,11 @@ function ImplicitSolventOBC(atoms::AbstractArray{Atom{TY, M, T, D, E}},
         factor_solvent = zero(T(coulomb_const_units))
     end
 
-    if isa(atoms, CuArray)
-        or = CuArray(offset_radii)
-        sor = CuArray(scaled_offset_radii)
-        is, js = CuArray(inds_i), CuArray(inds_j)
+    if isa(atoms, AbstractGPUArray)
+        AT = array_type(atoms)
+        or = AT(offset_radii)
+        sor = AT(scaled_offset_radii)
+        is, js = AT(inds_i), AT(inds_j)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -563,12 +564,13 @@ function ImplicitSolventGBN2(atoms::AbstractArray{Atom{TY, M, T, D, E}},
         factor_solvent = zero(T(coulomb_const_units))
     end
 
-    if isa(atoms, CuArray)
-        or = CuArray(offset_radii)
-        sor = CuArray(scaled_offset_radii)
-        is, js = CuArray(inds_i), CuArray(inds_j)
-        d0s, m0s = CuArray(table_d0), CuArray(table_m0)
-        αs, βs, γs = CuArray(αs_cpu), CuArray(βs_cpu), CuArray(γs_cpu)
+    if isa(atoms, AbstractGPUArray)
+        AT = array_type(atoms)
+        or = AT(offset_radii)
+        sor = AT(scaled_offset_radii)
+        is, js = AT(inds_i), AT(inds_j)
+        d0s, m0s = AT(table_d0), AT(table_m0)
+        αs, βs, γs = AT(αs_cpu), AT(βs_cpu), AT(γs_cpu)
     else
         or = offset_radii
         sor = scaled_offset_radii
@@ -694,7 +696,7 @@ function born_radii_and_grad(inter::ImplicitSolventOBC{T}, coords, boundary) whe
     return Bs, B_grads, I_grads
 end
 
-function born_radii_and_grad(inter::ImplicitSolventOBC, coords::CuArray, boundary)
+function born_radii_and_grad(inter::ImplicitSolventOBC, coords::AbstractGPUArray, boundary)
     coords_i = @view coords[inter.is]
     coords_j = @view coords[inter.js]
     loop_res = born_radii_loop_OBC.(coords_i, coords_j, inter.oris, inter.srjs,
@@ -766,7 +768,7 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords, boundary) wh
     return Bs, B_grads, I_grads
 end
 
-function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, boundary) where T
+function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::AbstractGPUArray, boundary) where T
     Is, I_grads = gbsa_born_gpu(coords, inter.offset_radii, inter.scaled_offset_radii,
                                 inter.dist_cutoff, inter.offset, inter.neck_scale,
                                 inter.neck_cut, inter.d0s, inter.m0s, boundary, Val(T))
@@ -778,42 +780,41 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, bou
     return Bs, B_grads, I_grads
 end
 
-function cuda_threads_blocks_gbsa(n_inters)
+function gpu_threads_blocks_gbsa(n_inters)
     n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_IMPLICIT", "512"))
-    n_blocks = cld(n_inters, n_threads_gpu)
-    return n_threads_gpu, n_blocks
+    return n_threads_gpu
 end
 
 function gbsa_born_gpu(coords::AbstractArray{SVector{D, C}}, offset_radii, scaled_offset_radii,
                        dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary,
                        ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    Is_nounits = CUDA.zeros(T, n_atoms)
-    I_grads_nounits = CUDA.zeros(T, n_atoms, n_atoms)
+    Is_nounits = KernelAbstractions.zeros(backend, T, n_atoms)
+    I_grads_nounits = KernelAbstractions.zeros(backend, T, n_atoms, n_atoms)
     n_inters = n_atoms ^ 2
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_born_kernel!(
-                Is_nounits, I_grads_nounits, coords, offset_radii, scaled_offset_radii,
-                dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary, Val(C))
+    kernel! = gbsa_born_kernel!(backend, n_threads_gpu)
+    kernel!(Is_nounits, I_grads_nounits, coords, offset_radii,
+            scaled_offset_radii, dist_cutoff, offset, neck_scale,
+            neck_cut, d0s, m0s, boundary, Val(C), ndrange=n_inters)
 
     Is = Is_nounits * unit(dist_cutoff)^-1
     I_grads = I_grads_nounits * unit(dist_cutoff)^-2
     return Is, I_grads
 end
 
-function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_offset_radii_var,
-                           dist_cutoff, offset, neck_scale, neck_cut, d0s_var, m0s_var, boundary,
-                           ::Val{C}) where C
-    coords              = CUDA.Const(coords_var)
-    offset_radii        = CUDA.Const(offset_radii_var)
-    scaled_offset_radii = CUDA.Const(scaled_offset_radii_var)
-    d0s                 = CUDA.Const(d0s_var)
-    m0s                 = CUDA.Const(m0s_var)
+@kernel function gbsa_born_kernel!(Is, I_grads, @Const(coords),
+                                   @Const(offset_radii),
+                                   @Const(scaled_offset_radii),
+                                   dist_cutoff, offset, neck_scale, neck_cut,
+                                   @Const(d0s), @Const(m0s), boundary,
+                                   ::Val{C}) where C
 
     n_atoms = length(coords)
     n_inters = n_atoms ^ 2
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i = cld(inter_i, n_atoms)
@@ -849,12 +850,11 @@ function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_off
                     numer = 2 * r_d0_strip + 9 * r_d0_strip^5 / 5
                     I_grad -= 10 * neck_scale * m0 * numer / (denom^2 * unit(dist_cutoff))
                 end
-                Atomix.@atomic :monotonic Is[i] += ustrip(unit(dist_cutoff)^-1, I)
+                Atomix.@atomic Is[i] += ustrip(unit(dist_cutoff)^-1, I)
                 I_grads[i, j] += ustrip(unit(dist_cutoff)^-2, I_grad)
             end
         end
     end
-    return nothing
 end
 
 function gb_force_loop_1(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, dist_cutoff,
@@ -948,8 +948,8 @@ function forces_gbsa(sys, inter, Bs, B_grads, I_grads, born_forces, atom_charges
     return fs
 end
 
-function forces_gbsa(sys::System{D, true, T}, inter, Bs, B_grads, I_grads, born_forces,
-                     atom_charges) where {D, T}
+function forces_gbsa(sys::System{D, AT, T}, inter, Bs, B_grads, I_grads, born_forces,
+                     atom_charges) where {D, AT <: AbstractGPUArray, T}
     fs_mat_1, born_forces_mod_ustrip = gbsa_force_1_gpu(sys.coords, sys.boundary, inter.dist_cutoff,
                         inter.factor_solute, inter.factor_solvent, inter.kappa, Bs, atom_charges,
                         sys.force_units)
@@ -965,16 +965,17 @@ end
 function gbsa_force_1_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff,
                           factor_solute, factor_solvent, kappa, Bs, atom_charges::AbstractArray{T},
                           force_units) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    fs_mat = CUDA.zeros(T, D, n_atoms)
-    born_forces_mod_ustrip = CUDA.zeros(T, n_atoms)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms)
+    born_forces_mod_ustrip = KernelAbstractions.zeros(backend, T, n_atoms)
     n_inters = n_atoms_to_n_pairs(n_atoms) + n_atoms
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_1_kernel!(
-                fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff,
-                factor_solute, factor_solvent, kappa, Bs, atom_charges,
-                Val(D), Val(force_units))
+    kernel! = gbsa_force_1_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff,
+            factor_solute, factor_solvent, kappa, Bs, atom_charges,
+            Val(D), Val(force_units), ndrange=n_inters)
 
     return fs_mat, born_forces_mod_ustrip
 end
@@ -982,29 +983,30 @@ end
 function gbsa_force_2_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff, Bs, B_grads,
                           I_grads, born_forces, offset_radii, scaled_offset_radii,
                           force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
     n_atoms = length(coords)
-    fs_mat = CUDA.zeros(T, D, n_atoms)
+    fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms)
     n_inters = n_atoms ^ 2
-    n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters)
+    n_threads_gpu = gpu_threads_blocks_gbsa(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_2_kernel!(
-                fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii,
-                scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units))
+    kernel! = gbsa_force_2_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii,
+            scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units),
+            ndrange=n_inters)
 
     return fs_mat
 end
 
-function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, boundary, dist_cutoff,
-                              factor_solute, factor_solvent, kappa, Bs_var, atom_charges_var,
-                              ::Val{D}, ::Val{F}) where {D, F}
-    coords  = CUDA.Const(coords_var)
-    Bs      = CUDA.Const(Bs_var)
-    atom_charges = CUDA.Const(atom_charges_var)
+@kernel function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip,
+                                      @Const(coords), boundary, dist_cutoff,
+                                      factor_solute, factor_solvent, kappa,
+                                      @Const(Bs), @Const(atom_charges),
+                                      ::Val{D}, ::Val{F}) where {D, F}
 
     n_atoms = length(coords)
     n_inters_not_self = n_atoms_to_n_pairs(n_atoms)
     n_inters = n_inters_not_self + n_atoms
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         if inter_i <= n_inters_not_self
@@ -1034,38 +1036,33 @@ function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, bounda
             dGpol_dalpha2_ij = -Gpol * exp_term * (1 + D_term) / (2 * denominator2)
 
             change_born_force_i = dGpol_dalpha2_ij * Bj
-            Atomix.@atomic :monotonic born_forces_mod_ustrip[i] += ustrip(change_born_force_i)
+            Atomix.@atomic born_forces_mod_ustrip[i] += ustrip(change_born_force_i)
             if i != j
                 change_born_force_j = dGpol_dalpha2_ij * Bi
-                Atomix.@atomic :monotonic born_forces_mod_ustrip[j] += ustrip(change_born_force_j)
+                Atomix.@atomic born_forces_mod_ustrip[j] += ustrip(change_born_force_j)
                 fdr = dr * dGpol_dr
                 if unit(fdr[1]) != F
                     error("wrong force unit returned, was expecting $F but got $(unit(fdr[1]))")
                 end
                 for dim in 1:D
                     fval = ustrip(fdr[dim])
-                    Atomix.@atomic :monotonic forces[dim, i] +=  fval
-                    Atomix.@atomic :monotonic forces[dim, j] += -fval
+                    Atomix.@atomic forces[dim, i] +=  fval
+                    Atomix.@atomic forces[dim, j] += -fval
                 end
             end
         end
     end
-    return nothing
 end
 
-function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cutoff, or_var,
-                              sor_var, Bs_var, B_grads_var, I_grads_var, ::Val{D},
-                              ::Val{F}) where {D, F}
-    coords  = CUDA.Const(coords_var)
-    or      = CUDA.Const(or_var)
-    sor     = CUDA.Const(sor_var)
-    Bs      = CUDA.Const(Bs_var)
-    B_grads = CUDA.Const(B_grads_var)
-    I_grads = CUDA.Const(I_grads_var)
+@kernel function gbsa_force_2_kernel!(forces, born_forces, @Const(coords),
+                                      boundary, dist_cutoff, @Const(or),
+                                      @Const(sor), @Const(Bs),
+                                      @Const(B_grads), @Const(I_grads),
+                                      ::Val{D}, ::Val{F}) where {D, F}
 
     n_atoms = length(coords)
     n_inters = n_atoms ^ 2
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i = cld(inter_i, n_atoms)
@@ -1091,14 +1088,13 @@ function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cu
                     end
                     for dim in 1:D
                         fval = ustrip(fdr[dim])
-                        Atomix.@atomic :monotonic forces[dim, i] +=  fval
-                        Atomix.@atomic :monotonic forces[dim, j] += -fval
+                        Atomix.@atomic forces[dim, i] +=  fval
+                        Atomix.@atomic forces[dim, j] += -fval
                     end
                 end
             end
         end
     end
-    return nothing
 end
 
 function AtomsCalculators.forces(sys, inter::AbstractGBSA; kwargs...)
@@ -1153,7 +1149,7 @@ function gb_energy_loop(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, ori,
     end
 end
 
-function AtomsCalculators.potential_energy(sys::System{<:Any, false, T}, inter::AbstractGBSA;
+function AtomsCalculators.potential_energy(sys::System{<:Any, <:Any, T}, inter::AbstractGBSA;
                                            kwargs...) where T
     coords, boundary = sys.coords, sys.boundary
     Bs, B_grads, I_grads = born_radii_and_grad(inter, coords, boundary)
@@ -1173,7 +1169,8 @@ function AtomsCalculators.potential_energy(sys::System{<:Any, false, T}, inter::
     return E
 end
 
-function AtomsCalculators.potential_energy(sys::System{<:Any, true}, inter::AbstractGBSA; kwargs...)
+function AtomsCalculators.potential_energy(sys::System{<:Any, AT}, inter::AbstractGBSA;
+                                           kwargs...) where AT <: AbstractGPUArray
     coords, atoms, boundary = sys.coords, sys.atoms, sys.boundary
     Bs, B_grads, I_grads = born_radii_and_grad(inter, coords, boundary)
 
diff --git a/src/kernels.jl b/src/kernels.jl
new file mode 100644
index 000000000..92cb16f50
--- /dev/null
+++ b/src/kernels.jl
@@ -0,0 +1,365 @@
+# KernelAbstractions.jl kernels
+
+@inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j,
+                                     boundary, vel_i, vel_j, step_n) where F
+    dr = vector(coord_i, coord_j, boundary)
+    f_tuple = ntuple(length(inters)) do inter_type_i
+        force_gpu(inters[inter_type_i], dr, atom_i, atom_j, F, special, coord_i, coord_j, boundary,
+                  vel_i, vel_j, step_n)
+    end
+    f = sum(f_tuple)
+    if unit(f[1]) != F
+        error("wrong force unit returned, was expecting $F but got $(unit(f[1]))")
+    end
+    return f
+end
+
+function gpu_threads_pairwise(n_neighbors)
+    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512"))
+    return n_threads_gpu
+end
+
+function gpu_threads_specific(n_inters)
+    n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128"))
+    return n_threads_gpu
+end
+
+function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, 
+                    pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray, T}
+    if isnothing(neighbors)
+        error("neighbors is nothing, if you are using GPUNeighborFinder on a non-NVIDIA GPU you " *
+              "should use DistanceNeighborFinder instead")
+    end
+    if typeof(neighbors) == NoNeighborList
+        nbs = neighbors
+    else
+        nbs = @view neighbors.list[1:neighbors.n]
+    end
+    if length(neighbors) > 0
+        backend = get_backend(sys.coords)
+        n_threads_gpu = gpu_threads_pairwise(length(nbs))
+        kernel! = pairwise_force_kernel_nl!(backend, n_threads_gpu)
+        kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters,
+                nbs, step_n, Val(D), Val(sys.force_units); ndrange=length(nbs))
+    end
+    return buffers
+end
+
+@kernel inbounds=true function pairwise_force_kernel_nl!(forces, @Const(coords),
+                                           @Const(velocities), @Const(atoms),
+                                           boundary, inters,
+                                           @Const(neighbors), step_n, ::Val{D},
+                                           ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(neighbors)
+        i, j, special = neighbors[inter_i]
+        f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), special, coords[i], coords[j],
+                                boundary, velocities[i], velocities[j], step_n)
+        for dim in 1:D
+            fval = ustrip(f[dim])
+            Atomix.@atomic forces[dim, i] += -fval
+            Atomix.@atomic forces[dim, j] +=  fval
+        end
+    end
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_force_1_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.inters, Val(D), Val(force_units);
+            ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_force_2_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.inters, Val(D), Val(force_units);
+            ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_force_3_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.inters, Val(D),
+            Val(force_units); ndrange = length(inter_list))
+    return fs_mat
+end
+
+function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
+                            velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_force_4_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters,
+            Val(D), Val(force_units); ndrange = length(inter_list))
+    return fs_mat
+end
+
+@kernel inbounds=true function specific_force_1_atoms_kernel!(forces, @Const(coords),
+                                                @Const(velocities),
+                                                @Const(atoms), boundary,
+                                                step_n, @Const(is),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(is)
+        i = is[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n)
+        if unit(fs.f1[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim])
+        end
+    end
+end
+
+@kernel inbounds=true function specific_force_2_atoms_kernel!(forces, @Const(coords),
+                                                @Const(velocities),
+                                                @Const(atoms), boundary,
+                                                step_n, @Const(is), @Const(js),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(is)
+        i, j = is[inter_i], js[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F,
+                       velocities[i], velocities[j], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim])
+        end
+    end
+end
+
+@kernel inbounds=true function specific_force_3_atoms_kernel!(forces, @Const(coords),
+                                                @Const(velocities),
+                                                @Const(atoms), boundary,
+                                                step_n, @Const(is),
+                                                @Const(js), @Const(ks),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(is)
+        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i],
+                       atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim])
+            Atomix.@atomic forces[dim, k] += ustrip(fs.f3[dim])
+        end
+    end
+end
+
+@kernel inbounds=true function specific_force_4_atoms_kernel!(forces, @Const(coords),
+                                                @Const(velocities),
+                                                @Const(atoms), boundary,
+                                                step_n, @Const(is),
+                                                @Const(js), @Const(ks),
+                                                @Const(ls),
+                                                @Const(inters), ::Val{D},
+                                                ::Val{F}) where {D, F}
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(is)
+        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
+        fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary,
+                       atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j],
+                       velocities[k], velocities[l], step_n)
+        if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F
+            error("wrong force unit returned, was expecting $F")
+        end
+        for dim in 1:D
+            Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim])
+            Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim])
+            Atomix.@atomic forces[dim, k] += ustrip(fs.f3[dim])
+            Atomix.@atomic forces[dim, l] += ustrip(fs.f4[dim])
+        end
+    end
+end
+
+function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT},
+                         pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray}
+    if isnothing(neighbors)
+        error("neighbors is nothing, if you are using GPUNeighborFinder on a non-NVIDIA GPU you " *
+              "should use DistanceNeighborFinder instead")
+    end
+    if typeof(neighbors) == NoNeighborList
+        nbs = neighbors
+    else
+        nbs = @view neighbors.list[1:neighbors.n]
+    end
+    if length(neighbors) > 0
+        backend = get_backend(sys.coords)
+        n_threads_gpu = gpu_threads_pairwise(length(nbs))
+        kernel! = pairwise_pe_kernel!(backend, n_threads_gpu)
+        kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary,
+                pairwise_inters, nbs, step_n, Val(sys.energy_units); ndrange=length(nbs))
+    end
+    return pe_vec_nounits
+end
+
+@kernel inbounds=true function pairwise_pe_kernel!(energy, @Const(coords), @Const(velocities),
+                                     @Const(atoms), boundary, inters,
+                                     @Const(neighbors), step_n, ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(neighbors)
+        i, j, special = neighbors[inter_i]
+        coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j]
+        dr = vector(coord_i, coord_j, boundary)
+        pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
+                                  boundary, vel_i, vel_j, step_n)
+        for inter in inters[2:end]
+            pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j,
+                                       boundary, vel_i, vel_j, step_n)
+        end
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_pe_1_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.inters, Val(energy_units); ndrange = length(inter_list))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_pe_2_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.inters, Val(energy_units); ndrange = length(inter_list))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_pe_3_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units);
+            ndrange = length(inter_list))
+    return pe_vec_nounits
+end
+
+function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}},
+                          velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T}
+    backend = get_backend(coords)
+    n_threads_gpu = gpu_threads_specific(length(inter_list))
+    kernel! = specific_pe_4_atoms_kernel!(backend, n_threads_gpu)
+    kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is,
+            inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units);
+            ndrange = length(inter_list))
+    return pe_vec_nounits
+end
+
+@kernel inbounds=true function specific_pe_1_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+                    @Const(atoms), boundary, step_n, @Const(is), @Const(inters), ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(is)
+        i = is[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E,
+                                  velocities[i], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel inbounds=true function specific_pe_2_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+                    @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(inters),
+                    ::Val{E}) where E
+
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(is)
+        i, j = is[inter_i], js[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i],
+                                  atoms[j], E, velocities[i], velocities[j], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel inbounds=true function specific_pe_3_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+                    @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(ks),
+                    @Const(inters), ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(is)
+        i, j, k = is[inter_i], js[inter_i], ks[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary,
+                                  atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j],
+                                  velocities[k], step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
+
+@kernel inbounds=true function specific_pe_4_atoms_kernel!(energy, @Const(coords), @Const(velocities),
+                    @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(ks),
+                    @Const(ls), @Const(inters), ::Val{E}) where E
+
+    inter_i = @index(Global, Linear)
+
+    if inter_i <= length(is)
+        i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i]
+        pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l],
+                                  boundary, atoms[i], atoms[j], atoms[k], atoms[l], E,
+                                  velocities[i], velocities[j], velocities[k], velocities[l],
+                                  step_n)
+        if unit(pe) != E
+            error("wrong energy unit returned, was expecting $E but got $(unit(pe))")
+        end
+        Atomix.@atomic energy[1] += ustrip(pe)
+    end
+end
diff --git a/src/neighbors.jl b/src/neighbors.jl
index 7f3625a88..dbece0938 100644
--- a/src/neighbors.jl
+++ b/src/neighbors.jl
@@ -43,13 +43,16 @@ find_neighbors(sys::System; kwargs...) = find_neighbors(sys, sys.neighbor_finder
 
 find_neighbors(sys::System, nf::NoNeighborFinder, args...; kwargs...) = nothing
 
+# Indicates whether an array type is compatible with GPUNeighborFinder
+uses_gpu_neighbor_finder(AT) = false
+
 """
     GPUNeighborFinder(; eligible, dist_cutoff, special, n_steps_reorder, initialized)
 
 Use the non-bonded forces/potential energy algorithm from
 [Eastman and Pande 2010](https://doi.org/10.1002/jcc.21413) to avoid calculating a neighbor list.
 
-This is the recommended neighbor finder on GPU.
+This is the recommended neighbor finder on NVIDIA GPUs.
 """
 mutable struct GPUNeighborFinder{B, D}
     eligible::B
@@ -75,6 +78,8 @@ find_neighbors(sys::System, nf::GPUNeighborFinder, args...; kwargs...) = nothing
     DistanceNeighborFinder(; eligible, dist_cutoff, special, n_steps)
 
 Find close atoms by distance.
+
+This is the recommended neighbor finder on non-NVIDIA GPUs.
 """
 struct DistanceNeighborFinder{B, D}
     eligible::B
@@ -93,12 +98,12 @@ function DistanceNeighborFinder(;
                 eligible, dist_cutoff, special, n_steps, zero(eligible))
 end
 
-function find_neighbors(sys::System{D, false},
+function find_neighbors(sys::System,
                         nf::DistanceNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads()) where D
+                        n_threads::Integer=Threads.nthreads())
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
@@ -121,20 +126,18 @@ function find_neighbors(sys::System{D, false},
     return NeighborList(length(neighbors_list), neighbors_list)
 end
 
-function cuda_threads_blocks_dnf(n_inters)
+function gpu_threads_blocks_dnf(n_inters)
     n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_DISTANCENF", "512"))
-    n_blocks = cld(n_inters, n_threads_gpu)
-    return n_threads_gpu, n_blocks
+    return n_threads_gpu
 end
 
-function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var,
-                                          boundary, sq_dist_neighbors)
-    coords    = CUDA.Const(coords_var)
-    eligible = CUDA.Const(eligible_var)
-
+@kernel function distance_neighbor_finder_kernel!(neighbors,
+                                                  @Const(coords),
+                                                  @Const(eligible),
+                                                  boundary, sq_dist_neighbors)
     n_atoms = length(coords)
     n_inters = n_atoms_to_n_pairs(n_atoms)
-    inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    inter_i = @index(Global, Linear)
 
     @inbounds if inter_i <= n_inters
         i, j = pair_index(n_atoms, inter_i)
@@ -146,28 +149,28 @@ function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var,
             end
         end
     end
-    return nothing
 end
 
 lists_to_tuple_list(i, j, w) = (Int32(i), Int32(j), w)
 
-function find_neighbors(sys::System{D, true},
+function find_neighbors(sys::System{D, AT},
                         nf::DistanceNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        kwargs...) where D
+                        kwargs...) where {D, AT <: AbstractGPUArray}
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
 
     nf.neighbors .= false
     n_inters = n_atoms_to_n_pairs(length(sys))
-    n_threads_gpu, n_blocks = cuda_threads_blocks_dnf(n_inters)
+    n_threads_gpu = gpu_threads_blocks_dnf(n_inters)
 
-    CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks distance_neighbor_finder_kernel!(
-        nf.neighbors, sys.coords, nf.eligible, sys.boundary, nf.dist_cutoff^2,
-    )
+    backend = get_backend(sys.coords)
+    kernel! = distance_neighbor_finder_kernel!(backend, n_threads_gpu)
+    kernel!(nf.neighbors, sys.coords, nf.eligible, sys.boundary,
+            nf.dist_cutoff^2, ndrange=n_inters)
 
     pairs = findall(nf.neighbors)
     nbsi, nbsj = getindex.(pairs, 1), getindex.(pairs, 2)
@@ -199,12 +202,12 @@ function TreeNeighborFinder(;
     return TreeNeighborFinder{typeof(dist_cutoff)}(eligible, dist_cutoff, special, n_steps)
 end
 
-function find_neighbors(sys::System,
+function find_neighbors(sys::System{<:Any, AT},
                         nf::TreeNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads())
+                        n_threads::Integer=Threads.nthreads()) where AT
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
@@ -227,7 +230,7 @@ function find_neighbors(sys::System,
         end
     end
 
-    return NeighborList(length(neighbors_list), move_array(neighbors_list, sys))
+    return NeighborList(length(neighbors_list), AT(neighbors_list))
 end
 
 """
@@ -337,19 +340,19 @@ function reduce_pairs(neighbors::NeighborList, neighbors_threaded::Vector{Neighb
     return neighbors
 end
 
-function find_neighbors(sys::System{D, G},
+function find_neighbors(sys::System{D, AT},
                         nf::CellListMapNeighborFinder,
                         current_neighbors=nothing,
                         step_n::Integer=0,
                         force_recompute::Bool=false;
-                        n_threads::Integer=Threads.nthreads()) where {D, G}
+                        n_threads::Integer=Threads.nthreads()) where {D, AT}
     if !force_recompute && !iszero(step_n % nf.n_steps)
         return current_neighbors
     end
 
     if isnothing(current_neighbors)
         neighbors = NeighborList()
-    elseif G
+    elseif AT <: AbstractGPUArray
         neighbors = NeighborList(current_neighbors.n, Array(current_neighbors.list))
     else
         neighbors = current_neighbors
@@ -381,8 +384,8 @@ function find_neighbors(sys::System{D, G},
     )
 
     nf.cl = cl
-    if G
-        return NeighborList(neighbors.n, CuArray(neighbors.list))
+    if AT <: AbstractGPUArray
+        return NeighborList(neighbors.n, AT(neighbors.list))
     else
         return neighbors
     end
diff --git a/src/setup.jl b/src/setup.jl
index e0b5efbe8..35b151f45 100644
--- a/src/setup.jl
+++ b/src/setup.jl
@@ -428,8 +428,8 @@ are not available when reading Gromacs files.
 - `loggers=()`: the loggers that record properties of interest during a
     simulation.
 - `units::Bool=true`: whether to use Unitful quantities.
-- `gpu::Bool=false`: whether to move the relevant parts of the system onto
-    the GPU.
+- `array_type=Array`: the array type for the simulation, for example
+    use `CuArray` or `ROCArray` for GPU support.
 - `dist_cutoff=1.0u"nm"`: cutoff distance for long-range interactions.
 - `dist_neighbors=1.2u"nm"`: cutoff distance for the neighbor list, should be
     greater than `dist_cutoff`.
@@ -452,7 +452,7 @@ function System(coord_file::AbstractString,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                gpu::Bool=false,
+                array_type::Type{AT}=Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
@@ -460,7 +460,7 @@ function System(coord_file::AbstractString,
                 data=nothing,
                 implicit_solvent=nothing,
                 kappa=0.0u"nm^-1",
-                rename_terminal_res::Bool=true)
+                rename_terminal_res::Bool=true) where AT <: AbstractArray
     T = typeof(force_field.weight_14_coulomb)
 
     # Chemfiles uses zero-based indexing, be careful
@@ -824,9 +824,9 @@ function System(coord_file::AbstractString,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            gpu ? CuArray(bonds.is) : bonds.is,
-            gpu ? CuArray(bonds.js) : bonds.js,
-            gpu ? CuArray([bonds.inters...]) : [bonds.inters...],
+            AT(bonds.is),
+            AT(bonds.js),
+            AT([bonds.inters...]),
             bonds.types,
         ))
         topology = MolecularTopology(bonds.is, bonds.js, n_atoms)
@@ -835,30 +835,30 @@ function System(coord_file::AbstractString,
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
-            gpu ? CuArray(angles.is) : angles.is,
-            gpu ? CuArray(angles.js) : angles.js,
-            gpu ? CuArray(angles.ks) : angles.ks,
-            gpu ? CuArray([angles.inters...]) : [angles.inters...],
+            AT(angles.is),
+            AT(angles.js),
+            AT(angles.ks),
+            AT([angles.inters...]),
             angles.types,
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(torsions.is) : torsions.is,
-            gpu ? CuArray(torsions.js) : torsions.js,
-            gpu ? CuArray(torsions.ks) : torsions.ks,
-            gpu ? CuArray(torsions.ls) : torsions.ls,
-            gpu ? CuArray(torsion_inters_pad) : torsion_inters_pad,
+            AT(torsions.is),
+            AT(torsions.js),
+            AT(torsions.ks),
+            AT(torsions.ls),
+            AT(torsion_inters_pad),
             torsions.types,
         ))
     end
     if length(impropers.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(impropers.is) : impropers.is,
-            gpu ? CuArray(impropers.js) : impropers.js,
-            gpu ? CuArray(impropers.ks) : impropers.ks,
-            gpu ? CuArray(impropers.ls) : impropers.ls,
-            gpu ? CuArray(improper_inters_pad) : improper_inters_pad,
+            AT(impropers.is),
+            AT(impropers.js),
+            AT(impropers.ks),
+            AT(impropers.ls),
+            AT(improper_inters_pad),
             impropers.types,
         ))
     end
@@ -887,15 +887,14 @@ function System(coord_file::AbstractString,
     end
     coords = wrap_coords.(coords, (boundary_used,))
 
-    if gpu
+    if uses_gpu_neighbor_finder(AT)
         neighbor_finder = GPUNeighborFinder(
-            eligible=CuArray(eligible),
+            eligible=AT(eligible),
             dist_cutoff=T(dist_neighbors),
-            special=CuArray(special),
+            special=AT(special),
             n_steps_reorder=10,
-            initialized=false,
         )
-    elseif use_cell_list
+    elseif use_cell_list && !(AT <: AbstractGPUArray)
         neighbor_finder = CellListMapNeighborFinder(
             eligible=eligible,
             special=special,
@@ -906,19 +905,15 @@ function System(coord_file::AbstractString,
         )
     else
         neighbor_finder = DistanceNeighborFinder(
-            eligible=eligible,
-            special=special,
+            eligible=AT(eligible),
+            special=AT(special),
             n_steps=10,
             dist_cutoff=T(dist_neighbors),
         )
     end
-    if gpu
-        atoms = CuArray([atoms_abst...])
-        coords_dev = CuArray(coords)
-    else
-        atoms = [atoms_abst...]
-        coords_dev = coords
-    end
+
+    atoms = AT([atoms_abst...])
+    coords_dev = AT(coords)
 
     if isnothing(velocities)
         if units
@@ -973,12 +968,12 @@ function System(T::Type,
                 velocities=nothing,
                 loggers=(),
                 units::Bool=true,
-                gpu::Bool=false,
+                array_type::Type{AT}=Array,
                 dist_cutoff=units ? 1.0u"nm" : 1.0,
                 dist_neighbors=units ? 1.2u"nm" : 1.2,
                 center_coords::Bool=true,
                 use_cell_list::Bool=true,
-                data=nothing)
+                data=nothing) where AT <: AbstractArray
     # Read force field and topology file
     atomtypes = Dict{String, Atom}()
     bondtypes = Dict{String, HarmonicBond}()
@@ -1254,9 +1249,9 @@ function System(T::Type,
     specific_inter_array = []
     if length(bonds.is) > 0
         push!(specific_inter_array, InteractionList2Atoms(
-            gpu ? CuArray(bonds.is) : bonds.is,
-            gpu ? CuArray(bonds.js) : bonds.js,
-            gpu ? CuArray([bonds.inters...]) : [bonds.inters...],
+            AT(bonds.is),
+            AT(bonds.js),
+            AT([bonds.inters...]),
             bonds.types,
         ))
         topology = MolecularTopology(bonds.is, bonds.js, n_atoms)
@@ -1265,34 +1260,33 @@ function System(T::Type,
     end
     if length(angles.is) > 0
         push!(specific_inter_array, InteractionList3Atoms(
-            gpu ? CuArray(angles.is) : angles.is,
-            gpu ? CuArray(angles.js) : angles.js,
-            gpu ? CuArray(angles.ks) : angles.ks,
-            gpu ? CuArray([angles.inters...]) : [angles.inters...],
+            AT(angles.is),
+            AT(angles.js),
+            AT(angles.ks),
+            AT([angles.inters...]),
             angles.types,
         ))
     end
     if length(torsions.is) > 0
         push!(specific_inter_array, InteractionList4Atoms(
-            gpu ? CuArray(torsions.is) : torsions.is,
-            gpu ? CuArray(torsions.js) : torsions.js,
-            gpu ? CuArray(torsions.ks) : torsions.ks,
-            gpu ? CuArray(torsions.ls) : torsions.ls,
-            gpu ? CuArray([torsions.inters...]) : [torsions.inters...],
+            AT(torsions.is),
+            AT(torsions.js),
+            AT(torsions.ks),
+            AT(torsions.ls),
+            AT([torsions.inters...]),
             torsions.types,
         ))
     end
     specific_inter_lists = tuple(specific_inter_array...)
 
-    if gpu
+    if uses_gpu_neighbor_finder(AT)
         neighbor_finder = GPUNeighborFinder(
-            eligible=CuArray(eligible),
+            eligible=AT(eligible),
             dist_cutoff=T(dist_neighbors),
-            special=CuArray(special),
+            special=AT(special),
             n_steps_reorder=10,
-            initialized=false,
         )
-    elseif use_cell_list
+    elseif use_cell_list && !(AT <: AbstractGPUArray)
         neighbor_finder = CellListMapNeighborFinder(
             eligible=eligible,
             special=special,
@@ -1303,19 +1297,15 @@ function System(T::Type,
         )
     else
         neighbor_finder = DistanceNeighborFinder(
-            eligible=eligible,
-            special=special,
+            eligible=AT(eligible),
+            special=AT(special),
             n_steps=10,
             dist_cutoff=T(dist_neighbors),
         )
     end
-    if gpu
-        atoms = CuArray([atoms_abst...])
-        coords_dev = CuArray(coords)
-    else
-        atoms = [atoms_abst...]
-        coords_dev = coords
-    end
+
+    atoms = AT([atoms_abst...])
+    coords_dev = AT(coords)
 
     if isnothing(velocities)
         if units
@@ -1382,10 +1372,10 @@ The `atom_selector` function takes in each atom and atom data and determines whe
 that atom.
 For example, [`is_heavy_atom`](@ref) means non-hydrogen atoms are restrained.
 """
-function add_position_restraints(sys,
+function add_position_restraints(sys::System{<:Any, AT},
                                  k;
                                  atom_selector::Function=is_any_atom,
-                                 restrain_coords=sys.coords)
+                                 restrain_coords=sys.coords) where AT
     k_array = isa(k, AbstractArray) ? k : fill(k, length(sys))
     if length(k_array) != length(sys)
         throw(ArgumentError("the system has $(length(sys)) atoms but there are $(length(k_array)) k values"))
@@ -1402,7 +1392,7 @@ function add_position_restraints(sys,
             push!(inters, HarmonicPositionRestraint(k_res, x0))
         end
     end
-    restraints = InteractionList1Atoms(move_array(is, sys), move_array([inters...], sys), types)
+    restraints = InteractionList1Atoms(AT(is), AT([inters...]), types)
     sis = (sys.specific_inter_lists..., restraints)
     return System(
         atoms=deepcopy(sys.atoms),
diff --git a/src/simulators.jl b/src/simulators.jl
index ad77d1217..141fc0a47 100644
--- a/src/simulators.jl
+++ b/src/simulators.jl
@@ -831,12 +831,12 @@ Attempt an exchange of replicas `n` and `m` in a [`ReplicaSystem`](@ref) during
 Successful exchanges should exchange coordinates and velocities as appropriate.
 Returns acceptance quantity `Δ` and a `Bool` indicating whether the exchange was successful.
 """
-function remd_exchange!(sys::ReplicaSystem{D, G, T},
+function remd_exchange!(sys::ReplicaSystem,
                         sim::TemperatureREMD,
                         n::Integer,
                         m::Integer;
                         n_threads::Integer=Threads.nthreads(),
-                        rng=Random.default_rng()) where {D, G, T}
+                        rng=Random.default_rng())
     T_n, T_m = sim.temperatures[n], sim.temperatures[m]
     β_n, β_m = inv(sys.k * T_n), inv(sys.k * T_m)
     neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder;
@@ -922,12 +922,12 @@ function simulate!(sys::ReplicaSystem,
     return simulate_remd!(sys, sim, n_steps; n_threads=n_threads, run_loggers=run_loggers, rng=rng)
 end
 
-function remd_exchange!(sys::ReplicaSystem{D, G, T},
+function remd_exchange!(sys::ReplicaSystem,
                         sim::HamiltonianREMD,
                         n::Integer,
                         m::Integer;
                         n_threads::Integer=Threads.nthreads(),
-                        rng=Random.default_rng()) where {D, G, T}
+                        rng=Random.default_rng())
     T_sim = sim.temperature
     β_sim = inv(sys.k * T_sim)
     neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder;
@@ -1047,12 +1047,12 @@ function MetropolisMonteCarlo(; temperature, trial_moves, trial_args=Dict())
     return MetropolisMonteCarlo(temperature, trial_moves, trial_args)
 end
 
-@inline function simulate!(sys::System{D, G, T},
+@inline function simulate!(sys::System,
                            sim::MetropolisMonteCarlo,
                            n_steps::Integer;
                            n_threads::Integer=Threads.nthreads(),
                            run_loggers=true,
-                           rng=Random.default_rng()) where {D, G, T}
+                           rng=Random.default_rng())
     neighbors = find_neighbors(sys, sys.neighbor_finder; n_threads=n_threads)
     E_old = potential_energy(sys, neighbors; n_threads=n_threads)
     coords_old = similar(sys.coords)
@@ -1090,9 +1090,9 @@ Performs a random translation of the coordinates of a randomly selected atom in
 The translation is generated using a uniformly selected direction and uniformly selected length
 in range [0, 1) scaled by `shift_size` which should have appropriate length units.
 """
-function random_uniform_translation!(sys::System{D, G, T};
+function random_uniform_translation!(sys::System{D, AT, T};
                                      shift_size=oneunit(eltype(eltype(sys.coords))),
-                                     rng=Random.default_rng()) where {D, G, T}
+                                     rng=Random.default_rng()) where {D, AT, T}
     rand_idx = rand(rng, eachindex(sys))
     direction = random_unit_vector(T, D, rng)
     magnitude = rand(rng, T) * shift_size
@@ -1110,9 +1110,9 @@ The translation is generated using a uniformly chosen direction and length selec
 the standard normal distribution i.e. with mean 0 and standard deviation 1, scaled by `shift_size`
 which should have appropriate length units.
 """
-function random_normal_translation!(sys::System{D, G, T};
+function random_normal_translation!(sys::System{D, AT, T};
                                     shift_size=oneunit(eltype(eltype(sys.coords))),
-                                    rng=Random.default_rng()) where {D, G, T}
+                                    rng=Random.default_rng()) where {D, AT, T}
     rand_idx = rand(rng, eachindex(sys))
     direction = random_unit_vector(T, D, rng)
     magnitude = randn(rng, T) * shift_size
diff --git a/src/spatial.jl b/src/spatial.jl
index f918a827a..728577f48 100644
--- a/src/spatial.jl
+++ b/src/spatial.jl
@@ -613,12 +613,12 @@ function random_velocities(sys::AtomsBase.AbstractSystem{2}, temp; rng=Random.de
     return random_velocity_2D.(masses(sys), temp, sys.k, rng)
 end
 
-function random_velocities(sys::System{3, true}, temp; rng=Random.default_rng())
-    return CuArray(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng))
+function random_velocities(sys::System{3, AT}, temp; rng=Random.default_rng()) where AT <: AbstractGPUArray
+    return AT(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng))
 end
 
-function random_velocities(sys::System{2, true}, temp; rng=Random.default_rng())
-    return CuArray(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng))
+function random_velocities(sys::System{2, AT}, temp; rng=Random.default_rng()) where AT <: AbstractGPUArray
+    return AT(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng))
 end
 
 """
@@ -738,9 +738,9 @@ function virial(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt
     return v
 end
 
-function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_nonl,
-                            pairwise_inters_nl) where {D, G, T}
-    if G
+function virial(sys::System{D, AT, T}, neighbors_dev, step_n, pairwise_inters_nonl,
+                            pairwise_inters_nl) where {D, AT, T}
+    if AT <: AbstractGPUArray
         coords, velocities, atoms = Array(sys.coords), Array(sys.velocities), Array(sys.atoms)
         if isnothing(neighbors_dev)
             neighbors = neighbors_dev
@@ -792,7 +792,7 @@ function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_non
 end
 
 # Default for general interactions
-function virial(inter, sys::System{D, G, T}, args...; kwargs...) where {D, G, T}
+function virial(inter, sys::System{D, AT, T}, args...; kwargs...) where {D, AT, T}
     return zero(T) * sys.energy_units
 end
 
@@ -874,8 +874,9 @@ function molecule_centers(coords::AbstractArray{SVector{D, C}}, boundary, topolo
     end
 end
 
-function molecule_centers(coords::CuArray, boundary, topology)
-    return CuArray(molecule_centers(Array(coords), boundary, topology))
+function molecule_centers(coords::AbstractGPUArray, boundary, topology)
+    AT = array_type(coords)
+    return AT(molecule_centers(Array(coords), boundary, topology))
 end
 
 # Allows scaling multiple vectors at once by broadcasting this function
@@ -895,7 +896,7 @@ This can be disabled with `ignore_molecules=true`.
 
 Not currently compatible with [`TriclinicBoundary`](@ref) if the topology is set.
 """
-function scale_coords!(sys, scale_factor; ignore_molecules=false)
+function scale_coords!(sys::System{<:Any, AT}, scale_factor; ignore_molecules=false) where AT
     if ignore_molecules || isnothing(sys.topology)
         sys.boundary = scale_boundary(sys.boundary, scale_factor)
         sys.coords .= scale_vec.(sys.coords, Ref(scale_factor))
@@ -926,7 +927,7 @@ function scale_coords!(sys, scale_factor; ignore_molecules=false)
             coords_nounits[i] = wrap_coords(
                     coords_nounits[i] .+ shift_vecs[mi] .- center_shifts[mi], boundary_nounits)
         end
-        sys.coords .= move_array(coords_nounits .* coord_units, sys)
+        sys.coords .= AT(coords_nounits .* coord_units)
     end
     return sys
 end
diff --git a/src/types.jl b/src/types.jl
index 225d9cff3..023883d33 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -15,6 +15,7 @@ export
     inject_gradients,
     extract_parameters,
     ReplicaSystem,
+    array_type,
     is_on_gpu,
     float_type,
     masses,
@@ -182,39 +183,23 @@ function Base.:+(il1::InteractionList4Atoms{I, T}, il2::InteractionList4Atoms{I,
     )
 end
 
-function inject_interaction_list(inter::InteractionList1Atoms, params_dic, gpu)
-    if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
-    else
-        inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
-    end
+function inject_interaction_list(inter::InteractionList1Atoms, params_dic, AT)
+    inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList1Atoms(inter.is, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList2Atoms, params_dic, gpu)
-    if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
-    else
-        inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
-    end
+function inject_interaction_list(inter::InteractionList2Atoms, params_dic, AT)
+    inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList2Atoms(inter.is, inter.js, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList3Atoms, params_dic, gpu)
-    if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
-    else
-        inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
-    end
+function inject_interaction_list(inter::InteractionList3Atoms, params_dic, AT)
+    inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList3Atoms(inter.is, inter.js, inter.ks, inters_grad, inter.types)
 end
 
-function inject_interaction_list(inter::InteractionList4Atoms, params_dic, gpu)
-    if gpu
-        inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
-    else
-        inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,))
-    end
+function inject_interaction_list(inter::InteractionList4Atoms, params_dic, AT)
+    inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,)))
     InteractionList4Atoms(inter.is, inter.js, inter.ks, inter.ls, inters_grad, inter.types)
 end
 
@@ -431,8 +416,6 @@ Base.firstindex(::NoNeighborList) = 1
 Base.lastindex(nl::NoNeighborList) = length(nl)
 Base.eachindex(nl::NoNeighborList) = Base.OneTo(length(nl))
 
-CUDA.Const(nl::NoNeighborList) = nl
-
 """
     System(; <keyword arguments>)
 
@@ -481,7 +464,7 @@ interface described there.
     modified in some simulations. `k` is chosen based on the `energy_units` given.
 - `data::DA=nothing`: arbitrary data associated with the system.
 """
-mutable struct System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF,
+mutable struct System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF,
                       L, F, E, K, M, DA} <: AtomsBase.AbstractSystem{D}
     atoms::A
     coords::C
@@ -521,7 +504,7 @@ function System(;
                 k=default_k(energy_units),
                 data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    G = isa(coords, CuArray)
+    AT = array_type(coords)
     T = float_type(boundary)
     A = typeof(atoms)
     C = typeof(coords)
@@ -567,19 +550,19 @@ function System(;
         end
     end
 
-    if isa(atoms, CuArray) && !isa(coords, CuArray)
+    if isa(atoms, AbstractGPUArray) && !isa(coords, AbstractGPUArray)
         throw(ArgumentError("the atoms are on the GPU but the coordinates are not"))
     end
-    if isa(coords, CuArray) && !isa(atoms, CuArray)
+    if isa(coords, AbstractGPUArray) && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the coordinates are on the GPU but the atoms are not"))
     end
-    if isa(atoms, CuArray) && !isa(vels, CuArray)
+    if isa(atoms, AbstractGPUArray) && !isa(vels, AbstractGPUArray)
         throw(ArgumentError("the atoms are on the GPU but the velocities are not"))
     end
-    if isa(vels, CuArray) && !isa(atoms, CuArray)
+    if isa(vels, AbstractGPUArray) && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the velocities are on the GPU but the atoms are not"))
     end
-    if isa(atoms, CuArray) && length(constraints) > 0
+    if isa(atoms, AbstractGPUArray) && length(constraints) > 0
         @warn "Constraints are not currently compatible with simulation on the GPU"
     end
 
@@ -596,7 +579,7 @@ function System(;
     check_units(atoms, coords, vels, energy_units, force_units, pairwise_inters,
                 specific_inter_lists, general_inters, boundary)
 
-    return System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}(
+    return System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}(
                     atoms, coords, boundary, vels, atoms_data, topology, pairwise_inters,
                     specific_inter_lists, general_inters, constraints, neighbor_finder, loggers,
                     df, force_units, energy_units, k_converted, atom_masses, data)
@@ -653,7 +636,7 @@ Construct a `System` from a SimpleCrystals.jl `Crystal` struct.
 
 Properties unused in the simulation or in analysis can be left with their
 default values.
-`atoms`, `atoms_data`, `coords` and `boundary` are automatically calcualted from
+`atoms`, `atoms_data`, `coords` and `boundary` are automatically calculated from
 the `Crystal` struct.
 Extra atom paramaters like `σ` have to be added manually after construction using
 the convenience constructor `System(sys; <keyword arguments>)`.
@@ -721,19 +704,15 @@ Allows gradients for individual parameters to be tracked.
 Returns atoms, pairwise interactions, specific interaction lists and general
 interactions.
 """
-function inject_gradients(sys::System{D, G}, params_dic) where {D, G}
-    if G
-        atoms_grad = CuArray(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,)))
-    else
-        atoms_grad = inject_atom.(sys.atoms, sys.atoms_data, (params_dic,))
-    end
+function inject_gradients(sys::System{D, AT}, params_dic) where {D, AT}
+    atoms_grad = AT(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,)))
     if length(sys.pairwise_inters) > 0
         pis_grad = inject_interaction.(sys.pairwise_inters, (params_dic,))
     else
         pis_grad = sys.pairwise_inters
     end
     if length(sys.specific_inter_lists) > 0
-        sis_grad = inject_interaction_list.(sys.specific_inter_lists, (params_dic,), G)
+        sis_grad = inject_interaction_list.(sys.specific_inter_lists, (params_dic,), AT)
     else
         sis_grad = sys.specific_inter_lists
     end
@@ -847,7 +826,7 @@ construction where `n` is the number of threads to be used per replica.
     modified in some simulations. `k` is chosen based on the `energy_units` given.
 - `data::DA=nothing`: arbitrary data associated with the replica system.
 """
-mutable struct ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA} <: AtomsBase.AbstractSystem{D}
+mutable struct ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA} <: AtomsBase.AbstractSystem{D}
     atoms::A
     n_replicas::Int
     atoms_data::AD
@@ -884,7 +863,7 @@ function ReplicaSystem(;
                         k=default_k(energy_units),
                         data=nothing)
     D = AtomsBase.n_dimensions(boundary)
-    G = isa(replica_coords[1], CuArray)
+    AT = array_type(replica_coords[1])
     T = float_type(boundary)
     A = typeof(atoms)
     AD = typeof(atoms_data)
@@ -995,25 +974,25 @@ function ReplicaSystem(;
         throw(ArgumentError("there are $(length(atoms)) atoms but $(length(atoms_data)) atom data entries"))
     end
 
-    n_cuarray = sum(y -> isa(y, CuArray), replica_coords)
-    if !(n_cuarray == n_replicas || n_cuarray == 0)
-        throw(ArgumentError("the coordinates for $n_cuarray out of $n_replicas replicas are on GPU"))
+    n_gpu_array = sum(y -> isa(y, AbstractGPUArray), replica_coords)
+    if !(n_gpu_array == n_replicas || n_gpu_array == 0)
+        throw(ArgumentError("the coordinates for $n_gpu_array out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, CuArray) && n_cuarray != n_replicas
+    if isa(atoms, AbstractGPUArray) && n_gpu_array != n_replicas
         throw(ArgumentError("the atoms are on the GPU but the coordinates are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, CuArray)
+    if n_gpu_array == n_replicas && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the coordinates are on the GPU but the atoms are not"))
     end
 
-    n_cuarray = sum(y -> isa(y, CuArray), replica_velocities)
-    if !(n_cuarray == n_replicas || n_cuarray == 0)
-        throw(ArgumentError("the velocities for $n_cuarray out of $n_replicas replicas are on GPU"))
+    n_gpu_array = sum(y -> isa(y, AbstractGPUArray), replica_velocities)
+    if !(n_gpu_array == n_replicas || n_gpu_array == 0)
+        throw(ArgumentError("the velocities for $n_gpu_array out of $n_replicas replicas are on GPU"))
     end
-    if isa(atoms, CuArray) && n_cuarray != n_replicas
+    if isa(atoms, AbstractGPUArray) && n_gpu_array != n_replicas
         throw(ArgumentError("the atoms are on the GPU but the velocities are not"))
     end
-    if n_cuarray == n_replicas && !isa(atoms, CuArray)
+    if n_gpu_array == n_replicas && !isa(atoms, AbstractGPUArray)
         throw(ArgumentError("the velocities are on the GPU but the atoms are not"))
     end
 
@@ -1023,7 +1002,7 @@ function ReplicaSystem(;
     k_converted = convert_k_units(T, k, energy_units)
     K = typeof(k_converted)
 
-    replicas = Tuple(System{D, G, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]),
+    replicas = Tuple(System{D, AT, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]),
                         typeof(replica_specific_inter_lists[i]), typeof(replica_general_inters[i]),
                         typeof(replica_constraints[i]), NF, typeof(replica_loggers[i]), F, E, K,
                         M, Nothing}(
@@ -1034,17 +1013,27 @@ function ReplicaSystem(;
             force_units, energy_units, k_converted, atom_masses, nothing) for i in 1:n_replicas)
     R = typeof(replicas)
 
-    return ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA}(
+    return ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA}(
             atoms, n_replicas, atoms_data, exchange_logger, force_units,
             energy_units, k_converted, replicas, data)
 end
 
+"""
+    array_type(sys)
+    array_type(arr)
+
+The array type of a [`System`](@ref), [`ReplicaSystem`](@ref) or array, for example
+`Array` for systems on CPU or `CuArray` for systems on a NVIDIA GPU.
+"""
+array_type(::AT) where AT = AT.name.wrapper
+array_type(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT
+
 """
     is_on_gpu(sys)
 
 Whether a [`System`](@ref) or [`ReplicaSystem`](@ref) is on the GPU.
 """
-is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G
+is_on_gpu(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT <: AbstractGPUArray
 
 """
     float_type(sys)
@@ -1052,7 +1041,7 @@ is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G
 
 The float type a [`System`](@ref), [`ReplicaSystem`](@ref) or bounding box uses.
 """
-float_type(::Union{System{D, G, T}, ReplicaSystem{D, G, T}}) where {D, G, T} = T
+float_type(::Union{System{D, AT, T}, ReplicaSystem{D, AT, T}}) where {D, AT, T} = T
 
 """
     masses(sys)
@@ -1070,10 +1059,6 @@ The partial charges of the atoms in a [`System`](@ref) or [`ReplicaSystem`](@ref
 charges(s::Union{System, ReplicaSystem}) = charge.(s.atoms)
 charge(s::Union{System, ReplicaSystem}, i::Integer) = charge(s.atoms[i])
 
-# Move an array to the GPU depending on whether the system is on the GPU
-move_array(arr, ::System{D, false}) where {D} = arr
-move_array(arr, ::System{D, true }) where {D} = CuArray(arr)
-
 Base.getindex(s::Union{System, ReplicaSystem}, i::Union{Integer, AbstractVector}) = s.atoms[i]
 Base.length(s::Union{System, ReplicaSystem}) = length(s.atoms)
 Base.eachindex(s::Union{System, ReplicaSystem}) = Base.OneTo(length(s))
diff --git a/test/Project.toml b/test/Project.toml
index 3901cc98f..c183b5756 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a"
 AtomsBaseTesting = "ed7c10db-df7e-4efa-a7be-4f4190f7f227"
@@ -9,17 +10,23 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000"
 GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SimpleCrystals = "64031d72-e220-11ed-1a7e-43a2532b2fa8"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [compat]
+AMDGPU = "1"
 Aqua = "0.8"
 AtomsBaseTesting = "0.4"
+CUDA = "5"
 DelimitedFiles = "1.9"
 FiniteDifferences = "0.12"
-GLMakie = "0.9, 0.10"
+Metal = "1.5"
 Test = "1.9"
+oneAPI = "2"
diff --git a/test/basic.jl b/test/basic.jl
index b9273cf0f..e510bf6e3 100644
--- a/test/basic.jl
+++ b/test/basic.jl
@@ -176,22 +176,22 @@
     @test mcs == [SVector(0.05, 0.0), SVector(1.0, 1.0)]
 
     ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "tip3p_standard.xml", "his.xml"])...)
-    for gpu in gpu_list
-        sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=gpu, use_cell_list=false)
+    for AT in array_list
+        sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; array_type=AT, use_cell_list=false)
         mcs = molecule_centers(sys.coords, sys.boundary, sys.topology)
-        @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.04u"nm")
+        @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.08u"nm")
 
         # Mark all pairs as ineligible for pairwise interactions and check that the
         #   potential energy from the specific interactions does not change on scaling
         no_nbs = falses(length(sys), length(sys))
-        if gpu
+        if AT <: CuArray
             sys.neighbor_finder = GPUNeighborFinder(
-                eligible=(gpu ? CuArray(no_nbs) : no_nbs),
+                eligible=AT(no_nbs),
                 dist_cutoff=1.0u"nm",
             )
-        else 
+        else
             sys.neighbor_finder = DistanceNeighborFinder(
-                eligible=(gpu ? CuArray(no_nbs) : no_nbs),
+                eligible=AT(no_nbs),
                 dist_cutoff=1.0u"nm",
             )
         end
@@ -317,8 +317,8 @@ end
         end
     end
 
-    if run_gpu_tests
-        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=true)
+    for AT in array_list[2:end]
+        sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; array_type=AT)
         for neighbor_finder in (DistanceNeighborFinder,)
             nf_gpu = neighbor_finder(
                 eligible=sys_gpu.neighbor_finder.eligible,
@@ -327,7 +327,7 @@ end
             )
             neighbors_gpu = find_neighbors(sys_gpu, nf_gpu)
             @test length(neighbors_gpu) == n_neighbors_ref
-            CUDA.allowscalar() do
+            GPUArrays.allowscalar() do
                 @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool}
             end
             @test identical_neighbors(neighbors_gpu, neighbors_ref)
@@ -343,8 +343,8 @@ end
     coords_1 = SVector{3, Float64}.(eachcol(cm_1)) / 10 * u"nm"
     coords_2 = SVector{3, Float64}.(eachcol(cm_2)) / 10 * u"nm"
     @test rmsd(coords_1, coords_2) ≈ 2.54859467758795u"Å"
-    if run_gpu_tests
-        @test rmsd(CuArray(coords_1), CuArray(coords_2)) ≈ 2.54859467758795u"Å"
+    for AT in array_list[2:end]
+        @test rmsd(AT(coords_1), AT(coords_2)) ≈ 2.54859467758795u"Å"
     end
 
     bb_atoms = BioStructures.collectatoms(struc[1], BioStructures.backboneselector)
diff --git a/test/energy_conservation.jl b/test/energy_conservation.jl
index f29aca2b4..22d020c4b 100644
--- a/test/energy_conservation.jl
+++ b/test/energy_conservation.jl
@@ -1,12 +1,15 @@
 # Energy conservation test
 
 using Molly
+using AbstractGPUArray
+using AMDGPU
 using CUDA
 
 using Test
 
 @testset "Lennard-Jones energy conservation" begin
-    function test_energy_conservation(nl::Bool, gpu::Bool, n_threads::Integer, n_steps::Integer)
+    function test_energy_conservation(nl::Bool, ::Type{AT}, n_threads::Integer,
+                                      n_steps::Integer) where AT
         n_atoms = 2_000
         atom_mass = 40.0u"g/mol"
         temp = 1.0u"K"
@@ -24,25 +27,27 @@ using Test
     
         for cutoff in cutoffs
             coords = place_atoms(n_atoms, boundary; min_dist=0.1u"nm")
-            neighbor_finder = NoNeighborFinder()
-            if nl && gpu
-                neighbor_finder=GPUNeighborFinder(
-                    eligible=CuArray(trues(n_atoms, n_atoms)),
-                    n_steps_reorder=10,
-                    dist_cutoff=dist_cutoff,
-                )
-            end
-            if nl && !gpu
-                neighbor_finder=DistanceNeighborFinder(
-                    eligible=trues(n_atoms, n_atoms),
-                    n_steps=10,
-                    dist_cutoff=dist_cutoff,
-                )
+            if nl
+                if AT <: CuArray
+                    neighbor_finder=GPUNeighborFinder(
+                        eligible=AT(trues(n_atoms, n_atoms)),
+                        n_steps_reorder=10,
+                        dist_cutoff=dist_cutoff,
+                    )
+                else
+                    neighbor_finder=DistanceNeighborFinder(
+                        eligible=AT(trues(n_atoms, n_atoms)),
+                        n_steps=10,
+                        dist_cutoff=dist_cutoff,
+                    )
+                end
+            else
+                neighbor_finder = NoNeighborFinder()
             end
     
             sys = System(
-                atoms=(gpu ? CuArray(atoms) : atoms),
-                coords=(gpu ? CuArray(coords) : coords),
+                atoms=AT(atoms),
+                coords=AT(coords),
                 boundary=boundary,
                 pairwise_inters=(LennardJones(cutoff=cutoff, use_neighbors=ifelse(nl, true, false)),),
                 neighbor_finder=neighbor_finder,
@@ -61,7 +66,7 @@ using Test
             @test isapprox(Es[1], E0; atol=1e-7u"kJ * mol^-1")
     
             max_ΔE = maximum(abs.(Es .- E0))
-            platform_str = gpu ? "GPU" : "CPU $n_threads thread(s)"
+            platform_str = (AT <: AbstractGPUArray ? "$AT" : "CPU $n_threads thread(s)")
             cutoff_str = Base.typename(typeof(cutoff)).wrapper
             @info "$platform_str - $cutoff_str - max energy difference $max_ΔE"
             @test max_ΔE < 5e-4u"kJ * mol^-1"
@@ -72,16 +77,18 @@ using Test
         end
     end
 
-    test_energy_conservation(true, false, 1, 10_000)
-    test_energy_conservation(false, false, 1, 10_000)
+    test_energy_conservation(true , Array, 1, 10_000)
+    test_energy_conservation(false, Array, 1, 10_000)
     if Threads.nthreads() > 1
-        test_energy_conservation(true, false, Threads.nthreads(), 50_000)
-        test_energy_conservation(false, false, Threads.nthreads(), 50_000)
+        test_energy_conservation(true , Array, Threads.nthreads(), 50_000)
+        test_energy_conservation(false, Array, Threads.nthreads(), 50_000)
     end
     if CUDA.functional()
-        test_energy_conservation(true, true, 1, 100_000)
-        test_energy_conservation(false, true, 1, 100_000)
+        test_energy_conservation(true , CuArray, 1, 100_000)
+        test_energy_conservation(false, CuArray, 1, 100_000)
+    end
+    if AMDGPU.functional()
+        test_energy_conservation(true , ROCArray, 1, 100_000)
+        test_energy_conservation(false, ROCArray, 1, 100_000)
     end
 end
-
-
diff --git a/test/gradients.jl b/test/gradients.jl
index cce785a6b..bdda204aa 100644
--- a/test/gradients.jl
+++ b/test/gradients.jl
@@ -36,24 +36,24 @@ end
 
 @testset "Differentiable simulation" begin
     runs = [ #               gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        ("CPU"             , false, false, false, false, false, false, 1e-4, 1e-4),
-        ("CPU forward"     , false, false, true , false, false, false, 0.5 , 0.1 ),
-        ("CPU f32"         , false, false, false, true , false, false, 0.01, 5e-4),
-        ("CPU obc2"        , false, false, false, false, true , false, 1e-4, 1e-4),
-        ("CPU gbn2"        , false, false, false, false, false, true , 1e-4, 1e-4),
-        ("CPU gbn2 forward", false, false, true , false, false, true , 0.5 , 0.1 ),
+        ("CPU"             , Array, false, false, false, false, false, 1e-4, 1e-4),
+        ("CPU forward"     , Array, false, true , false, false, false, 0.5 , 0.1 ),
+        ("CPU f32"         , Array, false, false, true , false, false, 0.01, 5e-4),
+        ("CPU obc2"        , Array, false, false, false, true , false, 1e-4, 1e-4),
+        ("CPU gbn2"        , Array, false, false, false, false, true , 1e-4, 1e-4),
+        ("CPU gbn2 forward", Array, false, true , false, false, true , 0.5 , 0.1 ),
     ]
-    if run_parallel_tests #                  gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        push!(runs, ("CPU parallel"        , false, true , false, false, false, false, 1e-4, 1e-4))
-        push!(runs, ("CPU parallel forward", false, true , true , false, false, false, 0.5 , 0.1 ))
-        push!(runs, ("CPU parallel f32"    , false, true , false, true , false, false, 0.01, 5e-4))
+    if run_parallel_tests #                  gpu      par    fwd    f32    obc2   gbn2   tol_σ tol_r0
+        push!(runs, ("CPU parallel"        , Array, true , false, false, false, false, 1e-4, 1e-4))
+        push!(runs, ("CPU parallel forward", Array, true , true , false, false, false, 0.5 , 0.1 ))
+        push!(runs, ("CPU parallel f32"    , Array, true , false, true , false, false, 0.01, 5e-4))
     end
-    if run_gpu_tests #                       gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
-        push!(runs, ("GPU"                 , true , false, false, false, false, false, 0.25, 20.0))
-        push!(runs, ("GPU forward"         , true , false, true , false, false, false, 0.25, 20.0))
-        push!(runs, ("GPU f32"             , true , false, false, true , false, false, 0.5 , 50.0))
-        push!(runs, ("GPU obc2"            , true , false, false, false, true , false, 0.25, 20.0))
-        push!(runs, ("GPU gbn2"            , true , false, false, false, false, true , 0.25, 20.0))
+    for AT in array_list[2:end] #            gpu    par    fwd    f32    obc2   gbn2   tol_σ tol_r0
+        push!(runs, ("$AT"                 , AT   , false, false, false, false, false, 0.25, 20.0))
+        push!(runs, ("$AT forward"         , AT   , false, true , false, false, false, 0.25, 20.0))
+        push!(runs, ("$AT f32"             , AT   , false, false, true , false, false, 0.5 , 50.0))
+        push!(runs, ("$AT obc2"            , AT   , false, false, false, true , false, 0.25, 20.0))
+        push!(runs, ("$AT gbn2"            , AT   , false, false, false, false, true , 0.25, 20.0))
     end
 
     function mean_min_separation(coords, boundary, ::Val{T}) where T
@@ -103,9 +103,8 @@ end
         return mean_min_separation(sys.coords, boundary, Val(T))
     end
 
-    for (name, gpu, parallel, forward, f32, obc2, gbn2, tol_σ, tol_r0) in runs
+    for (name, AT, parallel, forward, f32, obc2, gbn2, tol_σ, tol_r0) in runs
         T = f32 ? Float32 : Float64
-        AT = gpu ? CuArray : Array
         σ  = T(0.4)
         r0 = T(1.0)
         n_atoms = 50
@@ -245,13 +244,13 @@ end
 end
 
 @testset "Differentiable protein" begin
-    function create_sys(gpu::Bool)
+    function create_sys(AT)
         ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...; units=false)
         return System(
             joinpath(data_dir, "6mrr_nowater.pdb"),
             ff;
             units=false,
-            gpu=gpu,
+            array_type=AT,
             implicit_solvent="gbn2",
             kappa=0.7,
         )
@@ -402,10 +401,10 @@ end
 
     platform_runs = [("CPU", false, false)]
     if run_parallel_tests
-        push!(platform_runs, ("CPU parallel", false, true))
+        push!(platform_runs, ("CPU parallel", Array, true))
     end
-    if run_gpu_tests
-        push!(platform_runs, ("GPU", true, false))
+    for AT in array_list[2:end]
+        push!(platform_runs, ("$AT", AT, false))
     end
     test_runs = [
         ("Energy", test_energy_grad, 1e-8),
@@ -423,8 +422,8 @@ end
     )
 
     for (test_name, test_fn, test_tol) in test_runs
-        for (platform, gpu, parallel) in platform_runs
-            sys_ref = create_sys(gpu)
+        for (platform, AT, parallel) in platform_runs
+            sys_ref = create_sys(AT)
             n_threads = parallel ? Threads.nthreads() : 1
             grads_enzyme = Dict(k => 0.0 for k in keys(params_dic))
             autodiff(
diff --git a/test/minimization.jl b/test/minimization.jl
index 83a10f0e8..c3baa0826 100644
--- a/test/minimization.jl
+++ b/test/minimization.jl
@@ -42,14 +42,14 @@
     @test isapprox(potential_energy(sys; n_threads=1) * u"kJ * mol^-1", -3.0u"kJ * mol^-1";
                     atol=1e-4u"kJ * mol^-1")
 
-    if run_gpu_tests
-        coords = CuArray([
+    for AT in array_list[2:end]
+        coords = AT([
             SVector(1.0, 1.0, 1.0)u"nm",
             SVector(1.6, 1.0, 1.0)u"nm",
             SVector(1.4, 1.6, 1.0)u"nm",
         ])
         sys = System(
-            atoms=CuArray([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
+            atoms=AT([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]),
             coords=coords,
             boundary=CubicBoundary(5.0u"nm"),
             pairwise_inters=(LennardJones(),),
@@ -57,10 +57,10 @@
         sim = SteepestDescentMinimizer(tol=1.0u"kJ * mol^-1 * nm^-1")
 
         simulate!(sys, sim)
-        dists = distances(sys.coords, sys.boundary)
+        dists = Array(distances(sys.coords, sys.boundary))
         dists_flat = dists[triu(trues(3, 3), 1)]
-        @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-3u"nm"), dists_flat)
+        @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-2u"nm"), dists_flat)
         @test isapprox(potential_energy(sys), -3.0u"kJ * mol^-1";
-                        atol=1e-4u"kJ * mol^-1")
+                        atol=1e-2u"kJ * mol^-1")
     end
 end
diff --git a/test/protein.jl b/test/protein.jl
index 9d7ab007e..c68016527 100644
--- a/test/protein.jl
+++ b/test/protein.jl
@@ -179,12 +179,12 @@ end
     @test pis_grad == sys_nounits.pairwise_inters
 
     # Test the same simulation on the GPU
-    if run_gpu_tests
+    for AT in array_list[2:end]
         sys = System(
             joinpath(data_dir, "6mrr_equil.pdb"),
             ff;
-            velocities=CuArray(copy(velocities_start)),
-            gpu=true,
+            velocities=AT(copy(velocities_start)),
+            array_type=AT,
             center_coords=false,
         )
         @test kinetic_energy(sys) ≈ 65521.87288132431u"kJ * mol^-1"
@@ -211,9 +211,9 @@ end
         sys_nounits = System(
             joinpath(data_dir, "6mrr_equil.pdb"),
             ff_nounits;
-            velocities=CuArray(copy(ustrip_vec.(velocities_start))),
+            velocities=AT(copy(ustrip_vec.(velocities_start))),
             units=false,
-            gpu=true,
+            array_type=AT,
             center_coords=false,
         )
         @test kinetic_energy(sys_nounits)u"kJ * mol^-1" ≈ 65521.87288132431u"kJ * mol^-1"
@@ -248,13 +248,13 @@ end
 @testset "Implicit solvent" begin
     ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...)
 
-    for gpu in gpu_list
+    for AT in array_list
         for solvent_model in ("obc2", "gbn2")
             sys = System(
                 joinpath(data_dir, "6mrr_nowater.pdb"),
                 ff;
                 boundary=CubicBoundary(100.0u"nm"),
-                gpu=gpu,
+                array_type=AT,
                 dist_cutoff=5.0u"nm",
                 dist_neighbors=5.0u"nm",
                 implicit_solvent=solvent_model,
diff --git a/test/runtests.jl b/test/runtests.jl
index b18d4d73c..3c602a14c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,4 +1,5 @@
 using Molly
+using AMDGPU
 using Aqua
 import AtomsBase
 using AtomsBaseTesting
@@ -8,7 +9,10 @@ import BioStructures # Imported to avoid clashing names
 using CUDA
 using Enzyme
 using FiniteDifferences
+using GPUArrays
 using KernelDensity
+using Metal
+using oneAPI
 import SimpleCrystals
 
 using DelimitedFiles
@@ -49,18 +53,45 @@ else
     @warn "The parallel tests will not be run as Julia is running on 1 thread"
 end
 
-# Allow CUDA device to be specified
+const run_gpu_tests = get(ENV, "GPUTESTS", "1") != "0"
+# Allow GPU device to be specified
 const DEVICE = parse(Int, get(ENV, "DEVICE", "0"))
 
-const run_gpu_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional()
-const gpu_list = (run_gpu_tests ? (false, true) : (false,))
-if run_gpu_tests
-    device!(DEVICE)
-    @info "The GPU tests will be run on device $DEVICE"
-elseif get(ENV, "GPUTESTS", "1") == "0"
-    @warn "The GPU tests will not be run as GPUTESTS is set to 0"
+const run_cuda_tests   = run_gpu_tests && CUDA.functional()
+const run_rocm_tests   = run_gpu_tests && AMDGPU.functional()
+const run_oneapi_tests = run_gpu_tests && oneAPI.functional()
+const run_metal_tests  = run_gpu_tests && Metal.functional()
+
+array_list = (Array,)
+
+if run_cuda_tests
+    array_list = (array_list..., CuArray)
+    CUDA.device!(DEVICE)
+    @info "The CUDA tests will be run on device $DEVICE"
+else
+    @warn "The CUDA tests will not be run as a CUDA-enabled device is not available"
+end
+
+if run_rocm_tests
+    array_list = (array_list..., ROCArray)
+    AMDGPU.device!(AMDGPU.device(DEVICE+1))
+    @info "The AMDGPU tests will be run on device $DEVICE"
+else
+    @warn "The AMDGPU tests will not be run as a AMDGPU-enabled device is not available"
+end
+
+if run_oneapi_tests
+    array_list = (array_list..., oneArray)
+    oneAPI.device!(DEVICE)
+    @info "The oneAPI tests will be run on device $DEVICE"
+else
+    @warn "The oneAPI tests will not be run as a oneAPI-enabled device is not available"
+end
+
+if run_metal_tests
+    @info "The Metal tests will be run"
 else
-    @warn "The GPU tests will not be run as a CUDA-enabled device is not available"
+    @warn "The Metal tests will not be run as a Metal-enabled device is not available"
 end
 
 const data_dir = normpath(@__DIR__, "..", "data")
diff --git a/test/simulation.jl b/test/simulation.jl
index 8ecbab863..6b277609e 100644
--- a/test/simulation.jl
+++ b/test/simulation.jl
@@ -1,6 +1,5 @@
 @testset "Lennard-Jones 2D" begin
-    for gpu in gpu_list
-        AT = gpu ? CuArray : Array
+    for AT in array_list
         n_atoms = 10
         n_steps = 20_000
         temp = 100.0u"K"
@@ -8,7 +7,7 @@
         simulator = VelocityVerlet(dt=0.001u"ps", coupling=AndersenThermostat(temp, 10.0u"ps"))
         gen_temp_wrapper(s, args...; kwargs...) = temperature(s)
 
-        if gpu
+        if AT <: CuArray
             neighbor_finder = GPUNeighborFinder(
                 eligible=eligible=AT(trues(n_atoms, n_atoms)),
                 n_steps_reorder=10,
@@ -16,7 +15,7 @@
             )
         else
             neighbor_finder = DistanceNeighborFinder(
-                eligible=trues(n_atoms, n_atoms),
+                eligible=AT(trues(n_atoms, n_atoms)),
                 n_steps=10,
                 dist_cutoff=2.0u"nm",
             )
@@ -221,39 +220,32 @@ end
         OverdampedLangevin(dt=0.002u"ps", temperature=temp, friction=10.0u"ps^-1"),
     ]
 
-    s = System(
-        atoms=[Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms],
-        coords=coords,
-        boundary=boundary,
-        pairwise_inters=(LennardJones(use_neighbors=true),),
-        neighbor_finder=DistanceNeighborFinder(
-            eligible=trues(n_atoms, n_atoms),
-            n_steps=10,
-            dist_cutoff=2.0u"nm",
-        ),
-        loggers=(coords=CoordinatesLogger(100),),
-    )
-    random_velocities!(s, temp)
-
-    if run_gpu_tests
-        s_gpu = System(
-            atoms=CuArray([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms]),
-            coords=CuArray(coords),
-            boundary=boundary,
-            pairwise_inters=(LennardJones(use_neighbors=true),),
-            neighbor_finder=GPUNeighborFinder(
-                eligible=CuArray(trues(n_atoms, n_atoms)),
+    for AT in array_list
+        if AT <: CuArray
+            neighbor_finder = GPUNeighborFinder(
+                eligible=AT(trues(n_atoms, n_atoms)),
                 n_steps_reorder=10,
                 dist_cutoff=2.0u"nm",
-            ),
+            )
+        else
+            neighbor_finder = DistanceNeighborFinder(
+                eligible=AT(trues(n_atoms, n_atoms)),
+                n_steps=10,
+                dist_cutoff=2.0u"nm",
+            )
+        end
+        s = System(
+            atoms=AT([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1")
+                      for i in 1:n_atoms]),
+            coords=AT(coords),
+            boundary=boundary,
+            pairwise_inters=(LennardJones(use_neighbors=true),),
+            neighbor_finder=neighbor_finder,
             loggers=(coords=CoordinatesLogger(100),),
         )
-    end
-
-    for simulator in simulators
-        @time simulate!(s, simulator, n_steps; n_threads=1)
-        if run_gpu_tests
-            @time simulate!(s_gpu, simulator, n_steps; n_threads=1)
+        random_velocities!(s, temp)
+        for simulator in simulators
+            @time simulate!(s, simulator, n_steps; n_threads=1)
         end
     end
 end
@@ -285,7 +277,7 @@ end
         loggers=(coords=CoordinatesLogger(100),),
     )
 
-    if run_gpu_tests
+    if run_cuda_tests
         s_gpu = System(
             atoms=CuArray([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.1u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms]),
             coords=CuArray(coords),
@@ -303,7 +295,7 @@ end
 
     for simulator in simulators
         @time simulate!(s, simulator, n_steps; n_threads=1)
-        if run_gpu_tests
+        if run_cuda_tests
             @time simulate!(s_gpu, simulator, n_steps; n_threads=1)
             coord_diff = sum(sum(map(x -> abs.(x), s.coords .- Array(s_gpu.coords)))) / (3 * n_atoms)
             E_diff = abs(potential_energy(s) - potential_energy(s_gpu))
@@ -437,7 +429,7 @@ end
             neighbor_finder = NoNeighborFinder()
         end
 
-        if run_gpu_tests
+        if run_cuda_tests
             neighbor_finder_gpu = GPUNeighborFinder(eligible=CuArray(trues(n_atoms, n_atoms)), n_steps_reorder=10,
                                                         dist_cutoff=1.2u"nm")
         end
@@ -457,7 +449,7 @@ end
         E0 = potential_energy(s)
         @time simulate!(s, simulator, n_steps)
 
-        if run_gpu_tests
+        if run_cuda_tests
             s_gpu = System(
                 atoms=CuArray(atoms),
                 coords=CuArray(coords),
@@ -574,7 +566,7 @@ end
 end
 
 @testset "Position restraints" begin
-    for gpu in gpu_list
+    for AT in array_list
         n_atoms = 10
         n_atoms_res = n_atoms ÷ 2
         n_steps = 2_000
@@ -585,8 +577,8 @@ end
         sim = Langevin(dt=0.001u"ps", temperature=300.0u"K", friction=1.0u"ps^-1")
 
         sys = System(
-            atoms=(gpu ? CuArray(atoms) : atoms),
-            coords=(gpu ? CuArray(copy(starting_coords)) : copy(starting_coords)),
+            atoms=AT(atoms),
+            coords=AT(copy(starting_coords)),
             boundary=boundary,
             atoms_data=atoms_data,
             pairwise_inters=(LennardJones(),),
@@ -1077,11 +1069,10 @@ end
     vvand_baro = VelocityVerlet(dt=dt, coupling=(AndersenThermostat(temp, 1.0u"ps"), barostat))
 
     for sim in (lang_baro, vvand_baro)
-        for gpu in gpu_list
-            if gpu && sim == vvand_baro
+        for AT in array_list
+            if AT <: AbstractGPUArray && sim == vvand_baro
                 continue
             end
-            AT = gpu ? CuArray : Array
 
             sys = System(
                 atoms=AT(atoms),
@@ -1141,10 +1132,9 @@ end
         SVector(nothing  , nothing  , nothing  ), # Uncoupled
     )
 
-    for gpu in gpu_list
-        AT = gpu ? CuArray : Array
+    for AT in array_list
         for (press_i, press) in enumerate(pressure_test_set)
-            if gpu && press_i != 2
+            if AT <: AbstractGPUArray && press_i != 2
                 continue
             end
 
@@ -1210,10 +1200,10 @@ end
         MonteCarloMembraneBarostat(press, tens, temp, boundary; z_axis_fixed=true),
     )
 
-    for gpu in gpu_list
-        AT = gpu ? CuArray : Array
+    if run_cuda_tests
+        AT = CuArray
         for (barostat_i, barostat) in enumerate(barostat_test_set)
-            if gpu && barostat_i != 2
+            if AT <: AbstractGPUArray && barostat_i != 2
                 continue
             end
 
@@ -1333,7 +1323,7 @@ end
     starting_coords_f32 = [Float32.(c) for c in starting_coords]
     starting_velocities_f32 = [Float32.(c) for c in starting_velocities]
 
-    function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool)
+    function test_sim(nl::Bool, parallel::Bool, f32::Bool, ::Type{AT}) where AT
         n_atoms = 400
         n_steps = 200
         atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol"
@@ -1343,43 +1333,37 @@ end
         r0 = f32 ? 0.2f0u"nm" : 0.2u"nm"
         bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)]
         specific_inter_lists = (InteractionList2Atoms(
-            gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)),
-            gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)),
-            gpu ? CuArray(bonds) : bonds,
+            AT(Int32.(collect(1:2:n_atoms))),
+            AT(Int32.(collect(2:2:n_atoms))),
+            AT(bonds),
         ),)
-
-        neighbor_finder = NoNeighborFinder()
         cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm")
-        pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
-        if nl && gpu
-            neighbor_finder = GPUNeighborFinder(
-                eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
-                n_steps_reorder=10,
-                dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
-            )
-            pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
-        end
-        if nl && !gpu
-            neighbor_finder = DistanceNeighborFinder(
-                eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms),
-                n_steps=10,
-                dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
-            )
+
+        if nl
+            if AT <: CuArray
+                neighbor_finder = GPUNeighborFinder(
+                    eligible=AT(trues(n_atoms, n_atoms)),
+                    n_steps_reorder=10,
+                    dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
+                )
+            else
+                neighbor_finder = DistanceNeighborFinder(
+                    eligible=AT(trues(n_atoms, n_atoms)),
+                    n_steps=10,
+                    dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm",
+                )
+            end
             pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),)
+        else
+            neighbor_finder = NoNeighborFinder()
+            pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),)
         end
         show(devnull, neighbor_finder)
 
-        if gpu
-            coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords))
-            velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities))
-            atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                                  ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
-        else
-            coords = copy(f32 ? starting_coords_f32 : starting_coords)
-            velocities = copy(f32 ? starting_velocities_f32 : starting_velocities)
-            atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
-                            ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]
-        end
+        coords = AT(copy(f32 ? starting_coords_f32 : starting_coords))
+        velocities = AT(copy(f32 ? starting_velocities_f32 : starting_velocities))
+        atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm",
+                                ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms])
 
         s = System(
             atoms=atoms,
@@ -1391,7 +1375,7 @@ end
             neighbor_finder=neighbor_finder,
         )
 
-        @test is_on_gpu(s) == gpu
+        @test is_on_gpu(s) == (AT <: AbstractGPUArray)
         @test float_type(s) == (f32 ? Float32 : Float64)
 
         n_threads = parallel ? Threads.nthreads() : 1
@@ -1402,22 +1386,26 @@ end
     end
 
     runs = [
-        ("CPU"       , [false, false, false, false]),
-        ("CPU f32"   , [false, false, true , false]),
-        ("CPU NL"    , [true , false, false, false]),
-        ("CPU f32 NL", [true , false, true , false]),
+        ("CPU"       , [false, false, false, Array]),
+        ("CPU f32"   , [false, false, true , Array]),
+        ("CPU NL"    , [true , false, false, Array]),
+        ("CPU f32 NL", [true , false, true , Array]),
     ]
     if run_parallel_tests
-        push!(runs, ("CPU parallel"       , [false, true , false, false]))
-        push!(runs, ("CPU parallel f32"   , [false, true , true , false]))
-        push!(runs, ("CPU parallel NL"    , [true , true , false, false]))
-        push!(runs, ("CPU parallel f32 NL", [true , true , true , false]))
+        push!(runs, ("CPU parallel"       , [false, true , false, Array]))
+        push!(runs, ("CPU parallel f32"   , [false, true , true , Array]))
+        push!(runs, ("CPU parallel NL"    , [true , true , false, Array]))
+        push!(runs, ("CPU parallel f32 NL", [true , true , true , Array]))
+    end
+    for AT in array_list[2:end]
+        push!(runs, ("$AT"       , [false, false, false, AT]))
+        push!(runs, ("$AT f32"   , [false, false, true , AT]))
+        push!(runs, ("$AT NL"    , [true , false, false, AT]))
+        push!(runs, ("$AT f32 NL", [true , false, true , AT]))
     end
-    if run_gpu_tests
-        push!(runs, ("GPU"       , [false, false, false, true]))
-        push!(runs, ("GPU f32"   , [false, false, true , true]))
-        push!(runs, ("GPU NL"    , [true , false, false, true]))
-        push!(runs, ("GPU f32 NL", [true , false, true , true]))
+    if run_metal_tests
+        push!(runs, ("$AT f32"   , [false, false, true , AT]))
+        push!(runs, ("$AT f32 NL", [true , false, true , AT]))
     end
 
     final_coords_ref, E_start_ref = test_sim(runs[1][2]...)