diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 7bb822ad9..3145e136e 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -29,6 +29,7 @@ jobs: - NotGradients - Gradients steps: + - run: export UCX_ERROR_SIGNALS="SIGILL,SIGBUS,SIGFPE" - uses: actions/checkout@v4 - uses: julia-actions/setup-julia@v2 with: diff --git a/.gitignore b/.gitignore index 293442edd..697b70410 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ *.jl.*.cov *.jl.mem docs/build -/Manifest.toml +*Manifest.toml benchmark/tune.json benchmark/results .vscode/settings.json diff --git a/Project.toml b/Project.toml index 5654df3c5..469f820ea 100644 --- a/Project.toml +++ b/Project.toml @@ -8,7 +8,6 @@ Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a" AtomsCalculators = "a3e0e189-c65a-42c1-833c-339540406eb1" BioStructures = "de9282ab-8554-53be-b2d6-f6c222edabfc" -CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" CellListMap = "69e1c6dd-3888-40e6-b3c8-31ac5f578864" Chemfiles = "46823bd8-5fb3-5f92-9aa0-96921f3dd015" Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa" @@ -17,7 +16,9 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7" Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" EzXML = "8f5d6c58-4d21-5cfd-889c-e3ad7ee6a615" FLoops = "cc61a311-1640-44b5-9fba-1b764f453329" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" NearestNeighbors = "b8a86587-4115-5ab1-83bc-aa920d37bbce" PeriodicTable = "7b2266bf-644c-5ea3-82d8-af4bbd25a884" @@ -32,6 +33,7 @@ UnitfulAtomic = "a7773ee8-282e-5fa2-be4e-bd808c38a91a" UnsafeAtomicsLLVM = "d80eeb9a-aca5-4d75-85e5-170c8b632249" [weakdeps] +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Colors = "5ae59095-9a9b-59fe-a467-6f913c188581" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" @@ -39,6 +41,7 @@ KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b" PythonCall = "6099a3de-0909-46bc-b1f4-468b9a2dfc0d" [extensions] +MollyCUDAExt = "CUDA" MollyEnzymeExt = "Enzyme" MollyGLMakieExt = ["GLMakie", "Colors"] MollyKernelDensityExt = "KernelDensity" @@ -61,7 +64,9 @@ Enzyme = "0.13.20" EzXML = "1" FLoops = "0.2" GLMakie = "0.8, 0.9, 0.10, 0.11" +GPUArrays = "11" Graphs = "1.8" +KernelAbstractions = "0.9" KernelDensity = "0.5, 0.6" LinearAlgebra = "1.9" NearestNeighbors = "0.4" diff --git a/README.md b/README.md index 2db2fa5d8..182d52f52 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Implemented features include: - [Unitful.jl](https://github.com/PainterQubits/Unitful.jl) compatibility so numbers have physical meaning. - Set up crystal systems using [SimpleCrystals.jl](https://github.com/ejmeitz/SimpleCrystals.jl). - Automatic multithreading. -- GPU acceleration on CUDA-enabled devices. +- GPU acceleration on all backends supported by [KernelAbstractions.jl](https://github.com/JuliaGPU/KernelAbstractions.jl), with better performance on CUDA-enabled devices. - Run with Float64, Float32 or other float types. - Some analysis functions, e.g. RDF. - Visualise simulations as animations with [Makie.jl](https://makie.juliaplots.org/stable). diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index 08e6c5b4a..c790b4463 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -17,15 +17,15 @@ else @warn "The parallel benchmarks will not be run as Julia is running on 1 thread" end -# Allow CUDA device to be specified -const DEVICE = get(ENV, "DEVICE", "0") +# Allow GPU device to be specified +const DEVICE = parse(Int, get(ENV, "DEVICE", "0")) -const run_gpu_tests = CUDA.functional() -if run_gpu_tests - device!(parse(Int, DEVICE)) - @info "The GPU benchmarks will be run on device $DEVICE" +const run_cuda_tests = CUDA.functional() +if run_cuda_tests + device!(DEVICE) + @info "The CUDA benchmarks will be run on device $DEVICE" else - @warn "The GPU benchmarks will not be run as a CUDA-enabled device is not available" + @warn "The CUDA benchmarks will not be run as a CUDA-enabled device is not available" end const SUITE = BenchmarkGroup( @@ -62,7 +62,7 @@ const starting_velocities = [random_velocity(atom_mass, 1.0u"K") for i in 1:n_at const starting_coords_f32 = [Float32.(c) for c in starting_coords] const starting_velocities_f32 = [Float32.(c) for c in starting_velocities] -function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) +function test_sim(nl::Bool, parallel::Bool, f32::Bool, ::Type{AT}) where AT n_atoms = 400 n_steps = 200 atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol" @@ -72,9 +72,9 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) r0 = f32 ? 0.2f0u"nm" : 0.2u"nm" bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)] specific_inter_lists = (InteractionList2Atoms( - gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)), - gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)), - gpu ? CuArray(bonds) : bonds, + AT(Int32.(collect(1:2:n_atoms))), + AT(Int32.(collect(2:2:n_atoms))), + AT(bonds), ),) neighbor_finder = NoNeighborFinder() @@ -82,24 +82,17 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),) if nl neighbor_finder = DistanceNeighborFinder( - eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms), + eligible=AT(trues(n_atoms, n_atoms)), n_steps=10, dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm", ) pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),) end - if gpu - coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords)) - velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities)) - atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", - ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]) - else - coords = copy(f32 ? starting_coords_f32 : starting_coords) - velocities = copy(f32 ? starting_velocities_f32 : starting_velocities) - atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", - ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms] - end + coords = AT(copy(f32 ? starting_coords_f32 : starting_coords)) + velocities = AT(copy(f32 ? starting_velocities_f32 : starting_velocities)) + atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", + ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]) sys = System( atoms=atoms, @@ -117,22 +110,22 @@ function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) end runs = [ - ("CPU" , [false, false, false, false]), - ("CPU f32" , [false, false, true , false]), - ("CPU NL" , [true , false, false, false]), - ("CPU f32 NL", [true , false, true , false]), + ("CPU" , [false, false, false, Array]), + ("CPU f32" , [false, false, true , Array]), + ("CPU NL" , [true , false, false, Array]), + ("CPU f32 NL", [true , false, true , Array]), ] if run_parallel_tests - push!(runs, ("CPU parallel" , [false, true , false, false])) - push!(runs, ("CPU parallel f32" , [false, true , true , false])) - push!(runs, ("CPU parallel NL" , [true , true , false, false])) - push!(runs, ("CPU parallel f32 NL", [true , true , true , false])) + push!(runs, ("CPU parallel" , [false, true , false, Array])) + push!(runs, ("CPU parallel f32" , [false, true , true , Array])) + push!(runs, ("CPU parallel NL" , [true , true , false, Array])) + push!(runs, ("CPU parallel f32 NL", [true , true , true , Array])) end -if run_gpu_tests - push!(runs, ("GPU" , [false, false, false, true])) - push!(runs, ("GPU f32" , [false, false, true , true])) - push!(runs, ("GPU NL" , [true , false, false, true])) - push!(runs, ("GPU f32 NL", [true , false, true , true])) +if run_cuda_tests + push!(runs, ("GPU" , [false, false, false, CuArray])) + push!(runs, ("GPU f32" , [false, false, true , CuArray])) + push!(runs, ("GPU NL" , [true , false, false, CuArray])) + push!(runs, ("GPU f32 NL", [true , false, true , CuArray])) end for (name, args) in runs diff --git a/benchmark/protein.jl b/benchmark/protein.jl index 30f512c07..7ff549c22 100644 --- a/benchmark/protein.jl +++ b/benchmark/protein.jl @@ -11,7 +11,7 @@ const data_dir = normpath(dirname(pathof(Molly)), "..", "data") const ff_dir = joinpath(data_dir, "force_fields") const openmm_dir = joinpath(data_dir, "openmm_6mrr") -function setup_system(gpu::Bool, f32::Bool, units::Bool) +function setup_system(::Type{AT}, f32::Bool, units::Bool) where AT T = f32 ? Float32 : Float64 ff = MolecularForceField( T, @@ -27,9 +27,9 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool) sys = System( joinpath(data_dir, "6mrr_equil.pdb"), ff; - velocities=gpu ? CuArray(velocities) : velocities, + velocities=AT(velocities), units=units, - gpu=gpu, + array_type=AT, dist_cutoff=(units ? dist_cutoff * u"nm" : dist_cutoff), dist_neighbors=(units ? dist_neighbors * u"nm" : dist_neighbors), ) @@ -41,21 +41,21 @@ function setup_system(gpu::Bool, f32::Bool, units::Bool) end runs = [ - # run_name gpu parr f32 units - ("CPU 1 thread" , false, false, false, true ), - ("CPU 1 thread f32" , false, false, true , true ), - ("CPU 1 thread f32 nounits" , false, false, true , false), - ("CPU $n_threads threads" , false, true , false, true ), - ("CPU $n_threads threads f32" , false, true , true , true ), - ("CPU $n_threads threads f32 nounits", false, true , true , false), - ("GPU" , true , false, false, true ), - ("GPU f32" , true , false, true , true ), - ("GPU f32 nounits" , true , false, true , false), + # run_name gpu parr f32 units + ("CPU 1 thread" , Array , false, false, true ), + ("CPU 1 thread f32" , Array , false, true , true ), + ("CPU 1 thread f32 nounits" , Array , false, true , false), + ("CPU $n_threads threads" , Array , true , false, true ), + ("CPU $n_threads threads f32" , Array , true , true , true ), + ("CPU $n_threads threads f32 nounits", Array , true , true , false), + ("GPU" , CuArray, false, false, true ), + ("GPU f32" , CuArray, false, true , true ), + ("GPU f32 nounits" , CuArray, false, true , false), ] -for (run_name, gpu, parallel, f32, units) in runs +for (run_name, AT, parallel, f32, units) in runs n_threads_used = parallel ? n_threads : 1 - sys, sim = setup_system(gpu, f32, units) + sys, sim = setup_system(AT, f32, units) simulate!(deepcopy(sys), sim, 20; n_threads=n_threads_used) println(run_name) @time simulate!(sys, sim, n_steps; n_threads=n_threads_used) diff --git a/docs/src/documentation.md b/docs/src/documentation.md index 73521b51e..ce6d7c242 100644 --- a/docs/src/documentation.md +++ b/docs/src/documentation.md @@ -135,11 +135,21 @@ visualize(sys.loggers.coords, boundary, "sim_lj.mp4") ## GPU acceleration -To run simulations on the GPU you will need to have a CUDA-compatible device. -[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl) is used to run on the device. +To run simulations on the GPU you will need to have a GPU available and then load the appropriate package: + +| Hardware Available | Necessary Package | Array Type | +| ------------------ | ----------------- | ---------- | +| Parallel CPU | none | `Array` | +| NVIDIA GPU | CUDA | `CuArray` | +| AMD GPU | AMDGPU | `ROCArray` | +| Intel GPU | oneAPI | `oneArray` | +| Apple Silicon | Metal | `MtlArray` | + +As an important note, Metal/Apple Silicon devices can only run with 32 bit precision, so be sure to use `Float32` (for example) where necessary. Simulation setup is similar to above, but with the coordinates, velocities and atoms moved to the GPU. This example also shows setting up a simulation to run with `Float32`, which gives much better performance on GPUs. Of course, you will need to determine whether this level of numerical accuracy is appropriate in your case. +Here is an example script for an NVIDIA GPU using CUDA: ```julia using Molly using CUDA @@ -168,6 +178,7 @@ sys = System( simulate!(deepcopy(sys), simulator, 20) # Compile function simulate!(sys, simulator, 1_000) ``` +To use another GPU package, just swap out `CUDA` for your desired package and `CuArray` for your desired array type. The device to run on can be changed with `device!`, e.g. `device!(1)`. The GPU code path is currently designed to be compatible with differentiable simulation and runs slower than related software, but this is an active area of development. Nonetheless, GPU performance is significantly better than CPU performance and is good enough for many applications. @@ -316,7 +327,7 @@ sys = System( energy=TotalEnergyLogger(10), writer=StructureWriter(10, "traj_6mrr_1ps.pdb", ["HOH"]), ), - gpu=false, + array_type=Array, ) minimizer = SteepestDescentMinimizer() @@ -352,7 +363,7 @@ Residue patches, virtual sites, file includes and any force types other than `Ha Some PDB files that read in fine can be found [here](https://github.com/greener-group/GB99dms/tree/main/structures/training/conf_1). -To run on the GPU, set `gpu=true`. +To run on the GPU, set `array_type=GPUArrayType`, where `GPUArrayType` is the array type for your GPU backend (for example `CuArray` for NVIDIA or `ROCArray` for AMD). You can use an implicit solvent method by giving the `implicit_solvent` keyword argument to [`System`](@ref). The options are `"obc1"`, `"obc2"` and `"gbn2"`, corresponding to the Onufriev-Bashford-Case GBSA model with parameter set I or II and the GB-Neck2 model. Other options include overriding the boundary dimensions in the file (`boundary`) and modifying the non-bonded interaction and neighbor list cutoff distances (`dist_cutoff` and `dist_neighbors`). @@ -1017,10 +1028,10 @@ function Molly.simulate!(sys::ReplicaSystem, end ``` -Under the hood there are two implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, and a version geared towards GPUs. -You can define different versions of a simulator for CPU and GPU systems by dispatching on `System{D, false}` or `System{D, true}` respectively. +Under the hood there are multiple implementations for the [`forces`](@ref) function, used by [`accelerations`](@ref), and for [`potential_energy`](@ref): a version geared towards CPUs and parallelism, a CUDA version, and a version for other GPU backends. +You can define different versions of a simulator for CPU, CUDA and generic GPU systems by dispatching on `System{D, Array}` or `System{D, CuArray}` and `System{D, AT} where AT <: AbstractGPUArray` respectively. This also applies to coupling methods, neighbor finders and analysis functions. -You do not have to define two versions though: you may only intend to use the simulator one way, or one version may be performant in all cases. +You do not have to define different versions though: you may only intend to use the simulator one way, or one version may be performant in all cases. ## Coupling @@ -1321,7 +1332,7 @@ The available neighbor finders are: - [`DistanceNeighborFinder`](@ref) - [`TreeNeighborFinder`](@ref) -The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU and [`GPUNeighborFinder`](@ref) on GPU. +The recommended neighbor finder is [`CellListMapNeighborFinder`](@ref) on CPU, [`GPUNeighborFinder`](@ref) on NVIDIA GPUs and [`DistanceNeighborFinder`](@ref) on other GPUs. When using a neighbor finder you should in general also use an interaction cutoff (see [Cutoffs](@ref)) with a cutoff distance less than the neighbor finder distance. The difference between the two should be larger than an atom can move in the time of the `n_steps` defined by the neighbor finder. The exception is [`GPUNeighborFinder`](@ref), which uses the algorithm from [Eastman and Pande 2010](https://doi.org/10.1002/jcc.21413) to avoid calculating a neighbor list and should have `dist_cutoff` set to the interaction cutoff distance. diff --git a/src/cuda.jl b/ext/MollyCUDAExt.jl similarity index 61% rename from src/cuda.jl rename to ext/MollyCUDAExt.jl index e751f51a5..de68d0807 100644 --- a/src/cuda.jl +++ b/ext/MollyCUDAExt.jl @@ -1,6 +1,16 @@ -# CUDA.jl kernels +module MollyCUDAExt + +using Molly +using CUDA +using Atomix +using KernelAbstractions + const WARPSIZE = UInt32(32) +Molly.uses_gpu_neighbor_finder(::Type{AT}) where {AT <: CuArray} = true + +CUDA.Const(nl::Molly.NoNeighborList) = nl + macro shfl_multiple_sync(mask, target, width, vars...) all_lines = map(vars) do v Expr(:(=), v, @@ -29,71 +39,69 @@ function cuda_threads_blocks_specific(n_inters) return n_threads_gpu, n_blocks end -function pairwise_force_gpu!(buffers, sys::System{D, true, T}, pairwise_inters, nbs, step_n) where {D, T} - if typeof(nbs) == NoNeighborList - kernel = @cuda launch=false pairwise_force_kernel_nonl!( - buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n, - Val(D), Val(sys.force_units)) - conf = launch_configuration(kernel.fun) - threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512")) - nthreads = min(length(sys.atoms), threads_basic, conf.threads) - nthreads = cld(nthreads, WARPSIZE) * WARPSIZE - n_blocks_i = cld(length(sys.atoms), WARPSIZE) - n_blocks_j = cld(length(sys.atoms), nthreads) - kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, - step_n, Val(D), Val(sys.force_units); threads=nthreads, - blocks=(n_blocks_i, n_blocks_j)) - else - N = length(sys.coords) - n_blocks = cld(N, WARPSIZE) - r_cut = sys.neighbor_finder.dist_cutoff - if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized - Morton_bits = 4 - w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut) - Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits) - copyto!(buffers.Morton_seq, Morton_seq_cpu) - CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!( - buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N), - sys.boundary, Val(D)) - sys.neighbor_finder.initialized = true - CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!( - buffers.Morton_seq, sys.neighbor_finder.eligible, sys.neighbor_finder.special, - buffers.compressed_eligible, buffers.compressed_special, Val(N)) - end - CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!( - buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords, - sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters, - sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible, - Val(T), Val(D)) - end +function Molly.pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, + nbs::Molly.NoNeighborList, step_n) where {D, AT <: CuArray, T} + kernel = @cuda launch=false pairwise_force_kernel_nonl!( + buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, step_n, + Val(D), Val(sys.force_units)) + conf = launch_configuration(kernel.fun) + threads_basic = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512")) + nthreads = min(length(sys.atoms), threads_basic, conf.threads) + nthreads = cld(nthreads, WARPSIZE) * WARPSIZE + n_blocks_i = cld(length(sys.atoms), WARPSIZE) + n_blocks_j = cld(length(sys.atoms), nthreads) + kernel(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, + step_n, Val(D), Val(sys.force_units); threads=nthreads, + blocks=(n_blocks_i, n_blocks_j)) return buffers end -function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, true, T}, pairwise_inters, nbs, step_n) where {D, T} - if typeof(nbs) == NoNeighborList - n_threads_gpu, n_blocks = cuda_threads_blocks_pairwise(length(nbs)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks pairwise_pe_kernel!( - pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, nbs, step_n, Val(sys.energy_units)) - else - # The ordering is always recomputed for potential energy - # Different buffers are used to the forces case, so sys.neighbor_finder.initialized - # is not updated - N = length(sys.coords) - n_blocks = cld(N, WARPSIZE) - r_cut = sys.neighbor_finder.dist_cutoff +function Molly.pairwise_force_gpu!(buffers, sys::System{D, AT, T}, pairwise_inters, nbs::Nothing, + step_n) where {D, AT <: CuArray, T} + N = length(sys.coords) + n_blocks = cld(N, WARPSIZE) + r_cut = sys.neighbor_finder.dist_cutoff + if step_n % sys.neighbor_finder.n_steps_reorder == 0 || !sys.neighbor_finder.initialized Morton_bits = 4 w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut) Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits) copyto!(buffers.Morton_seq, Morton_seq_cpu) CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!( - buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, - Val(N), sys.boundary, Val(D)) - CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!( - buffers.Morton_seq, pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords, - sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), pairwise_inters, - sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible, - Val(T), Val(D)) - end + buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, Val(N), + sys.boundary, Val(D)) + sys.neighbor_finder.initialized = true + CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true compress_boolean_matrices!( + buffers.Morton_seq, sys.neighbor_finder.eligible, sys.neighbor_finder.special, + buffers.compressed_eligible, buffers.compressed_special, Val(N)) + end + CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true force_kernel!( + buffers.Morton_seq, buffers.fs_mat, buffers.box_mins, buffers.box_maxs, sys.coords, + sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.force_units), pairwise_inters, + sys.boundary, step_n, buffers.compressed_special, buffers.compressed_eligible, + Val(T), Val(D)) + return buffers +end + +function Molly.pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT, T}, pairwise_inters, + nbs::Nothing, step_n) where {D, AT <: CuArray, T} + # The ordering is always recomputed for potential energy + # Different buffers are used to the forces case, so sys.neighbor_finder.initialized + # is not updated + N = length(sys.coords) + n_blocks = cld(N, WARPSIZE) + r_cut = sys.neighbor_finder.dist_cutoff + Morton_bits = 4 + w = r_cut - typeof(ustrip(r_cut))(0.1) * unit(r_cut) + Morton_seq_cpu = sorted_Morton_seq(Array(sys.coords), w, Morton_bits) + copyto!(buffers.Morton_seq, Morton_seq_cpu) + CUDA.@sync @cuda blocks=(cld(N, WARPSIZE),) threads=(32,) kernel_min_max!( + buffers.Morton_seq, buffers.box_mins, buffers.box_maxs, sys.coords, + Val(N), sys.boundary, Val(D)) + CUDA.@sync @cuda blocks=(n_blocks, n_blocks) threads=(32, 1) always_inline=true energy_kernel!( + buffers.Morton_seq, pe_vec_nounits, buffers.box_mins, buffers.box_maxs, sys.coords, + sys.velocities, sys.atoms, Val(N), r_cut, Val(sys.energy_units), pairwise_inters, + sys.boundary, step_n, sys.neighbor_finder.special, sys.neighbor_finder.eligible, + Val(T), Val(D)) return pe_vec_nounits end @@ -276,7 +284,6 @@ function compress_boolean_matrices!(sorted_seq, eligible_matrix, special_matrix, return nothing end - #= **The No-neighborlist pairwise force summation kernel (algorithm by Eastman, see https://onlinelibrary.wiley.com/doi/full/10.1002/jcc.21413)**: 1. Case j < n_blocks && i < j, i.e., `WARPSIZE`×`WARPSIZE` tiles: For such tiles each row is assiged to a different thread in a warp which calculates the @@ -406,7 +413,7 @@ function force_kernel!( spec = (special_bitmask >> (warpsize() - shuffle_idx)) | (special_bitmask << shuffle_idx) condition = (excl & 0x1) == true && r2 <= r_cut * r_cut - f = condition ? sum_pairwise_forces( + f = condition ? Molly.sum_pairwise_forces( inters_tuple, atoms_i, atoms_j_shuffle, Val(force_units), @@ -471,7 +478,7 @@ function force_kernel!( spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m) condition = (excl & 0x1) == true && r2 <= r_cut * r_cut - f = condition ? sum_pairwise_forces( + f = condition ? Molly.sum_pairwise_forces( inters_tuple, atoms_i, atoms_j, Val(force_units), @@ -521,7 +528,7 @@ function force_kernel!( spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m) condition = (excl & 0x1) == true && r2 <= r_cut * r_cut - f = condition ? sum_pairwise_forces( + f = condition ? Molly.sum_pairwise_forces( inters_tuple, atoms_i, atoms_j, Val(force_units), @@ -568,7 +575,7 @@ function force_kernel!( spec = (special_bitmask >> (warpsize() - m)) | (special_bitmask << m) condition = (excl & 0x1) == true && r2 <= r_cut * r_cut - f = condition ? sum_pairwise_forces( + f = condition ? Molly.sum_pairwise_forces( inters_tuple, atoms_i, atoms_j, Val(force_units), @@ -595,7 +602,6 @@ function force_kernel!( return nothing end - function energy_kernel!( sorted_seq, energy_nounits, @@ -828,8 +834,6 @@ function energy_kernel!( return nothing end - - function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, velocities_var, atoms_var, boundary, inters, step_n, ::Val{D}, ::Val{F}) where {T, D, F} coords = CUDA.Const(coords_var) @@ -857,8 +861,8 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc j = j_0_tile + del_j if i != j atom_j, coord_j, vel_j = atoms[j], coords[j], velocities[j] - f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j, - boundary, vel_i, vel_j, step_n) + f = Molly.sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, + coord_j, boundary, vel_i, vel_j, step_n) for dim in 1:D forces_shmem[dim, tidx] += -ustrip(f[dim]) end @@ -882,7 +886,7 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc @inbounds for _ in 1:tilesteps sync_warp() atom_j = atoms[j] - f = sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j, + f = Molly.sum_pairwise_forces(inters, atom_i, atom_j, Val(F), false, coord_i, coord_j, boundary, vel_i, vel_j, step_n) for dim in 1:D forces_shmem[dim, tidx] += -ustrip(f[dim]) @@ -898,58 +902,14 @@ function pairwise_force_kernel_nonl!(forces::AbstractArray{T}, coords_var, veloc return nothing end -function pairwise_pe_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, inters, - neighbors_var, step_n, ::Val{E}) where E - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - neighbors = CUDA.Const(neighbors_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(neighbors) - i, j, special = neighbors[inter_i] - coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j] - dr = vector(coord_i, coord_j, boundary) - pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j, - boundary, vel_i, vel_j, step_n) - for inter in inters[2:end] - pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j, - boundary, vel_i, vel_j, step_n) - end - if unit(pe) != E - error("wrong energy unit returned, was expecting $E but got $(unit(pe))") - end - Atomix.@atomic :monotonic energy[1] += ustrip(pe) - end - return nothing -end - -@inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j, - boundary, vel_i, vel_j, step_n) where F - dr = vector(coord_i, coord_j, boundary) - f_tuple = ntuple(length(inters)) do inter_type_i - force_gpu(inters[inter_type_i], dr, atom_i, atom_j, F, special, coord_i, coord_j, boundary, - vel_i, vel_j, step_n) - end - f = sum(f_tuple) - if unit(f[1]) != F - # This triggers an error but it isn't printed - # See https://discourse.julialang.org/t/error-handling-in-cuda-kernels/79692 - # for how to throw a more meaningful error - error("wrong force unit returned, was expecting $F but got $(unit(f[1]))") - end - return f -end - @inline function sum_pairwise_potentials(inters, atom_i, atom_j, ::Val{E}, special, coord_i, coord_j, boundary, vel_i, vel_j, step_n) where E dr = vector(coord_i, coord_j, boundary) pe_tuple = ntuple(length(inters)) do inter_type_i - SVector(potential_energy_gpu(inters[inter_type_i], dr, atom_i, atom_j, E, special, coord_i, coord_j, boundary, - vel_i, vel_j, step_n)) - # SVector was required to avoid a GPU error occurring with scalars (like the quantity returned by potential_energy_gpu) + # SVector was required to avoid a GPU error occurring with scalars + SVector(Molly.potential_energy_gpu(inters[inter_type_i], dr, atom_i, atom_j, E, special, + coord_i, coord_j, boundary, vel_i, vel_j, step_n)) end pe = sum(pe_tuple) if unit(pe[1]) != E @@ -961,281 +921,4 @@ end return pe end -function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}}, - velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T} - n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_1_atoms_kernel!(fs_mat, - coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.inters, - Val(D), Val(force_units)) - return fs_mat -end - -function specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}}, - velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T} - n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_2_atoms_kernel!(fs_mat, - coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js, - inter_list.inters, Val(D), Val(force_units)) - return fs_mat -end - -function specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}}, - velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T} - n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_3_atoms_kernel!(fs_mat, - coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js, - inter_list.ks, inter_list.inters, Val(D), Val(force_units)) - return fs_mat -end - -function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}}, - velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T} - n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_force_4_atoms_kernel!(fs_mat, - coords, velocities, atoms, boundary, step_n, inter_list.is, inter_list.js, - inter_list.ks, inter_list.ls, inter_list.inters, Val(D), Val(force_units)) - return fs_mat -end - -function specific_force_1_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary, - step_n, is_var, inters_var, ::Val{D}, ::Val{F}) where {D, F} - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - is = CUDA.Const(is_var) - inters = CUDA.Const(inters_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(is) - i = is[inter_i] - fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n) - if unit(fs.f1[1]) != F - error("wrong force unit returned, was expecting $F") - end - for dim in 1:D - Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim]) - end - end - return nothing -end - -function specific_force_2_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary, - step_n, is_var, js_var, inters_var, ::Val{D}, ::Val{F}) where {D, F} - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - is = CUDA.Const(is_var) - js = CUDA.Const(js_var) - inters = CUDA.Const(inters_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(is) - i, j = is[inter_i], js[inter_i] - fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F, - velocities[i], velocities[j], step_n) - if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F - error("wrong force unit returned, was expecting $F") - end - for dim in 1:D - Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim]) - Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim]) - end - end - return nothing -end - -function specific_force_3_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary, - step_n, is_var, js_var, ks_var, inters_var, ::Val{D}, ::Val{F}) where {D, F} - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - is = CUDA.Const(is_var) - js = CUDA.Const(js_var) - ks = CUDA.Const(ks_var) - inters = CUDA.Const(inters_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(is) - i, j, k = is[inter_i], js[inter_i], ks[inter_i] - fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i], - atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n) - if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F - error("wrong force unit returned, was expecting $F") - end - for dim in 1:D - Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim]) - Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim]) - Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim]) - end - end - return nothing -end - -function specific_force_4_atoms_kernel!(forces, coords_var, velocities_var, atoms_var, boundary, - step_n, is_var, js_var, ks_var, ls_var, inters_var, - ::Val{D}, ::Val{F}) where {D, F} - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - is = CUDA.Const(is_var) - js = CUDA.Const(js_var) - ks = CUDA.Const(ks_var) - ls = CUDA.Const(ls_var) - inters = CUDA.Const(inters_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(is) - i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i] - fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary, - atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j], - velocities[k], velocities[l], step_n) - if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F - error("wrong force unit returned, was expecting $F") - end - for dim in 1:D - Atomix.@atomic :monotonic forces[dim, i] += ustrip(fs.f1[dim]) - Atomix.@atomic :monotonic forces[dim, j] += ustrip(fs.f2[dim]) - Atomix.@atomic :monotonic forces[dim, k] += ustrip(fs.f3[dim]) - Atomix.@atomic :monotonic forces[dim, l] += ustrip(fs.f4[dim]) - end - end - return nothing -end - - -function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}}, - velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T} - n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_1_atoms_kernel!( - pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is, - inter_list.inters, Val(energy_units)) - return pe_vec_nounits -end - -function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}}, - velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T} - n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_2_atoms_kernel!( - pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is, - inter_list.js, inter_list.inters, Val(energy_units)) - return pe_vec_nounits -end - -function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}}, - velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T} - n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_3_atoms_kernel!( - pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is, - inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units)) - return pe_vec_nounits -end - -function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}}, - velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T} - n_threads_gpu, n_blocks = cuda_threads_blocks_specific(length(inter_list)) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks specific_pe_4_atoms_kernel!( - pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is, - inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units)) - return pe_vec_nounits -end - -function specific_pe_1_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, - step_n, is_var, inters_var, ::Val{E}) where E - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - is = CUDA.Const(is_var) - inters = CUDA.Const(inters_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(is) - i = is[inter_i] - pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E, - velocities[i], step_n) - if unit(pe) != E - error("wrong energy unit returned, was expecting $E but got $(unit(pe))") - end - Atomix.@atomic :monotonic energy[1] += ustrip(pe) - end - return nothing -end - -function specific_pe_2_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, - step_n, is_var, js_var, inters_var, ::Val{E}) where E - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - is = CUDA.Const(is_var) - js = CUDA.Const(js_var) - inters = CUDA.Const(inters_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(is) - i, j = is[inter_i], js[inter_i] - pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], - atoms[j], E, velocities[i], velocities[j], step_n) - if unit(pe) != E - error("wrong energy unit returned, was expecting $E but got $(unit(pe))") - end - Atomix.@atomic :monotonic energy[1] += ustrip(pe) - end - return nothing -end - -function specific_pe_3_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, - step_n, is_var, js_var, ks_var, inters_var, ::Val{E}) where E - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - is = CUDA.Const(is_var) - js = CUDA.Const(js_var) - ks = CUDA.Const(ks_var) - inters = CUDA.Const(inters_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(is) - i, j, k = is[inter_i], js[inter_i], ks[inter_i] - pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, - atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j], - velocities[k], step_n) - if unit(pe) != E - error("wrong energy unit returned, was expecting $E but got $(unit(pe))") - end - Atomix.@atomic :monotonic energy[1] += ustrip(pe) - end - return nothing -end - -function specific_pe_4_atoms_kernel!(energy, coords_var, velocities_var, atoms_var, boundary, - step_n, is_var, js_var, ks_var, ls_var, inters_var, ::Val{E}) where E - coords = CUDA.Const(coords_var) - velocities = CUDA.Const(velocities_var) - atoms = CUDA.Const(atoms_var) - is = CUDA.Const(is_var) - js = CUDA.Const(js_var) - ks = CUDA.Const(ks_var) - ls = CUDA.Const(ls_var) - inters = CUDA.Const(inters_var) - - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x - - @inbounds if inter_i <= length(is) - i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i] - pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], - boundary, atoms[i], atoms[j], atoms[k], atoms[l], E, - velocities[i], velocities[j], velocities[k], velocities[l], - step_n) - if unit(pe) != E - error("wrong energy unit returned, was expecting $E but got $(unit(pe))") - end - Atomix.@atomic :monotonic energy[1] += ustrip(pe) - end - return nothing end diff --git a/ext/MollyEnzymeExt.jl b/ext/MollyEnzymeExt.jl index 90e015390..26fd0e882 100644 --- a/ext/MollyEnzymeExt.jl +++ b/ext/MollyEnzymeExt.jl @@ -11,13 +11,10 @@ EnzymeRules.inactive(::typeof(Molly.n_infinite_dims), args...) = nothing EnzymeRules.inactive(::typeof(random_velocity), args...) = nothing EnzymeRules.inactive(::typeof(random_velocities), args...) = nothing EnzymeRules.inactive(::typeof(random_velocities!), args...) = nothing -EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_pairwise), args...) = nothing -EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_specific), args...) = nothing EnzymeRules.inactive(::typeof(Molly.check_force_units), args...) = nothing EnzymeRules.inactive(::typeof(Molly.check_energy_units), args...) = nothing EnzymeRules.inactive(::typeof(Molly.atoms_bonded_to_N), args...) = nothing EnzymeRules.inactive(::typeof(Molly.lookup_table), args...) = nothing -EnzymeRules.inactive(::typeof(Molly.cuda_threads_blocks_gbsa), args...) = nothing EnzymeRules.inactive(::typeof(find_neighbors), args...) = nothing EnzymeRules.inactive_type(::Type{DistanceNeighborFinder}) = nothing EnzymeRules.inactive(::typeof(visualize), args...) = nothing diff --git a/ext/MollyPythonCallExt.jl b/ext/MollyPythonCallExt.jl index e1afaeb82..acbb6c675 100644 --- a/ext/MollyPythonCallExt.jl +++ b/ext/MollyPythonCallExt.jl @@ -6,7 +6,7 @@ module MollyPythonCallExt using Molly using PythonCall import AtomsCalculators -using CUDA +using GPUArrays using StaticArrays using Unitful @@ -91,9 +91,9 @@ end uconvert_vec(x...) = uconvert.(x...) -function AtomsCalculators.forces(sys::System{D, G, T}, +function AtomsCalculators.forces(sys::System{D, AT, T}, ase_calc::ASECalculator; - kwargs...) where {D, G, T} + kwargs...) where {D, AT, T} update_ase_calc!(ase_calc, sys) forces_py = ase_calc.ase_atoms.get_forces() forces_flat = reshape(transpose(pyconvert(Matrix{T}, forces_py)), length(sys) * D) @@ -105,12 +105,12 @@ function AtomsCalculators.forces(sys::System{D, G, T}, else fs_unit = uconvert_vec.(sys.force_units, fs * u"eV/Å") end - return G ? CuArray(fs_unit) : fs_unit + return AT(fs_unit) end -function AtomsCalculators.potential_energy(sys::System{D, G, T}, +function AtomsCalculators.potential_energy(sys::System{D, AT, T}, ase_calc::ASECalculator; - kwargs...) where {D, G, T} + kwargs...) where {D, AT, T} update_ase_calc!(ase_calc, sys) pe_py = ase_calc.ase_atoms.get_potential_energy() pe = pyconvert(T, pe_py) diff --git a/src/Molly.jl b/src/Molly.jl index 19664debc..e57e17ced 100644 --- a/src/Molly.jl +++ b/src/Molly.jl @@ -11,13 +11,14 @@ import BioStructures # Imported to avoid clashing names using CellListMap import Chemfiles using Combinatorics -using CUDA using DataStructures using Distances using Distributions using EzXML using FLoops +using GPUArrays using Graphs +using KernelAbstractions using NearestNeighbors using PeriodicTable using SimpleCrystals @@ -34,7 +35,7 @@ include("types.jl") include("units.jl") include("spatial.jl") include("cutoffs.jl") -include("cuda.jl") +include("kernels.jl") include("force.jl") include("interactions/lennard_jones.jl") include("interactions/soft_sphere.jl") diff --git a/src/analysis.jl b/src/analysis.jl index 01429b5a2..5ad057b97 100644 --- a/src/analysis.jl +++ b/src/analysis.jl @@ -88,8 +88,7 @@ Calculate the hydrodynamic radius of a set of coordinates. """ function hydrodynamic_radius(coords::AbstractArray{SVector{D, T}}, boundary) where {D, T} n_atoms = length(coords) - diag_cpu = Diagonal(ones(T, n_atoms)) - diag = isa(coords, CuArray) ? CuArray(diag_cpu) : diag_cpu + diag = array_type(coords)(Diagonal(ones(T, n_atoms))) dists = distances(coords, boundary) .+ diag sum_inv_dists = sum(inv.(dists)) - sum(inv(diag)) inv_R_hyd = sum_inv_dists / (2 * n_atoms^2) diff --git a/src/coupling.jl b/src/coupling.jl index da30dfae1..ae4fc7f55 100644 --- a/src/coupling.jl +++ b/src/coupling.jl @@ -58,10 +58,10 @@ struct AndersenThermostat{T, C} coupling_const::C end -function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, sim, +function apply_coupling!(sys::System, thermostat::AndersenThermostat, sim, neighbors=nothing, step_n::Integer=0; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where D + rng=Random.default_rng()) for i in eachindex(sys) if rand(rng) < (sim.dt / thermostat.coupling_const) sys.velocities[i] = random_velocity(mass(sys.atoms[i]), thermostat.temperature, sys.k; @@ -71,14 +71,14 @@ function apply_coupling!(sys::System{D, false}, thermostat::AndersenThermostat, return false end -function apply_coupling!(sys::System{D, true, T}, thermostat::AndersenThermostat, sim, +function apply_coupling!(sys::System{D, AT, T}, thermostat::AndersenThermostat, sim, neighbors=nothing, step_n::Integer=0; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where {D, T} + rng=Random.default_rng()) where {D, AT <: AbstractGPUArray, T} atoms_to_bump = T.(rand(rng, length(sys)) .< (sim.dt / thermostat.coupling_const)) atoms_to_leave = one(T) .- atoms_to_bump - atoms_to_bump_dev = move_array(atoms_to_bump, sys) - atoms_to_leave_dev = move_array(atoms_to_leave, sys) + atoms_to_bump_dev = AT(atoms_to_bump) + atoms_to_leave_dev = AT(atoms_to_leave) vs = random_velocities(sys, thermostat.temperature; rng=rng) sys.velocities .= sys.velocities .* atoms_to_leave_dev .+ vs .* atoms_to_bump_dev return false @@ -231,9 +231,9 @@ function MonteCarloBarostat(P, T, boundary; n_steps=30, n_iterations=1, scale_fa max_volume_frac, trial_find_neighbors, 0, 0) end -function apply_coupling!(sys::System{D, G, T}, barostat::MonteCarloBarostat, sim, neighbors=nothing, +function apply_coupling!(sys::System{D, AT, T}, barostat::MonteCarloBarostat, sim, neighbors=nothing, step_n::Integer=0; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) where {D, AT, T} if !iszero(step_n % barostat.n_steps) return false end @@ -371,13 +371,13 @@ function MonteCarloAnisotropicBarostat(pressure::SVector{D}, ) end -function apply_coupling!(sys::System{D, G, T}, +function apply_coupling!(sys::System{D, AT, T}, barostat::MonteCarloAnisotropicBarostat{D}, sim, neighbors=nothing, step_n::Integer=0; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) where {D, AT, T} !iszero(step_n % barostat.n_steps) && return false all(isnothing, barostat.pressure) && return false @@ -546,13 +546,13 @@ function MonteCarloMembraneBarostat(pressure, ) end -function apply_coupling!(sys::System{D, G, T}, +function apply_coupling!(sys::System{D, AT, T}, barostat::MonteCarloMembraneBarostat, sim, neighbors=nothing, step_n::Integer=0; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) where {D, AT, T} !iszero(step_n % barostat.n_steps) && return false kT = energy_remove_mol(sys.k * barostat.temperature) diff --git a/src/energy.jl b/src/energy.jl index 6fdd265c2..b257c6fa7 100644 --- a/src/energy.jl +++ b/src/energy.jl @@ -33,7 +33,7 @@ E_k = \frac{1}{2} \sum_{i} m_i v_i^2 ``` where ``m_i`` is the mass and ``v_i`` is the velocity of atom ``i``. """ -function kinetic_energy(sys::System{D, G, T}) where {D, G, T} +function kinetic_energy(sys::System) ke = kinetic_energy_noconvert(sys) return uconvert(sys.energy_units, ke) end @@ -78,8 +78,8 @@ function potential_energy(sys; n_threads::Integer=Threads.nthreads()) return potential_energy(sys, find_neighbors(sys; n_threads=n_threads); n_threads=n_threads) end -function potential_energy(sys::System{D, false, T}, neighbors, step_n::Integer=0; - n_threads::Integer=Threads.nthreads()) where {D, T} +function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0; + n_threads::Integer=Threads.nthreads()) where {D, AT, T} pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters)) pairwise_inters_nl = filter( use_neighbors, values(sys.pairwise_inters)) sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists)) @@ -253,11 +253,11 @@ function specific_pe(atoms, coords, velocities, boundary, energy_units, sils_1_a return pe end -function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0; - n_threads::Integer=Threads.nthreads()) where {D, T} - pe_vec_nounits = CUDA.zeros(T, 1) +function potential_energy(sys::System{D, AT, T}, neighbors, step_n::Integer=0; + n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T} val_ft = Val(T) - buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1) + pe_vec_nounits = KernelAbstractions.zeros(get_backend(sys.coords), T, 1) + buffers = init_forces_buffer!(sys, ustrip_vec.(zero(sys.coords)), 1, true) pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters)) if length(pairwise_inters_nonl) > 0 @@ -267,7 +267,7 @@ function potential_energy(sys::System{D, true, T}, neighbors, step_n::Integer=0; pairwise_inters_nl = filter(use_neighbors, values(sys.pairwise_inters)) if length(pairwise_inters_nl) > 0 - pairwise_pe_gpu!(pe_vec_nounits, buffers, sys, pairwise_inters_nl, nothing, step_n) + pairwise_pe_gpu!(pe_vec_nounits, buffers, sys, pairwise_inters_nl, neighbors, step_n) end for inter_list in values(sys.specific_inter_lists) diff --git a/src/force.jl b/src/force.jl index 0b54db195..bbb67e38c 100644 --- a/src/force.jl +++ b/src/force.jl @@ -132,17 +132,19 @@ struct ForcesBuffer{F, C, M, R} compressed_special::R end -function init_forces_buffer!(sys, forces_nounits::CuArray{SVector{D, T}}, n_threads) where {D, T} +function init_forces_buffer!(sys, forces_nounits::AbstractGPUArray{SVector{D, T}}, n_threads, + for_pe::Bool=false) where {D, T} N = length(forces_nounits) C = eltype(eltype(sys.coords)) n_blocks = cld(N, 32) - fs_mat = CUDA.zeros(T, D, N) - box_mins = CUDA.zeros(C, n_blocks, D) - box_maxs = CUDA.zeros(C, n_blocks, D) - Morton_seq = CUDA.zeros(Int32, N) - compressed_eligible = CUDA.zeros(UInt32, 32, n_blocks, n_blocks) - compressed_special = CUDA.zeros(UInt32, 32, n_blocks, n_blocks) - if sys.neighbor_finder isa GPUNeighborFinder + backend = get_backend(forces_nounits) + fs_mat = KernelAbstractions.zeros(backend, T, D, N) + box_mins = KernelAbstractions.zeros(backend, C, n_blocks, D) + box_maxs = KernelAbstractions.zeros(backend, C, n_blocks, D) + Morton_seq = KernelAbstractions.zeros(backend, Int32, N) + compressed_eligible = KernelAbstractions.zeros(backend, UInt32, 32, n_blocks, n_blocks) + compressed_special = KernelAbstractions.zeros(backend, UInt32, 32, n_blocks, n_blocks) + if !for_pe && sys.neighbor_finder isa GPUNeighborFinder sys.neighbor_finder.initialized = false end return ForcesBuffer(fs_mat, box_mins, box_maxs, Morton_seq, compressed_eligible, compressed_special) @@ -165,8 +167,8 @@ function forces(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt return forces_nounits .* sys.force_units end -function forces_nounits!(fs_nounits, sys::System{D, false}, neighbors, fs_chunks=nothing, - step_n::Integer=0; n_threads::Integer=Threads.nthreads()) where D +function forces_nounits!(fs_nounits, sys::System, neighbors, fs_chunks=nothing, + step_n::Integer=0; n_threads::Integer=Threads.nthreads()) pairwise_inters_nonl = filter(!use_neighbors, values(sys.pairwise_inters)) pairwise_inters_nl = filter( use_neighbors, values(sys.pairwise_inters)) sils_1_atoms = filter(il -> il isa InteractionList1Atoms, values(sys.specific_inter_lists)) @@ -367,9 +369,9 @@ function specific_forces!(fs_nounits, atoms, coords, velocities, boundary, force return fs_nounits end -function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors, +function forces_nounits!(fs_nounits, sys::System{D, AT, T}, neighbors, buffers, step_n::Integer=0; - n_threads::Integer=Threads.nthreads()) where {D, T} + n_threads::Integer=Threads.nthreads()) where {D, AT <: AbstractGPUArray, T} fill!(buffers.fs_mat, zero(T)) val_ft = Val(T) @@ -382,7 +384,7 @@ function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors, pairwise_inters_nl = filter(use_neighbors, values(sys.pairwise_inters)) if length(pairwise_inters_nl) > 0 - pairwise_force_gpu!(buffers, sys, pairwise_inters_nl, nothing, step_n) + pairwise_force_gpu!(buffers, sys, pairwise_inters_nl, neighbors, step_n) end for inter_list in values(sys.specific_inter_lists) @@ -401,5 +403,3 @@ function forces_nounits!(fs_nounits, sys::System{D, true, T}, neighbors, return fs_nounits end - - diff --git a/src/interactions/implicit_solvent.jl b/src/interactions/implicit_solvent.jl index 44e81da85..860314a91 100644 --- a/src/interactions/implicit_solvent.jl +++ b/src/interactions/implicit_solvent.jl @@ -411,10 +411,11 @@ function ImplicitSolventOBC(atoms::AbstractArray{Atom{TY, M, T, D, E}}, factor_solvent = zero(T(coulomb_const_units)) end - if isa(atoms, CuArray) - or = CuArray(offset_radii) - sor = CuArray(scaled_offset_radii) - is, js = CuArray(inds_i), CuArray(inds_j) + if isa(atoms, AbstractGPUArray) + AT = array_type(atoms) + or = AT(offset_radii) + sor = AT(scaled_offset_radii) + is, js = AT(inds_i), AT(inds_j) else or = offset_radii sor = scaled_offset_radii @@ -563,12 +564,13 @@ function ImplicitSolventGBN2(atoms::AbstractArray{Atom{TY, M, T, D, E}}, factor_solvent = zero(T(coulomb_const_units)) end - if isa(atoms, CuArray) - or = CuArray(offset_radii) - sor = CuArray(scaled_offset_radii) - is, js = CuArray(inds_i), CuArray(inds_j) - d0s, m0s = CuArray(table_d0), CuArray(table_m0) - αs, βs, γs = CuArray(αs_cpu), CuArray(βs_cpu), CuArray(γs_cpu) + if isa(atoms, AbstractGPUArray) + AT = array_type(atoms) + or = AT(offset_radii) + sor = AT(scaled_offset_radii) + is, js = AT(inds_i), AT(inds_j) + d0s, m0s = AT(table_d0), AT(table_m0) + αs, βs, γs = AT(αs_cpu), AT(βs_cpu), AT(γs_cpu) else or = offset_radii sor = scaled_offset_radii @@ -694,7 +696,7 @@ function born_radii_and_grad(inter::ImplicitSolventOBC{T}, coords, boundary) whe return Bs, B_grads, I_grads end -function born_radii_and_grad(inter::ImplicitSolventOBC, coords::CuArray, boundary) +function born_radii_and_grad(inter::ImplicitSolventOBC, coords::AbstractGPUArray, boundary) coords_i = @view coords[inter.is] coords_j = @view coords[inter.js] loop_res = born_radii_loop_OBC.(coords_i, coords_j, inter.oris, inter.srjs, @@ -766,7 +768,7 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords, boundary) wh return Bs, B_grads, I_grads end -function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, boundary) where T +function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::AbstractGPUArray, boundary) where T Is, I_grads = gbsa_born_gpu(coords, inter.offset_radii, inter.scaled_offset_radii, inter.dist_cutoff, inter.offset, inter.neck_scale, inter.neck_cut, inter.d0s, inter.m0s, boundary, Val(T)) @@ -778,42 +780,41 @@ function born_radii_and_grad(inter::ImplicitSolventGBN2{T}, coords::CuArray, bou return Bs, B_grads, I_grads end -function cuda_threads_blocks_gbsa(n_inters) +function gpu_threads_blocks_gbsa(n_inters) n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_IMPLICIT", "512")) - n_blocks = cld(n_inters, n_threads_gpu) - return n_threads_gpu, n_blocks + return n_threads_gpu end function gbsa_born_gpu(coords::AbstractArray{SVector{D, C}}, offset_radii, scaled_offset_radii, dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary, ::Val{T}) where {D, C, T} + backend = get_backend(coords) n_atoms = length(coords) - Is_nounits = CUDA.zeros(T, n_atoms) - I_grads_nounits = CUDA.zeros(T, n_atoms, n_atoms) + Is_nounits = KernelAbstractions.zeros(backend, T, n_atoms) + I_grads_nounits = KernelAbstractions.zeros(backend, T, n_atoms, n_atoms) n_inters = n_atoms ^ 2 - n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters) + n_threads_gpu = gpu_threads_blocks_gbsa(n_inters) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_born_kernel!( - Is_nounits, I_grads_nounits, coords, offset_radii, scaled_offset_radii, - dist_cutoff, offset, neck_scale, neck_cut, d0s, m0s, boundary, Val(C)) + kernel! = gbsa_born_kernel!(backend, n_threads_gpu) + kernel!(Is_nounits, I_grads_nounits, coords, offset_radii, + scaled_offset_radii, dist_cutoff, offset, neck_scale, + neck_cut, d0s, m0s, boundary, Val(C), ndrange=n_inters) Is = Is_nounits * unit(dist_cutoff)^-1 I_grads = I_grads_nounits * unit(dist_cutoff)^-2 return Is, I_grads end -function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_offset_radii_var, - dist_cutoff, offset, neck_scale, neck_cut, d0s_var, m0s_var, boundary, - ::Val{C}) where C - coords = CUDA.Const(coords_var) - offset_radii = CUDA.Const(offset_radii_var) - scaled_offset_radii = CUDA.Const(scaled_offset_radii_var) - d0s = CUDA.Const(d0s_var) - m0s = CUDA.Const(m0s_var) +@kernel function gbsa_born_kernel!(Is, I_grads, @Const(coords), + @Const(offset_radii), + @Const(scaled_offset_radii), + dist_cutoff, offset, neck_scale, neck_cut, + @Const(d0s), @Const(m0s), boundary, + ::Val{C}) where C n_atoms = length(coords) n_inters = n_atoms ^ 2 - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + inter_i = @index(Global, Linear) @inbounds if inter_i <= n_inters i = cld(inter_i, n_atoms) @@ -849,12 +850,11 @@ function gbsa_born_kernel!(Is, I_grads, coords_var, offset_radii_var, scaled_off numer = 2 * r_d0_strip + 9 * r_d0_strip^5 / 5 I_grad -= 10 * neck_scale * m0 * numer / (denom^2 * unit(dist_cutoff)) end - Atomix.@atomic :monotonic Is[i] += ustrip(unit(dist_cutoff)^-1, I) + Atomix.@atomic Is[i] += ustrip(unit(dist_cutoff)^-1, I) I_grads[i, j] += ustrip(unit(dist_cutoff)^-2, I_grad) end end end - return nothing end function gb_force_loop_1(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, dist_cutoff, @@ -948,8 +948,8 @@ function forces_gbsa(sys, inter, Bs, B_grads, I_grads, born_forces, atom_charges return fs end -function forces_gbsa(sys::System{D, true, T}, inter, Bs, B_grads, I_grads, born_forces, - atom_charges) where {D, T} +function forces_gbsa(sys::System{D, AT, T}, inter, Bs, B_grads, I_grads, born_forces, + atom_charges) where {D, AT <: AbstractGPUArray, T} fs_mat_1, born_forces_mod_ustrip = gbsa_force_1_gpu(sys.coords, sys.boundary, inter.dist_cutoff, inter.factor_solute, inter.factor_solvent, inter.kappa, Bs, atom_charges, sys.force_units) @@ -965,16 +965,17 @@ end function gbsa_force_1_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff, factor_solute, factor_solvent, kappa, Bs, atom_charges::AbstractArray{T}, force_units) where {D, C, T} + backend = get_backend(coords) n_atoms = length(coords) - fs_mat = CUDA.zeros(T, D, n_atoms) - born_forces_mod_ustrip = CUDA.zeros(T, n_atoms) + fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms) + born_forces_mod_ustrip = KernelAbstractions.zeros(backend, T, n_atoms) n_inters = n_atoms_to_n_pairs(n_atoms) + n_atoms - n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters) + n_threads_gpu = gpu_threads_blocks_gbsa(n_inters) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_1_kernel!( - fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff, - factor_solute, factor_solvent, kappa, Bs, atom_charges, - Val(D), Val(force_units)) + kernel! = gbsa_force_1_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, born_forces_mod_ustrip, coords, boundary, dist_cutoff, + factor_solute, factor_solvent, kappa, Bs, atom_charges, + Val(D), Val(force_units), ndrange=n_inters) return fs_mat, born_forces_mod_ustrip end @@ -982,29 +983,30 @@ end function gbsa_force_2_gpu(coords::AbstractArray{SVector{D, C}}, boundary, dist_cutoff, Bs, B_grads, I_grads, born_forces, offset_radii, scaled_offset_radii, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) n_atoms = length(coords) - fs_mat = CUDA.zeros(T, D, n_atoms) + fs_mat = KernelAbstractions.zeros(backend, T, D, n_atoms) n_inters = n_atoms ^ 2 - n_threads_gpu, n_blocks = cuda_threads_blocks_gbsa(n_inters) + n_threads_gpu = gpu_threads_blocks_gbsa(n_inters) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks gbsa_force_2_kernel!( - fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii, - scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units)) + kernel! = gbsa_force_2_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, born_forces, coords, boundary, dist_cutoff, offset_radii, + scaled_offset_radii, Bs, B_grads, I_grads, Val(D), Val(force_units), + ndrange=n_inters) return fs_mat end -function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, boundary, dist_cutoff, - factor_solute, factor_solvent, kappa, Bs_var, atom_charges_var, - ::Val{D}, ::Val{F}) where {D, F} - coords = CUDA.Const(coords_var) - Bs = CUDA.Const(Bs_var) - atom_charges = CUDA.Const(atom_charges_var) +@kernel function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, + @Const(coords), boundary, dist_cutoff, + factor_solute, factor_solvent, kappa, + @Const(Bs), @Const(atom_charges), + ::Val{D}, ::Val{F}) where {D, F} n_atoms = length(coords) n_inters_not_self = n_atoms_to_n_pairs(n_atoms) n_inters = n_inters_not_self + n_atoms - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + inter_i = @index(Global, Linear) @inbounds if inter_i <= n_inters if inter_i <= n_inters_not_self @@ -1034,38 +1036,33 @@ function gbsa_force_1_kernel!(forces, born_forces_mod_ustrip, coords_var, bounda dGpol_dalpha2_ij = -Gpol * exp_term * (1 + D_term) / (2 * denominator2) change_born_force_i = dGpol_dalpha2_ij * Bj - Atomix.@atomic :monotonic born_forces_mod_ustrip[i] += ustrip(change_born_force_i) + Atomix.@atomic born_forces_mod_ustrip[i] += ustrip(change_born_force_i) if i != j change_born_force_j = dGpol_dalpha2_ij * Bi - Atomix.@atomic :monotonic born_forces_mod_ustrip[j] += ustrip(change_born_force_j) + Atomix.@atomic born_forces_mod_ustrip[j] += ustrip(change_born_force_j) fdr = dr * dGpol_dr if unit(fdr[1]) != F error("wrong force unit returned, was expecting $F but got $(unit(fdr[1]))") end for dim in 1:D fval = ustrip(fdr[dim]) - Atomix.@atomic :monotonic forces[dim, i] += fval - Atomix.@atomic :monotonic forces[dim, j] += -fval + Atomix.@atomic forces[dim, i] += fval + Atomix.@atomic forces[dim, j] += -fval end end end end - return nothing end -function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cutoff, or_var, - sor_var, Bs_var, B_grads_var, I_grads_var, ::Val{D}, - ::Val{F}) where {D, F} - coords = CUDA.Const(coords_var) - or = CUDA.Const(or_var) - sor = CUDA.Const(sor_var) - Bs = CUDA.Const(Bs_var) - B_grads = CUDA.Const(B_grads_var) - I_grads = CUDA.Const(I_grads_var) +@kernel function gbsa_force_2_kernel!(forces, born_forces, @Const(coords), + boundary, dist_cutoff, @Const(or), + @Const(sor), @Const(Bs), + @Const(B_grads), @Const(I_grads), + ::Val{D}, ::Val{F}) where {D, F} n_atoms = length(coords) n_inters = n_atoms ^ 2 - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + inter_i = @index(Global, Linear) @inbounds if inter_i <= n_inters i = cld(inter_i, n_atoms) @@ -1091,14 +1088,13 @@ function gbsa_force_2_kernel!(forces, born_forces, coords_var, boundary, dist_cu end for dim in 1:D fval = ustrip(fdr[dim]) - Atomix.@atomic :monotonic forces[dim, i] += fval - Atomix.@atomic :monotonic forces[dim, j] += -fval + Atomix.@atomic forces[dim, i] += fval + Atomix.@atomic forces[dim, j] += -fval end end end end end - return nothing end function AtomsCalculators.forces(sys, inter::AbstractGBSA; kwargs...) @@ -1153,7 +1149,7 @@ function gb_energy_loop(coord_i, coord_j, i, j, charge_i, charge_j, Bi, Bj, ori, end end -function AtomsCalculators.potential_energy(sys::System{<:Any, false, T}, inter::AbstractGBSA; +function AtomsCalculators.potential_energy(sys::System{<:Any, <:Any, T}, inter::AbstractGBSA; kwargs...) where T coords, boundary = sys.coords, sys.boundary Bs, B_grads, I_grads = born_radii_and_grad(inter, coords, boundary) @@ -1173,7 +1169,8 @@ function AtomsCalculators.potential_energy(sys::System{<:Any, false, T}, inter:: return E end -function AtomsCalculators.potential_energy(sys::System{<:Any, true}, inter::AbstractGBSA; kwargs...) +function AtomsCalculators.potential_energy(sys::System{<:Any, AT}, inter::AbstractGBSA; + kwargs...) where AT <: AbstractGPUArray coords, atoms, boundary = sys.coords, sys.atoms, sys.boundary Bs, B_grads, I_grads = born_radii_and_grad(inter, coords, boundary) diff --git a/src/kernels.jl b/src/kernels.jl new file mode 100644 index 000000000..92cb16f50 --- /dev/null +++ b/src/kernels.jl @@ -0,0 +1,365 @@ +# KernelAbstractions.jl kernels + +@inline function sum_pairwise_forces(inters, atom_i, atom_j, ::Val{F}, special, coord_i, coord_j, + boundary, vel_i, vel_j, step_n) where F + dr = vector(coord_i, coord_j, boundary) + f_tuple = ntuple(length(inters)) do inter_type_i + force_gpu(inters[inter_type_i], dr, atom_i, atom_j, F, special, coord_i, coord_j, boundary, + vel_i, vel_j, step_n) + end + f = sum(f_tuple) + if unit(f[1]) != F + error("wrong force unit returned, was expecting $F but got $(unit(f[1]))") + end + return f +end + +function gpu_threads_pairwise(n_neighbors) + n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_PAIRWISE", "512")) + return n_threads_gpu +end + +function gpu_threads_specific(n_inters) + n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_SPECIFIC", "128")) + return n_threads_gpu +end + +function pairwise_force_gpu!(buffers, sys::System{D, AT, T}, + pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray, T} + if isnothing(neighbors) + error("neighbors is nothing, if you are using GPUNeighborFinder on a non-NVIDIA GPU you " * + "should use DistanceNeighborFinder instead") + end + if typeof(neighbors) == NoNeighborList + nbs = neighbors + else + nbs = @view neighbors.list[1:neighbors.n] + end + if length(neighbors) > 0 + backend = get_backend(sys.coords) + n_threads_gpu = gpu_threads_pairwise(length(nbs)) + kernel! = pairwise_force_kernel_nl!(backend, n_threads_gpu) + kernel!(buffers.fs_mat, sys.coords, sys.velocities, sys.atoms, sys.boundary, pairwise_inters, + nbs, step_n, Val(D), Val(sys.force_units); ndrange=length(nbs)) + end + return buffers +end + +@kernel inbounds=true function pairwise_force_kernel_nl!(forces, @Const(coords), + @Const(velocities), @Const(atoms), + boundary, inters, + @Const(neighbors), step_n, ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + if inter_i <= length(neighbors) + i, j, special = neighbors[inter_i] + f = sum_pairwise_forces(inters, atoms[i], atoms[j], Val(F), special, coords[i], coords[j], + boundary, velocities[i], velocities[j], step_n) + for dim in 1:D + fval = ustrip(f[dim]) + Atomix.@atomic forces[dim, i] += -fval + Atomix.@atomic forces[dim, j] += fval + end + end +end + +function specific_force_gpu!(fs_mat, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}}, + velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + n_threads_gpu = gpu_threads_specific(length(inter_list)) + kernel! = specific_force_1_atoms_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is, + inter_list.inters, Val(D), Val(force_units); + ndrange = length(inter_list)) + return fs_mat +end + +function specific_force_gpu!(fs_mat, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}}, + velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + n_threads_gpu = gpu_threads_specific(length(inter_list)) + kernel! = specific_force_2_atoms_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is, + inter_list.js, inter_list.inters, Val(D), Val(force_units); + ndrange = length(inter_list)) + return fs_mat +end + +function specific_force_gpu!(fs_mat, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}}, + velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + n_threads_gpu = gpu_threads_specific(length(inter_list)) + kernel! = specific_force_3_atoms_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is, + inter_list.js, inter_list.ks, inter_list.inters, Val(D), + Val(force_units); ndrange = length(inter_list)) + return fs_mat +end + +function specific_force_gpu!(fs_mat, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}}, + velocities, atoms, boundary, step_n, force_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + n_threads_gpu = gpu_threads_specific(length(inter_list)) + kernel! = specific_force_4_atoms_kernel!(backend, n_threads_gpu) + kernel!(fs_mat, coords, velocities, atoms, boundary, step_n, inter_list.is, + inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, + Val(D), Val(force_units); ndrange = length(inter_list)) + return fs_mat +end + +@kernel inbounds=true function specific_force_1_atoms_kernel!(forces, @Const(coords), + @Const(velocities), + @Const(atoms), boundary, + step_n, @Const(is), + @Const(inters), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + if inter_i <= length(is) + i = is[inter_i] + fs = force_gpu(inters[inter_i], coords[i], boundary, atoms[i], F, velocities[i], step_n) + if unit(fs.f1[1]) != F + error("wrong force unit returned, was expecting $F") + end + for dim in 1:D + Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim]) + end + end +end + +@kernel inbounds=true function specific_force_2_atoms_kernel!(forces, @Const(coords), + @Const(velocities), + @Const(atoms), boundary, + step_n, @Const(is), @Const(js), + @Const(inters), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + if inter_i <= length(is) + i, j = is[inter_i], js[inter_i] + fs = force_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], atoms[j], F, + velocities[i], velocities[j], step_n) + if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F + error("wrong force unit returned, was expecting $F") + end + for dim in 1:D + Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim]) + Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim]) + end + end +end + +@kernel inbounds=true function specific_force_3_atoms_kernel!(forces, @Const(coords), + @Const(velocities), + @Const(atoms), boundary, + step_n, @Const(is), + @Const(js), @Const(ks), + @Const(inters), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + if inter_i <= length(is) + i, j, k = is[inter_i], js[inter_i], ks[inter_i] + fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, atoms[i], + atoms[j], atoms[k], F, velocities[i], velocities[j], velocities[k], step_n) + if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F + error("wrong force unit returned, was expecting $F") + end + for dim in 1:D + Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim]) + Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim]) + Atomix.@atomic forces[dim, k] += ustrip(fs.f3[dim]) + end + end +end + +@kernel inbounds=true function specific_force_4_atoms_kernel!(forces, @Const(coords), + @Const(velocities), + @Const(atoms), boundary, + step_n, @Const(is), + @Const(js), @Const(ks), + @Const(ls), + @Const(inters), ::Val{D}, + ::Val{F}) where {D, F} + + inter_i = @index(Global, Linear) + + if inter_i <= length(is) + i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i] + fs = force_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], boundary, + atoms[i], atoms[j], atoms[k], atoms[l], F, velocities[i], velocities[j], + velocities[k], velocities[l], step_n) + if unit(fs.f1[1]) != F || unit(fs.f2[1]) != F || unit(fs.f3[1]) != F || unit(fs.f4[1]) != F + error("wrong force unit returned, was expecting $F") + end + for dim in 1:D + Atomix.@atomic forces[dim, i] += ustrip(fs.f1[dim]) + Atomix.@atomic forces[dim, j] += ustrip(fs.f2[dim]) + Atomix.@atomic forces[dim, k] += ustrip(fs.f3[dim]) + Atomix.@atomic forces[dim, l] += ustrip(fs.f4[dim]) + end + end +end + +function pairwise_pe_gpu!(pe_vec_nounits, buffers, sys::System{D, AT}, + pairwise_inters, neighbors, step_n) where {D, AT <: AbstractGPUArray} + if isnothing(neighbors) + error("neighbors is nothing, if you are using GPUNeighborFinder on a non-NVIDIA GPU you " * + "should use DistanceNeighborFinder instead") + end + if typeof(neighbors) == NoNeighborList + nbs = neighbors + else + nbs = @view neighbors.list[1:neighbors.n] + end + if length(neighbors) > 0 + backend = get_backend(sys.coords) + n_threads_gpu = gpu_threads_pairwise(length(nbs)) + kernel! = pairwise_pe_kernel!(backend, n_threads_gpu) + kernel!(pe_vec_nounits, sys.coords, sys.velocities, sys.atoms, sys.boundary, + pairwise_inters, nbs, step_n, Val(sys.energy_units); ndrange=length(nbs)) + end + return pe_vec_nounits +end + +@kernel inbounds=true function pairwise_pe_kernel!(energy, @Const(coords), @Const(velocities), + @Const(atoms), boundary, inters, + @Const(neighbors), step_n, ::Val{E}) where E + + inter_i = @index(Global, Linear) + + if inter_i <= length(neighbors) + i, j, special = neighbors[inter_i] + coord_i, coord_j, vel_i, vel_j = coords[i], coords[j], velocities[i], velocities[j] + dr = vector(coord_i, coord_j, boundary) + pe = potential_energy_gpu(inters[1], dr, atoms[i], atoms[j], E, special, coord_i, coord_j, + boundary, vel_i, vel_j, step_n) + for inter in inters[2:end] + pe += potential_energy_gpu(inter, dr, atoms[i], atoms[j], E, special, coord_i, coord_j, + boundary, vel_i, vel_j, step_n) + end + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic energy[1] += ustrip(pe) + end +end + +function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList1Atoms, coords::AbstractArray{SVector{D, C}}, + velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + n_threads_gpu = gpu_threads_specific(length(inter_list)) + kernel! = specific_pe_1_atoms_kernel!(backend, n_threads_gpu) + kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is, + inter_list.inters, Val(energy_units); ndrange = length(inter_list)) + return pe_vec_nounits +end + +function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList2Atoms, coords::AbstractArray{SVector{D, C}}, + velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T} + + backend = get_backend(coords) + n_threads_gpu = gpu_threads_specific(length(inter_list)) + kernel! = specific_pe_2_atoms_kernel!(backend, n_threads_gpu) + kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is, + inter_list.js, inter_list.inters, Val(energy_units); ndrange = length(inter_list)) + return pe_vec_nounits +end + +function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList3Atoms, coords::AbstractArray{SVector{D, C}}, + velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + n_threads_gpu = gpu_threads_specific(length(inter_list)) + kernel! = specific_pe_3_atoms_kernel!(backend, n_threads_gpu) + kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is, + inter_list.js, inter_list.ks, inter_list.inters, Val(energy_units); + ndrange = length(inter_list)) + return pe_vec_nounits +end + +function specific_pe_gpu!(pe_vec_nounits, inter_list::InteractionList4Atoms, coords::AbstractArray{SVector{D, C}}, + velocities, atoms, boundary, step_n, energy_units, ::Val{T}) where {D, C, T} + backend = get_backend(coords) + n_threads_gpu = gpu_threads_specific(length(inter_list)) + kernel! = specific_pe_4_atoms_kernel!(backend, n_threads_gpu) + kernel!(pe_vec_nounits, coords, velocities, atoms, boundary, step_n, inter_list.is, + inter_list.js, inter_list.ks, inter_list.ls, inter_list.inters, Val(energy_units); + ndrange = length(inter_list)) + return pe_vec_nounits +end + +@kernel inbounds=true function specific_pe_1_atoms_kernel!(energy, @Const(coords), @Const(velocities), + @Const(atoms), boundary, step_n, @Const(is), @Const(inters), ::Val{E}) where E + + inter_i = @index(Global, Linear) + + if inter_i <= length(is) + i = is[inter_i] + pe = potential_energy_gpu(inters[inter_i], coords[i], boundary, atoms[i], E, + velocities[i], step_n) + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic energy[1] += ustrip(pe) + end +end + +@kernel inbounds=true function specific_pe_2_atoms_kernel!(energy, @Const(coords), @Const(velocities), + @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(inters), + ::Val{E}) where E + + + inter_i = @index(Global, Linear) + + if inter_i <= length(is) + i, j = is[inter_i], js[inter_i] + pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], boundary, atoms[i], + atoms[j], E, velocities[i], velocities[j], step_n) + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic energy[1] += ustrip(pe) + end +end + +@kernel inbounds=true function specific_pe_3_atoms_kernel!(energy, @Const(coords), @Const(velocities), + @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(ks), + @Const(inters), ::Val{E}) where E + + inter_i = @index(Global, Linear) + + if inter_i <= length(is) + i, j, k = is[inter_i], js[inter_i], ks[inter_i] + pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], boundary, + atoms[i], atoms[j], atoms[k], E, velocities[i], velocities[j], + velocities[k], step_n) + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic energy[1] += ustrip(pe) + end +end + +@kernel inbounds=true function specific_pe_4_atoms_kernel!(energy, @Const(coords), @Const(velocities), + @Const(atoms), boundary, step_n, @Const(is), @Const(js), @Const(ks), + @Const(ls), @Const(inters), ::Val{E}) where E + + inter_i = @index(Global, Linear) + + if inter_i <= length(is) + i, j, k, l = is[inter_i], js[inter_i], ks[inter_i], ls[inter_i] + pe = potential_energy_gpu(inters[inter_i], coords[i], coords[j], coords[k], coords[l], + boundary, atoms[i], atoms[j], atoms[k], atoms[l], E, + velocities[i], velocities[j], velocities[k], velocities[l], + step_n) + if unit(pe) != E + error("wrong energy unit returned, was expecting $E but got $(unit(pe))") + end + Atomix.@atomic energy[1] += ustrip(pe) + end +end diff --git a/src/neighbors.jl b/src/neighbors.jl index 7f3625a88..dbece0938 100644 --- a/src/neighbors.jl +++ b/src/neighbors.jl @@ -43,13 +43,16 @@ find_neighbors(sys::System; kwargs...) = find_neighbors(sys, sys.neighbor_finder find_neighbors(sys::System, nf::NoNeighborFinder, args...; kwargs...) = nothing +# Indicates whether an array type is compatible with GPUNeighborFinder +uses_gpu_neighbor_finder(AT) = false + """ GPUNeighborFinder(; eligible, dist_cutoff, special, n_steps_reorder, initialized) Use the non-bonded forces/potential energy algorithm from [Eastman and Pande 2010](https://doi.org/10.1002/jcc.21413) to avoid calculating a neighbor list. -This is the recommended neighbor finder on GPU. +This is the recommended neighbor finder on NVIDIA GPUs. """ mutable struct GPUNeighborFinder{B, D} eligible::B @@ -75,6 +78,8 @@ find_neighbors(sys::System, nf::GPUNeighborFinder, args...; kwargs...) = nothing DistanceNeighborFinder(; eligible, dist_cutoff, special, n_steps) Find close atoms by distance. + +This is the recommended neighbor finder on non-NVIDIA GPUs. """ struct DistanceNeighborFinder{B, D} eligible::B @@ -93,12 +98,12 @@ function DistanceNeighborFinder(; eligible, dist_cutoff, special, n_steps, zero(eligible)) end -function find_neighbors(sys::System{D, false}, +function find_neighbors(sys::System, nf::DistanceNeighborFinder, current_neighbors=nothing, step_n::Integer=0, force_recompute::Bool=false; - n_threads::Integer=Threads.nthreads()) where D + n_threads::Integer=Threads.nthreads()) if !force_recompute && !iszero(step_n % nf.n_steps) return current_neighbors end @@ -121,20 +126,18 @@ function find_neighbors(sys::System{D, false}, return NeighborList(length(neighbors_list), neighbors_list) end -function cuda_threads_blocks_dnf(n_inters) +function gpu_threads_blocks_dnf(n_inters) n_threads_gpu = parse(Int, get(ENV, "MOLLY_GPUNTHREADS_DISTANCENF", "512")) - n_blocks = cld(n_inters, n_threads_gpu) - return n_threads_gpu, n_blocks + return n_threads_gpu end -function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var, - boundary, sq_dist_neighbors) - coords = CUDA.Const(coords_var) - eligible = CUDA.Const(eligible_var) - +@kernel function distance_neighbor_finder_kernel!(neighbors, + @Const(coords), + @Const(eligible), + boundary, sq_dist_neighbors) n_atoms = length(coords) n_inters = n_atoms_to_n_pairs(n_atoms) - inter_i = (blockIdx().x - 1) * blockDim().x + threadIdx().x + inter_i = @index(Global, Linear) @inbounds if inter_i <= n_inters i, j = pair_index(n_atoms, inter_i) @@ -146,28 +149,28 @@ function distance_neighbor_finder_kernel!(neighbors, coords_var, eligible_var, end end end - return nothing end lists_to_tuple_list(i, j, w) = (Int32(i), Int32(j), w) -function find_neighbors(sys::System{D, true}, +function find_neighbors(sys::System{D, AT}, nf::DistanceNeighborFinder, current_neighbors=nothing, step_n::Integer=0, force_recompute::Bool=false; - kwargs...) where D + kwargs...) where {D, AT <: AbstractGPUArray} if !force_recompute && !iszero(step_n % nf.n_steps) return current_neighbors end nf.neighbors .= false n_inters = n_atoms_to_n_pairs(length(sys)) - n_threads_gpu, n_blocks = cuda_threads_blocks_dnf(n_inters) + n_threads_gpu = gpu_threads_blocks_dnf(n_inters) - CUDA.@sync @cuda threads=n_threads_gpu blocks=n_blocks distance_neighbor_finder_kernel!( - nf.neighbors, sys.coords, nf.eligible, sys.boundary, nf.dist_cutoff^2, - ) + backend = get_backend(sys.coords) + kernel! = distance_neighbor_finder_kernel!(backend, n_threads_gpu) + kernel!(nf.neighbors, sys.coords, nf.eligible, sys.boundary, + nf.dist_cutoff^2, ndrange=n_inters) pairs = findall(nf.neighbors) nbsi, nbsj = getindex.(pairs, 1), getindex.(pairs, 2) @@ -199,12 +202,12 @@ function TreeNeighborFinder(; return TreeNeighborFinder{typeof(dist_cutoff)}(eligible, dist_cutoff, special, n_steps) end -function find_neighbors(sys::System, +function find_neighbors(sys::System{<:Any, AT}, nf::TreeNeighborFinder, current_neighbors=nothing, step_n::Integer=0, force_recompute::Bool=false; - n_threads::Integer=Threads.nthreads()) + n_threads::Integer=Threads.nthreads()) where AT if !force_recompute && !iszero(step_n % nf.n_steps) return current_neighbors end @@ -227,7 +230,7 @@ function find_neighbors(sys::System, end end - return NeighborList(length(neighbors_list), move_array(neighbors_list, sys)) + return NeighborList(length(neighbors_list), AT(neighbors_list)) end """ @@ -337,19 +340,19 @@ function reduce_pairs(neighbors::NeighborList, neighbors_threaded::Vector{Neighb return neighbors end -function find_neighbors(sys::System{D, G}, +function find_neighbors(sys::System{D, AT}, nf::CellListMapNeighborFinder, current_neighbors=nothing, step_n::Integer=0, force_recompute::Bool=false; - n_threads::Integer=Threads.nthreads()) where {D, G} + n_threads::Integer=Threads.nthreads()) where {D, AT} if !force_recompute && !iszero(step_n % nf.n_steps) return current_neighbors end if isnothing(current_neighbors) neighbors = NeighborList() - elseif G + elseif AT <: AbstractGPUArray neighbors = NeighborList(current_neighbors.n, Array(current_neighbors.list)) else neighbors = current_neighbors @@ -381,8 +384,8 @@ function find_neighbors(sys::System{D, G}, ) nf.cl = cl - if G - return NeighborList(neighbors.n, CuArray(neighbors.list)) + if AT <: AbstractGPUArray + return NeighborList(neighbors.n, AT(neighbors.list)) else return neighbors end diff --git a/src/setup.jl b/src/setup.jl index e0b5efbe8..35b151f45 100644 --- a/src/setup.jl +++ b/src/setup.jl @@ -428,8 +428,8 @@ are not available when reading Gromacs files. - `loggers=()`: the loggers that record properties of interest during a simulation. - `units::Bool=true`: whether to use Unitful quantities. -- `gpu::Bool=false`: whether to move the relevant parts of the system onto - the GPU. +- `array_type=Array`: the array type for the simulation, for example + use `CuArray` or `ROCArray` for GPU support. - `dist_cutoff=1.0u"nm"`: cutoff distance for long-range interactions. - `dist_neighbors=1.2u"nm"`: cutoff distance for the neighbor list, should be greater than `dist_cutoff`. @@ -452,7 +452,7 @@ function System(coord_file::AbstractString, velocities=nothing, loggers=(), units::Bool=true, - gpu::Bool=false, + array_type::Type{AT}=Array, dist_cutoff=units ? 1.0u"nm" : 1.0, dist_neighbors=units ? 1.2u"nm" : 1.2, center_coords::Bool=true, @@ -460,7 +460,7 @@ function System(coord_file::AbstractString, data=nothing, implicit_solvent=nothing, kappa=0.0u"nm^-1", - rename_terminal_res::Bool=true) + rename_terminal_res::Bool=true) where AT <: AbstractArray T = typeof(force_field.weight_14_coulomb) # Chemfiles uses zero-based indexing, be careful @@ -824,9 +824,9 @@ function System(coord_file::AbstractString, specific_inter_array = [] if length(bonds.is) > 0 push!(specific_inter_array, InteractionList2Atoms( - gpu ? CuArray(bonds.is) : bonds.is, - gpu ? CuArray(bonds.js) : bonds.js, - gpu ? CuArray([bonds.inters...]) : [bonds.inters...], + AT(bonds.is), + AT(bonds.js), + AT([bonds.inters...]), bonds.types, )) topology = MolecularTopology(bonds.is, bonds.js, n_atoms) @@ -835,30 +835,30 @@ function System(coord_file::AbstractString, end if length(angles.is) > 0 push!(specific_inter_array, InteractionList3Atoms( - gpu ? CuArray(angles.is) : angles.is, - gpu ? CuArray(angles.js) : angles.js, - gpu ? CuArray(angles.ks) : angles.ks, - gpu ? CuArray([angles.inters...]) : [angles.inters...], + AT(angles.is), + AT(angles.js), + AT(angles.ks), + AT([angles.inters...]), angles.types, )) end if length(torsions.is) > 0 push!(specific_inter_array, InteractionList4Atoms( - gpu ? CuArray(torsions.is) : torsions.is, - gpu ? CuArray(torsions.js) : torsions.js, - gpu ? CuArray(torsions.ks) : torsions.ks, - gpu ? CuArray(torsions.ls) : torsions.ls, - gpu ? CuArray(torsion_inters_pad) : torsion_inters_pad, + AT(torsions.is), + AT(torsions.js), + AT(torsions.ks), + AT(torsions.ls), + AT(torsion_inters_pad), torsions.types, )) end if length(impropers.is) > 0 push!(specific_inter_array, InteractionList4Atoms( - gpu ? CuArray(impropers.is) : impropers.is, - gpu ? CuArray(impropers.js) : impropers.js, - gpu ? CuArray(impropers.ks) : impropers.ks, - gpu ? CuArray(impropers.ls) : impropers.ls, - gpu ? CuArray(improper_inters_pad) : improper_inters_pad, + AT(impropers.is), + AT(impropers.js), + AT(impropers.ks), + AT(impropers.ls), + AT(improper_inters_pad), impropers.types, )) end @@ -887,15 +887,14 @@ function System(coord_file::AbstractString, end coords = wrap_coords.(coords, (boundary_used,)) - if gpu + if uses_gpu_neighbor_finder(AT) neighbor_finder = GPUNeighborFinder( - eligible=CuArray(eligible), + eligible=AT(eligible), dist_cutoff=T(dist_neighbors), - special=CuArray(special), + special=AT(special), n_steps_reorder=10, - initialized=false, ) - elseif use_cell_list + elseif use_cell_list && !(AT <: AbstractGPUArray) neighbor_finder = CellListMapNeighborFinder( eligible=eligible, special=special, @@ -906,19 +905,15 @@ function System(coord_file::AbstractString, ) else neighbor_finder = DistanceNeighborFinder( - eligible=eligible, - special=special, + eligible=AT(eligible), + special=AT(special), n_steps=10, dist_cutoff=T(dist_neighbors), ) end - if gpu - atoms = CuArray([atoms_abst...]) - coords_dev = CuArray(coords) - else - atoms = [atoms_abst...] - coords_dev = coords - end + + atoms = AT([atoms_abst...]) + coords_dev = AT(coords) if isnothing(velocities) if units @@ -973,12 +968,12 @@ function System(T::Type, velocities=nothing, loggers=(), units::Bool=true, - gpu::Bool=false, + array_type::Type{AT}=Array, dist_cutoff=units ? 1.0u"nm" : 1.0, dist_neighbors=units ? 1.2u"nm" : 1.2, center_coords::Bool=true, use_cell_list::Bool=true, - data=nothing) + data=nothing) where AT <: AbstractArray # Read force field and topology file atomtypes = Dict{String, Atom}() bondtypes = Dict{String, HarmonicBond}() @@ -1254,9 +1249,9 @@ function System(T::Type, specific_inter_array = [] if length(bonds.is) > 0 push!(specific_inter_array, InteractionList2Atoms( - gpu ? CuArray(bonds.is) : bonds.is, - gpu ? CuArray(bonds.js) : bonds.js, - gpu ? CuArray([bonds.inters...]) : [bonds.inters...], + AT(bonds.is), + AT(bonds.js), + AT([bonds.inters...]), bonds.types, )) topology = MolecularTopology(bonds.is, bonds.js, n_atoms) @@ -1265,34 +1260,33 @@ function System(T::Type, end if length(angles.is) > 0 push!(specific_inter_array, InteractionList3Atoms( - gpu ? CuArray(angles.is) : angles.is, - gpu ? CuArray(angles.js) : angles.js, - gpu ? CuArray(angles.ks) : angles.ks, - gpu ? CuArray([angles.inters...]) : [angles.inters...], + AT(angles.is), + AT(angles.js), + AT(angles.ks), + AT([angles.inters...]), angles.types, )) end if length(torsions.is) > 0 push!(specific_inter_array, InteractionList4Atoms( - gpu ? CuArray(torsions.is) : torsions.is, - gpu ? CuArray(torsions.js) : torsions.js, - gpu ? CuArray(torsions.ks) : torsions.ks, - gpu ? CuArray(torsions.ls) : torsions.ls, - gpu ? CuArray([torsions.inters...]) : [torsions.inters...], + AT(torsions.is), + AT(torsions.js), + AT(torsions.ks), + AT(torsions.ls), + AT([torsions.inters...]), torsions.types, )) end specific_inter_lists = tuple(specific_inter_array...) - if gpu + if uses_gpu_neighbor_finder(AT) neighbor_finder = GPUNeighborFinder( - eligible=CuArray(eligible), + eligible=AT(eligible), dist_cutoff=T(dist_neighbors), - special=CuArray(special), + special=AT(special), n_steps_reorder=10, - initialized=false, ) - elseif use_cell_list + elseif use_cell_list && !(AT <: AbstractGPUArray) neighbor_finder = CellListMapNeighborFinder( eligible=eligible, special=special, @@ -1303,19 +1297,15 @@ function System(T::Type, ) else neighbor_finder = DistanceNeighborFinder( - eligible=eligible, - special=special, + eligible=AT(eligible), + special=AT(special), n_steps=10, dist_cutoff=T(dist_neighbors), ) end - if gpu - atoms = CuArray([atoms_abst...]) - coords_dev = CuArray(coords) - else - atoms = [atoms_abst...] - coords_dev = coords - end + + atoms = AT([atoms_abst...]) + coords_dev = AT(coords) if isnothing(velocities) if units @@ -1382,10 +1372,10 @@ The `atom_selector` function takes in each atom and atom data and determines whe that atom. For example, [`is_heavy_atom`](@ref) means non-hydrogen atoms are restrained. """ -function add_position_restraints(sys, +function add_position_restraints(sys::System{<:Any, AT}, k; atom_selector::Function=is_any_atom, - restrain_coords=sys.coords) + restrain_coords=sys.coords) where AT k_array = isa(k, AbstractArray) ? k : fill(k, length(sys)) if length(k_array) != length(sys) throw(ArgumentError("the system has $(length(sys)) atoms but there are $(length(k_array)) k values")) @@ -1402,7 +1392,7 @@ function add_position_restraints(sys, push!(inters, HarmonicPositionRestraint(k_res, x0)) end end - restraints = InteractionList1Atoms(move_array(is, sys), move_array([inters...], sys), types) + restraints = InteractionList1Atoms(AT(is), AT([inters...]), types) sis = (sys.specific_inter_lists..., restraints) return System( atoms=deepcopy(sys.atoms), diff --git a/src/simulators.jl b/src/simulators.jl index ad77d1217..141fc0a47 100644 --- a/src/simulators.jl +++ b/src/simulators.jl @@ -831,12 +831,12 @@ Attempt an exchange of replicas `n` and `m` in a [`ReplicaSystem`](@ref) during Successful exchanges should exchange coordinates and velocities as appropriate. Returns acceptance quantity `Δ` and a `Bool` indicating whether the exchange was successful. """ -function remd_exchange!(sys::ReplicaSystem{D, G, T}, +function remd_exchange!(sys::ReplicaSystem, sim::TemperatureREMD, n::Integer, m::Integer; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) T_n, T_m = sim.temperatures[n], sim.temperatures[m] β_n, β_m = inv(sys.k * T_n), inv(sys.k * T_m) neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder; @@ -922,12 +922,12 @@ function simulate!(sys::ReplicaSystem, return simulate_remd!(sys, sim, n_steps; n_threads=n_threads, run_loggers=run_loggers, rng=rng) end -function remd_exchange!(sys::ReplicaSystem{D, G, T}, +function remd_exchange!(sys::ReplicaSystem, sim::HamiltonianREMD, n::Integer, m::Integer; n_threads::Integer=Threads.nthreads(), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) T_sim = sim.temperature β_sim = inv(sys.k * T_sim) neighbors_n = find_neighbors(sys.replicas[n], sys.replicas[n].neighbor_finder; @@ -1047,12 +1047,12 @@ function MetropolisMonteCarlo(; temperature, trial_moves, trial_args=Dict()) return MetropolisMonteCarlo(temperature, trial_moves, trial_args) end -@inline function simulate!(sys::System{D, G, T}, +@inline function simulate!(sys::System, sim::MetropolisMonteCarlo, n_steps::Integer; n_threads::Integer=Threads.nthreads(), run_loggers=true, - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) neighbors = find_neighbors(sys, sys.neighbor_finder; n_threads=n_threads) E_old = potential_energy(sys, neighbors; n_threads=n_threads) coords_old = similar(sys.coords) @@ -1090,9 +1090,9 @@ Performs a random translation of the coordinates of a randomly selected atom in The translation is generated using a uniformly selected direction and uniformly selected length in range [0, 1) scaled by `shift_size` which should have appropriate length units. """ -function random_uniform_translation!(sys::System{D, G, T}; +function random_uniform_translation!(sys::System{D, AT, T}; shift_size=oneunit(eltype(eltype(sys.coords))), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) where {D, AT, T} rand_idx = rand(rng, eachindex(sys)) direction = random_unit_vector(T, D, rng) magnitude = rand(rng, T) * shift_size @@ -1110,9 +1110,9 @@ The translation is generated using a uniformly chosen direction and length selec the standard normal distribution i.e. with mean 0 and standard deviation 1, scaled by `shift_size` which should have appropriate length units. """ -function random_normal_translation!(sys::System{D, G, T}; +function random_normal_translation!(sys::System{D, AT, T}; shift_size=oneunit(eltype(eltype(sys.coords))), - rng=Random.default_rng()) where {D, G, T} + rng=Random.default_rng()) where {D, AT, T} rand_idx = rand(rng, eachindex(sys)) direction = random_unit_vector(T, D, rng) magnitude = randn(rng, T) * shift_size diff --git a/src/spatial.jl b/src/spatial.jl index f918a827a..728577f48 100644 --- a/src/spatial.jl +++ b/src/spatial.jl @@ -613,12 +613,12 @@ function random_velocities(sys::AtomsBase.AbstractSystem{2}, temp; rng=Random.de return random_velocity_2D.(masses(sys), temp, sys.k, rng) end -function random_velocities(sys::System{3, true}, temp; rng=Random.default_rng()) - return CuArray(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng)) +function random_velocities(sys::System{3, AT}, temp; rng=Random.default_rng()) where AT <: AbstractGPUArray + return AT(random_velocity_3D.(Array(masses(sys)), temp, sys.k, rng)) end -function random_velocities(sys::System{2, true}, temp; rng=Random.default_rng()) - return CuArray(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng)) +function random_velocities(sys::System{2, AT}, temp; rng=Random.default_rng()) where AT <: AbstractGPUArray + return AT(random_velocity_2D.(Array(masses(sys)), temp, sys.k, rng)) end """ @@ -738,9 +738,9 @@ function virial(sys, neighbors, step_n::Integer=0; n_threads::Integer=Threads.nt return v end -function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_nonl, - pairwise_inters_nl) where {D, G, T} - if G +function virial(sys::System{D, AT, T}, neighbors_dev, step_n, pairwise_inters_nonl, + pairwise_inters_nl) where {D, AT, T} + if AT <: AbstractGPUArray coords, velocities, atoms = Array(sys.coords), Array(sys.velocities), Array(sys.atoms) if isnothing(neighbors_dev) neighbors = neighbors_dev @@ -792,7 +792,7 @@ function virial(sys::System{D, G, T}, neighbors_dev, step_n, pairwise_inters_non end # Default for general interactions -function virial(inter, sys::System{D, G, T}, args...; kwargs...) where {D, G, T} +function virial(inter, sys::System{D, AT, T}, args...; kwargs...) where {D, AT, T} return zero(T) * sys.energy_units end @@ -874,8 +874,9 @@ function molecule_centers(coords::AbstractArray{SVector{D, C}}, boundary, topolo end end -function molecule_centers(coords::CuArray, boundary, topology) - return CuArray(molecule_centers(Array(coords), boundary, topology)) +function molecule_centers(coords::AbstractGPUArray, boundary, topology) + AT = array_type(coords) + return AT(molecule_centers(Array(coords), boundary, topology)) end # Allows scaling multiple vectors at once by broadcasting this function @@ -895,7 +896,7 @@ This can be disabled with `ignore_molecules=true`. Not currently compatible with [`TriclinicBoundary`](@ref) if the topology is set. """ -function scale_coords!(sys, scale_factor; ignore_molecules=false) +function scale_coords!(sys::System{<:Any, AT}, scale_factor; ignore_molecules=false) where AT if ignore_molecules || isnothing(sys.topology) sys.boundary = scale_boundary(sys.boundary, scale_factor) sys.coords .= scale_vec.(sys.coords, Ref(scale_factor)) @@ -926,7 +927,7 @@ function scale_coords!(sys, scale_factor; ignore_molecules=false) coords_nounits[i] = wrap_coords( coords_nounits[i] .+ shift_vecs[mi] .- center_shifts[mi], boundary_nounits) end - sys.coords .= move_array(coords_nounits .* coord_units, sys) + sys.coords .= AT(coords_nounits .* coord_units) end return sys end diff --git a/src/types.jl b/src/types.jl index 225d9cff3..023883d33 100644 --- a/src/types.jl +++ b/src/types.jl @@ -15,6 +15,7 @@ export inject_gradients, extract_parameters, ReplicaSystem, + array_type, is_on_gpu, float_type, masses, @@ -182,39 +183,23 @@ function Base.:+(il1::InteractionList4Atoms{I, T}, il2::InteractionList4Atoms{I, ) end -function inject_interaction_list(inter::InteractionList1Atoms, params_dic, gpu) - if gpu - inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,))) - else - inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,)) - end +function inject_interaction_list(inter::InteractionList1Atoms, params_dic, AT) + inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,))) InteractionList1Atoms(inter.is, inters_grad, inter.types) end -function inject_interaction_list(inter::InteractionList2Atoms, params_dic, gpu) - if gpu - inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,))) - else - inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,)) - end +function inject_interaction_list(inter::InteractionList2Atoms, params_dic, AT) + inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,))) InteractionList2Atoms(inter.is, inter.js, inters_grad, inter.types) end -function inject_interaction_list(inter::InteractionList3Atoms, params_dic, gpu) - if gpu - inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,))) - else - inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,)) - end +function inject_interaction_list(inter::InteractionList3Atoms, params_dic, AT) + inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,))) InteractionList3Atoms(inter.is, inter.js, inter.ks, inters_grad, inter.types) end -function inject_interaction_list(inter::InteractionList4Atoms, params_dic, gpu) - if gpu - inters_grad = CuArray(inject_interaction.(Array(inter.inters), inter.types, (params_dic,))) - else - inters_grad = inject_interaction.(inter.inters, inter.types, (params_dic,)) - end +function inject_interaction_list(inter::InteractionList4Atoms, params_dic, AT) + inters_grad = AT(inject_interaction.(Array(inter.inters), inter.types, (params_dic,))) InteractionList4Atoms(inter.is, inter.js, inter.ks, inter.ls, inters_grad, inter.types) end @@ -431,8 +416,6 @@ Base.firstindex(::NoNeighborList) = 1 Base.lastindex(nl::NoNeighborList) = length(nl) Base.eachindex(nl::NoNeighborList) = Base.OneTo(length(nl)) -CUDA.Const(nl::NoNeighborList) = nl - """ System(; ) @@ -481,7 +464,7 @@ interface described there. modified in some simulations. `k` is chosen based on the `energy_units` given. - `data::DA=nothing`: arbitrary data associated with the system. """ -mutable struct System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, +mutable struct System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA} <: AtomsBase.AbstractSystem{D} atoms::A coords::C @@ -521,7 +504,7 @@ function System(; k=default_k(energy_units), data=nothing) D = AtomsBase.n_dimensions(boundary) - G = isa(coords, CuArray) + AT = array_type(coords) T = float_type(boundary) A = typeof(atoms) C = typeof(coords) @@ -567,19 +550,19 @@ function System(; end end - if isa(atoms, CuArray) && !isa(coords, CuArray) + if isa(atoms, AbstractGPUArray) && !isa(coords, AbstractGPUArray) throw(ArgumentError("the atoms are on the GPU but the coordinates are not")) end - if isa(coords, CuArray) && !isa(atoms, CuArray) + if isa(coords, AbstractGPUArray) && !isa(atoms, AbstractGPUArray) throw(ArgumentError("the coordinates are on the GPU but the atoms are not")) end - if isa(atoms, CuArray) && !isa(vels, CuArray) + if isa(atoms, AbstractGPUArray) && !isa(vels, AbstractGPUArray) throw(ArgumentError("the atoms are on the GPU but the velocities are not")) end - if isa(vels, CuArray) && !isa(atoms, CuArray) + if isa(vels, AbstractGPUArray) && !isa(atoms, AbstractGPUArray) throw(ArgumentError("the velocities are on the GPU but the atoms are not")) end - if isa(atoms, CuArray) && length(constraints) > 0 + if isa(atoms, AbstractGPUArray) && length(constraints) > 0 @warn "Constraints are not currently compatible with simulation on the GPU" end @@ -596,7 +579,7 @@ function System(; check_units(atoms, coords, vels, energy_units, force_units, pairwise_inters, specific_inter_lists, general_inters, boundary) - return System{D, G, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}( + return System{D, AT, T, A, C, B, V, AD, TO, PI, SI, GI, CN, NF, L, F, E, K, M, DA}( atoms, coords, boundary, vels, atoms_data, topology, pairwise_inters, specific_inter_lists, general_inters, constraints, neighbor_finder, loggers, df, force_units, energy_units, k_converted, atom_masses, data) @@ -653,7 +636,7 @@ Construct a `System` from a SimpleCrystals.jl `Crystal` struct. Properties unused in the simulation or in analysis can be left with their default values. -`atoms`, `atoms_data`, `coords` and `boundary` are automatically calcualted from +`atoms`, `atoms_data`, `coords` and `boundary` are automatically calculated from the `Crystal` struct. Extra atom paramaters like `σ` have to be added manually after construction using the convenience constructor `System(sys; )`. @@ -721,19 +704,15 @@ Allows gradients for individual parameters to be tracked. Returns atoms, pairwise interactions, specific interaction lists and general interactions. """ -function inject_gradients(sys::System{D, G}, params_dic) where {D, G} - if G - atoms_grad = CuArray(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,))) - else - atoms_grad = inject_atom.(sys.atoms, sys.atoms_data, (params_dic,)) - end +function inject_gradients(sys::System{D, AT}, params_dic) where {D, AT} + atoms_grad = AT(inject_atom.(Array(sys.atoms), sys.atoms_data, (params_dic,))) if length(sys.pairwise_inters) > 0 pis_grad = inject_interaction.(sys.pairwise_inters, (params_dic,)) else pis_grad = sys.pairwise_inters end if length(sys.specific_inter_lists) > 0 - sis_grad = inject_interaction_list.(sys.specific_inter_lists, (params_dic,), G) + sis_grad = inject_interaction_list.(sys.specific_inter_lists, (params_dic,), AT) else sis_grad = sys.specific_inter_lists end @@ -847,7 +826,7 @@ construction where `n` is the number of threads to be used per replica. modified in some simulations. `k` is chosen based on the `energy_units` given. - `data::DA=nothing`: arbitrary data associated with the replica system. """ -mutable struct ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA} <: AtomsBase.AbstractSystem{D} +mutable struct ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA} <: AtomsBase.AbstractSystem{D} atoms::A n_replicas::Int atoms_data::AD @@ -884,7 +863,7 @@ function ReplicaSystem(; k=default_k(energy_units), data=nothing) D = AtomsBase.n_dimensions(boundary) - G = isa(replica_coords[1], CuArray) + AT = array_type(replica_coords[1]) T = float_type(boundary) A = typeof(atoms) AD = typeof(atoms_data) @@ -995,25 +974,25 @@ function ReplicaSystem(; throw(ArgumentError("there are $(length(atoms)) atoms but $(length(atoms_data)) atom data entries")) end - n_cuarray = sum(y -> isa(y, CuArray), replica_coords) - if !(n_cuarray == n_replicas || n_cuarray == 0) - throw(ArgumentError("the coordinates for $n_cuarray out of $n_replicas replicas are on GPU")) + n_gpu_array = sum(y -> isa(y, AbstractGPUArray), replica_coords) + if !(n_gpu_array == n_replicas || n_gpu_array == 0) + throw(ArgumentError("the coordinates for $n_gpu_array out of $n_replicas replicas are on GPU")) end - if isa(atoms, CuArray) && n_cuarray != n_replicas + if isa(atoms, AbstractGPUArray) && n_gpu_array != n_replicas throw(ArgumentError("the atoms are on the GPU but the coordinates are not")) end - if n_cuarray == n_replicas && !isa(atoms, CuArray) + if n_gpu_array == n_replicas && !isa(atoms, AbstractGPUArray) throw(ArgumentError("the coordinates are on the GPU but the atoms are not")) end - n_cuarray = sum(y -> isa(y, CuArray), replica_velocities) - if !(n_cuarray == n_replicas || n_cuarray == 0) - throw(ArgumentError("the velocities for $n_cuarray out of $n_replicas replicas are on GPU")) + n_gpu_array = sum(y -> isa(y, AbstractGPUArray), replica_velocities) + if !(n_gpu_array == n_replicas || n_gpu_array == 0) + throw(ArgumentError("the velocities for $n_gpu_array out of $n_replicas replicas are on GPU")) end - if isa(atoms, CuArray) && n_cuarray != n_replicas + if isa(atoms, AbstractGPUArray) && n_gpu_array != n_replicas throw(ArgumentError("the atoms are on the GPU but the velocities are not")) end - if n_cuarray == n_replicas && !isa(atoms, CuArray) + if n_gpu_array == n_replicas && !isa(atoms, AbstractGPUArray) throw(ArgumentError("the velocities are on the GPU but the atoms are not")) end @@ -1023,7 +1002,7 @@ function ReplicaSystem(; k_converted = convert_k_units(T, k, energy_units) K = typeof(k_converted) - replicas = Tuple(System{D, G, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]), + replicas = Tuple(System{D, AT, T, A, C, B, V, AD, TO, typeof(replica_pairwise_inters[i]), typeof(replica_specific_inter_lists[i]), typeof(replica_general_inters[i]), typeof(replica_constraints[i]), NF, typeof(replica_loggers[i]), F, E, K, M, Nothing}( @@ -1034,17 +1013,27 @@ function ReplicaSystem(; force_units, energy_units, k_converted, atom_masses, nothing) for i in 1:n_replicas) R = typeof(replicas) - return ReplicaSystem{D, G, T, A, AD, EL, F, E, K, R, DA}( + return ReplicaSystem{D, AT, T, A, AD, EL, F, E, K, R, DA}( atoms, n_replicas, atoms_data, exchange_logger, force_units, energy_units, k_converted, replicas, data) end +""" + array_type(sys) + array_type(arr) + +The array type of a [`System`](@ref), [`ReplicaSystem`](@ref) or array, for example +`Array` for systems on CPU or `CuArray` for systems on a NVIDIA GPU. +""" +array_type(::AT) where AT = AT.name.wrapper +array_type(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT + """ is_on_gpu(sys) Whether a [`System`](@ref) or [`ReplicaSystem`](@ref) is on the GPU. """ -is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G +is_on_gpu(::Union{System{D, AT}, ReplicaSystem{D, AT}}) where {D, AT} = AT <: AbstractGPUArray """ float_type(sys) @@ -1052,7 +1041,7 @@ is_on_gpu(::Union{System{D, G}, ReplicaSystem{D, G}}) where {D, G} = G The float type a [`System`](@ref), [`ReplicaSystem`](@ref) or bounding box uses. """ -float_type(::Union{System{D, G, T}, ReplicaSystem{D, G, T}}) where {D, G, T} = T +float_type(::Union{System{D, AT, T}, ReplicaSystem{D, AT, T}}) where {D, AT, T} = T """ masses(sys) @@ -1070,10 +1059,6 @@ The partial charges of the atoms in a [`System`](@ref) or [`ReplicaSystem`](@ref charges(s::Union{System, ReplicaSystem}) = charge.(s.atoms) charge(s::Union{System, ReplicaSystem}, i::Integer) = charge(s.atoms[i]) -# Move an array to the GPU depending on whether the system is on the GPU -move_array(arr, ::System{D, false}) where {D} = arr -move_array(arr, ::System{D, true }) where {D} = CuArray(arr) - Base.getindex(s::Union{System, ReplicaSystem}, i::Union{Integer, AbstractVector}) = s.atoms[i] Base.length(s::Union{System, ReplicaSystem}) = length(s.atoms) Base.eachindex(s::Union{System, ReplicaSystem}) = Base.OneTo(length(s)) diff --git a/test/Project.toml b/test/Project.toml index 3901cc98f..c183b5756 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,4 +1,5 @@ [deps] +AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" AtomsBase = "a963bdd2-2df7-4f54-a1ee-49d51e6be12a" AtomsBaseTesting = "ed7c10db-df7e-4efa-a7be-4f4190f7f227" @@ -9,17 +10,23 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" FiniteDifferences = "26cc04aa-876d-5657-8c51-4c34ba976000" GLMakie = "e9467ef8-e4e7-5192-8a1a-b1aee30e663a" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" KernelDensity = "5ab0869b-81aa-558d-bb23-cbf5423bbe9b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Metal = "dde4c033-4e86-420c-a63e-0dd931031962" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SimpleCrystals = "64031d72-e220-11ed-1a7e-43a2532b2fa8" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b" [compat] +AMDGPU = "1" Aqua = "0.8" AtomsBaseTesting = "0.4" +CUDA = "5" DelimitedFiles = "1.9" FiniteDifferences = "0.12" -GLMakie = "0.9, 0.10" +Metal = "1.5" Test = "1.9" +oneAPI = "2" diff --git a/test/basic.jl b/test/basic.jl index b9273cf0f..e510bf6e3 100644 --- a/test/basic.jl +++ b/test/basic.jl @@ -176,22 +176,22 @@ @test mcs == [SVector(0.05, 0.0), SVector(1.0, 1.0)] ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "tip3p_standard.xml", "his.xml"])...) - for gpu in gpu_list - sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=gpu, use_cell_list=false) + for AT in array_list + sys = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; array_type=AT, use_cell_list=false) mcs = molecule_centers(sys.coords, sys.boundary, sys.topology) - @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.04u"nm") + @test isapprox(Array(mcs)[1], mean(sys.coords[1:1170]); atol=0.08u"nm") # Mark all pairs as ineligible for pairwise interactions and check that the # potential energy from the specific interactions does not change on scaling no_nbs = falses(length(sys), length(sys)) - if gpu + if AT <: CuArray sys.neighbor_finder = GPUNeighborFinder( - eligible=(gpu ? CuArray(no_nbs) : no_nbs), + eligible=AT(no_nbs), dist_cutoff=1.0u"nm", ) - else + else sys.neighbor_finder = DistanceNeighborFinder( - eligible=(gpu ? CuArray(no_nbs) : no_nbs), + eligible=AT(no_nbs), dist_cutoff=1.0u"nm", ) end @@ -317,8 +317,8 @@ end end end - if run_gpu_tests - sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; gpu=true) + for AT in array_list[2:end] + sys_gpu = System(joinpath(data_dir, "6mrr_equil.pdb"), ff; array_type=AT) for neighbor_finder in (DistanceNeighborFinder,) nf_gpu = neighbor_finder( eligible=sys_gpu.neighbor_finder.eligible, @@ -327,7 +327,7 @@ end ) neighbors_gpu = find_neighbors(sys_gpu, nf_gpu) @test length(neighbors_gpu) == n_neighbors_ref - CUDA.allowscalar() do + GPUArrays.allowscalar() do @test neighbors_gpu[10] isa Tuple{Int32, Int32, Bool} end @test identical_neighbors(neighbors_gpu, neighbors_ref) @@ -343,8 +343,8 @@ end coords_1 = SVector{3, Float64}.(eachcol(cm_1)) / 10 * u"nm" coords_2 = SVector{3, Float64}.(eachcol(cm_2)) / 10 * u"nm" @test rmsd(coords_1, coords_2) ≈ 2.54859467758795u"Å" - if run_gpu_tests - @test rmsd(CuArray(coords_1), CuArray(coords_2)) ≈ 2.54859467758795u"Å" + for AT in array_list[2:end] + @test rmsd(AT(coords_1), AT(coords_2)) ≈ 2.54859467758795u"Å" end bb_atoms = BioStructures.collectatoms(struc[1], BioStructures.backboneselector) diff --git a/test/energy_conservation.jl b/test/energy_conservation.jl index f29aca2b4..22d020c4b 100644 --- a/test/energy_conservation.jl +++ b/test/energy_conservation.jl @@ -1,12 +1,15 @@ # Energy conservation test using Molly +using AbstractGPUArray +using AMDGPU using CUDA using Test @testset "Lennard-Jones energy conservation" begin - function test_energy_conservation(nl::Bool, gpu::Bool, n_threads::Integer, n_steps::Integer) + function test_energy_conservation(nl::Bool, ::Type{AT}, n_threads::Integer, + n_steps::Integer) where AT n_atoms = 2_000 atom_mass = 40.0u"g/mol" temp = 1.0u"K" @@ -24,25 +27,27 @@ using Test for cutoff in cutoffs coords = place_atoms(n_atoms, boundary; min_dist=0.1u"nm") - neighbor_finder = NoNeighborFinder() - if nl && gpu - neighbor_finder=GPUNeighborFinder( - eligible=CuArray(trues(n_atoms, n_atoms)), - n_steps_reorder=10, - dist_cutoff=dist_cutoff, - ) - end - if nl && !gpu - neighbor_finder=DistanceNeighborFinder( - eligible=trues(n_atoms, n_atoms), - n_steps=10, - dist_cutoff=dist_cutoff, - ) + if nl + if AT <: CuArray + neighbor_finder=GPUNeighborFinder( + eligible=AT(trues(n_atoms, n_atoms)), + n_steps_reorder=10, + dist_cutoff=dist_cutoff, + ) + else + neighbor_finder=DistanceNeighborFinder( + eligible=AT(trues(n_atoms, n_atoms)), + n_steps=10, + dist_cutoff=dist_cutoff, + ) + end + else + neighbor_finder = NoNeighborFinder() end sys = System( - atoms=(gpu ? CuArray(atoms) : atoms), - coords=(gpu ? CuArray(coords) : coords), + atoms=AT(atoms), + coords=AT(coords), boundary=boundary, pairwise_inters=(LennardJones(cutoff=cutoff, use_neighbors=ifelse(nl, true, false)),), neighbor_finder=neighbor_finder, @@ -61,7 +66,7 @@ using Test @test isapprox(Es[1], E0; atol=1e-7u"kJ * mol^-1") max_ΔE = maximum(abs.(Es .- E0)) - platform_str = gpu ? "GPU" : "CPU $n_threads thread(s)" + platform_str = (AT <: AbstractGPUArray ? "$AT" : "CPU $n_threads thread(s)") cutoff_str = Base.typename(typeof(cutoff)).wrapper @info "$platform_str - $cutoff_str - max energy difference $max_ΔE" @test max_ΔE < 5e-4u"kJ * mol^-1" @@ -72,16 +77,18 @@ using Test end end - test_energy_conservation(true, false, 1, 10_000) - test_energy_conservation(false, false, 1, 10_000) + test_energy_conservation(true , Array, 1, 10_000) + test_energy_conservation(false, Array, 1, 10_000) if Threads.nthreads() > 1 - test_energy_conservation(true, false, Threads.nthreads(), 50_000) - test_energy_conservation(false, false, Threads.nthreads(), 50_000) + test_energy_conservation(true , Array, Threads.nthreads(), 50_000) + test_energy_conservation(false, Array, Threads.nthreads(), 50_000) end if CUDA.functional() - test_energy_conservation(true, true, 1, 100_000) - test_energy_conservation(false, true, 1, 100_000) + test_energy_conservation(true , CuArray, 1, 100_000) + test_energy_conservation(false, CuArray, 1, 100_000) + end + if AMDGPU.functional() + test_energy_conservation(true , ROCArray, 1, 100_000) + test_energy_conservation(false, ROCArray, 1, 100_000) end end - - diff --git a/test/gradients.jl b/test/gradients.jl index cce785a6b..bdda204aa 100644 --- a/test/gradients.jl +++ b/test/gradients.jl @@ -36,24 +36,24 @@ end @testset "Differentiable simulation" begin runs = [ # gpu par fwd f32 obc2 gbn2 tol_σ tol_r0 - ("CPU" , false, false, false, false, false, false, 1e-4, 1e-4), - ("CPU forward" , false, false, true , false, false, false, 0.5 , 0.1 ), - ("CPU f32" , false, false, false, true , false, false, 0.01, 5e-4), - ("CPU obc2" , false, false, false, false, true , false, 1e-4, 1e-4), - ("CPU gbn2" , false, false, false, false, false, true , 1e-4, 1e-4), - ("CPU gbn2 forward", false, false, true , false, false, true , 0.5 , 0.1 ), + ("CPU" , Array, false, false, false, false, false, 1e-4, 1e-4), + ("CPU forward" , Array, false, true , false, false, false, 0.5 , 0.1 ), + ("CPU f32" , Array, false, false, true , false, false, 0.01, 5e-4), + ("CPU obc2" , Array, false, false, false, true , false, 1e-4, 1e-4), + ("CPU gbn2" , Array, false, false, false, false, true , 1e-4, 1e-4), + ("CPU gbn2 forward", Array, false, true , false, false, true , 0.5 , 0.1 ), ] - if run_parallel_tests # gpu par fwd f32 obc2 gbn2 tol_σ tol_r0 - push!(runs, ("CPU parallel" , false, true , false, false, false, false, 1e-4, 1e-4)) - push!(runs, ("CPU parallel forward", false, true , true , false, false, false, 0.5 , 0.1 )) - push!(runs, ("CPU parallel f32" , false, true , false, true , false, false, 0.01, 5e-4)) + if run_parallel_tests # gpu par fwd f32 obc2 gbn2 tol_σ tol_r0 + push!(runs, ("CPU parallel" , Array, true , false, false, false, false, 1e-4, 1e-4)) + push!(runs, ("CPU parallel forward", Array, true , true , false, false, false, 0.5 , 0.1 )) + push!(runs, ("CPU parallel f32" , Array, true , false, true , false, false, 0.01, 5e-4)) end - if run_gpu_tests # gpu par fwd f32 obc2 gbn2 tol_σ tol_r0 - push!(runs, ("GPU" , true , false, false, false, false, false, 0.25, 20.0)) - push!(runs, ("GPU forward" , true , false, true , false, false, false, 0.25, 20.0)) - push!(runs, ("GPU f32" , true , false, false, true , false, false, 0.5 , 50.0)) - push!(runs, ("GPU obc2" , true , false, false, false, true , false, 0.25, 20.0)) - push!(runs, ("GPU gbn2" , true , false, false, false, false, true , 0.25, 20.0)) + for AT in array_list[2:end] # gpu par fwd f32 obc2 gbn2 tol_σ tol_r0 + push!(runs, ("$AT" , AT , false, false, false, false, false, 0.25, 20.0)) + push!(runs, ("$AT forward" , AT , false, true , false, false, false, 0.25, 20.0)) + push!(runs, ("$AT f32" , AT , false, false, true , false, false, 0.5 , 50.0)) + push!(runs, ("$AT obc2" , AT , false, false, false, true , false, 0.25, 20.0)) + push!(runs, ("$AT gbn2" , AT , false, false, false, false, true , 0.25, 20.0)) end function mean_min_separation(coords, boundary, ::Val{T}) where T @@ -103,9 +103,8 @@ end return mean_min_separation(sys.coords, boundary, Val(T)) end - for (name, gpu, parallel, forward, f32, obc2, gbn2, tol_σ, tol_r0) in runs + for (name, AT, parallel, forward, f32, obc2, gbn2, tol_σ, tol_r0) in runs T = f32 ? Float32 : Float64 - AT = gpu ? CuArray : Array σ = T(0.4) r0 = T(1.0) n_atoms = 50 @@ -245,13 +244,13 @@ end end @testset "Differentiable protein" begin - function create_sys(gpu::Bool) + function create_sys(AT) ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...; units=false) return System( joinpath(data_dir, "6mrr_nowater.pdb"), ff; units=false, - gpu=gpu, + array_type=AT, implicit_solvent="gbn2", kappa=0.7, ) @@ -402,10 +401,10 @@ end platform_runs = [("CPU", false, false)] if run_parallel_tests - push!(platform_runs, ("CPU parallel", false, true)) + push!(platform_runs, ("CPU parallel", Array, true)) end - if run_gpu_tests - push!(platform_runs, ("GPU", true, false)) + for AT in array_list[2:end] + push!(platform_runs, ("$AT", AT, false)) end test_runs = [ ("Energy", test_energy_grad, 1e-8), @@ -423,8 +422,8 @@ end ) for (test_name, test_fn, test_tol) in test_runs - for (platform, gpu, parallel) in platform_runs - sys_ref = create_sys(gpu) + for (platform, AT, parallel) in platform_runs + sys_ref = create_sys(AT) n_threads = parallel ? Threads.nthreads() : 1 grads_enzyme = Dict(k => 0.0 for k in keys(params_dic)) autodiff( diff --git a/test/minimization.jl b/test/minimization.jl index 83a10f0e8..c3baa0826 100644 --- a/test/minimization.jl +++ b/test/minimization.jl @@ -42,14 +42,14 @@ @test isapprox(potential_energy(sys; n_threads=1) * u"kJ * mol^-1", -3.0u"kJ * mol^-1"; atol=1e-4u"kJ * mol^-1") - if run_gpu_tests - coords = CuArray([ + for AT in array_list[2:end] + coords = AT([ SVector(1.0, 1.0, 1.0)u"nm", SVector(1.6, 1.0, 1.0)u"nm", SVector(1.4, 1.6, 1.0)u"nm", ]) sys = System( - atoms=CuArray([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]), + atoms=AT([Atom(σ=(0.4 / (2 ^ (1 / 6)))u"nm", ϵ=1.0u"kJ * mol^-1") for i in 1:3]), coords=coords, boundary=CubicBoundary(5.0u"nm"), pairwise_inters=(LennardJones(),), @@ -57,10 +57,10 @@ sim = SteepestDescentMinimizer(tol=1.0u"kJ * mol^-1 * nm^-1") simulate!(sys, sim) - dists = distances(sys.coords, sys.boundary) + dists = Array(distances(sys.coords, sys.boundary)) dists_flat = dists[triu(trues(3, 3), 1)] - @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-3u"nm"), dists_flat) + @test all(x -> isapprox(x, 0.4u"nm"; atol=1e-2u"nm"), dists_flat) @test isapprox(potential_energy(sys), -3.0u"kJ * mol^-1"; - atol=1e-4u"kJ * mol^-1") + atol=1e-2u"kJ * mol^-1") end end diff --git a/test/protein.jl b/test/protein.jl index 9d7ab007e..c68016527 100644 --- a/test/protein.jl +++ b/test/protein.jl @@ -179,12 +179,12 @@ end @test pis_grad == sys_nounits.pairwise_inters # Test the same simulation on the GPU - if run_gpu_tests + for AT in array_list[2:end] sys = System( joinpath(data_dir, "6mrr_equil.pdb"), ff; - velocities=CuArray(copy(velocities_start)), - gpu=true, + velocities=AT(copy(velocities_start)), + array_type=AT, center_coords=false, ) @test kinetic_energy(sys) ≈ 65521.87288132431u"kJ * mol^-1" @@ -211,9 +211,9 @@ end sys_nounits = System( joinpath(data_dir, "6mrr_equil.pdb"), ff_nounits; - velocities=CuArray(copy(ustrip_vec.(velocities_start))), + velocities=AT(copy(ustrip_vec.(velocities_start))), units=false, - gpu=true, + array_type=AT, center_coords=false, ) @test kinetic_energy(sys_nounits)u"kJ * mol^-1" ≈ 65521.87288132431u"kJ * mol^-1" @@ -248,13 +248,13 @@ end @testset "Implicit solvent" begin ff = MolecularForceField(joinpath.(ff_dir, ["ff99SBildn.xml", "his.xml"])...) - for gpu in gpu_list + for AT in array_list for solvent_model in ("obc2", "gbn2") sys = System( joinpath(data_dir, "6mrr_nowater.pdb"), ff; boundary=CubicBoundary(100.0u"nm"), - gpu=gpu, + array_type=AT, dist_cutoff=5.0u"nm", dist_neighbors=5.0u"nm", implicit_solvent=solvent_model, diff --git a/test/runtests.jl b/test/runtests.jl index b18d4d73c..3c602a14c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ using Molly +using AMDGPU using Aqua import AtomsBase using AtomsBaseTesting @@ -8,7 +9,10 @@ import BioStructures # Imported to avoid clashing names using CUDA using Enzyme using FiniteDifferences +using GPUArrays using KernelDensity +using Metal +using oneAPI import SimpleCrystals using DelimitedFiles @@ -49,18 +53,45 @@ else @warn "The parallel tests will not be run as Julia is running on 1 thread" end -# Allow CUDA device to be specified +const run_gpu_tests = get(ENV, "GPUTESTS", "1") != "0" +# Allow GPU device to be specified const DEVICE = parse(Int, get(ENV, "DEVICE", "0")) -const run_gpu_tests = get(ENV, "GPUTESTS", "1") != "0" && CUDA.functional() -const gpu_list = (run_gpu_tests ? (false, true) : (false,)) -if run_gpu_tests - device!(DEVICE) - @info "The GPU tests will be run on device $DEVICE" -elseif get(ENV, "GPUTESTS", "1") == "0" - @warn "The GPU tests will not be run as GPUTESTS is set to 0" +const run_cuda_tests = run_gpu_tests && CUDA.functional() +const run_rocm_tests = run_gpu_tests && AMDGPU.functional() +const run_oneapi_tests = run_gpu_tests && oneAPI.functional() +const run_metal_tests = run_gpu_tests && Metal.functional() + +array_list = (Array,) + +if run_cuda_tests + array_list = (array_list..., CuArray) + CUDA.device!(DEVICE) + @info "The CUDA tests will be run on device $DEVICE" +else + @warn "The CUDA tests will not be run as a CUDA-enabled device is not available" +end + +if run_rocm_tests + array_list = (array_list..., ROCArray) + AMDGPU.device!(AMDGPU.device(DEVICE+1)) + @info "The AMDGPU tests will be run on device $DEVICE" +else + @warn "The AMDGPU tests will not be run as a AMDGPU-enabled device is not available" +end + +if run_oneapi_tests + array_list = (array_list..., oneArray) + oneAPI.device!(DEVICE) + @info "The oneAPI tests will be run on device $DEVICE" +else + @warn "The oneAPI tests will not be run as a oneAPI-enabled device is not available" +end + +if run_metal_tests + @info "The Metal tests will be run" else - @warn "The GPU tests will not be run as a CUDA-enabled device is not available" + @warn "The Metal tests will not be run as a Metal-enabled device is not available" end const data_dir = normpath(@__DIR__, "..", "data") diff --git a/test/simulation.jl b/test/simulation.jl index 8ecbab863..6b277609e 100644 --- a/test/simulation.jl +++ b/test/simulation.jl @@ -1,6 +1,5 @@ @testset "Lennard-Jones 2D" begin - for gpu in gpu_list - AT = gpu ? CuArray : Array + for AT in array_list n_atoms = 10 n_steps = 20_000 temp = 100.0u"K" @@ -8,7 +7,7 @@ simulator = VelocityVerlet(dt=0.001u"ps", coupling=AndersenThermostat(temp, 10.0u"ps")) gen_temp_wrapper(s, args...; kwargs...) = temperature(s) - if gpu + if AT <: CuArray neighbor_finder = GPUNeighborFinder( eligible=eligible=AT(trues(n_atoms, n_atoms)), n_steps_reorder=10, @@ -16,7 +15,7 @@ ) else neighbor_finder = DistanceNeighborFinder( - eligible=trues(n_atoms, n_atoms), + eligible=AT(trues(n_atoms, n_atoms)), n_steps=10, dist_cutoff=2.0u"nm", ) @@ -221,39 +220,32 @@ end OverdampedLangevin(dt=0.002u"ps", temperature=temp, friction=10.0u"ps^-1"), ] - s = System( - atoms=[Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms], - coords=coords, - boundary=boundary, - pairwise_inters=(LennardJones(use_neighbors=true),), - neighbor_finder=DistanceNeighborFinder( - eligible=trues(n_atoms, n_atoms), - n_steps=10, - dist_cutoff=2.0u"nm", - ), - loggers=(coords=CoordinatesLogger(100),), - ) - random_velocities!(s, temp) - - if run_gpu_tests - s_gpu = System( - atoms=CuArray([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms]), - coords=CuArray(coords), - boundary=boundary, - pairwise_inters=(LennardJones(use_neighbors=true),), - neighbor_finder=GPUNeighborFinder( - eligible=CuArray(trues(n_atoms, n_atoms)), + for AT in array_list + if AT <: CuArray + neighbor_finder = GPUNeighborFinder( + eligible=AT(trues(n_atoms, n_atoms)), n_steps_reorder=10, dist_cutoff=2.0u"nm", - ), + ) + else + neighbor_finder = DistanceNeighborFinder( + eligible=AT(trues(n_atoms, n_atoms)), + n_steps=10, + dist_cutoff=2.0u"nm", + ) + end + s = System( + atoms=AT([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.3u"nm", ϵ=0.2u"kJ * mol^-1") + for i in 1:n_atoms]), + coords=AT(coords), + boundary=boundary, + pairwise_inters=(LennardJones(use_neighbors=true),), + neighbor_finder=neighbor_finder, loggers=(coords=CoordinatesLogger(100),), ) - end - - for simulator in simulators - @time simulate!(s, simulator, n_steps; n_threads=1) - if run_gpu_tests - @time simulate!(s_gpu, simulator, n_steps; n_threads=1) + random_velocities!(s, temp) + for simulator in simulators + @time simulate!(s, simulator, n_steps; n_threads=1) end end end @@ -285,7 +277,7 @@ end loggers=(coords=CoordinatesLogger(100),), ) - if run_gpu_tests + if run_cuda_tests s_gpu = System( atoms=CuArray([Atom(mass=10.0u"g/mol", charge=0.0, σ=0.1u"nm", ϵ=0.2u"kJ * mol^-1") for i in 1:n_atoms]), coords=CuArray(coords), @@ -303,7 +295,7 @@ end for simulator in simulators @time simulate!(s, simulator, n_steps; n_threads=1) - if run_gpu_tests + if run_cuda_tests @time simulate!(s_gpu, simulator, n_steps; n_threads=1) coord_diff = sum(sum(map(x -> abs.(x), s.coords .- Array(s_gpu.coords)))) / (3 * n_atoms) E_diff = abs(potential_energy(s) - potential_energy(s_gpu)) @@ -437,7 +429,7 @@ end neighbor_finder = NoNeighborFinder() end - if run_gpu_tests + if run_cuda_tests neighbor_finder_gpu = GPUNeighborFinder(eligible=CuArray(trues(n_atoms, n_atoms)), n_steps_reorder=10, dist_cutoff=1.2u"nm") end @@ -457,7 +449,7 @@ end E0 = potential_energy(s) @time simulate!(s, simulator, n_steps) - if run_gpu_tests + if run_cuda_tests s_gpu = System( atoms=CuArray(atoms), coords=CuArray(coords), @@ -574,7 +566,7 @@ end end @testset "Position restraints" begin - for gpu in gpu_list + for AT in array_list n_atoms = 10 n_atoms_res = n_atoms ÷ 2 n_steps = 2_000 @@ -585,8 +577,8 @@ end sim = Langevin(dt=0.001u"ps", temperature=300.0u"K", friction=1.0u"ps^-1") sys = System( - atoms=(gpu ? CuArray(atoms) : atoms), - coords=(gpu ? CuArray(copy(starting_coords)) : copy(starting_coords)), + atoms=AT(atoms), + coords=AT(copy(starting_coords)), boundary=boundary, atoms_data=atoms_data, pairwise_inters=(LennardJones(),), @@ -1077,11 +1069,10 @@ end vvand_baro = VelocityVerlet(dt=dt, coupling=(AndersenThermostat(temp, 1.0u"ps"), barostat)) for sim in (lang_baro, vvand_baro) - for gpu in gpu_list - if gpu && sim == vvand_baro + for AT in array_list + if AT <: AbstractGPUArray && sim == vvand_baro continue end - AT = gpu ? CuArray : Array sys = System( atoms=AT(atoms), @@ -1141,10 +1132,9 @@ end SVector(nothing , nothing , nothing ), # Uncoupled ) - for gpu in gpu_list - AT = gpu ? CuArray : Array + for AT in array_list for (press_i, press) in enumerate(pressure_test_set) - if gpu && press_i != 2 + if AT <: AbstractGPUArray && press_i != 2 continue end @@ -1210,10 +1200,10 @@ end MonteCarloMembraneBarostat(press, tens, temp, boundary; z_axis_fixed=true), ) - for gpu in gpu_list - AT = gpu ? CuArray : Array + if run_cuda_tests + AT = CuArray for (barostat_i, barostat) in enumerate(barostat_test_set) - if gpu && barostat_i != 2 + if AT <: AbstractGPUArray && barostat_i != 2 continue end @@ -1333,7 +1323,7 @@ end starting_coords_f32 = [Float32.(c) for c in starting_coords] starting_velocities_f32 = [Float32.(c) for c in starting_velocities] - function test_sim(nl::Bool, parallel::Bool, f32::Bool, gpu::Bool) + function test_sim(nl::Bool, parallel::Bool, f32::Bool, ::Type{AT}) where AT n_atoms = 400 n_steps = 200 atom_mass = f32 ? 10.0f0u"g/mol" : 10.0u"g/mol" @@ -1343,43 +1333,37 @@ end r0 = f32 ? 0.2f0u"nm" : 0.2u"nm" bonds = [HarmonicBond(k=k, r0=r0) for i in 1:(n_atoms ÷ 2)] specific_inter_lists = (InteractionList2Atoms( - gpu ? CuArray(Int32.(collect(1:2:n_atoms))) : Int32.(collect(1:2:n_atoms)), - gpu ? CuArray(Int32.(collect(2:2:n_atoms))) : Int32.(collect(2:2:n_atoms)), - gpu ? CuArray(bonds) : bonds, + AT(Int32.(collect(1:2:n_atoms))), + AT(Int32.(collect(2:2:n_atoms))), + AT(bonds), ),) - - neighbor_finder = NoNeighborFinder() cutoff = DistanceCutoff(f32 ? 1.0f0u"nm" : 1.0u"nm") - pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),) - if nl && gpu - neighbor_finder = GPUNeighborFinder( - eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms), - n_steps_reorder=10, - dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm", - ) - pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),) - end - if nl && !gpu - neighbor_finder = DistanceNeighborFinder( - eligible=gpu ? CuArray(trues(n_atoms, n_atoms)) : trues(n_atoms, n_atoms), - n_steps=10, - dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm", - ) + + if nl + if AT <: CuArray + neighbor_finder = GPUNeighborFinder( + eligible=AT(trues(n_atoms, n_atoms)), + n_steps_reorder=10, + dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm", + ) + else + neighbor_finder = DistanceNeighborFinder( + eligible=AT(trues(n_atoms, n_atoms)), + n_steps=10, + dist_cutoff=f32 ? 1.5f0u"nm" : 1.5u"nm", + ) + end pairwise_inters = (LennardJones(use_neighbors=true, cutoff=cutoff),) + else + neighbor_finder = NoNeighborFinder() + pairwise_inters = (LennardJones(use_neighbors=false, cutoff=cutoff),) end show(devnull, neighbor_finder) - if gpu - coords = CuArray(copy(f32 ? starting_coords_f32 : starting_coords)) - velocities = CuArray(copy(f32 ? starting_velocities_f32 : starting_velocities)) - atoms = CuArray([Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", - ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]) - else - coords = copy(f32 ? starting_coords_f32 : starting_coords) - velocities = copy(f32 ? starting_velocities_f32 : starting_velocities) - atoms = [Atom(mass=atom_mass, charge=f32 ? 0.0f0 : 0.0, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", - ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms] - end + coords = AT(copy(f32 ? starting_coords_f32 : starting_coords)) + velocities = AT(copy(f32 ? starting_velocities_f32 : starting_velocities)) + atoms = AT([Atom(charge=f32 ? 0.0f0 : 0.0, mass=atom_mass, σ=f32 ? 0.2f0u"nm" : 0.2u"nm", + ϵ=f32 ? 0.2f0u"kJ * mol^-1" : 0.2u"kJ * mol^-1") for i in 1:n_atoms]) s = System( atoms=atoms, @@ -1391,7 +1375,7 @@ end neighbor_finder=neighbor_finder, ) - @test is_on_gpu(s) == gpu + @test is_on_gpu(s) == (AT <: AbstractGPUArray) @test float_type(s) == (f32 ? Float32 : Float64) n_threads = parallel ? Threads.nthreads() : 1 @@ -1402,22 +1386,26 @@ end end runs = [ - ("CPU" , [false, false, false, false]), - ("CPU f32" , [false, false, true , false]), - ("CPU NL" , [true , false, false, false]), - ("CPU f32 NL", [true , false, true , false]), + ("CPU" , [false, false, false, Array]), + ("CPU f32" , [false, false, true , Array]), + ("CPU NL" , [true , false, false, Array]), + ("CPU f32 NL", [true , false, true , Array]), ] if run_parallel_tests - push!(runs, ("CPU parallel" , [false, true , false, false])) - push!(runs, ("CPU parallel f32" , [false, true , true , false])) - push!(runs, ("CPU parallel NL" , [true , true , false, false])) - push!(runs, ("CPU parallel f32 NL", [true , true , true , false])) + push!(runs, ("CPU parallel" , [false, true , false, Array])) + push!(runs, ("CPU parallel f32" , [false, true , true , Array])) + push!(runs, ("CPU parallel NL" , [true , true , false, Array])) + push!(runs, ("CPU parallel f32 NL", [true , true , true , Array])) + end + for AT in array_list[2:end] + push!(runs, ("$AT" , [false, false, false, AT])) + push!(runs, ("$AT f32" , [false, false, true , AT])) + push!(runs, ("$AT NL" , [true , false, false, AT])) + push!(runs, ("$AT f32 NL", [true , false, true , AT])) end - if run_gpu_tests - push!(runs, ("GPU" , [false, false, false, true])) - push!(runs, ("GPU f32" , [false, false, true , true])) - push!(runs, ("GPU NL" , [true , false, false, true])) - push!(runs, ("GPU f32 NL", [true , false, true , true])) + if run_metal_tests + push!(runs, ("$AT f32" , [false, false, true , AT])) + push!(runs, ("$AT f32 NL", [true , false, true , AT])) end final_coords_ref, E_start_ref = test_sim(runs[1][2]...)