Skip to content
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand All @@ -34,13 +35,13 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
TrixiParticlesCUDAExt = "CUDA"
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]

[compat]
Accessors = "0.1.43"
Expand All @@ -63,6 +64,7 @@ Polyester = "0.7.10"
ReadVTK = "0.2"
RecipesBase = "1"
Reexport = "1"
SIMD = "3.7.2"
SciMLBase = "2"
StaticArrays = "1"
Statistics = "1"
Expand Down
1 change: 1 addition & 0 deletions src/TrixiParticles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ using Random: seed!
using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
add_tstop!
using SIMD: SIMD
@reexport using StaticArrays: SVector
using StaticArrays: @SMatrix, SMatrix, setindex
using Statistics: Statistics
Expand Down
47 changes: 38 additions & 9 deletions src/general/abstract_system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,46 @@ end
end

# Return `A[:, :, i]` as an `SMatrix`.
@inline function extract_smatrix(A, system, particle)
@boundscheck checkbounds(A, ndims(system), ndims(system), particle)
@propagate_inbounds function extract_smatrix(A, system, particle)
return extract_smatrix(A, Val(ndims(system)), particle)
end

@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
@boundscheck checkbounds(A, NDIMS, NDIMS, particle)

# Extract the matrix elements for this particle as a tuple to pass to SMatrix
return SMatrix{ndims(system),
ndims(system)}(ntuple(@inline(i->@inbounds A[mod(i - 1,
ndims(system)) + 1,
div(i - 1,
ndims(system)) + 1,
particle]),
Val(ndims(system)^2)))
return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
div(i - 1, NDIMS) + 1,
particle]),
Val(NDIMS^2)))
end

# Optimized version for 2D, which uses SIMD.jl to combine the 4 loads of the 2x2 matrix
# into a single wide load. This is significantly faster on GPUs than 4 individual loads.
# WARNING:
# 1. This only works if the matrix elements are stored contiguously in memory.
# The 4 elements of the 2x2 matrix for a particle must immediately follow the
# 4 elements of the previous particle's matrix, with no padding in between.
# 2. The pointer of `A` must be aligned to the size of `Vec{4, eltype(A)}`.
# This is guaranteed if `A` is allocated by Julia and has the correct size,
# but may not be true if `A` is a view or subarray.
@propagate_inbounds function extract_smatrix_aligned(A, system, particle)
return extract_smatrix_aligned(A, Val(ndims(system)), particle)
end

@inline function extract_smatrix_aligned(A, ::Val{2}, particle)
@boundscheck checkbounds(A, 2, 2, particle)

# Note that this doesn't work in 3D because it requires a stride of 2^n.
x = SIMD.vloada(SIMD.Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1))

return SMatrix{2, 2}(Tuple(x))
end

# Fall back to the generic version when not in 2D.
@propagate_inbounds function extract_smatrix_aligned(A, ::Val{NDIMS},
particle) where {NDIMS}
return extract_smatrix(A, Val(NDIMS), particle)
end

# Specifically get the current coordinates of a particle for all system types.
Expand Down
4 changes: 3 additions & 1 deletion src/general/corrections.jl
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,9 @@ end

function correction_matrix_inversion_step!(corr_matrix, system, semi)
@threaded semi for particle in eachparticle(system)
L = extract_smatrix(corr_matrix, system, particle)
# `corr_matrix` is not a view, so we can use the fast `extract_smatrix_aligned`
# instead of the generic `extract_smatrix`.
L = extract_smatrix_aligned(corr_matrix, system, particle)

# The matrix `L` only becomes singular when the particle and all neighbors
# are collinear (in 2D) or lie all in the same plane (in 3D).
Expand Down
6 changes: 6 additions & 0 deletions src/general/neighborhood_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ function PointNeighbors.foreach_point_neighbor(f, system, neighbor_system,
points, parallelization_backend)
end

@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, backend, particle)
PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, particle)
Comment thread
efaulhaber marked this conversation as resolved.
end

# === Compact support selection ===
# -- Generic
@inline function compact_support(system, neighbor)
Expand Down
5 changes: 5 additions & 0 deletions src/schemes/boundary/wall_boundary/system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ end
return kernel(smoothing_kernel, distance, smoothing_length)
end

@inline function smoothing_kernel_unsafe(system::WallBoundarySystem, distance, particle)
(; smoothing_kernel, smoothing_length) = system.boundary_model
return kernel_unsafe(smoothing_kernel, distance, smoothing_length)
end

@inline function smoothing_length(system::WallBoundarySystem, particle)
return smoothing_length(system.boundary_model, particle)
end
Expand Down
65 changes: 40 additions & 25 deletions src/schemes/structure/total_lagrangian_sph/penalty_force.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,38 +14,53 @@ struct PenaltyForceGanzenmueller{ELTYPE}
end
end

@inline function dv_penalty_force(penalty_force::Nothing,
particle, neighbor, initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)
return zero(initial_pos_diff)
@inline function dv_penalty_force!(dv_particle, penalty_force::Nothing,
particle, neighbor, initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
return dv_particle
end

@propagate_inbounds function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
particle, neighbor, initial_pos_diff,
initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)
volume_a = m_a / rho_a
volume_b = m_b / rho_b
@propagate_inbounds function dv_penalty_force!(dv_particle,
penalty_force::PenaltyForceGanzenmueller,
particle, neighbor, initial_pos_diff,
initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
(; alpha) = penalty_force

kernel_weight = smoothing_kernel(system, initial_distance, particle)
# Since this is one of the most performance critical functions, using fast divisions
# here gives a significant speedup on GPUs.
# See the docs page "Development" for more details on `div_fast`.
volume_a = div_fast(m_a, rho_a)
volume_b = div_fast(m_b, rho_b)

F_a = deformation_gradient(system, particle)
F_b = deformation_gradient(system, neighbor)
# This function is called after a compact support check, so we can use the unsafe
# kernel function, which does not check the distance again.
kernel_weight = smoothing_kernel_unsafe(system, initial_distance, particle)

inv_current_distance = 1 / current_distance
E_a = young_modulus(system, particle)
E_b = young_modulus(system, neighbor)

# Use the symmetry of epsilon to simplify computations
eps_sum = (F_a + F_b) * initial_pos_diff - 2 * current_pos_diff
delta_sum = dot(eps_sum, current_pos_diff) * inv_current_distance
# Note that this is actually ϵ^b_ab = -ϵ^b_ba in the paper, so we later compute
# δ^b_ab instead of δ^b_ba, but δ^b_ab = δ^b_ba because of antisymmetry of x_ab and ϵ_ab.
eps_a = F_a * initial_pos_diff - current_pos_diff
eps_b = F_b * initial_pos_diff - current_pos_diff

E = young_modulus(system, particle)
# This is (E_a * delta_a + E_b * delta_b) * current_distance.
# Pulling the division by `current_distance` out allows us to do one division by
# `current_distance^2` instead.
delta_sum = E_a * dot(eps_a, current_pos_diff) + E_b * dot(eps_b, current_pos_diff)

f = (penalty_force.alpha / 2) * volume_a * volume_b *
kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff *
inv_current_distance
# The division contains all scalar factors, which are then multiplied by
# the vector `current_pos_diff` at the end.
# We already divide by `m_a` to obtain an acceleration.
# Since this is one of the most performance critical functions, using fast divisions
# here gives a significant speedup on GPUs.
# See the docs page "Development" for more details on `div_fast`.
dv_particle[] += div_fast((alpha / 2) * volume_a * volume_b * kernel_weight * delta_sum,
initial_distance^2 * current_distance^2 * m_a) *
current_pos_diff

# Divide force by mass to obtain acceleration
return f / m_a
return dv_particle
end
93 changes: 54 additions & 39 deletions src/schemes/structure/total_lagrangian_sph/rhs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ end

# Everything here is done in the initial coordinates
system_coords = initial_coordinates(system)
neighborhood_search = get_neighborhood_search(system, semi)
backend = semi.parallelization_backend

# For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
# and the density diffusion divide by zero.
Expand All @@ -29,48 +31,61 @@ end
h = initial_smoothing_length(system)
almostzero = sqrt(eps(h^2))

# Loop over all pairs of particles and neighbors within the kernel cutoff.
# For structure-structure interaction, this has to happen in the initial coordinates.
foreach_point_neighbor(system, system, system_coords, system_coords, semi;
points=each_integrated_particle(system)) do particle, neighbor,
initial_pos_diff,
initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

rho_a = @inbounds system.material_density[particle]
rho_b = @inbounds system.material_density[neighbor]

@threaded semi for particle in each_integrated_particle(system)
# We are looping over the particles of `system`, so it is guaranteed
# that `particle` is in bounds of `system`.
m_a = @inbounds system.mass[particle]
m_b = @inbounds system.mass[neighbor]

rho_a = @inbounds system.material_density[particle]
# PK1 / rho^2
pk1_rho2_a = @inbounds pk1_rho2(system, particle)
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)

current_pos_diff_ = @inbounds current_coords(system, particle) -
current_coords(system, neighbor)
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
current_pos_diff = convert.(eltype(system), current_pos_diff_)
current_distance = norm(current_pos_diff)

dv_stress = m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel

dv_penalty_force_ = @inbounds dv_penalty_force(penalty_force, particle, neighbor,
initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)

dv_particle = Ref(dv_stress + dv_penalty_force_)
@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
current_pos_diff, current_distance,
m_a, m_b, rho_a, rho_b, grad_kernel)
current_coords_a = @inbounds current_coords(system, particle)
F_a = @inbounds deformation_gradient(system, particle)

# Accumulate the RHS contributions over all neighbors before writing to `dv`
# to reduce the number of memory writes.
# Note that we need a `Ref` in order to be able to update these variables
# inside the closure in the `foreach_neighbor` loop.
dv_particle = Ref(zero(current_coords_a))

# Loop over all neighbors within the kernel cutoff
@inbounds foreach_neighbor(system_coords, system_coords,
neighborhood_search, backend,
particle) do particle, neighbor,
initial_pos_diff, initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

rho_b = @inbounds system.material_density[neighbor]
m_b = @inbounds system.mass[neighbor]
# PK1 / rho^2
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
current_coords_b = @inbounds current_coords(system, neighbor)

# The compiler is smart enough to optimize this away if no penalty force is used
F_b = @inbounds deformation_gradient(system, neighbor)
Comment thread
efaulhaber marked this conversation as resolved.

current_pos_diff_ = current_coords_a - current_coords_b
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
current_pos_diff = convert.(eltype(system), current_pos_diff_)
current_distance = norm(current_pos_diff)

dv_particle[] += m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel

@inbounds dv_penalty_force!(dv_particle, penalty_force, particle, neighbor,
initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)

@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
current_pos_diff, current_distance,
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
end

for i in 1:ndims(system)
@inbounds dv[i, particle] += dv_particle[][i]
Expand Down
Loading
Loading