Skip to content
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand All @@ -34,13 +35,13 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"

[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"

[extensions]
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
TrixiParticlesCUDAExt = "CUDA"
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]

[compat]
Accessors = "0.1.43"
Expand All @@ -63,6 +64,7 @@ Polyester = "0.7.10"
ReadVTK = "0.2"
RecipesBase = "1"
Reexport = "1"
SIMD = "3.7.2"
SciMLBase = "2"
StaticArrays = "1"
Statistics = "1"
Expand Down
1 change: 1 addition & 0 deletions src/TrixiParticles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ using Random: seed!
using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
add_tstop!
using SIMD: SIMD
@reexport using StaticArrays: SVector
using StaticArrays: @SMatrix, SMatrix, setindex
using Statistics: Statistics
Expand Down
30 changes: 21 additions & 9 deletions src/general/abstract_system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,17 +68,29 @@ end
end

# Return `A[:, :, i]` as an `SMatrix`.
@inline function extract_smatrix(A, system, particle)
@boundscheck checkbounds(A, ndims(system), ndims(system), particle)
@propagate_inbounds function extract_smatrix(A, system, particle)
return extract_smatrix(A, Val(ndims(system)), particle)
end

@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
@boundscheck checkbounds(A, NDIMS, NDIMS, particle)

# Extract the matrix elements for this particle as a tuple to pass to SMatrix
return SMatrix{ndims(system),
ndims(system)}(ntuple(@inline(i->@inbounds A[mod(i - 1,
ndims(system)) + 1,
div(i - 1,
ndims(system)) + 1,
particle]),
Val(ndims(system)^2)))
return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
div(i - 1, NDIMS) + 1,
particle]),
Val(NDIMS^2)))
end

# Optimized version for 2D, which uses SIMD.jl to combine the 4 loads of the 2x2 matrix
# into a single wide load. This is significantly faster on GPUs than 4 individual loads.
@inline function extract_smatrix(A, ::Val{2}, particle)
@boundscheck checkbounds(A, 2, 2, particle)

# Note that this doesn't work in 3D because it requires a stride of 2^n.
x = SIMD.vloada(SIMD.Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1))
Comment thread
efaulhaber marked this conversation as resolved.
Outdated

return SMatrix{2, 2}(Tuple(x))
end

# Specifically get the current coordinates of a particle for all system types.
Expand Down
6 changes: 6 additions & 0 deletions src/general/neighborhood_search.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ function PointNeighbors.foreach_point_neighbor(f, system, neighbor_system,
points, parallelization_backend)
end

@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, backend, particle)
PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords,
neighborhood_search, particle)
Comment thread
efaulhaber marked this conversation as resolved.
end

# === Compact support selection ===
# -- Generic
@inline function compact_support(system, neighbor)
Expand Down
5 changes: 5 additions & 0 deletions src/schemes/boundary/wall_boundary/system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,11 @@ end
return kernel(smoothing_kernel, distance, smoothing_length)
end

@inline function smoothing_kernel_unsafe(system::WallBoundarySystem, distance, particle)
(; smoothing_kernel, smoothing_length) = system.boundary_model
return kernel_unsafe(smoothing_kernel, distance, smoothing_length)
end

@inline function smoothing_length(system::WallBoundarySystem, particle)
return smoothing_length(system.boundary_model, particle)
end
Expand Down
63 changes: 38 additions & 25 deletions src/schemes/structure/total_lagrangian_sph/penalty_force.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,38 +14,51 @@ struct PenaltyForceGanzenmueller{ELTYPE}
end
end

@inline function dv_penalty_force(penalty_force::Nothing,
particle, neighbor, initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)
return zero(initial_pos_diff)
@inline function dv_penalty_force!(dv_particle, penalty_force::Nothing,
particle, neighbor, initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
return dv_particle
end

@propagate_inbounds function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
particle, neighbor, initial_pos_diff,
initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)
volume_a = m_a / rho_a
volume_b = m_b / rho_b
@propagate_inbounds function dv_penalty_force!(dv_particle,
penalty_force::PenaltyForceGanzenmueller,
particle, neighbor, initial_pos_diff,
initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
(; alpha) = penalty_force

kernel_weight = smoothing_kernel(system, initial_distance, particle)
# Since this is one of the most performance critical functions, using fast divisions
# here gives a significant speedup on GPUs.
# See the docs page "Development" for more details on `div_fast`.
volume_a = div_fast(m_a, rho_a)
volume_b = div_fast(m_b, rho_b)

F_a = deformation_gradient(system, particle)
F_b = deformation_gradient(system, neighbor)
# This function is called after a compact support check, so we can use the unsafe
# kernel function, which does not check the distance again.
kernel_weight = smoothing_kernel_unsafe(system, initial_distance, particle)

inv_current_distance = 1 / current_distance
E_a = young_modulus(system, particle)
E_b = young_modulus(system, neighbor)

# Use the symmetry of epsilon to simplify computations
eps_sum = (F_a + F_b) * initial_pos_diff - 2 * current_pos_diff
delta_sum = dot(eps_sum, current_pos_diff) * inv_current_distance
eps_a = F_a * initial_pos_diff - current_pos_diff
eps_b = -(F_b * initial_pos_diff - current_pos_diff)
Comment thread
svchb marked this conversation as resolved.
Outdated

E = young_modulus(system, particle)
# This is (E_a * delta_a + E_b * delta_b) * current_distance.
# Pulling the division by `current_distance` out allows us to do one division by
# `current_distance^2` instead.
delta_sum = E_a * dot(eps_a, current_pos_diff) + E_b * dot(eps_b, current_pos_diff)

f = (penalty_force.alpha / 2) * volume_a * volume_b *
kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff *
inv_current_distance
# The division contains all scalar factors, which are then multiplied by
# the vector `current_pos_diff` at the end.
# We already divide by `m_a` to obtain an acceleration.
# Since this is one of the most performance critical functions, using fast divisions
# here gives a significant speedup on GPUs.
# See the docs page "Development" for more details on `div_fast`.
dv_particle[] += div_fast((alpha / 2) * volume_a * volume_b * kernel_weight * delta_sum,
initial_distance^2 * current_distance^2 * m_a) *
current_pos_diff

# Divide force by mass to obtain acceleration
return f / m_a
return dv_particle
end
93 changes: 54 additions & 39 deletions src/schemes/structure/total_lagrangian_sph/rhs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ end

# Everything here is done in the initial coordinates
system_coords = initial_coordinates(system)
neighborhood_search = get_neighborhood_search(system, system, semi)
backend = semi.parallelization_backend

# For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
# and the density diffusion divide by zero.
Expand All @@ -29,48 +31,61 @@ end
h = initial_smoothing_length(system)
almostzero = sqrt(eps(h^2))

# Loop over all pairs of particles and neighbors within the kernel cutoff.
# For structure-structure interaction, this has to happen in the initial coordinates.
foreach_point_neighbor(system, system, system_coords, system_coords, semi;
points=each_integrated_particle(system)) do particle, neighbor,
initial_pos_diff,
initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

rho_a = @inbounds system.material_density[particle]
rho_b = @inbounds system.material_density[neighbor]

@threaded semi for particle in each_integrated_particle(system)
# We are looping over the particles of `system`, so it is guaranteed
# that `particle` is in bounds of `system`.
m_a = @inbounds system.mass[particle]
m_b = @inbounds system.mass[neighbor]

rho_a = @inbounds system.material_density[particle]
# PK1 / rho^2
pk1_rho2_a = @inbounds pk1_rho2(system, particle)
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)

current_pos_diff_ = @inbounds current_coords(system, particle) -
current_coords(system, neighbor)
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
current_pos_diff = convert.(eltype(system), current_pos_diff_)
current_distance = norm(current_pos_diff)

dv_stress = m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel

dv_penalty_force_ = @inbounds dv_penalty_force(penalty_force, particle, neighbor,
initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b)

dv_particle = Ref(dv_stress + dv_penalty_force_)
@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
current_pos_diff, current_distance,
m_a, m_b, rho_a, rho_b, grad_kernel)
current_coords_a = @inbounds current_coords(system, particle)
F_a = @inbounds deformation_gradient(system, particle)

# Accumulate the RHS contributions over all neighbors before writing to `dv`
# to reduce the number of memory writes.
# Note that we need a `Ref` in order to be able to update these variables
# inside the closure in the `foreach_neighbor` loop.
dv_particle = Ref(zero(current_coords_a))

# Loop over all neighbors within the kernel cutoff
@inbounds foreach_neighbor(system_coords, system_coords,
neighborhood_search, backend,
particle) do particle, neighbor,
initial_pos_diff, initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

rho_b = @inbounds system.material_density[neighbor]
m_b = @inbounds system.mass[neighbor]
# PK1 / rho^2
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
current_coords_b = @inbounds current_coords(system, neighbor)

# The compiler is smart enough to optimize this away if no penalty force is used
F_b = @inbounds deformation_gradient(system, neighbor)
Comment thread
efaulhaber marked this conversation as resolved.

current_pos_diff_ = current_coords_a - current_coords_b
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
current_pos_diff = convert.(eltype(system), current_pos_diff_)
current_distance = norm(current_pos_diff)

dv_particle[] += m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel

@inbounds dv_penalty_force!(dv_particle, penalty_force, particle, neighbor,
initial_pos_diff, initial_distance,
current_pos_diff, current_distance,
system, m_a, m_b, rho_a, rho_b, F_a, F_b)

@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
current_pos_diff, current_distance,
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
end

for i in 1:ndims(system)
@inbounds dv[i, particle] += dv_particle[][i]
Expand Down
60 changes: 39 additions & 21 deletions src/schemes/structure/total_lagrangian_sph/system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -489,30 +489,48 @@ end

# Loop over all pairs of particles and neighbors within the kernel cutoff
initial_coords = initial_coordinates(system)
foreach_point_neighbor(system, system, initial_coords, initial_coords,
semi) do particle, neighbor, initial_pos_diff, initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return
neighborhood_search = get_neighborhood_search(system, system, semi)
backend = semi.parallelization_backend

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

volume = @inbounds mass[neighbor] / material_density[neighbor]
pos_diff_ = @inbounds current_coords(system, particle) -
current_coords(system, neighbor)
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
pos_diff = convert.(eltype(system), pos_diff_)

# Multiply by L_{0a}
L = @inbounds correction_matrix(system, particle)

result = volume * pos_diff * grad_kernel' * L'
@threaded semi for particle in each_integrated_particle(system)
# We are looping over the particles of `system`, so it is guaranteed
# that `particle` is in bounds of `system`.
current_coords_a = @inbounds current_coords(system, particle)
L_a = @inbounds correction_matrix(system, particle)

# Accumulate the contributions over all neighbors before writing
# to `deformation_grad` to reduce the number of memory writes.
# Note that we need a `Ref` in order to be able to update these variables
# inside the closure in the `foreach_neighbor` loop.
result = Ref(zero(L_a))

# Loop over all neighbors within the kernel cutoff
@inbounds foreach_neighbor(initial_coords, initial_coords,
neighborhood_search, backend,
particle) do particle, neighbor,
initial_pos_diff, initial_distance
# Skip neighbors with the same position because the kernel gradient is zero.
# Note that `return` only exits the closure, i.e., skips the current neighbor.
skip_zero_distance(system) && initial_distance < almostzero && return

# Now that we know that `distance` is not zero, we can safely call the unsafe
# version of the kernel gradient to avoid redundant zero checks.
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
initial_distance, particle)

volume = @inbounds mass[neighbor] / material_density[neighbor]
current_coords_b = @inbounds current_coords(system, neighbor)
pos_diff_ = current_coords_a - current_coords_b
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
pos_diff = convert.(eltype(system), pos_diff_)
Comment thread
LasNikas marked this conversation as resolved.

# The tensor product pos_diff ⊗ (L_{0a} * ∇W) is equivalent to multiplication
# by the transpose: pos_diff * (L_{0a} * ∇W)ᵀ = pos_diff * ∇Wᵀ * L_{0a}ᵀ.
result[] -= volume * pos_diff * grad_kernel' * L_a'
end

for j in 1:ndims(system), i in 1:ndims(system)
@inbounds deformation_grad[i, j, particle] -= result[i, j]
@inbounds deformation_grad[i, j, particle] += result[][i, j]
end
end

Expand Down
Loading
Loading