trixi-framework · efaulhaber · Apr 16, 2026 · Apr 8, 2026 · Apr 9, 2026 · Apr 14, 2026
diff --git a/Project.toml b/Project.toml
@@ -25,6 +25,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReadVTK = "dc215faf-f008-4882-a9f7-a79a826fadc3"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
+SIMD = "fdea26ae-647d-5447-a871-4b548cad5224"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
@@ -34,13 +35,13 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
 WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 
 [weakdeps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
-TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
 TrixiParticlesCUDAExt = "CUDA"
+TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
 
 [compat]
 Accessors = "0.1.43"
@@ -63,6 +64,7 @@ Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
 Reexport = "1"
+SIMD = "3.7.2"
 SciMLBase = "2"
 StaticArrays = "1"
 Statistics = "1"

diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -25,6 +25,7 @@ using Random: seed!
 using SciMLBase: SciMLBase, CallbackSet, DiscreteCallback, DynamicalODEProblem, u_modified!,
                  get_tmp_cache, set_proposed_dt!, ODESolution, ODEProblem, terminate!,
                  add_tstop!
+using SIMD: SIMD
 @reexport using StaticArrays: SVector
 using StaticArrays: @SMatrix, SMatrix, setindex
 using Statistics: Statistics

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
@@ -68,17 +68,46 @@ end
 end
 
 # Return `A[:, :, i]` as an `SMatrix`.
-@inline function extract_smatrix(A, system, particle)
-    @boundscheck checkbounds(A, ndims(system), ndims(system), particle)
+@propagate_inbounds function extract_smatrix(A, system, particle)
+    return extract_smatrix(A, Val(ndims(system)), particle)
+end
+
+@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
+    @boundscheck checkbounds(A, NDIMS, NDIMS, particle)
 
     # Extract the matrix elements for this particle as a tuple to pass to SMatrix
-    return SMatrix{ndims(system),
-                   ndims(system)}(ntuple(@inline(i->@inbounds A[mod(i - 1,
-                                                                    ndims(system)) + 1,
-                                                                div(i - 1,
-                                                                    ndims(system)) + 1,
-                                                                particle]),
-                                         Val(ndims(system)^2)))
+    return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
+                                                               div(i - 1, NDIMS) + 1,
+                                                               particle]),
+                                        Val(NDIMS^2)))
+end
+
+# Optimized version for 2D, which uses SIMD.jl to combine the 4 loads of the 2x2 matrix
+# into a single wide load. This is significantly faster on GPUs than 4 individual loads.
+# WARNING:
+# 1. This only works if the matrix elements are stored contiguously in memory.
+#    The 4 elements of the 2x2 matrix for a particle must immediately follow the
+#    4 elements of the previous particle's matrix, with no padding in between.
+# 2. The pointer of `A` must be aligned to the size of `Vec{4, eltype(A)}`.
+#    This is guaranteed if `A` is allocated by Julia and has the correct size,
+#    but may not be true if `A` is a view or subarray.
+@propagate_inbounds function extract_smatrix_aligned(A, system, particle)
+    return extract_smatrix_aligned(A, Val(ndims(system)), particle)
+end
+
+@inline function extract_smatrix_aligned(A, ::Val{2}, particle)
+    @boundscheck checkbounds(A, 2, 2, particle)
+
+    # Note that this doesn't work in 3D because it requires a stride of 2^n.
+    x = SIMD.vloada(SIMD.Vec{4, eltype(A)}, pointer(A, 4 * (particle - 1) + 1))
+
+    return SMatrix{2, 2}(Tuple(x))
+end
+
+# Fall back to the generic version when not in 2D.
+@propagate_inbounds function extract_smatrix_aligned(A, ::Val{NDIMS},
+                                                     particle) where {NDIMS}
+    return extract_smatrix(A, Val(NDIMS), particle)
 end
 
 # Specifically get the current coordinates of a particle for all system types.

diff --git a/src/general/corrections.jl b/src/general/corrections.jl
@@ -395,7 +395,9 @@ end
 
 function correction_matrix_inversion_step!(corr_matrix, system, semi)
     @threaded semi for particle in eachparticle(system)
-        L = extract_smatrix(corr_matrix, system, particle)
+        # `corr_matrix` is not a view, so we can use the fast `extract_smatrix_aligned`
+        # instead of the generic `extract_smatrix`.
+        L = extract_smatrix_aligned(corr_matrix, system, particle)
 
         # The matrix `L` only becomes singular when the particle and all neighbors
         # are collinear (in 2D) or lie all in the same plane (in 3D).

diff --git a/src/general/neighborhood_search.jl b/src/general/neighborhood_search.jl
@@ -11,6 +11,12 @@ function PointNeighbors.foreach_point_neighbor(f, system, neighbor_system,
                            points, parallelization_backend)
 end
 
+@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords,
+                                              neighborhood_search, backend, particle)
+    PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords,
+                                    neighborhood_search, particle)
+end
+
 # === Compact support selection ===
 # -- Generic
 @inline function compact_support(system, neighbor)

diff --git a/src/schemes/boundary/wall_boundary/system.jl b/src/schemes/boundary/wall_boundary/system.jl
@@ -179,6 +179,11 @@ end
     return kernel(smoothing_kernel, distance, smoothing_length)
 end
 
+@inline function smoothing_kernel_unsafe(system::WallBoundarySystem, distance, particle)
+    (; smoothing_kernel, smoothing_length) = system.boundary_model
+    return kernel_unsafe(smoothing_kernel, distance, smoothing_length)
+end
+
 @inline function smoothing_length(system::WallBoundarySystem, particle)
     return smoothing_length(system.boundary_model, particle)
 end

diff --git a/src/schemes/structure/total_lagrangian_sph/penalty_force.jl b/src/schemes/structure/total_lagrangian_sph/penalty_force.jl
@@ -14,38 +14,53 @@ struct PenaltyForceGanzenmueller{ELTYPE}
     end
 end
 
-@inline function dv_penalty_force(penalty_force::Nothing,
-                                  particle, neighbor, initial_pos_diff, initial_distance,
-                                  current_pos_diff, current_distance,
-                                  system, m_a, m_b, rho_a, rho_b)
-    return zero(initial_pos_diff)
+@inline function dv_penalty_force!(dv_particle, penalty_force::Nothing,
+                                   particle, neighbor, initial_pos_diff, initial_distance,
+                                   current_pos_diff, current_distance,
+                                   system, m_a, m_b, rho_a, rho_b, F_a, F_b)
+    return dv_particle
 end
 
-@propagate_inbounds function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
-                                              particle, neighbor, initial_pos_diff,
-                                              initial_distance,
-                                              current_pos_diff, current_distance,
-                                              system, m_a, m_b, rho_a, rho_b)
-    volume_a = m_a / rho_a
-    volume_b = m_b / rho_b
+@propagate_inbounds function dv_penalty_force!(dv_particle,
+                                               penalty_force::PenaltyForceGanzenmueller,
+                                               particle, neighbor, initial_pos_diff,
+                                               initial_distance,
+                                               current_pos_diff, current_distance,
+                                               system, m_a, m_b, rho_a, rho_b, F_a, F_b)
+    (; alpha) = penalty_force
 
-    kernel_weight = smoothing_kernel(system, initial_distance, particle)
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    volume_a = div_fast(m_a, rho_a)
+    volume_b = div_fast(m_b, rho_b)
 
-    F_a = deformation_gradient(system, particle)
-    F_b = deformation_gradient(system, neighbor)
+    # This function is called after a compact support check, so we can use the unsafe
+    # kernel function, which does not check the distance again.
+    kernel_weight = smoothing_kernel_unsafe(system, initial_distance, particle)
 
-    inv_current_distance = 1 / current_distance
+    E_a = young_modulus(system, particle)
+    E_b = young_modulus(system, neighbor)
 
-    # Use the symmetry of epsilon to simplify computations
-    eps_sum = (F_a + F_b) * initial_pos_diff - 2 * current_pos_diff
-    delta_sum = dot(eps_sum, current_pos_diff) * inv_current_distance
+    # Note that this is actually ϵ^b_ab = -ϵ^b_ba in the paper, so we later compute
+    # δ^b_ab instead of δ^b_ba, but δ^b_ab = δ^b_ba because of antisymmetry of x_ab and ϵ_ab.
+    eps_a = F_a * initial_pos_diff - current_pos_diff
+    eps_b = F_b * initial_pos_diff - current_pos_diff
 
-    E = young_modulus(system, particle)
+    # This is (E_a * delta_a + E_b * delta_b) * current_distance.
+    # Pulling the division by `current_distance` out allows us to do one division by
+    # `current_distance^2` instead.
+    delta_sum = E_a * dot(eps_a, current_pos_diff) + E_b * dot(eps_b, current_pos_diff)
 
-    f = (penalty_force.alpha / 2) * volume_a * volume_b *
-        kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff *
-        inv_current_distance
+    # The division contains all scalar factors, which are then multiplied by
+    # the vector `current_pos_diff` at the end.
+    # We already divide by `m_a` to obtain an acceleration.
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    dv_particle[] += div_fast((alpha / 2) * volume_a * volume_b * kernel_weight * delta_sum,
+                              initial_distance^2 * current_distance^2 * m_a) *
+                     current_pos_diff
 
-    # Divide force by mass to obtain acceleration
-    return f / m_a
+    return dv_particle
 end
diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl
@@ -19,6 +19,8 @@ end
 
     # Everything here is done in the initial coordinates
     system_coords = initial_coordinates(system)
+    neighborhood_search = get_neighborhood_search(system, semi)
+    backend = semi.parallelization_backend
 
     # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
     # and the density diffusion divide by zero.
@@ -29,48 +31,61 @@ end
     h = initial_smoothing_length(system)
     almostzero = sqrt(eps(h^2))
 
-    # Loop over all pairs of particles and neighbors within the kernel cutoff.
-    # For structure-structure interaction, this has to happen in the initial coordinates.
-    foreach_point_neighbor(system, system, system_coords, system_coords, semi;
-                           points=each_integrated_particle(system)) do particle, neighbor,
-                                                                       initial_pos_diff,
-                                                                       initial_distance
-        # Skip neighbors with the same position because the kernel gradient is zero.
-        # Note that `return` only exits the closure, i.e., skips the current neighbor.
-        skip_zero_distance(system) && initial_distance < almostzero && return
-
-        # Now that we know that `distance` is not zero, we can safely call the unsafe
-        # version of the kernel gradient to avoid redundant zero checks.
-        grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
-                                                   initial_distance, particle)
-
-        rho_a = @inbounds system.material_density[particle]
-        rho_b = @inbounds system.material_density[neighbor]
-
+    @threaded semi for particle in each_integrated_particle(system)
+        # We are looping over the particles of `system`, so it is guaranteed
+        # that `particle` is in bounds of `system`.
         m_a = @inbounds system.mass[particle]
-        m_b = @inbounds system.mass[neighbor]
-
+        rho_a = @inbounds system.material_density[particle]
         # PK1 / rho^2
         pk1_rho2_a = @inbounds pk1_rho2(system, particle)
-        pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
-
-        current_pos_diff_ = @inbounds current_coords(system, particle) -
-                                      current_coords(system, neighbor)
-        # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
-        current_pos_diff = convert.(eltype(system), current_pos_diff_)
-        current_distance = norm(current_pos_diff)
-
-        dv_stress = m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
-
-        dv_penalty_force_ = @inbounds dv_penalty_force(penalty_force, particle, neighbor,
-                                                       initial_pos_diff, initial_distance,
-                                                       current_pos_diff, current_distance,
-                                                       system, m_a, m_b, rho_a, rho_b)
-
-        dv_particle = Ref(dv_stress + dv_penalty_force_)
-        @inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
-                                      current_pos_diff, current_distance,
-                                      m_a, m_b, rho_a, rho_b, grad_kernel)
+        current_coords_a = @inbounds current_coords(system, particle)
+        F_a = @inbounds deformation_gradient(system, particle)
+
+        # Accumulate the RHS contributions over all neighbors before writing to `dv`
+        # to reduce the number of memory writes.
+        # Note that we need a `Ref` in order to be able to update these variables
+        # inside the closure in the `foreach_neighbor` loop.
+        dv_particle = Ref(zero(current_coords_a))
+
+        # Loop over all neighbors within the kernel cutoff
+        @inbounds foreach_neighbor(system_coords, system_coords,
+                                   neighborhood_search, backend,
+                                   particle) do particle, neighbor,
+                                                initial_pos_diff, initial_distance
+            # Skip neighbors with the same position because the kernel gradient is zero.
+            # Note that `return` only exits the closure, i.e., skips the current neighbor.
+            skip_zero_distance(system) && initial_distance < almostzero && return
+
+            # Now that we know that `distance` is not zero, we can safely call the unsafe
+            # version of the kernel gradient to avoid redundant zero checks.
+            grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
+                                                       initial_distance, particle)
+
+            rho_b = @inbounds system.material_density[neighbor]
+            m_b = @inbounds system.mass[neighbor]
+            # PK1 / rho^2
+            pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
+            current_coords_b = @inbounds current_coords(system, neighbor)
+
+            # The compiler is smart enough to optimize this away if no penalty force is used
+            F_b = @inbounds deformation_gradient(system, neighbor)
+
+            current_pos_diff_ = current_coords_a - current_coords_b
+            # On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
+            current_pos_diff = convert.(eltype(system), current_pos_diff_)
+            current_distance = norm(current_pos_diff)
+
+            dv_particle[] += m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
+
+            @inbounds dv_penalty_force!(dv_particle, penalty_force, particle, neighbor,
+                                        initial_pos_diff, initial_distance,
+                                        current_pos_diff, current_distance,
+                                        system, m_a, m_b, rho_a, rho_b, F_a, F_b)
+
+            @inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
+                                          current_pos_diff, current_distance,
+                                          m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
+        end
 
         for i in 1:ndims(system)
             @inbounds dv[i, particle] += dv_particle[][i]