diff --git a/Project.toml b/Project.toml
index f00a02ee74..9f17f48c0a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -58,7 +58,7 @@ JSON = "1"
 KernelAbstractions = "0.9"
 OrdinaryDiffEq = "6.91"
 OrdinaryDiffEqCore = "2, 3"
-PointNeighbors = "0.6.5"
+PointNeighbors = "0.6.6"
 Polyester = "0.7.10"
 ReadVTK = "0.2"
 RecipesBase = "1"
diff --git a/src/general/corrections.jl b/src/general/corrections.jl
index 77f4fae0dd..2cebdb8db8 100644
--- a/src/general/corrections.jl
+++ b/src/general/corrections.jl
@@ -35,15 +35,16 @@ end
 # `rho_mean` is the mean density of the fluid, which is used to determine correction values near the free surface.
 #  Return a tuple `(viscosity_correction, pressure_correction, surface_tension_correction)` representing the correction terms.
 @inline function free_surface_correction(correction::AkinciFreeSurfaceCorrection,
-                                         particle_system, rho_mean)
+                                         particle_system, rho_a, rho_b)
     # Equation 4 in ref
+    rho_mean = (rho_a + rho_b) / 2
     k = correction.rho0 / rho_mean
 
     # Viscosity, pressure, surface_tension
     return k, 1, k
 end
 
-@inline function free_surface_correction(correction, particle_system, rho_mean)
+@inline function free_surface_correction(correction, particle_system, rho_a, rho_b)
     return 1, 1, 1
 end
 
diff --git a/src/general/neighborhood_search.jl b/src/general/neighborhood_search.jl
index b2953a6a20..ce318cea2d 100644
--- a/src/general/neighborhood_search.jl
+++ b/src/general/neighborhood_search.jl
@@ -17,6 +17,18 @@ end
                                     neighborhood_search, particle)
 end
 
+# We cannot dispatch by `AbstractGPUArray` because this is called from within
+# a kernel, where the arrays are device arrays (like `CuDeviceArray`),
+# which are not `AbstractGPUArray`s.
+@inline function foreach_neighbor(f, system_coords, neighbor_coords, neighborhood_search,
+                                  backend::KernelAbstractions.GPU, particle)
+    # On GPUs, remove all bounds checks for maximum performance.
+    # Note that this is not safe if the neighborhood search was not initialized correctly.
+    # For example, this is unsafe when benchmarking `interact!` with the wrong NHS.
+    PointNeighbors.foreach_neighbor_unsafe(f, system_coords, neighbor_coords,
+                                           neighborhood_search, particle)
+end
+
 # === Compact support selection ===
 # -- Generic
 @inline function compact_support(system, neighbor)
diff --git a/src/schemes/fluid/entropically_damped_sph/rhs.jl b/src/schemes/fluid/entropically_damped_sph/rhs.jl
index 142dd2ccf1..2ea1e4dc4d 100644
--- a/src/schemes/fluid/entropically_damped_sph/rhs.jl
+++ b/src/schemes/fluid/entropically_damped_sph/rhs.jl
@@ -76,7 +76,7 @@ function interact!(dv, v_particle_system, u_particle_system,
         @inbounds dv_shifting!(dv_particle, shifting_technique(particle_system),
                                particle_system, neighbor_system,
                                v_particle_system, v_neighbor_system,
-                               particle, neighbor, m_a, m_b, rho_a, rho_b,
+                               particle, neighbor, m_a, m_b, rho_a, rho_b, v_a, v_b,
                                pos_diff, distance, grad_kernel, correction)
 
         @inbounds surface_tension_force!(dv_particle, surface_tension_a,
diff --git a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
index eedf81bb15..8112021b1a 100644
--- a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
+++ b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl
@@ -47,17 +47,14 @@ function interact!(dv, v_particle_system, u_particle_system,
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
         m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
 
+        p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
         # The following call is equivalent to
-        #     `p_a = particle_pressure(v_particle_system, particle_system, particle)`
-        #     `p_b = particle_pressure(v_neighbor_system, neighbor_system, neighbor)`
-        # Only when the neighbor system is a `WallBoundarySystem` or a `TotalLagrangianSPHSystem`
-        # with the boundary model `PressureMirroring`, this will return `p_b = p_a`, which is
-        # the pressure of the fluid particle.
-        p_a,
-        p_b = @inbounds particle_neighbor_pressure(v_particle_system,
-                                                   v_neighbor_system,
-                                                   particle_system, neighbor_system,
-                                                   particle, neighbor)
+        #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
+        # Only when the neighbor system is a `WallBoundarySystem`
+        # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`,
+        # this will return `p_b = p_a`, which is the pressure of the fluid particle.
+        p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system,
+                                          neighbor, p_a)
 
         dv_pressure = pressure_acceleration(particle_system, neighbor_system,
                                             particle, neighbor,
diff --git a/src/schemes/fluid/shifting_techniques.jl b/src/schemes/fluid/shifting_techniques.jl
index 99e29366da..bfb9ab8117 100644
--- a/src/schemes/fluid/shifting_techniques.jl
+++ b/src/schemes/fluid/shifting_techniques.jl
@@ -43,7 +43,7 @@ end
 # Additional term in the momentum equation due to the shifting technique
 @inline function dv_shifting!(dv_particle, shifting, system, neighbor_system,
                               v_system, v_neighbor_system, particle, neighbor,
-                              m_a, m_b, rho_a, rho_b, pos_diff, distance,
+                              m_a, m_b, rho_a, rho_b, v_a, v_b, pos_diff, distance,
                               grad_kernel, correction)
     return dv_particle
 end
@@ -329,11 +329,11 @@ struct MomentumEquationTermSun2019 end
                                           shifting::ParticleShiftingTechnique,
                                           system, neighbor_system,
                                           v_system, v_neighbor_system, particle, neighbor,
-                                          m_a, m_b, rho_a, rho_b, pos_diff, distance,
-                                          grad_kernel, correction)
+                                          m_a, m_b, rho_a, rho_b, v_a, v_b, pos_diff,
+                                          distance, grad_kernel, correction)
     return dv_shifting!(dv_particle, shifting.momentum_equation_term, system,
                         neighbor_system, v_system, v_neighbor_system,
-                        particle, neighbor, m_a, m_b, rho_a, rho_b,
+                        particle, neighbor, m_a, m_b, rho_a, rho_b, v_a, v_b,
                         pos_diff, distance, grad_kernel, correction)
 end
 
@@ -341,15 +341,17 @@ end
                                           system, neighbor_system,
                                           v_system, v_neighbor_system,
                                           particle, neighbor, m_a, m_b, rho_a, rho_b,
-                                          pos_diff, distance, grad_kernel, correction)
+                                          v_a, v_b, pos_diff, distance,
+                                          grad_kernel, correction)
     delta_v_a = delta_v(system, particle)
     delta_v_b = delta_v(neighbor_system, neighbor)
 
-    v_a = current_velocity(v_system, system, particle)
-    v_b = current_velocity(v_neighbor_system, neighbor_system, neighbor)
-
     tensor_product = v_a * delta_v_a' + v_b * delta_v_b'
-    dv_particle[] += m_b / rho_b *
+
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    dv_particle[] += div_fast(m_b, rho_b) *
                      (tensor_product * grad_kernel +
                       v_a * dot(delta_v_a - delta_v_b, grad_kernel))
 
@@ -388,7 +390,10 @@ struct ContinuityEquationTermSun2019 end
                                                              ::ContinuityEquationTermSun2019,
                                                              delta_v_a, delta_v_b,
                                                              rho_a, rho_b)
-    return v_diff + delta_v_a + rho_b / rho_a * delta_v_b
+    # Since this is one of the most performance critical functions, using fast divisions
+    # here gives a significant speedup on GPUs.
+    # See the docs page "Development" for more details on `div_fast`.
+    return v_diff + delta_v_a + div_fast(rho_b, rho_a) * delta_v_b
 end
 
 @inline function second_continuity_equation_term(v_diff, second_continuity_equation_term,
@@ -579,12 +584,9 @@ end
 @propagate_inbounds function dv_shifting!(dv_particle, ::TransportVelocityAdami,
                                           system, neighbor_system,
                                           v_system, v_neighbor_system, particle, neighbor,
-                                          m_a, m_b, rho_a, rho_b, pos_diff, distance,
-                                          grad_kernel, correction)
-    v_a = current_velocity(v_system, system, particle)
+                                          m_a, m_b, rho_a, rho_b, v_a, v_b, pos_diff,
+                                          distance, grad_kernel, correction)
     delta_v_a = delta_v(system, particle)
-
-    v_b = current_velocity(v_neighbor_system, neighbor_system, neighbor)
     delta_v_b = delta_v(neighbor_system, neighbor)
 
     A_a = rho_a * v_a * delta_v_a'
diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
index 27c3ff8dbf..836063538b 100644
--- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl
+++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl
@@ -14,145 +14,123 @@ function interact!(dv, v_particle_system, u_particle_system,
 
     system_coords = current_coordinates(u_particle_system, particle_system)
     neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system)
-
-    # In order to visualize quantities like pressure forces or viscosity forces, uncomment
-    # the following code and the two other lines below that are marked as "debug example".
-    # debug_array = zeros(ndims(particle_system), nparticles(particle_system))
-
-    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
-    # and the density diffusion divide by zero.
-    # To account for rounding errors, we check if `distance` is almost zero.
-    # Since the coordinates are in the order of the smoothing length `h`, `distance^2` is in
-    # the order of `h^2`, so we need to check `distance < sqrt(eps(h^2))`.
-    # Note that `sqrt(eps(h^2)) != eps(h)`.
-    h = initial_smoothing_length(particle_system)
-    almostzero = sqrt(eps(h^2))
-
-    # Loop over all pairs of particles and neighbors within the kernel cutoff
-    foreach_point_neighbor(particle_system, neighbor_system,
-                           system_coords, neighbor_system_coords, semi;
-                           points=each_integrated_particle(particle_system)) do particle,
-                                                                                neighbor,
-                                                                                pos_diff,
-                                                                                distance
-        # Skip neighbors with the same position because the kernel gradient is zero.
-        # Note that `return` only exits the closure, i.e., skips the current neighbor.
-        skip_zero_distance(particle_system) && distance < almostzero && return
-
-        # Now that we know that `distance` is not zero, we can safely call the unsafe
-        # version of the kernel gradient to avoid redundant zero checks.
-        grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
-                                                   distance, particle)
-
-        # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are
-        # in bounds of the respective system. For performance reasons, we use `@inbounds`
-        # in this hot loop to avoid bounds checking when extracting particle quantities.
-        rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
-        rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
-        rho_mean = (rho_a + rho_b) / 2
-
-        v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
-        v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
-
-        # Determine correction factors.
-        # This can be ignored, as these are all 1 when no correction is used.
-        (viscosity_correction, pressure_correction,
-         surface_tension_correction) = free_surface_correction(correction, particle_system,
-                                                               rho_mean)
-
+    neighborhood_search = get_neighborhood_search(particle_system, neighbor_system, semi)
+    backend = semi.parallelization_backend
+
+    # For `distance == 0`, the analytical gradient is zero, but the unsafe gradient divides
+    # by zero. To account for rounding errors, we check if `distance` is almost zero.
+    # Since the coordinates are in the order of the compact support `c`, `distance^2` is in
+    # the order of `c^2`, so we need to check `distance < sqrt(eps(c^2))`.
+    # Note that `sqrt(eps(c^2)) != eps(c)`.
+    compact_support_ = compact_support(particle_system, neighbor_system)
+    almostzero = sqrt(eps(compact_support_^2))
+
+    @threaded semi for particle in each_integrated_particle(particle_system)
+        # We are looping over the particles of `particle_system`, so it is guaranteed
+        # that `particle` is in bounds of `particle_system`.
         m_a = @inbounds hydrodynamic_mass(particle_system, particle)
-        m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+        p_a = @inbounds current_pressure(v_particle_system, particle_system, particle)
 
         v_a = @inbounds current_velocity(v_particle_system, particle_system, particle)
-        v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
-
-        # The following call is equivalent to
-        #     `p_a = current_pressure(v_particle_system, particle_system, particle)`
-        #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
-        # Only when the neighbor system is a `WallBoundarySystem` or a `TotalLagrangianSPHSystem`
-        # with the boundary model `PressureMirroring`, this will return `p_b = p_a`, which is
-        # the pressure of the fluid particle.
-        p_a,
-        p_b = @inbounds particle_neighbor_pressure(v_particle_system,
-                                                   v_neighbor_system,
-                                                   particle_system, neighbor_system,
-                                                   particle, neighbor)
-
-        dv_pressure = pressure_correction *
-                      pressure_acceleration(particle_system, neighbor_system,
-                                            particle, neighbor,
-                                            m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
-                                            distance, grad_kernel, correction)
-
-        dv_particle = Ref(dv_pressure)
-
-        # Propagate `@inbounds` to the viscosity function, which accesses particle data
-        @inbounds dv_viscosity!(dv_particle, particle_system, neighbor_system,
-                                v_particle_system, v_neighbor_system,
-                                particle, neighbor, pos_diff, distance,
-                                sound_speed, m_a, m_b, rho_a, rho_b,
-                                v_a, v_b, grad_kernel, viscosity_correction)
-
-        # Extra terms in the momentum equation when using a shifting technique
-        @inbounds dv_shifting!(dv_particle, shifting_technique(particle_system),
-                               particle_system, neighbor_system,
-                               v_particle_system, v_neighbor_system,
-                               particle, neighbor, m_a, m_b, rho_a, rho_b,
-                               pos_diff, distance, grad_kernel, correction)
-
-        @inbounds surface_tension_force!(dv_particle,
-                                         surface_tension_a, surface_tension_b,
-                                         particle_system, neighbor_system,
-                                         particle, neighbor, pos_diff, distance,
-                                         rho_a, rho_b, grad_kernel,
-                                         surface_tension_correction)
-
-        @inbounds adhesion_force!(dv_particle, surface_tension_a, particle_system,
-                                  neighbor_system,
-                                  particle, neighbor, pos_diff, distance)
-
-        for i in 1:ndims(particle_system)
-            @inbounds dv[i, particle] += dv_particle[][i]
-            # Debug example
-            # debug_array[i, particle] += dv_pressure[i]
-        end
+        rho_a = @inbounds current_density(v_particle_system, particle_system, particle)
 
+        # Accumulate the RHS contributions over all neighbors before writing to `dv`,
+        # to reduce the number of memory writes.
+        # Note that we need a `Ref` in order to be able to update these variables
+        # inside the closure in the `foreach_neighbor` loop.
+        dv_particle = Ref(zero(v_a))
         drho_particle = Ref(zero(rho_a))
 
-        # TODO If variable smoothing_length is used, this should use the neighbor smoothing length
-        # Propagate `@inbounds` to the continuity equation, which accesses particle data
-        @inbounds continuity_equation!(drho_particle, density_calculator,
-                                       particle_system, neighbor_system,
-                                       particle, neighbor, pos_diff, distance,
-                                       m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
+        # Loop over all neighbors within the kernel cutoff
+        @inbounds foreach_neighbor(system_coords, neighbor_system_coords,
+                                   neighborhood_search, backend,
+                                   particle) do particle, neighbor, pos_diff, distance
+            # Skip neighbors with the same position because the kernel gradient is zero.
+            # Note that `return` only exits the closure, i.e., skips the current neighbor.
+            skip_zero_distance(particle_system) && distance < almostzero && return
+
+            # Now that we know that `distance` is not zero, we can safely call the unsafe
+            # version of the kernel gradient to avoid redundant zero checks.
+            grad_kernel = smoothing_kernel_grad_unsafe(particle_system, pos_diff,
+                                                       distance, particle)
+
+            # `foreach_neighbor` makes sure that `neighbor` is in bounds of `neighbor_system`
+            m_b = @inbounds hydrodynamic_mass(neighbor_system, neighbor)
+            v_b = @inbounds current_velocity(v_neighbor_system, neighbor_system, neighbor)
+            rho_b = @inbounds current_density(v_neighbor_system, neighbor_system, neighbor)
+
+            # The following call is equivalent to
+            #     `p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)`
+            # Only when the neighbor system is a `WallBoundarySystem`
+            # or a `TotalLagrangianSPHSystem` with the boundary model `PressureMirroring`,
+            # this will return `p_b = p_a`, which is the pressure of the fluid particle.
+            p_b = @inbounds neighbor_pressure(v_neighbor_system, neighbor_system,
+                                              neighbor, p_a)
+
+            # Determine correction factors.
+            # This can usually be ignored, as these are all 1 when no correction is used.
+            (viscosity_correction, pressure_correction,
+             surface_tension_correction) = free_surface_correction(correction,
+                                                                   particle_system,
+                                                                   rho_a, rho_b)
+
+            # For `ContinuityDensity` without correction, this is equivalent to
+            # dv_pressure = -m_b * (p_a + p_b) / (rho_a * rho_b) * grad_kernel
+            dv_pressure = pressure_acceleration(particle_system, neighbor_system,
+                                                particle, neighbor,
+                                                m_a, m_b, p_a, p_b, rho_a, rho_b, pos_diff,
+                                                distance, grad_kernel, correction)
+            dv_particle[] += dv_pressure * pressure_correction
+
+            # Propagate `@inbounds` to the viscosity function, which accesses particle data
+            @inbounds dv_viscosity!(dv_particle, particle_system, neighbor_system,
+                                    v_particle_system, v_neighbor_system,
+                                    particle, neighbor, pos_diff, distance,
+                                    sound_speed, m_a, m_b, rho_a, rho_b,
+                                    v_a, v_b, grad_kernel, viscosity_correction)
+
+            # Extra terms in the momentum equation when using a shifting technique
+            @inbounds dv_shifting!(dv_particle, shifting_technique(particle_system),
+                                   particle_system, neighbor_system,
+                                   v_particle_system, v_neighbor_system,
+                                   particle, neighbor, m_a, m_b, rho_a, rho_b, v_a, v_b,
+                                   pos_diff, distance, grad_kernel, correction)
+
+            @inbounds surface_tension_force!(dv_particle,
+                                             surface_tension_a, surface_tension_b,
+                                             particle_system, neighbor_system,
+                                             particle, neighbor, pos_diff, distance,
+                                             rho_a, rho_b, grad_kernel,
+                                             surface_tension_correction)
+
+            @inbounds adhesion_force!(dv_particle, surface_tension_a, particle_system,
+                                      neighbor_system,
+                                      particle, neighbor, pos_diff, distance)
+
+            # TODO If variable smoothing_length is used, this should use the neighbor smoothing length
+            # Propagate `@inbounds` to the continuity equation, which accesses particle data
+            @inbounds continuity_equation!(drho_particle, density_calculator,
+                                           particle_system, neighbor_system,
+                                           particle, neighbor, pos_diff, distance,
+                                           m_b, rho_a, rho_b, v_a, v_b, grad_kernel)
+        end
 
+        for i in eachindex(dv_particle[])
+            @inbounds dv[i, particle] += dv_particle[][i]
+        end
         @inbounds write_drho_particle!(dv, density_calculator, drho_particle, particle)
     end
-    # Debug example
-    # periodic_box = neighborhood_search.periodic_box
-    # Note: this saves a file in every stage of the integrator
-    # if !@isdefined iter; iter = 0; end
-    # TODO: This call should use public API. This requires some additional changes to simplify the calls.
-    # trixi2vtk(v_particle_system, u_particle_system, -1.0, particle_system, periodic_box, debug=debug_array, prefix="debug", iter=iter += 1)
 
     return dv
 end
 
-@propagate_inbounds function particle_neighbor_pressure(v_particle_system,
-                                                        v_neighbor_system,
-                                                        particle_system, neighbor_system,
-                                                        particle, neighbor)
-    p_a = current_pressure(v_particle_system, particle_system, particle)
-    p_b = current_pressure(v_neighbor_system, neighbor_system, neighbor)
-
-    return p_a, p_b
+@propagate_inbounds function neighbor_pressure(v_neighbor_system, neighbor_system,
+                                               neighbor, p_a)
+    return current_pressure(v_neighbor_system, neighbor_system, neighbor)
 end
 
-@inline function particle_neighbor_pressure(v_particle_system, v_neighbor_system,
-                                            particle_system,
-                                            neighbor_system::WallBoundarySystem{<:BoundaryModelDummyParticles{PressureMirroring}},
-                                            particle, neighbor)
-    p_a = current_pressure(v_particle_system, particle_system, particle)
-
-    return p_a, p_a
+@inline function neighbor_pressure(v_neighbor_system,
+                                   neighbor_system::WallBoundarySystem{<:BoundaryModelDummyParticles{PressureMirroring}},
+                                   neighbor, p_a)
+    return p_a
 end