Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 22 additions & 9 deletions ext/cuda/operators_sem_shmem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,10 @@ Base.@propagate_inbounds function operator_shmem(
FT = Spaces.undertype(space)
QS = Spaces.quadrature_style(space)
Nq = Quadratures.degrees_of_freedom(QS)
JT = operator_return_eltype(op, eltype(arg1), FT)
# allocate temp output for Ju1 and psi
Ju1 = CUDA.CuStaticSharedArray(FT, (Nq, Nvt))
psi = CUDA.CuStaticSharedArray(FT, (Nq, Nvt))
Ju1 = CUDA.CuStaticSharedArray(JT, (Nq, Nvt))
psi = CUDA.CuStaticSharedArray(eltype(arg2), (Nq, Nvt))
return (Ju1, psi)
end
Base.@propagate_inbounds function operator_shmem(
Expand All @@ -179,10 +180,11 @@ Base.@propagate_inbounds function operator_shmem(
FT = Spaces.undertype(space)
QS = Spaces.quadrature_style(space)
Nq = Quadratures.degrees_of_freedom(QS)
JT = operator_return_eltype(op, eltype(arg1), FT)
# allocate temp output for Ju1, Ju2, and psi
Ju1 = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nvt))
Ju2 = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nvt))
psi = CUDA.CuStaticSharedArray(FT, (Nq, Nq, Nvt))
Ju1 = CUDA.CuStaticSharedArray(JT, (Nq, Nq, Nvt))
Ju2 = CUDA.CuStaticSharedArray(JT, (Nq, Nq, Nvt))
psi = CUDA.CuStaticSharedArray(eltype(arg2), (Nq, Nq, Nvt))
return (Ju1, Ju2, psi)
end

Expand All @@ -198,7 +200,11 @@ Base.@propagate_inbounds function operator_fill_shmem!(
vt = threadIdx().z
local_geometry = get_local_geometry(space, ij, slabidx)
i, _ = ij.I
Ju1[i, vt] = local_geometry.J * Geometry.contravariant1(arg1, local_geometry)
Ju1[i, vt] =
local_geometry.J ⊠ RecursiveApply.rmap(
u -> Geometry.contravariant1(u, local_geometry),
arg1,
)
psi[i, vt] = arg2
end

Expand All @@ -214,9 +220,16 @@ Base.@propagate_inbounds function operator_fill_shmem!(
vt = threadIdx().z
local_geometry = get_local_geometry(space, ij, slabidx)
i, j = ij.I

Ju1[i, j, vt] = local_geometry.J * Geometry.contravariant1(arg1, local_geometry)
Ju2[i, j, vt] = local_geometry.J * Geometry.contravariant2(arg1, local_geometry)
Ju1[i, j, vt] =
local_geometry.J ⊠ RecursiveApply.rmap(
u -> Geometry.contravariant1(u, local_geometry),
arg1,
)
Ju2[i, j, vt] =
local_geometry.J ⊠ RecursiveApply.rmap(
u -> Geometry.contravariant2(u, local_geometry),
arg1,
)
psi[i, j, vt] = arg2
end

Expand Down
44 changes: 24 additions & 20 deletions ext/cuda/operators_spectral_element.jl
Original file line number Diff line number Diff line change
Expand Up @@ -294,18 +294,20 @@ Base.@propagate_inbounds function operator_evaluate(
QS = Spaces.quadrature_style(space)
Nq = Quadratures.degrees_of_freedom(QS)
D = Quadratures.differentiation_matrix(FT, QS)
RT = Geometry.rmul_return_type(eltype(Ju1), eltype(psi))

local_geometry = get_local_geometry(space, ij, slabidx)

# Compute split-form divergence: sum_j D[i,j] * flux_ij
# where flux_ij = 0.5 * (Ju1[i] + Ju1[j]) * (psi[i] + psi[j])
result = zero(FT)
result = zero(RT)
for j in 1:Nq
flux_ij = 0.5 * (Ju1[i, vt] + Ju1[j, vt]) * (psi[i, vt] + psi[j, vt])
result = result + D[i, j] * flux_ij
j == i && continue
F1 = RecursiveApply.rdiv(
(Ju1[i, vt] ⊞ Ju1[j, vt]) ⊠ (psi[i, vt] ⊞ psi[j, vt]),
2,
)
result = result ⊞ D[i, j] ⊠ F1
end

return result * local_geometry.invJ
return result ⊠ local_geometry.invJ
end
Base.@propagate_inbounds function operator_evaluate(
op::SplitDivergence{(1, 2)},
Expand All @@ -321,26 +323,28 @@ Base.@propagate_inbounds function operator_evaluate(
QS = Spaces.quadrature_style(space)
Nq = Quadratures.degrees_of_freedom(QS)
D = Quadratures.differentiation_matrix(FT, QS)
RT = Geometry.rmul_return_type(eltype(Ju1), eltype(psi))

local_geometry = get_local_geometry(space, ij, slabidx)

# Compute split-form divergence in 2D
# First dimension: sum_k D[i,k] * flux_ik
# where flux_ik = 0.5 * (Ju1[i,j] + Ju1[k,j]) * (psi[i,j] + psi[k,j])
result = zero(FT)
result = zero(RT)
for k in 1:Nq
flux_ik = 0.5 * (Ju1[i, j, vt] + Ju1[k, j, vt]) * (psi[i, j, vt] + psi[k, j, vt])
result = result + D[i, k] * flux_ik
k == i && continue
F1 = RecursiveApply.rdiv(
(Ju1[i, j, vt] ⊞ Ju1[k, j, vt]) ⊠ (psi[i, j, vt] ⊞ psi[k, j, vt]),
2,
)
result = result ⊞ D[i, k] ⊠ F1
end

# Second dimension: sum_k D[j,k] * flux_jk
# where flux_jk = 0.5 * (Ju2[i,j] + Ju2[i,k]) * (psi[i,j] + psi[i,k])
for k in 1:Nq
flux_jk = 0.5 * (Ju2[i, j, vt] + Ju2[i, k, vt]) * (psi[i, j, vt] + psi[i, k, vt])
result = result + D[j, k] * flux_jk
k == j && continue
F2 = RecursiveApply.rdiv(
(Ju2[i, j, vt] ⊞ Ju2[i, k, vt]) ⊠ (psi[i, j, vt] ⊞ psi[i, k, vt]),
2,
)
result = result ⊞ D[j, k] ⊠ F2
end

return result * local_geometry.invJ
return result ⊠ local_geometry.invJ
end

Base.@propagate_inbounds function operator_evaluate(
Expand Down
180 changes: 106 additions & 74 deletions src/Operators/spectralelement.jl
Original file line number Diff line number Diff line change
Expand Up @@ -625,9 +625,9 @@ end

"""
split_div = SplitDivergence()
split_div.(u, ψ)
split_div.(ρu, ψ)

Computes the divergence of the product `u * ψ` using a **split-form (entropy-stable)** discretization.
Computes the divergence of the product `ρu * ψ` using a **split-form (entropy-stable)** discretization.

This operator is designed for the advection of scalar quantities in conservation laws (e.g.,
thermodynamic variables or tracers). By evaluating the divergence using a specific averaging of the
Expand All @@ -636,43 +636,72 @@ of two spectrally variable fields, thereby inhibiting the growth of quadratic in
cold temperature spikes) without requiring hyperviscosity.

# Arguments
- `u`: The transport vector field, typically the **mass flux** (``\\rho \\mathbf{u}``). It must be a vector quantity (e.g., `Geometry.Covariant12Vector`).
- `ρu`: The transport vector field, typically the **mass flux**. It must be a vector quantity (e.g., `Geometry.Contravariant12Vector`).
- `ψ`: The **specific** scalar quantity to be advected (e.g., specific total energy ``e_{tot}`` or specific humidity ``q_{tot}``).

# Mathematical Formulation

## Continuous
The split form is defined as the arithmetic mean of the conservative form and the advective form:
The split form of the divergence operator is defined as the arithmetic mean of
the conservative and advective forms:
```math
\\nabla \\cdot (\\rho \\mathbf{u} \\psi)|_{split} = \\frac{1}{2} \\nabla \\cdot (\\rho \\mathbf{u} \\psi) + \\frac{1}{2} \\left( \\psi \\nabla \\cdot (\\rho \\mathbf{u}) + \\rho \\mathbf{u} \\cdot \\nabla \\psi \\right)
\\nabla \\cdot (\\rho \\mathbf{u} \\psi)|_\\textrm{split} =
\\frac{1}{2} \\nabla \\cdot (\\rho \\mathbf{u} \\psi) +
\\frac{1}{2} \\left(
\\psi \\nabla \\cdot (\\rho \\mathbf{u}) +
\\rho \\mathbf{u} \\cdot \\nabla \\psi
\\right)
```

## Discrete (SBP)
The implementation uses the generalized summation-by-parts (SBP) property, where the divergence is the inverse of the mass matrix acting on the stiffness matrix. Let \\delta V_i = w_i J_i be the physical volume element (weights w, Jacobian J) and Q be the skew-symmetric stiffness matrix. The divergence is:
## Discrete
The discretized split operator is equivalent to using the strong formulation of
the gradient operator and the weak formulation of the divergence operator:
```math
(Div_{split})_i = \\frac{1}{\\delta V_i} \\sum_j Q_{ij} F_{ij}
\\textrm{split_div}(\\rho \\mathbf{u}, \\psi) =
\\frac{1}{2} \\textrm{wdiv}(\\rho \\mathbf{u} \\psi) +
\\frac{1}{2} \\left(
\\psi \\textrm{wdiv}(\\rho \\mathbf{u}) +
\\rho \\mathbf{u} \\cdot \\textrm{grad}(\\psi)
\\right)
```
where F_{ij} is the symmetric two-point flux.

**Reduction to differentiation matrix form:** For Legendre-Gauss-Lobatto (LGL) elements, the stiffness matrix satisfies Q_{ij} = w_i D_{ij}. Substituting this into the equation above, the quadrature weights w_i cancel out:
```math
\\frac{1}{w_i J_i} \\sum_j (w_i D_{ij}) F_{ij} = \\frac{1}{J_i} \\sum_j D_{ij} F_{ij}
Swapping the weak and strong formulations in the last two terms also results in
the same operator. The discrete form of the divergence theorem, which stems from
the generalized summation-by-parts (SBP) property, guarantees that the integral
of the first term vanishes,
```math
\\int_\\Omega \\textrm{wdiv}(\\rho \\mathbf{u} \\psi) dV = 0
```
while the integrals of the other two terms cancel out,
```math
\\int_\\Omega \\psi \\textrm{wdiv}(\\rho \\mathbf{u}) dV =
-\\int_\\Omega \\rho \\mathbf{u} \\cdot \\textrm{grad}(\\psi) dV
```
So, this discretization ensures that the split operator conserves the integral
of ``\\rho \\mathbf{u} \\psi``.

This is the form implemented in the code:
## Two-Point
A more compact representation of the discretized operator can be obtained with
the symmetric two-point flux, whose values in one dimension are
```math
(Div_{split})_i = \\frac{1}{J_i} \\sum_j D_{ij} F_{ij}
(F^1)_{ij} =
\\frac{1}{2} (\\rho_i J_i (u^1)_i + \\rho_j J_j (u^1)_j) (\\psi_i + \\psi_j)
```
where ``F_{ij}`` is the symmetric **two-point flux** between nodes ``i`` and ``j``:
With ``D`` denoting the spectral derivative matrix, the split operator in one
dimension can be expressed as
```math
F_{ij} = \\frac{1}{2} \\left( U_i + U_j \\right) (\\psi_i + \\psi_j)
\\textrm{split_div}(\\rho \\mathbf{u}, \\psi)_i =
\\frac{1}{J_i} \\sum_{j \\neq i} D_{ij} (F^1)_{ij}
```
Here, ``U`` represents the Jacobian-weighted contravariant flux component (``U^1 = J u^1``). In 2D/3D tensor-product grids, this 1D split operator is
applied sequentially along each dimension.
In two dimensions, ``F^1`` and the analogous quantity ``F^2`` provide a similar
expression for the split divergence, with the one-dimensional operator applied
sequentially along each dimension.

# Properties
1. **Conservation:** The operator remains conservative.
2. **Consistency:** If `ψ` is spatially constant (e.g., `ψ = 1`), the operator degenerates to the standard divergence of `u` (mass continuity).
3. **Complexity:** Unlike standard spectral element divergence which uses tensor product factorization for ``O(N)`` cost, this operator requires explicit interaction between all node pairs on a 1D line, resulting in ``O(N^2)`` complexity per line (where ``N`` is the polynomial order).
1. **Conservation:** The split operator conserves ``\\rho \\mathbf{u} \\psi``
2. **Consistency:** If ``\\psi = 1``, the split operator degenerates to the
weak formulation of ``\\nabla \\cdot \\rho \\mathbf{u}`` (mass continuity)
3. **Complexity:** The split operator has the same ``O(N^2)`` complexity per
element as the strong and weak operators, but needs twice as many operations

# References
- Fisher, T. C., & Carpenter, M. H. (2013). High-order entropy stable finite difference schemes for nonlinear conservation laws: Finite domains. Journal of Computational Physics, 252, 518-557. [https://doi.org/10.1016/j.jcp.2013.06.014](https://doi.org/10.1016/j.jcp.2013.06.014)
Expand All @@ -693,36 +722,39 @@ function apply_operator(op::SplitDivergence{(1,)}, space, slabidx, arg1, arg2)
QS = Spaces.quadrature_style(space)
Nq = Quadratures.degrees_of_freedom(QS)
D = Quadratures.differentiation_matrix(FT, QS)
JT = operator_return_eltype(op, eltype(arg1), FT)
RT = operator_return_eltype(op, eltype(arg1), eltype(arg2))
out = IF{RT, Nq}(MArray, FT)
fill!(parent(out), zero(FT))

Ju1_slab = MArray{Tuple{Nq}, FT}(undef)
psi_slab = MArray{Tuple{Nq}, FT}(undef)

Ju1 = IF{JT, Nq}(MArray, FT)
psi = IF{eltype(arg2), Nq}(MArray, eltype(arg2))
@inbounds for i in 1:Nq
ij = CartesianIndex((i,))
local_geometry = get_local_geometry(space, ij, slabidx)
u = get_node(space, arg1, ij, slabidx)
psi = get_node(space, arg2, ij, slabidx)
Ju1 = local_geometry.J * Geometry.contravariant1(u, local_geometry)
Ju1_slab[i] = Ju1
psi_slab[i] = psi
Ju1[slab_index(i)] =
local_geometry.J ⊠ RecursiveApply.rmap(
u -> Geometry.contravariant1(u, local_geometry),
get_node(space, arg1, ij, slabidx),
)
psi[slab_index(i)] = get_node(space, arg2, ij, slabidx)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just curious: I find these 6 lines harder to read than the pattern it replaces. What's the advantage of writing it this way?

Copy link
Copy Markdown
Member Author

@dennisYatunin dennisYatunin Jan 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The reason I did this was to enforce consistency with the rest of the code in this file, so future PRs that refactor RecursiveApply and DataLayouts will not be behavior changing. But I totally agree that this is harder to read and needs to be refactored soon.

The RecursiveApply functions in this file allow iteration over multiple mass fluxes and/or multiple scalar quantities within the same fused operation. I'm working on removing the RecursiveApply module entirely from ClimaCore while maintining this behavior in #2417, which should eliminate most of the complexity introduced here.

As for the convoluted indexing required by DataLayouts, I'm honestly not sure why that is used throughout this file. The original intent of DataLayouts was to rearrange values in global memory so that L1/L2 caching could be improved, but it's not clear to me how this is useful for MArrays that are already cached in register memory. I will explore this more carefully in a future PR that refactors the DataLayouts module. If I don't end up finding any benefit to wrapping register caches in IF and IJF DataLayouts, I will remove these wrappers from psi, out, and all other register caches in ClimaCore.

end

out = IF{RT, Nq}(MArray, FT)
fill!(parent(out), zero(FT))
@inbounds for i in 1:Nq
val = zero(FT)
for j in 1:Nq
flux_ij = (Ju1_slab[i] + Ju1_slab[j]) * (psi_slab[i] + psi_slab[j]) / 2
val += D[i, j] * flux_ij
for j in 1:(i - 1) # loop over half the indices, since F1[i,j] = F1[j,i]
F1 = RecursiveApply.rdiv(
(Ju1[slab_index(i)] ⊞ Ju1[slab_index(j)]) ⊠
(psi[slab_index(i)] ⊞ psi[slab_index(j)]),
2,
)
out[slab_index(i)] = out[slab_index(i)] ⊞ D[i, j] ⊠ F1
out[slab_index(j)] = out[slab_index(j)] ⊞ D[j, i] ⊠ F1
end
out[slab_index(i)] = val
end

@inbounds for i in 1:Nq
ij = CartesianIndex((i,))
local_geometry = get_local_geometry(space, ij, slabidx)
out[slab_index(i)] = out[slab_index(i)] * local_geometry.invJ
out[slab_index(i)] = out[slab_index(i)] local_geometry.invJ
end

return Field(SArray(out), space)
Expand All @@ -733,55 +765,55 @@ function apply_operator(op::SplitDivergence{(1, 2)}, space, slabidx, arg1, arg2)
QS = Spaces.quadrature_style(space)
Nq = Quadratures.degrees_of_freedom(QS)
D = Quadratures.differentiation_matrix(FT, QS)
JT = operator_return_eltype(op, eltype(arg1), FT)
RT = operator_return_eltype(op, eltype(arg1), eltype(arg2))
out = DataLayouts.IJF{RT, Nq}(MArray, FT)
fill!(parent(out), zero(FT))

Ju1_slab = MArray{Tuple{Nq, Nq}, FT}(undef)
Ju2_slab = MArray{Tuple{Nq, Nq}, FT}(undef)
psi_slab = MArray{Tuple{Nq, Nq}, FT}(undef)

Ju1 = DataLayouts.IJF{JT, Nq}(MArray, FT)
Ju2 = DataLayouts.IJF{JT, Nq}(MArray, FT)
psi = DataLayouts.IJF{eltype(arg2), Nq}(MArray, eltype(arg2))
@inbounds for j in 1:Nq, i in 1:Nq
ij = CartesianIndex((i, j))
local_geometry = get_local_geometry(space, ij, slabidx)
u = get_node(space, arg1, ij, slabidx)
psi = get_node(space, arg2, ij, slabidx)

Ju1_slab[i, j] = local_geometry.J * Geometry.contravariant1(u, local_geometry)
Ju2_slab[i, j] = local_geometry.J * Geometry.contravariant2(u, local_geometry)
psi_slab[i, j] = psi
Ju1[slab_index(i, j)] =
local_geometry.J ⊠ RecursiveApply.rmap(
u -> Geometry.contravariant1(u, local_geometry),
u,
)
Ju2[slab_index(i, j)] =
local_geometry.J ⊠ RecursiveApply.rmap(
u -> Geometry.contravariant2(u, local_geometry),
u,
)
psi[slab_index(i, j)] = get_node(space, arg2, ij, slabidx)
end

@inbounds for j in 1:Nq
for i in 1:Nq
val = zero(FT)
for k in 1:Nq
flux_ik =
(Ju1_slab[i, j] + Ju1_slab[k, j]) *
(psi_slab[i, j] + psi_slab[k, j]) / 2
val += D[i, k] * flux_ik
end
out[slab_index(i, j)] += val
out = DataLayouts.IJF{RT, Nq}(MArray, FT)
fill!(parent(out), zero(FT))
@inbounds for j in 1:Nq, i in 1:Nq
for k in 1:(i - 1) # loop over half the indices, since F1[i,k] = F1[k,i]
F1 = RecursiveApply.rdiv(
(Ju1[slab_index(i, j)] ⊞ Ju1[slab_index(k, j)]) ⊠
(psi[slab_index(i, j)] ⊞ psi[slab_index(k, j)]),
2,
)
out[slab_index(i, j)] = out[slab_index(i, j)] ⊞ D[i, k] ⊠ F1
out[slab_index(k, j)] = out[slab_index(k, j)] ⊞ D[k, i] ⊠ F1
end
end

@inbounds for i in 1:Nq
for j in 1:Nq
val = zero(FT)
for k in 1:Nq
flux_jk =
(Ju2_slab[i, j] + Ju2_slab[i, k]) *
(psi_slab[i, j] + psi_slab[i, k]) / 2
val += D[j, k] * flux_jk
end
out[slab_index(i, j)] += val
for k in 1:(j - 1) # loop over half the indices, since F2[j,k] = F2[k,j]
F2 = RecursiveApply.rdiv(
(Ju2[slab_index(i, j)] ⊞ Ju2[slab_index(i, k)]) ⊠
(psi[slab_index(i, j)] ⊞ psi[slab_index(i, k)]),
2,
)
out[slab_index(i, j)] = out[slab_index(i, j)] ⊞ D[j, k] ⊠ F2
out[slab_index(i, k)] = out[slab_index(i, k)] ⊞ D[k, j] ⊠ F2
end
end

@inbounds for j in 1:Nq, i in 1:Nq
ij = CartesianIndex((i, j))
local_geometry = get_local_geometry(space, ij, slabidx)
out[slab_index(i, j)] *= local_geometry.invJ
out[slab_index(i, j)] = out[slab_index(i, j)] ⊠ local_geometry.invJ
end

return Field(SArray(out), space)
Expand Down
Loading
Loading