diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 28ca591a8c..9e5913c8b5 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -57,13 +57,6 @@ steps:
         agents:
           slurm_gpus: 1
 
-  - group: "Unit: RecursiveApply"
-    steps:
-
-      - label: "Unit: RecursiveApply"
-        key: unit_recursive_apply
-        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/RecursiveApply/unit_recursive_apply.jl"
-
   - group: "Unit: CUDA utils"
     steps:
 
@@ -93,6 +86,10 @@ steps:
         key: unit_plushalf
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Utilities/unit_plushalf.jl"
 
+      - label: "Unit: auto_broadcaster"
+        key: unit_auto_broadcaster
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Utilities/unit_auto_broadcaster.jl"
+
   - group: "Unit: DataLayouts"
     steps:
 
@@ -227,9 +224,9 @@ steps:
         key: unit_axistensors
         command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Geometry/axistensors.jl"
 
-      - label: "Unit: rmul_with_projection"
-        key: unit_rmul_with_projection
-        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Geometry/rmul_with_projection.jl"
+      - label: "Unit: mul_with_projection"
+        key: unit_mul_with_projection
+        command: "julia --color=yes --check-bounds=yes --project=.buildkite test/Geometry/mul_with_projection.jl"
 
   - group: "Unit: Meshes"
     steps:
diff --git a/benchmarks/3d/se_kernels.jl b/benchmarks/3d/se_kernels.jl
index 4e8a613981..cc8032892d 100644
--- a/benchmarks/3d/se_kernels.jl
+++ b/benchmarks/3d/se_kernels.jl
@@ -13,9 +13,7 @@ import ClimaCore:
     Spaces,
     Quadratures,
     Topologies,
-    DataLayouts,
-    RecursiveApply
-
+    DataLayouts
 const C1 = ClimaCore.Geometry.Covariant1Vector
 const C2 = ClimaCore.Geometry.Covariant2Vector
 const C3 = ClimaCore.Geometry.Covariant3Vector
@@ -25,8 +23,6 @@ const CT123 = Geometry.Contravariant123Vector
 const ᶜinterp = Operators.InterpolateF2C()
 const ᶠinterp = Operators.InterpolateC2F()
 
-const ⊞ = RecursiveApply.radd
-
 init_uθ(ϕ, z, R) = 1.0 / R
 init_vθ(ϕ, z, R) = 1.0 / R
 init_w(ϕ, z) = 1.0
diff --git a/benchmarks/bickleyjet/bickleyjet_dg.jl b/benchmarks/bickleyjet/bickleyjet_dg.jl
index 2fe4df39a7..052248d4b1 100644
--- a/benchmarks/bickleyjet/bickleyjet_dg.jl
+++ b/benchmarks/bickleyjet/bickleyjet_dg.jl
@@ -7,9 +7,6 @@ import ClimaCore.Operators
 using ClimaCore.Geometry
 import ClimaCore.Geometry: Abstract2DPoint
 
-using ClimaCore.RecursiveApply
-
-
 const parameters = (
     ϵ = 0.1,  # perturbation size for initial condition
     l = 0.5, # Gaussian width
@@ -60,7 +57,7 @@ roe_average(ρ⁻, ρ⁺, var⁻, var⁺) =
     (sqrt(ρ⁻) * var⁻ + sqrt(ρ⁺) * var⁺) / (sqrt(ρ⁻) + sqrt(ρ⁺))
 
 function roeflux(n, (y⁻, parameters⁻), (y⁺, parameters⁺))
-    Favg = RecursiveApply.rdiv(flux(y⁻, parameters⁻) ⊞ flux(y⁺, parameters⁺), 2)
+    Favg = (flux(y⁻, parameters⁻) + flux(y⁺, parameters⁺)) / 2
 
     λ = sqrt(parameters⁻.g)
 
@@ -115,7 +112,7 @@ function roeflux(n, (y⁻, parameters⁻), (y⁺, parameters⁺))
     fluxᵀn_ρθ = ((w1 + w2) * θ + w5) * 0.5
 
     Δf = (ρ = -fluxᵀn_ρ, ρu = -fluxᵀn_ρu, ρθ = -fluxᵀn_ρθ)
-    RecursiveApply.rmap(f -> f' * n, Favg) ⊞ Δf
+    return Favg' * n + Δf
 end
 
 function volume!(dydt, y, (parameters,), t)
diff --git a/benchmarks/bickleyjet/core_vs_ref.jl b/benchmarks/bickleyjet/core_vs_ref.jl
index 3c84bbdf68..a1f57939db 100644
--- a/benchmarks/bickleyjet/core_vs_ref.jl
+++ b/benchmarks/bickleyjet/core_vs_ref.jl
@@ -30,8 +30,7 @@ for Nq in Nqs
     volume!(dydt, y0, (parameters,), 0.0)
     # TODO: move this to volume!
     dydt_data = Fields.field_values(dydt)
-    dydt_data .=
-        RecursiveApply.rdiv.(dydt_data, Spaces.local_geometry_data(space).WJ)
+    dydt_data ./= Spaces.local_geometry_data(space).WJ
 
     # setup reference
     X = coordinates(Val(Nq), n1, n2)
@@ -84,8 +83,7 @@ for Nq in Nqs
     add_face!(dydt, y0, (parameters,), 0.0)
     # TODO: move this to volume!
     dydt_data = Fields.field_values(dydt)
-    dydt_data .=
-        RecursiveApply.rdiv.(dydt_data, Spaces.local_geometry_data(space).WJ)
+    dydt_data ./= Spaces.local_geometry_data(space).WJ
 
     fill!(dydt_ref, 0.0)
     add_face_ref!(dydt_ref, y0_ref, (n1, n2, parameters, Val(Nq)), 0.0)
diff --git a/docs/make.jl b/docs/make.jl
index f359d168cf..8f57d04fc7 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -100,7 +100,6 @@ withenv("GKSwstype" => "nul") do
                 "Limiters" => "APIs/limiters_api.md",
                 "InputOutput" => "APIs/input_output_api.md",
                 "Remapping" => "APIs/remapping_api.md",
-                "RecursiveApply" => "APIs/recursive_apply_api.md",
                 "Devices" => "APIs/devices_api.md",
                 "DebugOnly" => "APIs/debug_only_api.md",
             ],
diff --git a/docs/src/APIs/geometry_api.md b/docs/src/APIs/geometry_api.md
index 6d7fbeb77f..9d2b1d09f1 100644
--- a/docs/src/APIs/geometry_api.md
+++ b/docs/src/APIs/geometry_api.md
@@ -21,6 +21,7 @@ Geometry.LocalGeometry
 
 ```@docs
 Geometry.Δz_metric_component
+Geometry.:⊗
 ```
 
 ## Coordinates
diff --git a/docs/src/APIs/recursive_apply_api.md b/docs/src/APIs/recursive_apply_api.md
deleted file mode 100644
index ef38087dda..0000000000
--- a/docs/src/APIs/recursive_apply_api.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# RecursiveApply
-
-```@meta
-CurrentModule = ClimaCore
-```
-
-```@docs
-RecursiveApply
-```
diff --git a/docs/src/APIs/utilities_api.md b/docs/src/APIs/utilities_api.md
index ac1f0b91c3..201112b1ff 100644
--- a/docs/src/APIs/utilities_api.md
+++ b/docs/src/APIs/utilities_api.md
@@ -5,11 +5,30 @@ CurrentModule = ClimaCore
 ```
 
 ```@docs
-Utilities.PlusHalf
-Utilities.half
+Utilities.unionall_type
 Utilities.replace_type_parameter
 Utilities.fieldtype_vals
 Utilities.new
+Utilities.unsafe_eltype
+Utilities.safe_eltype
+```
+
+## Utilities.PlusHalf
+
+```@docs
+Utilities.PlusHalf
+Utilities.half
+```
+
+## Utilities.AutoBroadcaster
+
+```@docs
+Utilities.AutoBroadcaster
+Utilities.is_auto_broadcastable
+Utilities.add_auto_broadcasters
+Utilities.drop_auto_broadcasters
+Utilities.auto_broadcasted
+Utilities.nested_broadcast
 ```
 
 ## Utilities.Cache
diff --git a/docs/src/geometry.md b/docs/src/geometry.md
index 483704dfa3..ef5fb3d450 100644
--- a/docs/src/geometry.md
+++ b/docs/src/geometry.md
@@ -6,7 +6,5 @@ CurrentModule = ClimaCore.Geometry
 
 ```@docs
 mul_with_projection
-rmul_with_projection
 mul_return_type
-rmul_return_type
 ```
diff --git a/docs/src/matrix_fields.md b/docs/src/matrix_fields.md
index 6fb0a3136e..c9a90b2f6b 100644
--- a/docs/src/matrix_fields.md
+++ b/docs/src/matrix_fields.md
@@ -210,15 +210,11 @@ J = MatrixFields.FieldMatrix((@name(f), @name(g))=> ∂f_∂g)
 
 ## Optimizations
 
-Each entry of a `FieldMatrix` can be a `ColumnwiseBandMatrixField`, a `DiagonalMatrixRow`, or an
-`UniformScaling`.
-
-A `ColumnwiseBandMatrixField` is a `Field` with a `BandMatrixRow` at each point. It is intended
-to represent a collection of banded matrices, where there is one band matrix for each column
-of the space the `Field` is on. Beyond only storing the diagonals of the band matrix, an `entry`
-can be optimized to use less memory. Each optimized representation can be indexed equivalently to
-non optimized representations, and used in addition, subtraction, matrix-vector multiplication,
-Matrix-matrix multiplication, `RecursiveApply`, and `FieldMatrixSolver`.
+Each entry of a `FieldMatrix` can be a `ColumnwiseBandMatrixField`, a `DiagonalMatrixRow`, or a
+`UniformScaling`. A `ColumnwiseBandMatrixField` is a `Field` with a `BandMatrixRow` at each point.
+It represents a collection of banded matrices, with each column of the `Field` corresponding to a
+specific matrix. If all columns correspond a constant multiple of the identity matrix, the `Field`
+may be replaced with a `ScalingFieldMatrixEntry` (i.e., a `DiagonalMatrixRow` or `UniformScaling`).
 
 For the following sections, `space` is a column space with $N_v$ levels. A column space is
 used for simplicity in this example, but the optimizations work with any space with columns.
diff --git a/docs/src/operators.md b/docs/src/operators.md
index f75470fccc..b3b6a25d8b 100644
--- a/docs/src/operators.md
+++ b/docs/src/operators.md
@@ -112,8 +112,6 @@ column_accumulate!
 ## Internal APIs
 
 ```@docs
-getidx_return_type
-stencil_return_type
 return_eltype
 return_space
 stencil_interior_width
diff --git a/examples/bickleyjet/bickleyjet_dg.jl b/examples/bickleyjet/bickleyjet_dg.jl
index 7b729c1668..052be0b5eb 100644
--- a/examples/bickleyjet/bickleyjet_dg.jl
+++ b/examples/bickleyjet/bickleyjet_dg.jl
@@ -7,12 +7,10 @@ import ClimaCore:
     Geometry,
     Meshes,
     Operators,
-    RecursiveApply,
     Spaces,
     Quadratures,
     Topologies
 import ClimaCore.Geometry: ⊗
-import ClimaCore.RecursiveApply: ⊞, rdiv, rmap
 
 using OrdinaryDiffEqSSPRK: ODEProblem, solve, SSPRK33
 
@@ -106,8 +104,6 @@ roe_average(ρ⁻, ρ⁺, var⁻, var⁺) =
     (sqrt(ρ⁻) * var⁻ + sqrt(ρ⁺) * var⁺) / (sqrt(ρ⁻) + sqrt(ρ⁺))
 
 function roeflux(n, (y⁻, parameters⁻), (y⁺, parameters⁺))
-    Favg = rdiv(flux(y⁻, parameters⁻) ⊞ flux(y⁺, parameters⁺), 2)
-
     λ = sqrt(parameters⁻.g)
 
     ρ⁻, ρu⁻, ρθ⁻ = y⁻.ρ, y⁻.ρu, y⁻.ρθ
@@ -159,9 +155,11 @@ function roeflux(n, (y⁻, parameters⁻), (y⁺, parameters⁺))
         (w1 * (u - c * n) + w2 * (u + c * n) + w3 * u + w4 * (Δu - Δuₙ * n)) *
         0.5
     fluxᵀn_ρθ = ((w1 + w2) * θ + w5) * 0.5
-
     Δf = (ρ = -fluxᵀn_ρ, ρu = -fluxᵀn_ρu, ρθ = -fluxᵀn_ρθ)
-    rmap(f -> f' * n, Favg) ⊞ Δf
+
+    return map(flux(y⁻, parameters⁻), flux(y⁺, parameters⁺), Δf) do F⁻, F⁺, Δf
+        ((F⁻ + F⁺) / 2)' * n + Δf
+    end
 end
 
 
@@ -205,9 +203,8 @@ function rhs!(dydt, y, (parameters, numflux), t)
     end
 
     # 6. Solve for final result
-    dydt_data = Fields.field_values(dydt)
-    dydt_data .=
-        RecursiveApply.rdiv.(dydt_data, Spaces.local_geometry_data(space).WJ)
+    dydt_data =
+        Fields.field_values(dydt) ./ Spaces.local_geometry_data(space).WJ
     M = Quadratures.cutoff_filter_matrix(
         Float64,
         Spaces.quadrature_style(space),
diff --git a/examples/hybrid/sphere/deformation_flow.jl b/examples/hybrid/sphere/deformation_flow.jl
index 5b8107f5c7..29f0414ac7 100644
--- a/examples/hybrid/sphere/deformation_flow.jl
+++ b/examples/hybrid/sphere/deformation_flow.jl
@@ -62,16 +62,15 @@ ode_algorithm = ExplicitAlgorithm(SSP33ShuOsher())
 const hdiv = Operators.Divergence()
 const hwdiv = Operators.WeakDivergence()
 const hgrad = Operators.Gradient()
-const If2c = Operators.InterpolateF2C()
-const Ic2f = Operators.InterpolateC2F(
+const interp = Operators.InterpolateC2F(
     bottom = Operators.Extrapolate(),
     top = Operators.Extrapolate(),
 )
-const ᶠwinterp = Operators.WeightedInterpolateC2F(
+const winterp = Operators.WeightedInterpolateC2F(
     bottom = Operators.Extrapolate(),
     top = Operators.Extrapolate(),
 )
-const vdivf2c = Operators.DivergenceF2C(
+const vdiv = Operators.DivergenceF2C(
     top = Operators.SetValue(Geometry.Contravariant3Vector(FT(0))),
     bottom = Operators.SetValue(Geometry.Contravariant3Vector(FT(0))),
 )
@@ -97,10 +96,6 @@ const LinVanLeerFlux = Operators.LinVanLeerC2F(
     top = Operators.FirstOrderOneSided(),
     constraint = Operators.MonotoneLocalExtrema(),
 )
-const FCTBorisBook = Operators.FCTBorisBook(
-    bottom = Operators.FirstOrderOneSided(),
-    top = Operators.FirstOrderOneSided(),
-)
 
 # Reference pressure and density
 p(z) = p_0 * exp(-z / H)
@@ -145,75 +140,52 @@ end
 
 function horizontal_tendency!(Yₜ, Y, cache, t)
     (; u, Δₕq) = cache
-    coord = Fields.coordinate_field(u)
+    coord = Fields.coordinate_field(Y.c)
     @. u = local_velocity(coord, t)
     @. Δₕq = hwdiv(hgrad(Y.c.ρq / Y.c.ρ))
     Spaces.weighted_dss!(Δₕq)
     @. Yₜ.c.ρ = -hdiv(Y.c.ρ * u)
-    for n in 1:5 # TODO: update RecursiveApply/Operators to eliminate this loop
-        ρq_n = Y.c.ρq.:($n)
-        ρqₜ_n = Yₜ.c.ρq.:($n)
-        @. ρqₜ_n = -hdiv(ρq_n * u)
-    end
-    @. Yₜ.c.ρq -= D₄ * hwdiv(Y.c.ρ * hgrad(Δₕq))
+    @. Yₜ.c.ρq = -hdiv(Y.c.ρq * u) - D₄ * hwdiv(Y.c.ρ * hgrad(Δₕq))
 end
 
 function vertical_tendency!(Yₜ, Y, cache, t)
-    (; q_n, face_u, face_uₕ, face_uᵥ, fct_op, dt) = cache
+    (; q, face_ρ, face_u, fct_op, dt) = cache
+    (; J) = Fields.local_geometry_field(Y.c)
     face_coord = Fields.coordinate_field(face_u)
+    @. q = Y.c.ρq / Y.c.ρ
+    @. face_ρ = winterp(J, Y.c.ρ)
     @. face_u = local_velocity(face_coord, t)
-    @. face_uₕ = Geometry.project(Geometry.Covariant12Axis(), face_u)
-    @. face_uᵥ = Geometry.project(Geometry.Covariant3Axis(), face_u)
-    @. Yₜ.c.ρ = -vdivf2c(Ic2f(Y.c.ρ) * face_u)
-    ᶜJ = Fields.local_geometry_field(axes(Y.c.ρ)).J
-    for n in 1:5 # TODO: update RecursiveApply/Operators to eliminate this loop
-        ρq_n = Y.c.ρq.:($n)
-        ρqₜ_n = Yₜ.c.ρq.:($n)
-        @. q_n = ρq_n / Y.c.ρ
-        @. ρqₜ_n = -vdivf2c(Ic2f(ρq_n) * face_uₕ)
-        if isnothing(fct_op)
-            @. ρqₜ_n -= vdivf2c(ᶠwinterp(ᶜJ, Y.c.ρ) * face_uᵥ * Ic2f(q_n))
-        elseif fct_op == upwind1
-            @. ρqₜ_n -= vdivf2c(ᶠwinterp(ᶜJ, Y.c.ρ) * upwind1(face_uᵥ, q_n))
-        elseif fct_op == upwind3
-            @. ρqₜ_n -= vdivf2c(ᶠwinterp(ᶜJ, Y.c.ρ) * upwind3(face_uᵥ, q_n))
-        elseif fct_op == FCTBorisBook
-            @. ρqₜ_n -= vdivf2c(
-                ᶠwinterp(ᶜJ, Y.c.ρ) * (
-                    upwind1(face_uᵥ, q_n) + FCTBorisBook(
-                        upwind3(face_uᵥ, q_n) - upwind1(face_uᵥ, q_n),
-                        q_n / dt -
-                        vdivf2c(ᶠwinterp(ᶜJ, Y.c.ρ) * upwind1(face_uᵥ, q_n)) / Y.c.ρ,
-                    )
+    @. Yₜ.c.ρ = -vdiv(face_ρ * face_u)
+    if isnothing(fct_op)
+        @. Yₜ.c.ρq = -vdiv(face_ρ * face_u * interp(q))
+    elseif fct_op == upwind1
+        @. Yₜ.c.ρq = -vdiv(face_ρ * upwind1(face_u, q))
+    elseif fct_op == upwind3
+        @. Yₜ.c.ρq = -vdiv(face_ρ * upwind3(face_u, q))
+    elseif fct_op == FCTZalesak
+        @. Yₜ.c.ρq =
+            -vdiv(
+                face_ρ * upwind1(face_u, q) +
+                FCTZalesak(
+                    face_ρ * (upwind3(face_u, q) - upwind1(face_u, q)),
+                    q / dt,
+                    q / dt - vdiv(face_ρ * upwind1(face_u, q)) / Y.c.ρ,
                 ),
             )
-        elseif fct_op == FCTZalesak
-            @. ρqₜ_n -= vdivf2c(
-                ᶠwinterp(ᶜJ, Y.c.ρ) * (
-                    upwind1(face_uᵥ, q_n) + FCTZalesak(
-                        upwind3(face_uᵥ, q_n) - upwind1(face_uᵥ, q_n),
-                        q_n / dt,
-                        q_n / dt -
-                        vdivf2c(ᶠwinterp(ᶜJ, Y.c.ρ) * upwind1(face_uᵥ, q_n)) / Y.c.ρ,
-                    )
+    elseif fct_op == SlopeLimitedFlux
+        @. Yₜ.c.ρq =
+            -vdiv(
+                face_ρ * upwind1(face_u, q) +
+                SlopeLimitedFlux(
+                    face_ρ * (upwind3(face_u, q) - upwind1(face_u, q)),
+                    q / dt,
+                    face_u,
                 ),
             )
-        elseif fct_op == SlopeLimitedFlux
-            @. ρqₜ_n -= vdivf2c(
-                ᶠwinterp(ᶜJ, Y.c.ρ) * (
-                    upwind1(face_uᵥ, q_n) + SlopeLimitedFlux(
-                        upwind3(face_uᵥ, q_n) - upwind1(face_uᵥ, q_n),
-                        q_n / dt,
-                        face_uᵥ,
-                    )
-                ),
-            )
-        elseif fct_op == LinVanLeerFlux
-            @. ρqₜ_n -=
-                vdivf2c(ᶠwinterp(ᶜJ, Y.c.ρ) * LinVanLeerFlux(face_uᵥ, q_n, dt))
-        else
-            error("unrecognized FCT operator $fct_op")
-        end
+    elseif fct_op == LinVanLeerFlux
+        @. Yₜ.c.ρq = -vdiv(face_ρ * LinVanLeerFlux(face_u, q, dt))
+    else
+        error("unrecognized FCT operator $fct_op")
     end
 end
 
@@ -289,11 +261,10 @@ function run_deformation_flow(use_limiter, fct_op, dt)
 
     cache = (;
         u = Fields.Field(Geometry.UVWVector{FT}, cent_space),
+        q = Fields.Field(NTuple{5, FT}, cent_space),
         Δₕq = Fields.Field(NTuple{5, FT}, cent_space),
-        q_n = Fields.Field(FT, cent_space),
+        face_ρ = Fields.Field(FT, face_space),
         face_u = Fields.Field(Geometry.UVWVector{FT}, face_space),
-        face_uₕ = Fields.Field(Geometry.Covariant12Vector{FT}, face_space),
-        face_uᵥ = Fields.Field(Geometry.Covariant3Vector{FT}, face_space),
         limiter = use_limiter ? Limiters.QuasiMonotoneLimiter(Y.c.ρq) : nothing,
         fct_op,
         dt,
@@ -305,12 +276,7 @@ function run_deformation_flow(use_limiter, fct_op, dt)
         (0, t_end),
         cache,
     )
-    sol = solve(
-        problem,
-        ode_algorithm;
-        dt,
-        saveat = collect(0.0:(t_end / 2):t_end),
-    )
+    sol = solve(problem, ode_algorithm; dt)
     if !(cache.limiter isa Nothing)
         @show cache.limiter.rtol
         Limiters.print_convergence_stats(cache.limiter)
@@ -318,157 +284,117 @@ function run_deformation_flow(use_limiter, fct_op, dt)
     return sol
 end
 
-function conservation_errors(sol)
-    initial_total_mass = sum(sol.u[1].c.ρ)
-    initial_tracer_masses = map(n -> sum(sol.u[1].c.ρq.:($n)), 1:5)
-    final_total_mass = sum(sol.u[end].c.ρ)
-    final_tracer_masses = map(n -> sum(sol.u[end].c.ρq.:($n)), 1:5)
-    return (
-        (final_total_mass - initial_total_mass) / initial_total_mass,
-        (final_tracer_masses .- initial_tracer_masses) ./ initial_tracer_masses,
-    )
+function total_conservation_error(sol)
+    initial_mass = sum(sol[1].c.ρ)
+    final_mass = sum(sol[end].c.ρ)
+    return abs(final_mass - initial_mass) / initial_mass
 end
 
-# Roughness is measure as a deviation from the mean value
-tracer_roughnesses(sol) =
-    map(1:5) do n
-        q_n = sol.u[end].c.ρq.:($n) ./ sol.u[end].c.ρ
-        mean_q_n = mean(q_n) # TODO: replace the mean with a low-pass filter
-        return mean(abs.(q_n .- mean_q_n))
-    end
+function tracer_conservation_errors(sol)
+    initial_masses = sum(sol[1].c.ρq)
+    final_masses = sum(sol[end].c.ρq)
+    return abs.(final_masses .- initial_masses) ./ initial_masses
+end
 
-tracer_ranges(sol) =
-    map(1:5) do n
-        q_n = sol.u[end].c.ρq.:($n) ./ sol.u[end].c.ρ
-        return maximum(q_n) - minimum(q_n)
-    end
+# Roughness measured as deviation from mean (TODO: use a low-pass filter instead)
+function tracer_roughnesses(sol)
+    final_q = sol[end].c.ρq ./ sol[end].c.ρ
+    return mean(abs.(final_q .- mean(final_q)))
+end
+
+function tracer_ranges(sol)
+    final_q = sol[end].c.ρq ./ sol[end].c.ρ
+    return maximum(final_q) .- minimum(final_q)
+end
 
-@info "Slope Limited Solutions"
-tvd_sol = run_deformation_flow(false, SlopeLimitedFlux, _dt)
-lim_tvd_sol = run_deformation_flow(true, SlopeLimitedFlux, _dt)
-@info "vanLeer Flux Solutions"
-lvl_sol = run_deformation_flow(false, LinVanLeerFlux, _dt)
-lim_lvl_sol = run_deformation_flow(true, LinVanLeerFlux, _dt)
-@info "Third-Order Upwind Solutions"
-third_upwind_sol = run_deformation_flow(false, upwind3, _dt)
-lim_third_upwind_sol = run_deformation_flow(true, upwind3, _dt)
-@info "Zalesak Flux-Corrected Transport Solutions"
-fct_sol = run_deformation_flow(false, FCTZalesak, _dt)
-lim_fct_sol = run_deformation_flow(true, FCTZalesak, _dt)
-@info "First-Order Upwind Solutions"
-lim_first_upwind_sol = run_deformation_flow(true, upwind1, _dt)
-lim_centered_sol = run_deformation_flow(true, nothing, _dt)
-
-third_upwind_ρ_err, third_upwind_ρq_errs = conservation_errors(third_upwind_sol)
-fct_ρ_err, fct_ρq_errs = conservation_errors(fct_sol)
-lim_third_upwind_ρ_err, lim_third_upwind_ρq_errs =
-    conservation_errors(lim_third_upwind_sol)
-lim_fct_ρ_err, lim_fct_ρq_errs = conservation_errors(lim_fct_sol)
-lim_first_upwind_ρ_err, lim_first_upwind_ρq_errs =
-    conservation_errors(lim_first_upwind_sol)
-lim_centered_ρ_err, lim_centered_ρq_errs = conservation_errors(lim_centered_sol)
-
-# Check that the conservation errors are not too big.
-max_err = 64 * eps(FT)
-@test abs(third_upwind_ρ_err) < max_err
-@test all(abs.(third_upwind_ρq_errs) .< max_err)
-@test all(abs.(fct_ρq_errs) .< max_err)
-@test all(abs.(lim_third_upwind_ρq_errs) .< max_err)
-@test all(abs.(lim_fct_ρq_errs) .< max_err)
-@test all(abs.(lim_first_upwind_ρ_err) .< max_err)
-@test all(abs.(lim_centered_ρq_errs) .< max_err)
-
-# Check that the different upwinding modes with the limiter have no effect on ρ.
-@test third_upwind_ρ_err ==
-      fct_ρ_err ==
-      lim_third_upwind_ρ_err ==
-      lim_fct_ρ_err ==
-      lim_first_upwind_ρ_err ==
-      lim_centered_ρ_err
-
-# Check that the different upwinding modes with the limiter have no effect on the tracer with q = 1, or at
-# least no effect up to round-off error.
-max_q5_roundoff_err = 2 * eps(FT)
-@test third_upwind_ρq_errs[5] ≈ third_upwind_ρ_err atol = max_q5_roundoff_err
-@test fct_ρq_errs[5] ≈ third_upwind_ρ_err atol = max_q5_roundoff_err
-@test lim_third_upwind_ρq_errs[5] ≈ third_upwind_ρ_err atol =
-    max_q5_roundoff_err
-@test lim_fct_ρq_errs[5] ≈ third_upwind_ρ_err atol = max_q5_roundoff_err
-@test lim_first_upwind_ρq_errs[5] ≈ third_upwind_ρ_err atol =
-    max_q5_roundoff_err
-@test lim_centered_ρq_errs[5] ≈ third_upwind_ρ_err atol = max_q5_roundoff_err
-
-compare_tracer_props(a, b; buffer = 1) = all(
-    x -> x[1] < x[2] * buffer || (x[1] ≤ 100eps() && x[2] ≤ 100eps()),
-    zip(a, b),
+@info "Centered Differences"
+centered_sol_no_lim = run_deformation_flow(false, nothing, _dt)
+centered_sol_with_lim = run_deformation_flow(true, nothing, _dt)
+@info "First-Order Upwinding"
+upwind1_sol_no_lim = run_deformation_flow(false, upwind1, _dt)
+upwind1_sol_with_lim = run_deformation_flow(true, upwind1, _dt)
+@info "Third-Order Upwinding"
+upwind3_sol_no_lim = run_deformation_flow(false, upwind3, _dt)
+upwind3_sol_with_lim = run_deformation_flow(true, upwind3, _dt)
+@info "Flux-Corrected Transport"
+fct_sol_no_lim = run_deformation_flow(false, FCTZalesak, _dt)
+fct_sol_with_lim = run_deformation_flow(true, FCTZalesak, _dt)
+@info "Slope-Limited Transport"
+tvd_sol_no_lim = run_deformation_flow(false, SlopeLimitedFlux, _dt)
+tvd_sol_with_lim = run_deformation_flow(true, SlopeLimitedFlux, _dt)
+@info "van Leer Transport"
+lvl_sol_no_lim = run_deformation_flow(false, LinVanLeerFlux, _dt)
+lvl_sol_with_lim = run_deformation_flow(true, LinVanLeerFlux, _dt)
+
+sols_no_lim = (;
+    centered = centered_sol_no_lim,
+    upwind1 = upwind1_sol_no_lim,
+    upwind3 = upwind3_sol_no_lim,
+    fct = fct_sol_no_lim,
+    tvd = tvd_sol_no_lim,
+    lvl = lvl_sol_no_lim,
 )
+sols_with_lim = (;
+    centered = centered_sol_with_lim,
+    upwind1 = upwind1_sol_with_lim,
+    upwind3 = upwind3_sol_with_lim,
+    fct = fct_sol_with_lim,
+    tvd = tvd_sol_with_lim,
+    lvl = lvl_sol_with_lim,
+)
+
+ρ_errs_no_lim = map(total_conservation_error, sols_no_lim)
+ρ_errs_with_lim = map(total_conservation_error, sols_with_lim)
+ρq_errs_no_lim = map(tracer_conservation_errors, sols_no_lim)
+ρq_errs_with_lim = map(tracer_conservation_errors, sols_with_lim)
+roughnesses_no_lim = map(tracer_roughnesses, sols_no_lim)
+roughnesses_with_lim = map(tracer_roughnesses, sols_with_lim)
+ranges_no_lim = map(tracer_ranges, sols_no_lim)
+ranges_with_lim = map(tracer_ranges, sols_with_lim)
+
+# Check that upwinding has no effect on total mass.
+for ρ_errs_data in (ρ_errs_no_lim, ρ_errs_with_lim), ρ_err in ρ_errs_data
+    @test ρ_err == ρ_errs_no_lim.centered
+end
+
+# Check that upwinding has no effect on the constant tracer q5, and that the
+# other non-constant tracers are all conserved, accounting for round-off errors.
+for ρq_errs_data in (ρq_errs_no_lim, ρq_errs_with_lim), ρq_errs in ρq_errs_data
+    @test ρq_errs[5] ≈ ρ_errs_no_lim.centered atol = eps(FT)
+    @test all(ρq_errs[1:4] .< 40 * eps(FT))
+end
+
+# Check that using a limiter improves the "smoothness" of non-constant tracers.
+for (no_lim, with_lim) in zip(roughnesses_no_lim, roughnesses_with_lim)
+    @test all(with_lim[1:4] .< no_lim[1:4] .* 0.9999)
+end
+for (no_lim, with_lim) in zip(ranges_no_lim, ranges_with_lim)
+    @test all(with_lim[1:4] .< no_lim[1:4] .* 0.992)
+end
 
-# Check that the different upwinding modes with the limiter improve the "smoothness" of the tracers.
-#! format: off
-@testset "Test tracer properties" begin
-    @test compare_tracer_props(tracer_roughnesses(fct_sol)             , tracer_roughnesses(third_upwind_sol); buffer = 1.0)
-    @test compare_tracer_props(tracer_roughnesses(lim_third_upwind_sol), tracer_roughnesses(third_upwind_sol); buffer = 1.0)
-    @test compare_tracer_props(tracer_roughnesses(lim_fct_sol)         , tracer_roughnesses(third_upwind_sol); buffer = 0.93)
-    @test compare_tracer_props(tracer_ranges(fct_sol)                  , tracer_ranges(third_upwind_sol); buffer = 1.0)
-    @test compare_tracer_props(tracer_ranges(lim_third_upwind_sol)     , tracer_ranges(third_upwind_sol); buffer = 1.2)
-    @test compare_tracer_props(tracer_ranges(lim_fct_sol)              , tracer_ranges(third_upwind_sol); buffer = 1.0)
-    @test compare_tracer_props(tracer_ranges(lim_first_upwind_sol)     , tracer_ranges(third_upwind_sol); buffer = 0.6)
-    @test compare_tracer_props(tracer_ranges(lim_centered_sol)         , tracer_ranges(third_upwind_sol); buffer = 1.3)
+# Check that the relative effects of different upwinding schemes are consistent.
+for data in (roughnesses_no_lim, roughnesses_with_lim, ranges_no_lim, ranges_with_lim)
+    @test all((data.upwind1 .< data.tvd .< data.lvl .< data.fct .< data.upwind3)[1:4])
 end
-#! format: on
 
 ENV["GKSwstype"] = "nul"
 using ClimaCorePlots, Plots
 Plots.GRBackend()
 path = joinpath(@__DIR__, "output", "deformation_flow")
 mkpath(path)
-for (sol, suffix) in (
-    (lim_centered_sol, "_lim_centered"),
-    (lim_first_upwind_sol, "_lim_first_upwind"),
-    (third_upwind_sol, "_third_upwind"),
-    (fct_sol, "_fct"),
-    (tvd_sol, "_tvd"),
-    (lvl_sol, "_lvl"),
-    (lim_third_upwind_sol, "_lim_third_upwind"),
-    (lim_fct_sol, "_lim_fct"),
-    (lim_tvd_sol, "_lim_tvd"),
-    (lim_lvl_sol, "_lim_lvl"),
-)
-    for (sol_index, day) in ((1, 6), (2, 12))
+
+ref_final_q3 = upwind3_sol_with_lim[end].c.ρq.:3 ./ upwind3_sol_with_lim[end].c.ρ
+for (lim_suffix, sols) in (("no_lim", sols_no_lim), ("with_lim", sols_with_lim))
+    for (name, sol) in pairs(sols)
+        final_q3 = sol[end].c.ρq.:3 ./ sol[end].c.ρ
         Plots.png(
-            Plots.plot(
-                sol.u[sol_index].c.ρq.:3 ./ sol.u[sol_index].c.ρ,
-                level = 15,
-                clim = (-1, 1),
-            ),
-            joinpath(path, "q3_day$day$suffix.png"),
+            Plots.plot(final_q3, level = 15, clim = (-1, 1)),
+            joinpath(path, "q3_day12_$(name)_$(lim_suffix).png"),
         )
-    end
-end
-
-for (sol, suffix) in (
-    (lim_centered_sol, "_lim_centered"),
-    (lim_first_upwind_sol, "_lim_first_upwind"),
-    (third_upwind_sol, "_third_upwind"),
-    (fct_sol, "_fct"),
-    (tvd_sol, "_tvd"),
-    (lvl_sol, "_lvl"),
-    (lim_fct_sol, "_lim_fct"),
-    (lim_lvl_sol, "_lim_lvl"),
-)
-    for (sol_index, day) in ((1, 6), (2, 12))
+        sol === upwind3_sol_with_lim && continue # skip diff plot for reference
         Plots.png(
-            Plots.plot(
-                (
-                    ((sol.u[sol_index].c.ρq.:3) ./ sol.u[sol_index].c.ρ) .- (
-                        lim_third_upwind_sol[sol_index].c.ρq.:3 ./
-                        lim_third_upwind_sol[sol_index].c.ρ
-                    )
-                ),
-                level = 15,
-                clim = (-1, 1),
-            ),
-            joinpath(path, "q3_day_diff_$day$suffix.png"),
+            Plots.plot(final_q3 .- ref_final_q3, level = 15, clim = (-0.2, 0.2)),
+            joinpath(path, "q3_diff_day12_$(name)_$(lim_suffix).png"),
         )
     end
 end
diff --git a/ext/ClimaCoreCUDAExt.jl b/ext/ClimaCoreCUDAExt.jl
index c30283a099..2c5551b087 100644
--- a/ext/ClimaCoreCUDAExt.jl
+++ b/ext/ClimaCoreCUDAExt.jl
@@ -15,10 +15,7 @@ import ClimaCore.DataLayouts: mapreduce_cuda
 import ClimaCore.DataLayouts: ToCUDA
 import ClimaCore.DataLayouts: NoMask, IJHMask
 import ClimaCore.DataLayouts: slab, column
-import ClimaCore.Utilities: half
-import ClimaCore.Utilities: cart_ind, linear_ind
-import ClimaCore.RecursiveApply:
-    ⊠, ⊞, ⊟, radd, rmul, rsub, rdiv, rmap, rzero, rmin, rmax
+import ClimaCore.Utilities: half, new, cart_ind, linear_ind
 import ClimaCore.DataLayouts: get_N, get_Nv, get_Nij, get_Nij, get_Nh
 import ClimaCore.DataLayouts: UniversalSize
 
diff --git a/ext/cuda/column_matrix_helpers.jl b/ext/cuda/column_matrix_helpers.jl
index 443ddb832a..6466a80733 100644
--- a/ext/cuda/column_matrix_helpers.jl
+++ b/ext/cuda/column_matrix_helpers.jl
@@ -19,15 +19,15 @@ Base.@propagate_inbounds function row_mul_mat!(
     pd1, pd2 = MatrixFields.outer_diagonals(prod_eltype)
     li = 1i32
     ri = CUDA.blockDim().x - 1i32
-    zero_entry = rzero(eltype(prod_eltype))
+    zero_entry = zero(eltype(prod_eltype))
     prod_entries = UnrolledUtilities.unrolled_map((pd1:pd2...,)) do pd
         if v + pd < li || v + pd > ri
             zero_entry
         else
-            UnrolledUtilities.unrolled_mapreduce(⊞, (ld1:ud1...,)) do mat1_row_d
+            UnrolledUtilities.unrolled_mapreduce(+, (ld1:ud1...,)) do mat1_row_d
                 if ld2 <= pd - mat1_row_d <= ud2 &&
                    (0i32 < v + mat1_row_d + half <= CUDA.blockDim().x)
-                    @inbounds mat1_row[mat1_row_d] ⊠
+                    @inbounds mat1_row[mat1_row_d] *
                               matrix2[v + mat1_row_d + half + (i - 1i32) * CUDA.blockDim().x][pd - mat1_row_d]
                 else
                     zero_entry
@@ -55,15 +55,15 @@ Base.@propagate_inbounds function row_mul_mat!(
     pd1, pd2 = MatrixFields.outer_diagonals(prod_eltype)
     li = 1i32
     ri = CUDA.blockDim().x
-    zero_entry = rzero(eltype(prod_eltype))
+    zero_entry = zero(eltype(prod_eltype))
     prod_entries = UnrolledUtilities.unrolled_map((pd1:pd2...,)) do pd
         if v + pd < li || v + pd > ri
             zero_entry
         else
-            UnrolledUtilities.unrolled_mapreduce(⊞, (ld1:ud1...,)) do mat1_row_d
+            UnrolledUtilities.unrolled_mapreduce(+, (ld1:ud1...,)) do mat1_row_d
                 if ld2 <= pd - mat1_row_d <= ud2 &&
                    (0i32 < v + mat1_row_d - half < CUDA.blockDim().x)
-                    @inbounds mat1_row[mat1_row_d] ⊠
+                    @inbounds mat1_row[mat1_row_d] *
                               matrix2[v + mat1_row_d - half + (i - 1i32) * CUDA.blockDim().x][pd - mat1_row_d]
                 else
                     zero_entry
@@ -91,15 +91,15 @@ Base.@propagate_inbounds function row_mul_mat!(
     pd1, pd2 = MatrixFields.outer_diagonals(prod_eltype)
     li = 1i32
     ri = CUDA.blockDim().x - 1i32
-    zero_entry = rzero(eltype(prod_eltype))
+    zero_entry = zero(eltype(prod_eltype))
     prod_entries = UnrolledUtilities.unrolled_map((pd1:pd2...,)) do pd
         if v + pd < li || v + pd > ri
             zero_entry
         else
-            UnrolledUtilities.unrolled_mapreduce(⊞, (ld1:ud1...,)) do mat1_row_d
+            UnrolledUtilities.unrolled_mapreduce(+, (ld1:ud1...,)) do mat1_row_d
                 if ld2 <= pd - mat1_row_d <= ud2 &&
                    (0i32 < v + mat1_row_d <= CUDA.blockDim().x - 1i32)
-                    @inbounds mat1_row[mat1_row_d] ⊠
+                    @inbounds mat1_row[mat1_row_d] *
                               matrix2[v + mat1_row_d + (i - 1i32) * CUDA.blockDim().x][pd - mat1_row_d]
                 else
                     zero_entry
@@ -129,14 +129,14 @@ Base.@propagate_inbounds function row_mul_mat!(
     li = 1i32
     ri = CUDA.blockDim().x
 
-    zero_entry = rzero(eltype(prod_eltype))
+    zero_entry = zero(eltype(prod_eltype))
     prod_entries = UnrolledUtilities.unrolled_map((pd1:pd2...,)) do pd
         if v + pd < li || v + pd > ri
             zero_entry
         else
-            UnrolledUtilities.unrolled_mapreduce(⊞, (ld1:ud1...,)) do mat1_row_d
+            UnrolledUtilities.unrolled_mapreduce(+, (ld1:ud1...,)) do mat1_row_d
                 if ld2 <= pd - mat1_row_d <= ud2 && (0i32 < v + mat1_row_d <= CUDA.blockDim().x)
-                    @inbounds mat1_row[mat1_row_d] ⊠
+                    @inbounds mat1_row[mat1_row_d] *
                               matrix2[v + mat1_row_d + (i - 1i32) * CUDA.blockDim().x][pd - mat1_row_d]
                 else
                     zero_entry
@@ -164,15 +164,15 @@ Base.@propagate_inbounds function row_mul_mat!(
     pd1, pd2 = MatrixFields.outer_diagonals(prod_eltype)
     li = 1i32
     ri = CUDA.blockDim().x
-    zero_entry = rzero(eltype(prod_eltype))
+    zero_entry = zero(eltype(prod_eltype))
     prod_entries = UnrolledUtilities.unrolled_map((pd1:pd2...,)) do pd
         if v + pd + half < li || v + pd + half > ri
             zero_entry
         else
-            UnrolledUtilities.unrolled_mapreduce(⊞, (ld1:ud1...,)) do mat1_row_d
+            UnrolledUtilities.unrolled_mapreduce(+, (ld1:ud1...,)) do mat1_row_d
                 if ld2 <= pd - mat1_row_d <= ud2 &&
                    (0i32 < v + mat1_row_d + half <= CUDA.blockDim().x)
-                    @inbounds mat1_row[mat1_row_d] ⊠
+                    @inbounds mat1_row[mat1_row_d] *
                               matrix2[v + mat1_row_d + half + (i - 1i32) * CUDA.blockDim().x][pd - mat1_row_d]
                 else
                     zero_entry
@@ -200,15 +200,15 @@ Base.@propagate_inbounds function row_mul_mat!(
     pd1, pd2 = MatrixFields.outer_diagonals(prod_eltype)
     li = 1i32
     ri = CUDA.blockDim().x
-    zero_entry = rzero(eltype(prod_eltype))
+    zero_entry = zero(eltype(prod_eltype))
     prod_entries = UnrolledUtilities.unrolled_map((pd1:pd2...,)) do pd
         if v + pd + half < li || v + pd + half > ri
             zero_entry
         else
-            UnrolledUtilities.unrolled_mapreduce(⊞, (ld1:ud1...,)) do mat1_row_d
+            UnrolledUtilities.unrolled_mapreduce(+, (ld1:ud1...,)) do mat1_row_d
                 if ld2 <= pd - mat1_row_d <= ud2 &&
                    (0i32 < v + mat1_row_d - half < CUDA.blockDim().x)
-                    @inbounds mat1_row[mat1_row_d] ⊠
+                    @inbounds mat1_row[mat1_row_d] *
                               matrix2[v + mat1_row_d - half + (i - 1i32) * CUDA.blockDim().x][pd - mat1_row_d]
                 else
                     zero_entry
@@ -236,14 +236,14 @@ Base.@propagate_inbounds function row_mul_mat!(
     pd1, pd2 = MatrixFields.outer_diagonals(prod_eltype)
     li = 1i32
     ri = CUDA.blockDim().x
-    zero_entry = rzero(eltype(prod_eltype))
+    zero_entry = zero(eltype(prod_eltype))
     prod_entries = UnrolledUtilities.unrolled_map((pd1:pd2...,)) do pd
         if v + pd + half < li || v + pd + half > ri
             zero_entry
         else
-            UnrolledUtilities.unrolled_mapreduce(⊞, (ld1:ud1...,)) do mat1_row_d
+            UnrolledUtilities.unrolled_mapreduce(+, (ld1:ud1...,)) do mat1_row_d
                 if ld2 <= pd - mat1_row_d <= ud2 && (0i32 < v + mat1_row_d <= CUDA.blockDim().x)
-                    @inbounds mat1_row[mat1_row_d] ⊠
+                    @inbounds mat1_row[mat1_row_d] *
                               matrix2[v + mat1_row_d + (i - 1i32) * CUDA.blockDim().x][pd - mat1_row_d]
                 else
                     zero_entry
@@ -271,14 +271,14 @@ Base.@propagate_inbounds function row_mul_mat!(
     pd1, pd2 = MatrixFields.outer_diagonals(prod_eltype)
     li = 1i32
     ri = CUDA.blockDim().x
-    zero_entry = rzero(eltype(prod_eltype))
+    zero_entry = zero(eltype(prod_eltype))
     prod_entries = UnrolledUtilities.unrolled_map((pd1:pd2...,)) do pd
         if v + pd + half < li || v + pd + half > ri
             zero_entry
         else
-            UnrolledUtilities.unrolled_mapreduce(⊞, (ld1:ud1...,)) do mat1_row_d
+            UnrolledUtilities.unrolled_mapreduce(+, (ld1:ud1...,)) do mat1_row_d
                 if ld2 <= pd - mat1_row_d <= ud2 && (0i32 < v + mat1_row_d < CUDA.blockDim().x)
-                    @inbounds mat1_row[mat1_row_d] ⊠
+                    @inbounds mat1_row[mat1_row_d] *
                               matrix2[v + mat1_row_d + (i - 1i32) * CUDA.blockDim().x][pd - mat1_row_d]
                 else
                     zero_entry
@@ -307,9 +307,9 @@ Base.@propagate_inbounds function row_mul_vec!(
     ld1, ud1 = MatrixFields.outer_diagonals(mat1_eltype)
     li = 1i32
     ri = CUDA.blockDim().x - 1i32
-    zero_entry = rzero(prod_eltype)
+    zero_entry = zero(prod_eltype)
     return UnrolledUtilities.unrolled_mapreduce(
-        ⊞,
+        +,
         ld1:ud1;
         init = zero_entry,
     ) do mat1_row_d
@@ -338,9 +338,9 @@ Base.@propagate_inbounds function row_mul_vec!(
     ld1, ud1 = MatrixFields.outer_diagonals(mat1_eltype)
     li = 1i32
     ri = CUDA.blockDim().x
-    zero_entry = rzero(prod_eltype)
+    zero_entry = zero(prod_eltype)
     return UnrolledUtilities.unrolled_mapreduce(
-        ⊞,
+        +,
         ld1:ud1;
         init = zero_entry,
     ) do mat1_row_d
@@ -369,9 +369,9 @@ Base.@propagate_inbounds function row_mul_vec!(
     ld1, ud1 = MatrixFields.outer_diagonals(mat1_eltype)
     li = 1i32
     ri = CUDA.blockDim().x - 1i32
-    zero_entry = rzero(prod_eltype)
+    zero_entry = zero(prod_eltype)
     return UnrolledUtilities.unrolled_mapreduce(
-        ⊞,
+        +,
         ld1:ud1;
         init = zero_entry,
     ) do mat1_row_d
@@ -400,9 +400,9 @@ Base.@propagate_inbounds function row_mul_vec!(
     ld1, ud1 = MatrixFields.outer_diagonals(mat1_eltype)
     li = 1i32
     ri = CUDA.blockDim().x
-    zero_entry = rzero(prod_eltype)
+    zero_entry = zero(prod_eltype)
     return UnrolledUtilities.unrolled_mapreduce(
-        ⊞,
+        +,
         ld1:ud1;
         init = zero_entry,
     ) do mat1_row_d
@@ -420,23 +420,9 @@ end
 # Handles multiplication in row_mul_vec!.
 # Basically rmul, but some operators matrices require special handling
 # general case
-Base.@propagate_inbounds outer_or_mul(x::T1, y::T2) where {T1, T2} = x ⊠ y
+Base.@propagate_inbounds outer_or_mul(x::T1, y::T2) where {T1, T2} = x * y
 # case for grad of a vec
 Base.@propagate_inbounds outer_or_mul(x::T1, y::T2) where {T1 <: AbstractVector, T2} = x ⊗ y
-Base.@propagate_inbounds outer_or_mul(
-    x::T1,
-    y::T2,
-) where {T1, T2 <: Union{Tuple, NamedTuple}} =
-    RecursiveApply.rmap(Base.Fix1(outer_or_mul, x), y)
-Base.@propagate_inbounds outer_or_mul(
-    x::T1,
-    y::T2,
-) where {T1 <: Union{Tuple, NamedTuple}, T2 <: Union{Tuple, NamedTuple}} = x ⊠ y
-Base.@propagate_inbounds outer_or_mul(
-    x::T1,
-    y::T2,
-) where {T1 <: AbstractVector, T2 <: Union{Tuple, NamedTuple}} =
-    RecursiveApply.rmap(Base.Fix1(outer_or_mul, x), y)
 # case for divgrad of a vec
 Base.@propagate_inbounds outer_or_mul(
     x::T1,
diff --git a/ext/cuda/limiters.jl b/ext/cuda/limiters.jl
index dc20ef5c4b..642dfe40ca 100644
--- a/ext/cuda/limiters.jl
+++ b/ext/cuda/limiters.jl
@@ -23,8 +23,12 @@ function compute_element_bounds!(
     ρ,
     dev::ClimaComms.CUDADevice,
 )
-    ρ_values = Fields.field_values(Operators.strip_space(ρ, axes(ρ)))
-    ρq_values = Fields.field_values(Operators.strip_space(ρq, axes(ρq)))
+    ρ_values = Base.broadcastable(
+        Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
+    )
+    ρq_values = Base.broadcastable(
+        Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
+    )
     (_, _, _, Nv, Nh) = DataLayouts.universal_size(ρ_values)
     nthreads, nblocks = config_threadblock(Nv, Nh)
 
@@ -53,13 +57,13 @@ function compute_element_bounds_kernel!(limiter, ρq, ρ)
         slab_ρ = slab(ρ, v, h)
         for j in 1:Nj
             for i in 1:Ni
-                q = rdiv(slab_ρq[slab_index(i, j)], slab_ρ[slab_index(i, j)])
+                q = slab_ρq[slab_index(i, j)] / slab_ρ[slab_index(i, j)]
                 if i == 1 && j == 1
                     q_min = q
                     q_max = q
                 else
-                    q_min = rmin(q_min, q)
-                    q_max = rmax(q_max, q)
+                    q_min = min(q_min, q)
+                    q_max = max(q_max, q)
                 end
             end
         end
@@ -107,7 +111,8 @@ function compute_neighbor_bounds_local_kernel!(
     tidx = thread_index()
     @inbounds if valid_range(tidx, prod(n))
         (v, h) = kernel_indexes(tidx, n).I
-        (; q_bounds, q_bounds_nbr, ghost_buffer, rtol) = limiter
+        (; q_bounds_nbr, ghost_buffer, rtol) = limiter
+        q_bounds = Base.broadcastable(limiter.q_bounds)
         slab_q_bounds = slab(q_bounds, v, h)
         q_min = slab_q_bounds[slab_index(1)]
         q_max = slab_q_bounds[slab_index(2)]
@@ -115,8 +120,8 @@ function compute_neighbor_bounds_local_kernel!(
             local_neighbor_elem_offset[h]:(local_neighbor_elem_offset[h + 1] - 1)
             h_nbr = local_neighbor_elem[lne]
             slab_q_bounds = slab(q_bounds, v, h_nbr)
-            q_min = rmin(q_min, slab_q_bounds[slab_index(1)])
-            q_max = rmax(q_max, slab_q_bounds[slab_index(2)])
+            q_min = min(q_min, slab_q_bounds[slab_index(1)])
+            q_max = max(q_max, slab_q_bounds[slab_index(2)])
         end
         slab_q_bounds_nbr = slab(q_bounds_nbr, v, h)
         slab_q_bounds_nbr[slab_index(1)] = q_min
diff --git a/ext/cuda/matrix_fields_multiple_field_solve.jl b/ext/cuda/matrix_fields_multiple_field_solve.jl
index c1bb2abde7..e573f3cc9e 100644
--- a/ext/cuda/matrix_fields_multiple_field_solve.jl
+++ b/ext/cuda/matrix_fields_multiple_field_solve.jl
@@ -15,8 +15,8 @@ NVTX.@annotate function multiple_field_solve!(
     x,
     A,
     b,
-    x1,
 )
+    x1 = first(values(x))
     names = MatrixFields.matrix_row_keys(keys(A))
     Nnames = length(names)
     Ni, Nj, _, _, Nh = size(Fields.field_values(x1))
@@ -29,13 +29,12 @@ NVTX.@annotate function multiple_field_solve!(
     xs = map(name -> ssx[name], names)
     As = map(name -> ssA[name, name], names)
     bs = map(name -> ssb[name], names)
-    x1 = first(xs)
 
     device = ClimaComms.device(x[first(names)])
 
     us = UniversalSize(Fields.field_values(x1))
     cart_inds = cartesian_indices_multiple_field_solve(us; Nnames)
-    args = (device, caches, xs, As, bs, x1, us, mask, cart_inds, Val(Nnames))
+    args = (device, caches, xs, As, bs, us, mask, cart_inds, Val(Nnames))
 
     nitems = Ni * Nj * Nh * Nnames
     (; threads, blocks) = config_via_occupancy(multiple_field_solve_kernel!, nitems, args)
@@ -46,7 +45,7 @@ NVTX.@annotate function multiple_field_solve!(
         blocks_s = blocks,
         always_inline = true,
     )
-    call_post_op_callback() && post_op_callback(x, dev, cache, x, A, b, x1)
+    call_post_op_callback() && post_op_callback(x, dev, cache, x, A, b)
 end
 
 Base.@propagate_inbounds column_A(A::UniformScaling, i, j, h) = A
@@ -83,7 +82,6 @@ function multiple_field_solve_kernel!(
     xs,
     As,
     bs,
-    x1,
     us::UniversalSize,
     mask,
     cart_inds,
diff --git a/ext/cuda/matrix_fields_single_field_solve.jl b/ext/cuda/matrix_fields_single_field_solve.jl
index b32000f190..fbb2ed5f42 100644
--- a/ext/cuda/matrix_fields_single_field_solve.jl
+++ b/ext/cuda/matrix_fields_single_field_solve.jl
@@ -11,7 +11,6 @@ import ClimaCore.DataLayouts: vindex
 import ClimaCore.MatrixFields: single_field_solve!
 import ClimaCore.MatrixFields: _single_field_solve!
 import ClimaCore.MatrixFields: band_matrix_solve!, unzip_tuple_field_values
-import ClimaCore.RecursiveApply: ⊠, ⊞, ⊟, rmap, rzero, rdiv
 
 function single_field_solve!(device::ClimaComms.CUDADevice, cache, x, A, b)
     Ni, Nj, _, _, Nh = size(Fields.field_values(A))
@@ -73,7 +72,7 @@ function _single_field_solve_diag_matrix_row!(
     b_data = Fields.field_values(b)
     Nv = DataLayouts.nlevels(x_data)
     @inbounds for v in 1:Nv
-        x_data[vi(v)] = inv(A₀[vi(v)]) ⊠ b_data[vi(v)]
+        x_data[vi(v)] = inv(A₀[vi(v)]) * b_data[vi(v)]
     end
 end
 
@@ -108,7 +107,7 @@ function _single_field_solve!(
     b_data = Fields.field_values(b)
     Nv = DataLayouts.nlevels(x_data)
     @inbounds for v in 1:Nv
-        x_data[vindex(v)] = inv(A.λ) ⊠ b_data[vindex(v)]
+        x_data[vindex(v)] = inv(A.λ) * b_data[vindex(v)]
     end
 end
 
@@ -121,7 +120,7 @@ function _single_field_solve!(
 )
     x_data = Fields.field_values(x)
     b_data = Fields.field_values(b)
-    x_data[] = inv(A.λ) ⊠ b_data[]
+    x_data[] = inv(A.λ) * b_data[]
 end
 
 using StaticArrays: MArray
@@ -207,7 +206,7 @@ function band_matrix_solve_local_mem!(
     Nv = DataLayouts.nlevels(x)
     (A₀,) = Aⱼs
     @inbounds for v in 1:Nv
-        x[vindex(v)] = inv(A₀[vindex(v)]) ⊠ b[vindex(v)]
+        x[vindex(v)] = inv(A₀[vindex(v)]) * b[vindex(v)]
     end
     return nothing
 end
diff --git a/ext/cuda/operators_fd_eager.jl b/ext/cuda/operators_fd_eager.jl
index 60580b05f4..df5c04e9d4 100644
--- a/ext/cuda/operators_fd_eager.jl
+++ b/ext/cuda/operators_fd_eager.jl
@@ -2,10 +2,9 @@ import ClimaCore: Spaces, Quadratures, Topologies, Operators
 import Base.Broadcast: Broadcasted
 import ClimaCore.Fields: Field, field_values, AbstractFieldStyle
 import ClimaComms
-import ClimaCore.Utilities: half
+import ClimaCore.Utilities: half, new
 import ClimaCore.Operators
 import ClimaCore.Geometry: ⊗, project
-import ClimaCore.RecursiveApply: rzero, ⊞, ⊠, rmuladd, rmap
 import ClimaCore.Operators:
     StencilBroadcasted, setidx!, getidx, reconstruct_placeholder_space
 import ClimaCore.MatrixFields: FaceToCenter, CenterToFace, Square, CenterToCenter,
@@ -263,7 +262,7 @@ Base.@propagate_inbounds function calc_level_val(
         CUDA.sync_threads()
         # if the output is on centers, the CUDA.blockDim().xth thread can just return 0
         mat1_space.staggering isa Spaces.CellCenter && v == CUDA.blockDim().x &&
-            return new_struct(eltype(bc))
+            return new(eltype(bc))
         if mat1_space.staggering isa Spaces.CellCenter
             mat1_shape =
                 eltype(ClimaCore.MatrixFields.outer_diagonals(typeof(mat1_row))) <:
@@ -308,7 +307,7 @@ Base.@propagate_inbounds function calc_level_val(
         h = blockIdx().z
         hidx = (i, j, h)
         if space.staggering isa Spaces.CellCenter
-            v == CUDA.blockDim().x && return @inline @inbounds new_struct(eltype(bc))
+            v == CUDA.blockDim().x && return @inline @inbounds new(eltype(bc))
         end
         li = space.staggering isa Spaces.CellCenter ? 1i32 : half
         idx = v - 1i32 + li
@@ -355,7 +354,7 @@ Base.@propagate_inbounds function calc_level_val(
     h = blockIdx().z
     hidx = (i, j, h)
     if space.staggering isa Spaces.CellCenter
-        v == CUDA.blockDim().x && return @inline @inbounds new_struct(eltype(bc))
+        v == CUDA.blockDim().x && return @inline @inbounds new(eltype(bc))
     end
     li = space.staggering isa Spaces.CellCenter ? 1i32 : half
     idx = v - 1i32 + li
@@ -366,7 +365,7 @@ end
     calc_level_val(f::Field, space)
 
 Returns the value of the field `f` at the thread's index.
-When the staggering of `space` is `CellCenter`, the thread with `v == CUDA.blockDim().x` returns `new_struct(eltype(f))`
+When the staggering of `space` is `CellCenter`, the thread with `v == CUDA.blockDim().x` returns `new(eltype(f))`
 """
 Base.@propagate_inbounds function calc_level_val(
     arg::F,
@@ -380,7 +379,7 @@ Base.@propagate_inbounds function calc_level_val(
     if space isa
        Union{Spaces.ExtrudedFiniteDifferenceSpace, Spaces.FiniteDifferenceSpace} &&
        space.staggering isa Spaces.CellCenter
-        v == CUDA.blockDim().x && return @inline @inbounds new_struct(eltype(data))
+        v == CUDA.blockDim().x && return @inline @inbounds new(eltype(data))
     end
     return @inline @inbounds data[CartesianIndex(i, j, 1i32, v, h)]
 end
@@ -421,7 +420,7 @@ Base.@propagate_inbounds function get_op_row(op, args, space)
     outputs_to_face = space.staggering isa ClimaCore.Grids.CellFace
     row_type = @inbounds @inline op_matrix_row_type(op, FT, args[1:(end - 1)]...)
     if !outputs_to_face && v == CUDA.blockDim().x
-        return new_struct(row_type)
+        return new(row_type)
     end
     v_half = outputs_to_face ? v - half : v
     in_left_bnd = Operators.should_call_left_boundary(v_half, space, op, nothing)
@@ -484,7 +483,7 @@ Base.@propagate_inbounds function project_row2_for_mul(mat1_row, mat2_row, space
     project_onto =
         ClimaCore.Geometry.recursively_find_dual_axes_for_projection(mat1_et)
     if space.staggering isa Spaces.CellCenter && v == CUDA.blockDim().x
-        lg = new_struct(Spaces.local_geometry_type(typeof(space)))
+        lg = new(Spaces.local_geometry_type(typeof(space)))
     else
         v_maybe_half = space.staggering isa Spaces.CellFace ? v - half : v
         @inbounds lg = Geometry.LocalGeometry(space, v_maybe_half, hidx)
@@ -503,20 +502,14 @@ end
 Recursively project `y` onto the axes in `projection_tuple[1]` using the local geometry in
 `projection_tuple[2]`.
 """
-Base.@propagate_inbounds recursively_project(
-    projection_tuple::T,
-    y::Y,
-) where {T, Y <: BandMatrixRow} = map(Base.Fix1(recursively_project, projection_tuple), y)
 Base.@propagate_inbounds recursively_project(projection_tuple::T, y::Y) where {T, Y} =
-    rmap(Base.Fix1(recursively_project, projection_tuple), y)
+    map(Base.Fix1(recursively_project, projection_tuple), y)
 Base.@propagate_inbounds recursively_project(
     projection_tuple::T,
     y::Y,
 ) where {T, Y <: AxisTensor} =
     @inbounds @inline project(projection_tuple[1], y, projection_tuple[2])
 
-@generated new_struct(::Type{T}) where {T} = Expr(:new, :T)
-
 if hasfield(Method, :recursion_relation)
     dont_limit = (args...) -> true
     for m in methods(recursively_project)
diff --git a/ext/cuda/operators_fd_shmem.jl b/ext/cuda/operators_fd_shmem.jl
index da1d3ffdd9..7fb8f3f317 100644
--- a/ext/cuda/operators_fd_shmem.jl
+++ b/ext/cuda/operators_fd_shmem.jl
@@ -1,8 +1,7 @@
-import ClimaCore: DataLayouts, Spaces, Geometry, RecursiveApply, DataLayouts
+import ClimaCore: DataLayouts, Spaces, Geometry, DataLayouts
 import CUDA
 import ClimaCore.Operators: return_eltype, get_local_geometry
 import ClimaCore.Geometry: ⊗
-import ClimaCore.RecursiveApply: ⊟, ⊞
 
 Base.@propagate_inbounds function fd_operator_shmem(
     space,
@@ -75,7 +74,7 @@ Base.@propagate_inbounds function fd_operator_evaluate(
         if !on_boundary(idx, space, op)
             Ju³₋ = Ju³[vt]   # corresponds to idx - half
             Ju³₊ = Ju³[vt + 1] # corresponds to idx + half
-            return (Ju³₊ ⊟ Ju³₋) ⊠ lg.invJ
+            return (Ju³₊ - Ju³₋) * lg.invJ
         else
             bloc =
                 on_left_boundary(idx, space, op) ?
@@ -87,7 +86,7 @@ Base.@propagate_inbounds function fd_operator_evaluate(
                 if bc isa Operators.SetValue
                     Ju³₋ = lJu³[1]   # corresponds to idx - half
                     Ju³₊ = Ju³[vt + 1] # corresponds to idx + half
-                    return (Ju³₊ ⊟ Ju³₋) ⊠ lg.invJ
+                    return (Ju³₊ - Ju³₋) * lg.invJ
                 else
                     # @assert bc isa Operators.SetDivergence
                     return lJu³[1]
@@ -97,7 +96,7 @@ Base.@propagate_inbounds function fd_operator_evaluate(
                 if bc isa Operators.SetValue
                     Ju³₋ = Ju³[vt]   # corresponds to idx - half
                     Ju³₊ = rJu³[1] # corresponds to idx + half
-                    return (Ju³₊ ⊟ Ju³₋) ⊠ lg.invJ
+                    return (Ju³₊ - Ju³₋) * lg.invJ
                 else
                     @assert bc isa Operators.SetDivergence
                     return rJu³[1]
@@ -174,7 +173,7 @@ Base.@propagate_inbounds function fd_operator_evaluate(
         if !on_boundary(idx, space, op)
             u₋ = u[vt - 1]   # corresponds to idx - half
             u₊ = u[vt] # corresponds to idx + half
-            return u₊ ⊟ u₋
+            return u₊ - u₋
         else
             bloc =
                 on_left_boundary(idx, space, op) ?
@@ -186,14 +185,14 @@ Base.@propagate_inbounds function fd_operator_evaluate(
                 if bc isa Operators.SetValue
                     u₋ = 2 * lb[1]   # corresponds to idx - half
                     u₊ = 2 * u[vt] # corresponds to idx + half
-                    return u₊ ⊟ u₋
+                    return u₊ - u₋
                 end
             else
                 @assert on_right_boundary(idx, space)
                 if bc isa Operators.SetValue
                     u₋ = 2 * u[vt - 1]   # corresponds to idx - half
                     u₊ = 2 * rb[1] # corresponds to idx + half
-                    return u₊ ⊟ u₋
+                    return u₊ - u₋
                 end
             end
         end
@@ -273,7 +272,7 @@ Base.@propagate_inbounds function fd_operator_evaluate(
         if !on_boundary(idx, space, op)
             u₋ = u[ᶜidx - 1]   # corresponds to idx - half
             u₊ = u[ᶜidx] # corresponds to idx + half
-            return RecursiveApply.rdiv(u₊ ⊞ u₋, 2)
+            return (u₊ + u₋) / 2
         else
             bloc =
                 on_left_boundary(idx, space, op) ?
@@ -289,7 +288,7 @@ Base.@propagate_inbounds function fd_operator_evaluate(
                 elseif bc isa Operators.SetGradient
                     u₋ = lb[1]   # corresponds to idx - half
                     u₊ = u[ᶜidx] # corresponds to idx + half
-                    return u₊ ⊟ RecursiveApply.rdiv(u₋, 2)
+                    return u₊ - u₋ / 2
                 else
                     @assert bc isa Operators.Extrapolate
                     return u[ᶜidx]
@@ -301,7 +300,7 @@ Base.@propagate_inbounds function fd_operator_evaluate(
                 elseif bc isa Operators.SetGradient
                     u₋ = u[ᶜidx - 1] # corresponds to idx - half
                     u₊ = rb[1]   # corresponds to idx + half
-                    return u₋ ⊞ RecursiveApply.rdiv(u₊, 2)
+                    return u₋ + u₊ / 2
                 else
                     @assert bc isa Operators.Extrapolate
                     return u[ᶜidx - 1]
diff --git a/ext/cuda/operators_fd_shmem_common.jl b/ext/cuda/operators_fd_shmem_common.jl
index 6aded04ead..cb978a084e 100644
--- a/ext/cuda/operators_fd_shmem_common.jl
+++ b/ext/cuda/operators_fd_shmem_common.jl
@@ -1,4 +1,4 @@
-import ClimaCore: DataLayouts, Spaces, Geometry, RecursiveApply, DataLayouts
+import ClimaCore: DataLayouts, Spaces, Geometry, DataLayouts
 import CUDA
 import ClimaCore.Operators: return_eltype, get_local_geometry
 import ClimaCore.Operators: getidx
@@ -9,18 +9,11 @@ import ClimaCore.Utilities
 ##### Boundary helpers
 #####
 
-@inline has_left_boundary(space, op) =
-    Operators.has_boundary(op, Operators.left_boundary_window(space))
-@inline has_right_boundary(space, op) =
-    Operators.has_boundary(op, Operators.right_boundary_window(space))
-
 @inline on_boundary(idx, space, op) =
     on_left_boundary(idx, space, op) || on_right_boundary(idx, space, op)
 
-@inline on_left_boundary(idx, space, op) =
-    has_left_boundary(space, op) && on_left_boundary(idx, space)
-@inline on_right_boundary(idx, space, op) =
-    has_right_boundary(space, op) && on_right_boundary(idx, space)
+@inline on_left_boundary(idx, space, op) = on_left_boundary(idx, space)
+@inline on_right_boundary(idx, space, op) = on_right_boundary(idx, space)
 
 @inline on_boundary(idx::PlusHalf, space) =
     idx == Operators.left_face_boundary_idx(space) ||
@@ -40,8 +33,7 @@ import ClimaCore.Utilities
     idx == Operators.right_center_boundary_idx(space)
 
 @inline on_any_boundary(idx, space, op) =
-    (has_left_boundary(space, op) && on_left_boundary(idx, space)) ||
-    has_right_boundary(space, op) && on_right_boundary(idx, space)
+    on_left_boundary(idx, space) || on_right_boundary(idx, space)
 
 @inline function is_out_of_bounds(idx::Integer, space)
     ᶜspace = Spaces.center_space(space)
diff --git a/ext/cuda/operators_sem_shmem.jl b/ext/cuda/operators_sem_shmem.jl
index 5a5a17ef03..6e8f308931 100644
--- a/ext/cuda/operators_sem_shmem.jl
+++ b/ext/cuda/operators_sem_shmem.jl
@@ -1,4 +1,4 @@
-import ClimaCore: DataLayouts, Spaces, Geometry, RecursiveApply, DataLayouts
+import ClimaCore: DataLayouts, Spaces, Geometry, DataLayouts
 import CUDA
 import ClimaCore.Operators:
     Divergence,
@@ -51,11 +51,8 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     vt = threadIdx().z
     local_geometry = get_local_geometry(space, ij, slabidx)
     i, _ = ij.I
-    Jv¹[i, vt] =
-        local_geometry.J ⊠ RecursiveApply.rmap(
-            v -> Geometry.contravariant1(v, local_geometry),
-            arg,
-        )
+    (; J) = local_geometry
+    Jv¹[i, vt] = J * Geometry.contravariant1(arg, local_geometry)
 end
 Base.@propagate_inbounds function operator_fill_shmem!(
     op::Divergence{(1, 2)},
@@ -68,17 +65,9 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     vt = threadIdx().z
     local_geometry = get_local_geometry(space, ij, slabidx)
     i, j = ij.I
-
-    Jv¹[i, j, vt] =
-        local_geometry.J ⊠ RecursiveApply.rmap(
-            v -> Geometry.contravariant1(v, local_geometry),
-            arg,
-        )
-    Jv²[i, j, vt] =
-        local_geometry.J ⊠ RecursiveApply.rmap(
-            v -> Geometry.contravariant2(v, local_geometry),
-            arg,
-        )
+    (; J) = local_geometry
+    Jv¹[i, j, vt] = J * Geometry.contravariant1(arg, local_geometry)
+    Jv²[i, j, vt] = J * Geometry.contravariant2(arg, local_geometry)
 end
 
 Base.@propagate_inbounds function operator_shmem(
@@ -122,11 +111,8 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     vt = threadIdx().z
     local_geometry = get_local_geometry(space, ij, slabidx)
     i, _ = ij.I
-    WJv¹[i, vt] =
-        local_geometry.WJ ⊠ RecursiveApply.rmap(
-            v -> Geometry.contravariant1(v, local_geometry),
-            arg,
-        )
+    (; WJ) = local_geometry
+    WJv¹[i, vt] = WJ * Geometry.contravariant1(arg, local_geometry)
 end
 Base.@propagate_inbounds function operator_fill_shmem!(
     op::WeakDivergence{(1, 2)},
@@ -139,17 +125,9 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     vt = threadIdx().z
     local_geometry = get_local_geometry(space, ij, slabidx)
     i, j = ij.I
-
-    WJv¹[i, j, vt] =
-        local_geometry.WJ ⊠ RecursiveApply.rmap(
-            v -> Geometry.contravariant1(v, local_geometry),
-            arg,
-        )
-    WJv²[i, j, vt] =
-        local_geometry.WJ ⊠ RecursiveApply.rmap(
-            v -> Geometry.contravariant2(v, local_geometry),
-            arg,
-        )
+    (; WJ) = local_geometry
+    WJv¹[i, j, vt] = WJ * Geometry.contravariant1(arg, local_geometry)
+    WJv²[i, j, vt] = WJ * Geometry.contravariant2(arg, local_geometry)
 end
 
 Base.@propagate_inbounds function operator_shmem(
@@ -198,11 +176,8 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     vt = threadIdx().z
     local_geometry = get_local_geometry(space, ij, slabidx)
     i, _ = ij.I
-    Ju1[i, vt] =
-        local_geometry.J ⊠ RecursiveApply.rmap(
-            u -> Geometry.contravariant1(u, local_geometry),
-            arg1,
-        )
+    (; J) = local_geometry
+    Ju1[i, vt] = J * Geometry.contravariant1(arg1, local_geometry)
     psi[i, vt] = arg2
 end
 
@@ -218,16 +193,9 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     vt = threadIdx().z
     local_geometry = get_local_geometry(space, ij, slabidx)
     i, j = ij.I
-    Ju1[i, j, vt] =
-        local_geometry.J ⊠ RecursiveApply.rmap(
-            u -> Geometry.contravariant1(u, local_geometry),
-            arg1,
-        )
-    Ju2[i, j, vt] =
-        local_geometry.J ⊠ RecursiveApply.rmap(
-            u -> Geometry.contravariant2(u, local_geometry),
-            arg1,
-        )
+    (; J) = local_geometry
+    Ju1[i, j, vt] = J * Geometry.contravariant1(arg1, local_geometry)
+    Ju2[i, j, vt] = J * Geometry.contravariant2(arg1, local_geometry)
     psi[i, j, vt] = arg2
 end
 
@@ -318,7 +286,7 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     local_geometry = get_local_geometry(space, ij, slabidx)
     W = local_geometry.WJ * local_geometry.invJ
     i, _ = ij.I
-    Wf[i, vt] = W ⊠ arg
+    Wf[i, vt] = W * arg
 end
 Base.@propagate_inbounds function operator_fill_shmem!(
     op::WeakGradient{(1, 2)},
@@ -332,7 +300,7 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     local_geometry = get_local_geometry(space, ij, slabidx)
     W = local_geometry.WJ * local_geometry.invJ
     i, j = ij.I
-    Wf[i, j, vt] = W ⊠ arg
+    Wf[i, j, vt] = W * arg
 end
 
 Base.@propagate_inbounds function operator_shmem(
@@ -515,14 +483,14 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     RT = operator_return_eltype(op, typeof(arg))
     if RT <: Geometry.Contravariant3Vector
         _, Wv₂ = work
-        Wv₂[i, vt] = W ⊠ Geometry.covariant2(arg, local_geometry)
+        Wv₂[i, vt] = W * Geometry.covariant2(arg, local_geometry)
     elseif RT <: Geometry.Contravariant2Vector
         (Wv₃,) = work
-        Wv₃[i, vt] = W ⊠ Geometry.covariant3(arg, local_geometry)
+        Wv₃[i, vt] = W * Geometry.covariant3(arg, local_geometry)
     else
         _, Wv₂, Wv₃ = work
-        Wv₂[i, vt] = W ⊠ Geometry.covariant2(arg, local_geometry)
-        Wv₃[i, vt] = W ⊠ Geometry.covariant3(arg, local_geometry)
+        Wv₂[i, vt] = W * Geometry.covariant2(arg, local_geometry)
+        Wv₃[i, vt] = W * Geometry.covariant3(arg, local_geometry)
     end
 end
 Base.@propagate_inbounds function operator_fill_shmem!(
@@ -540,15 +508,15 @@ Base.@propagate_inbounds function operator_fill_shmem!(
     RT = operator_return_eltype(op, typeof(arg))
     if RT <: Geometry.Contravariant3Vector
         Wv₁, Wv₂ = work
-        Wv₁[i, j, vt] = W ⊠ Geometry.covariant1(arg, local_geometry)
-        Wv₂[i, j, vt] = W ⊠ Geometry.covariant2(arg, local_geometry)
+        Wv₁[i, j, vt] = W * Geometry.covariant1(arg, local_geometry)
+        Wv₂[i, j, vt] = W * Geometry.covariant2(arg, local_geometry)
     elseif RT <: Geometry.Contravariant12Vector
         (Wv₃,) = work
-        Wv₃[i, j, vt] = W ⊠ Geometry.covariant3(arg, local_geometry)
+        Wv₃[i, j, vt] = W * Geometry.covariant3(arg, local_geometry)
     else
         Wv₁, Wv₂, Wv₃ = work
-        Wv₁[i, j, vt] = W ⊠ Geometry.covariant1(arg, local_geometry)
-        Wv₂[i, j, vt] = W ⊠ Geometry.covariant2(arg, local_geometry)
-        Wv₃[i, j, vt] = W ⊠ Geometry.covariant3(arg, local_geometry)
+        Wv₁[i, j, vt] = W * Geometry.covariant1(arg, local_geometry)
+        Wv₂[i, j, vt] = W * Geometry.covariant2(arg, local_geometry)
+        Wv₃[i, j, vt] = W * Geometry.covariant3(arg, local_geometry)
     end
 end
diff --git a/ext/cuda/operators_spectral_element.jl b/ext/cuda/operators_spectral_element.jl
index a849f76f06..4988793a84 100644
--- a/ext/cuda/operators_spectral_element.jl
+++ b/ext/cuda/operators_spectral_element.jl
@@ -1,5 +1,5 @@
 import ClimaCore: Spaces, Quadratures, Topologies
-import ClimaCore: Operators, Geometry, Quadratures, RecursiveApply
+import ClimaCore: Operators, Geometry, Quadratures
 import ClimaComms
 using CUDA
 import ClimaCore.Operators: AbstractSpectralStyle, strip_space
@@ -198,11 +198,11 @@ Base.@propagate_inbounds function operator_evaluate(
 
     local_geometry = get_local_geometry(space, ij, slabidx)
 
-    DJv = D[i, 1] ⊠ Jv¹[1, vt]
+    DJv = D[i, 1] * Jv¹[1, vt]
     for k in 2:Nq
-        DJv = DJv ⊞ D[i, k] ⊠ Jv¹[k, vt]
+        DJv += D[i, k] * Jv¹[k, vt]
     end
-    return RecursiveApply.rmul(DJv, local_geometry.invJ)
+    return DJv * local_geometry.invJ
 end
 Base.@propagate_inbounds function operator_evaluate(
     op::Divergence{(1, 2)},
@@ -221,14 +221,14 @@ Base.@propagate_inbounds function operator_evaluate(
 
     local_geometry = get_local_geometry(space, ij, slabidx)
 
-    DJv = D[i, 1] ⊠ Jv¹[1, j, vt]
+    DJv = D[i, 1] * Jv¹[1, j, vt]
     for k in 2:Nq
-        DJv = DJv ⊞ D[i, k] ⊠ Jv¹[k, j, vt]
+        DJv += D[i, k] * Jv¹[k, j, vt]
     end
     for k in 1:Nq
-        DJv = DJv ⊞ D[j, k] ⊠ Jv²[i, k, vt]
+        DJv += D[j, k] * Jv²[i, k, vt]
     end
-    return RecursiveApply.rmul(DJv, local_geometry.invJ)
+    return DJv * local_geometry.invJ
 end
 
 Base.@propagate_inbounds function operator_evaluate(
@@ -248,11 +248,11 @@ Base.@propagate_inbounds function operator_evaluate(
 
     local_geometry = get_local_geometry(space, ij, slabidx)
 
-    Dᵀ₁WJv¹ = D[1, i] ⊠ WJv¹[1, vt]
+    Dᵀ₁WJv¹ = D[1, i] * WJv¹[1, vt]
     for k in 2:Nq
-        Dᵀ₁WJv¹ = Dᵀ₁WJv¹ ⊞ D[k, i] ⊠ WJv¹[k, vt]
+        Dᵀ₁WJv¹ += D[k, i] * WJv¹[k, vt]
     end
-    return ⊟(RecursiveApply.rdiv(Dᵀ₁WJv¹, local_geometry.WJ))
+    return -Dᵀ₁WJv¹ / local_geometry.WJ
 end
 Base.@propagate_inbounds function operator_evaluate(
     op::WeakDivergence{(1, 2)},
@@ -271,13 +271,13 @@ Base.@propagate_inbounds function operator_evaluate(
 
     local_geometry = get_local_geometry(space, ij, slabidx)
 
-    Dᵀ₁WJv¹ = D[1, i] ⊠ WJv¹[1, j, vt]
-    Dᵀ₂WJv² = D[1, j] ⊠ WJv²[i, 1, vt]
+    Dᵀ₁WJv¹ = D[1, i] * WJv¹[1, j, vt]
+    Dᵀ₂WJv² = D[1, j] * WJv²[i, 1, vt]
     for k in 2:Nq
-        Dᵀ₁WJv¹ = Dᵀ₁WJv¹ ⊞ D[k, i] ⊠ WJv¹[k, j, vt]
-        Dᵀ₂WJv² = Dᵀ₂WJv² ⊞ D[k, j] ⊠ WJv²[i, k, vt]
+        Dᵀ₁WJv¹ += D[k, i] * WJv¹[k, j, vt]
+        Dᵀ₂WJv² += D[k, j] * WJv²[i, k, vt]
     end
-    return ⊟(RecursiveApply.rdiv(Dᵀ₁WJv¹ ⊞ Dᵀ₂WJv², local_geometry.WJ))
+    return -(Dᵀ₁WJv¹ + Dᵀ₂WJv²) / local_geometry.WJ
 end
 
 Base.@propagate_inbounds function operator_evaluate(
@@ -294,20 +294,17 @@ Base.@propagate_inbounds function operator_evaluate(
     QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
-    RT = Geometry.rmul_return_type(eltype(Ju1), eltype(psi))
+    RT = Geometry.mul_return_type(eltype(Ju1), eltype(psi))
 
     local_geometry = get_local_geometry(space, ij, slabidx)
 
     result = zero(RT)
     for j in 1:Nq
         j == i && continue
-        F1 = RecursiveApply.rdiv(
-            (Ju1[i, vt] ⊞ Ju1[j, vt]) ⊠ (psi[i, vt] ⊞ psi[j, vt]),
-            2,
-        )
-        result = result ⊞ D[i, j] ⊠ F1
+        result +=
+            D[i, j] * (Ju1[i, vt] + Ju1[j, vt]) * (psi[i, vt] + psi[j, vt]) / 2
     end
-    return result ⊠ local_geometry.invJ
+    return result * local_geometry.invJ
 end
 Base.@propagate_inbounds function operator_evaluate(
     op::SplitDivergence{(1, 2)},
@@ -323,28 +320,24 @@ Base.@propagate_inbounds function operator_evaluate(
     QS = Spaces.quadrature_style(space)
     Nq = Quadratures.degrees_of_freedom(QS)
     D = Quadratures.differentiation_matrix(FT, QS)
-    RT = Geometry.rmul_return_type(eltype(Ju1), eltype(psi))
+    RT = Geometry.mul_return_type(eltype(Ju1), eltype(psi))
 
     local_geometry = get_local_geometry(space, ij, slabidx)
 
     result = zero(RT)
     for k in 1:Nq
         k == i && continue
-        F1 = RecursiveApply.rdiv(
-            (Ju1[i, j, vt] ⊞ Ju1[k, j, vt]) ⊠ (psi[i, j, vt] ⊞ psi[k, j, vt]),
-            2,
-        )
-        result = result ⊞ D[i, k] ⊠ F1
+        result +=
+            D[i, k] *
+            (Ju1[i, j, vt] + Ju1[k, j, vt]) * (psi[i, j, vt] + psi[k, j, vt]) / 2
     end
     for k in 1:Nq
         k == j && continue
-        F2 = RecursiveApply.rdiv(
-            (Ju2[i, j, vt] ⊞ Ju2[i, k, vt]) ⊠ (psi[i, j, vt] ⊞ psi[i, k, vt]),
-            2,
-        )
-        result = result ⊞ D[j, k] ⊠ F2
+        result +=
+            D[j, k] *
+            (Ju2[i, j, vt] + Ju2[i, k, vt]) * (psi[i, j, vt] + psi[i, k, vt]) / 2
     end
-    return result ⊠ local_geometry.invJ
+    return result * local_geometry.invJ
 end
 
 Base.@propagate_inbounds function operator_evaluate(
@@ -363,9 +356,9 @@ Base.@propagate_inbounds function operator_evaluate(
     D = Quadratures.differentiation_matrix(FT, QS)
 
     @inbounds begin
-        ∂f∂ξ₁ = D[i, 1] ⊠ input[1, vt]
+        ∂f∂ξ₁ = D[i, 1] * input[1, vt]
         for k in 2:Nq
-            ∂f∂ξ₁ = ∂f∂ξ₁ ⊞ D[i, k] ⊠ input[k, vt]
+            ∂f∂ξ₁ += D[i, k] * input[k, vt]
         end
     end
     if eltype(input) <: Number
@@ -394,11 +387,11 @@ Base.@propagate_inbounds function operator_evaluate(
     D = Quadratures.differentiation_matrix(FT, QS)
 
     @inbounds begin
-        ∂f∂ξ₁ = D[i, 1] ⊠ input[1, j, vt]
-        ∂f∂ξ₂ = D[j, 1] ⊠ input[i, 1, vt]
+        ∂f∂ξ₁ = D[i, 1] * input[1, j, vt]
+        ∂f∂ξ₂ = D[j, 1] * input[i, 1, vt]
         for k in 2:Nq
-            ∂f∂ξ₁ = ∂f∂ξ₁ ⊞ D[i, k] ⊠ input[k, j, vt]
-            ∂f∂ξ₂ = ∂f∂ξ₂ ⊞ D[j, k] ⊠ input[i, k, vt]
+            ∂f∂ξ₁ += D[i, k] * input[k, j, vt]
+            ∂f∂ξ₂ += D[j, k] * input[i, k, vt]
         end
     end
     if eltype(input) <: Number
@@ -431,11 +424,11 @@ Base.@propagate_inbounds function operator_evaluate(
     local_geometry = get_local_geometry(space, ij, slabidx)
     W = local_geometry.WJ * local_geometry.invJ
 
-    Dᵀ₁Wf = D[1, i] ⊠ Wf[1, vt]
+    Dᵀ₁Wf = D[1, i] * Wf[1, vt]
     for k in 2:Nq
-        Dᵀ₁Wf = Dᵀ₁Wf ⊞ D[k, i] ⊠ Wf[k, vt]
+        Dᵀ₁Wf += D[k, i] * Wf[k, vt]
     end
-    return Geometry.Covariant1Vector(⊟(RecursiveApply.rdiv(Dᵀ₁Wf, W)))
+    return Geometry.Covariant1Vector(-Dᵀ₁Wf) / W
 end
 Base.@propagate_inbounds function operator_evaluate(
     op::WeakGradient{(1, 2)},
@@ -455,16 +448,13 @@ Base.@propagate_inbounds function operator_evaluate(
     local_geometry = get_local_geometry(space, ij, slabidx)
     W = local_geometry.WJ * local_geometry.invJ
 
-    Dᵀ₁Wf = D[1, i] ⊠ Wf[1, j, vt]
-    Dᵀ₂Wf = D[1, j] ⊠ Wf[i, 1, vt]
+    Dᵀ₁Wf = D[1, i] * Wf[1, j, vt]
+    Dᵀ₂Wf = D[1, j] * Wf[i, 1, vt]
     for k in 2:Nq
-        Dᵀ₁Wf = Dᵀ₁Wf ⊞ D[k, i] ⊠ Wf[k, j, vt]
-        Dᵀ₂Wf = Dᵀ₂Wf ⊞ D[k, j] ⊠ Wf[i, k, vt]
+        Dᵀ₁Wf += D[k, i] * Wf[k, j, vt]
+        Dᵀ₂Wf += D[k, j] * Wf[i, k, vt]
     end
-    return Geometry.Covariant12Vector(
-        ⊟(RecursiveApply.rdiv(Dᵀ₁Wf, W)),
-        ⊟(RecursiveApply.rdiv(Dᵀ₂Wf, W)),
-    )
+    return Geometry.Covariant12Vector(-Dᵀ₁Wf, -Dᵀ₂Wf) / W
 end
 
 Base.@propagate_inbounds function operator_evaluate(
@@ -485,35 +475,29 @@ Base.@propagate_inbounds function operator_evaluate(
 
     if length(work) == 2
         _, v₂ = work
-        D₁v₂ = D[i, 1] ⊠ v₂[1, vt]
+        D₁v₂ = D[i, 1] * v₂[1, vt]
         for k in 2:Nq
-            D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, vt]
+            D₁v₂ += D[i, k] * v₂[k, vt]
         end
-        return Geometry.Contravariant3Vector(
-            RecursiveApply.rmul(D₁v₂, local_geometry.invJ),
-        )
+        result = Geometry.Contravariant3Vector(D₁v₂)
     elseif length(work) == 1
         (v₃,) = work
-        D₁v₃ = D[i, 1] ⊠ v₃[1, vt]
+        D₁v₃ = D[i, 1] * v₃[1, vt]
         for k in 2:Nq
-            D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, vt]
+            D₁v₃ += D[i, k] * v₃[k, vt]
         end
-        return Geometry.Contravariant2Vector(
-            ⊟(RecursiveApply.rmul(D₁v₃, local_geometry.invJ)),
-        )
+        result = Geometry.Contravariant2Vector(-D₁v₃)
     else
         _, v₂, v₃ = work
-        D₁v₂ = D[i, 1] ⊠ v₂[1, vt]
-        D₁v₃ = D[i, 1] ⊠ v₃[1, vt]
+        D₁v₂ = D[i, 1] * v₂[1, vt]
+        D₁v₃ = D[i, 1] * v₃[1, vt]
         @simd for k in 2:Nq
-            D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, vt]
-            D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, vt]
+            D₁v₂ += D[i, k] * v₂[k, vt]
+            D₁v₃ += D[i, k] * v₃[k, vt]
         end
-        return Geometry.Contravariant23Vector(
-            ⊟(RecursiveApply.rmul(D₁v₃, local_geometry.invJ)),
-            RecursiveApply.rmul(D₁v₂, local_geometry.invJ),
-        )
+        result = Geometry.Contravariant23Vector(-D₁v₃, D₁v₂)
     end
+    return result * local_geometry.invJ
 end
 Base.@propagate_inbounds function operator_evaluate(
     op::Curl{(1, 2)},
@@ -533,45 +517,37 @@ Base.@propagate_inbounds function operator_evaluate(
 
     if length(work) == 2
         v₁, v₂ = work
-        D₁v₂ = D[i, 1] ⊠ v₂[1, j, vt]
-        D₂v₁ = D[j, 1] ⊠ v₁[i, 1, vt]
+        D₁v₂ = D[i, 1] * v₂[1, j, vt]
+        D₂v₁ = D[j, 1] * v₁[i, 1, vt]
         for k in 2:Nq
-            D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, j, vt]
-            D₂v₁ = D₂v₁ ⊞ D[j, k] ⊠ v₁[i, k, vt]
+            D₁v₂ += D[i, k] * v₂[k, j, vt]
+            D₂v₁ += D[j, k] * v₁[i, k, vt]
         end
-        return Geometry.Contravariant3Vector(
-            RecursiveApply.rmul(D₁v₂ ⊟ D₂v₁, local_geometry.invJ),
-        )
+        result = Geometry.Contravariant3Vector(D₁v₂ - D₂v₁)
     elseif length(work) == 1
         (v₃,) = work
-        D₁v₃ = D[i, 1] ⊠ v₃[1, j, vt]
-        D₂v₃ = D[j, 1] ⊠ v₃[i, 1, vt]
+        D₁v₃ = D[i, 1] * v₃[1, j, vt]
+        D₂v₃ = D[j, 1] * v₃[i, 1, vt]
         for k in 2:Nq
-            D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, j, vt]
-            D₂v₃ = D₂v₃ ⊞ D[j, k] ⊠ v₃[i, k, vt]
+            D₁v₃ += D[i, k] * v₃[k, j, vt]
+            D₂v₃ += D[j, k] * v₃[i, k, vt]
         end
-        return Geometry.Contravariant12Vector(
-            RecursiveApply.rmul(D₂v₃, local_geometry.invJ),
-            ⊟(RecursiveApply.rmul(D₁v₃, local_geometry.invJ)),
-        )
+        result = Geometry.Contravariant12Vector(D₂v₃, -D₁v₃)
     else
         v₁, v₂, v₃ = work
-        D₁v₂ = D[i, 1] ⊠ v₂[1, j, vt]
-        D₂v₁ = D[j, 1] ⊠ v₁[i, 1, vt]
-        D₁v₃ = D[i, 1] ⊠ v₃[1, j, vt]
-        D₂v₃ = D[j, 1] ⊠ v₃[i, 1, vt]
+        D₁v₂ = D[i, 1] * v₂[1, j, vt]
+        D₂v₁ = D[j, 1] * v₁[i, 1, vt]
+        D₁v₃ = D[i, 1] * v₃[1, j, vt]
+        D₂v₃ = D[j, 1] * v₃[i, 1, vt]
         @simd for k in 2:Nq
-            D₁v₂ = D₁v₂ ⊞ D[i, k] ⊠ v₂[k, j, vt]
-            D₂v₁ = D₂v₁ ⊞ D[j, k] ⊠ v₁[i, k, vt]
-            D₁v₃ = D₁v₃ ⊞ D[i, k] ⊠ v₃[k, j, vt]
-            D₂v₃ = D₂v₃ ⊞ D[j, k] ⊠ v₃[i, k, vt]
+            D₁v₂ += D[i, k] * v₂[k, j, vt]
+            D₂v₁ += D[j, k] * v₁[i, k, vt]
+            D₁v₃ += D[i, k] * v₃[k, j, vt]
+            D₂v₃ += D[j, k] * v₃[i, k, vt]
         end
-        return Geometry.Contravariant123Vector(
-            RecursiveApply.rmul(D₂v₃, local_geometry.invJ),
-            ⊟(RecursiveApply.rmul(D₁v₃, local_geometry.invJ)),
-            RecursiveApply.rmul(D₁v₂ ⊟ D₂v₁, local_geometry.invJ),
-        )
+        result = Geometry.Contravariant123Vector(D₂v₃, -D₁v₃, D₁v₂ - D₂v₁)
     end
+    return result * local_geometry.invJ
 end
 
 Base.@propagate_inbounds function operator_evaluate(
@@ -592,35 +568,29 @@ Base.@propagate_inbounds function operator_evaluate(
 
     if length(work) == 2
         _, Wv₂ = work
-        Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, vt]
+        Dᵀ₁Wv₂ = D[1, i] * Wv₂[1, vt]
         for k in 2:Nq
-            Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, vt]
+            Dᵀ₁Wv₂ += D[k, i] * Wv₂[k, vt]
         end
-        return Geometry.Contravariant3Vector(
-            RecursiveApply.rdiv(⊟(Dᵀ₁Wv₂), local_geometry.WJ),
-        )
+        result = Geometry.Contravariant3Vector(-Dᵀ₁Wv₂)
     elseif length(work) == 1
         (Wv₃,) = work
-        Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, vt]
+        Dᵀ₁Wv₃ = D[1, i] * Wv₃[1, vt]
         for k in 2:Nq
-            Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, vt]
+            Dᵀ₁Wv₃ += D[k, i] * Wv₃[k, vt]
         end
-        return Geometry.Contravariant2Vector(
-            RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ),
-        )
+        result = Geometry.Contravariant2Vector(Dᵀ₁Wv₃)
     else
         _, Wv₂, Wv₃ = work
-        Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, vt]
-        Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, vt]
+        Dᵀ₁Wv₂ = D[1, i] * Wv₂[1, vt]
+        Dᵀ₁Wv₃ = D[1, i] * Wv₃[1, vt]
         @simd for k in 2:Nq
-            Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, vt]
-            Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, vt]
+            Dᵀ₁Wv₂ += D[k, i] * Wv₂[k, vt]
+            Dᵀ₁Wv₃ += D[k, i] * Wv₃[k, vt]
         end
-        return Geometry.Contravariant23Vector(
-            RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ),
-            RecursiveApply.rdiv(⊟(Dᵀ₁Wv₂), local_geometry.WJ),
-        )
+        result = Geometry.Contravariant23Vector(Dᵀ₁Wv₃, -Dᵀ₁Wv₂)
     end
+    return result / local_geometry.WJ
 end
 Base.@propagate_inbounds function operator_evaluate(
     op::WeakCurl{(1, 2)},
@@ -640,43 +610,35 @@ Base.@propagate_inbounds function operator_evaluate(
 
     if length(work) == 2
         Wv₁, Wv₂ = work
-        Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, j, vt]
-        Dᵀ₂Wv₁ = D[1, j] ⊠ Wv₁[i, 1, vt]
+        Dᵀ₁Wv₂ = D[1, i] * Wv₂[1, j, vt]
+        Dᵀ₂Wv₁ = D[1, j] * Wv₁[i, 1, vt]
         for k in 2:Nq
-            Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, j, vt]
-            Dᵀ₂Wv₁ = Dᵀ₂Wv₁ ⊞ D[k, j] ⊠ Wv₁[i, k, vt]
+            Dᵀ₁Wv₂ += D[k, i] * Wv₂[k, j, vt]
+            Dᵀ₂Wv₁ += D[k, j] * Wv₁[i, k, vt]
         end
-        return Geometry.Contravariant3Vector(
-            RecursiveApply.rdiv(Dᵀ₂Wv₁ ⊟ Dᵀ₁Wv₂, local_geometry.WJ),
-        )
+        result = Geometry.Contravariant3Vector(Dᵀ₂Wv₁ - Dᵀ₁Wv₂)
     elseif length(work) == 1
         (Wv₃,) = work
-        Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, j, vt]
-        Dᵀ₂Wv₃ = D[1, j] ⊠ Wv₃[i, 1, vt]
+        Dᵀ₁Wv₃ = D[1, i] * Wv₃[1, j, vt]
+        Dᵀ₂Wv₃ = D[1, j] * Wv₃[i, 1, vt]
         for k in 2:Nq
-            Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, j, vt]
-            Dᵀ₂Wv₃ = Dᵀ₂Wv₃ ⊞ D[k, j] ⊠ Wv₃[i, k, vt]
+            Dᵀ₁Wv₃ += D[k, i] * Wv₃[k, j, vt]
+            Dᵀ₂Wv₃ += D[k, j] * Wv₃[i, k, vt]
         end
-        return Geometry.Contravariant12Vector(
-            ⊟(RecursiveApply.rdiv(Dᵀ₂Wv₃, local_geometry.WJ)),
-            RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ),
-        )
+        result = Geometry.Contravariant12Vector(-Dᵀ₂Wv₃, Dᵀ₁Wv₃)
     else
         Wv₁, Wv₂, Wv₃ = work
-        Dᵀ₁Wv₂ = D[1, i] ⊠ Wv₂[1, j, vt]
-        Dᵀ₂Wv₁ = D[1, j] ⊠ Wv₁[i, 1, vt]
-        Dᵀ₁Wv₃ = D[1, i] ⊠ Wv₃[1, j, vt]
-        Dᵀ₂Wv₃ = D[1, j] ⊠ Wv₃[i, 1, vt]
+        Dᵀ₁Wv₂ = D[1, i] * Wv₂[1, j, vt]
+        Dᵀ₂Wv₁ = D[1, j] * Wv₁[i, 1, vt]
+        Dᵀ₁Wv₃ = D[1, i] * Wv₃[1, j, vt]
+        Dᵀ₂Wv₃ = D[1, j] * Wv₃[i, 1, vt]
         @simd for k in 2:Nq
-            Dᵀ₁Wv₂ = Dᵀ₁Wv₂ ⊞ D[k, i] ⊠ Wv₂[k, j, vt]
-            Dᵀ₂Wv₁ = Dᵀ₂Wv₁ ⊞ D[k, j] ⊠ Wv₁[i, k, vt]
-            Dᵀ₁Wv₃ = Dᵀ₁Wv₃ ⊞ D[k, i] ⊠ Wv₃[k, j, vt]
-            Dᵀ₂Wv₃ = Dᵀ₂Wv₃ ⊞ D[k, j] ⊠ Wv₃[i, k, vt]
+            Dᵀ₁Wv₂ += D[k, i] * Wv₂[k, j, vt]
+            Dᵀ₂Wv₁ += D[k, j] * Wv₁[i, k, vt]
+            Dᵀ₁Wv₃ += D[k, i] * Wv₃[k, j, vt]
+            Dᵀ₂Wv₃ += D[k, j] * Wv₃[i, k, vt]
         end
-        return Geometry.Contravariant123Vector(
-            ⊟(RecursiveApply.rdiv(Dᵀ₂Wv₃, local_geometry.WJ)),
-            RecursiveApply.rdiv(Dᵀ₁Wv₃, local_geometry.WJ),
-            RecursiveApply.rdiv(Dᵀ₂Wv₁ ⊟ Dᵀ₁Wv₂, local_geometry.WJ),
-        )
+        result = Geometry.Contravariant123Vector(-Dᵀ₂Wv₃, Dᵀ₁Wv₃, Dᵀ₂Wv₁ - Dᵀ₁Wv₂)
     end
+    return result / local_geometry.WJ
 end
diff --git a/ext/cuda/topologies_dss.jl b/ext/cuda/topologies_dss.jl
index 25dd5caa89..c02070e43a 100644
--- a/ext/cuda/topologies_dss.jl
+++ b/ext/cuda/topologies_dss.jl
@@ -262,8 +262,7 @@ function dss_transform_kernel!(
             local_geometry[loc],
             dss_weights[loc],
         )
-        perimeter_data[CI(p, 1, 1, level, elem)] =
-            Topologies.drop_vert_dim(eltype(perimeter_data), src)
+        perimeter_data[CI(p, 1, 1, level, elem)] = src
     end
     return nothing
 end
@@ -595,7 +594,7 @@ function Topologies.dss_1d!(
     nitems = Nv * nfaces
     threads = _max_threads_cuda()
     p = linear_partition(nitems, threads)
-    args = (data, local_geometry, dss_weights, nfaces)
+    args = (Base.broadcastable(data), local_geometry, dss_weights, nfaces)
     auto_launch!(
         dss_1d_kernel!,
         args;
@@ -621,7 +620,7 @@ function dss_1d_kernel!(data, local_geometry, dss_weights, nfaces)
                 local_geometry,
                 dss_weights,
                 left_idx,
-            ) ⊞ Topologies.dss_transform(
+            ) + Topologies.dss_transform(
                 data,
                 local_geometry,
                 dss_weights,
diff --git a/src/ClimaCore.jl b/src/ClimaCore.jl
index 01dd82287f..761dfbc782 100644
--- a/src/ClimaCore.jl
+++ b/src/ClimaCore.jl
@@ -8,7 +8,7 @@ include("DebugOnly/DebugOnly.jl")
 include("Utilities/Utilities.jl")
 include("interface.jl")
 include("devices.jl")
-include("RecursiveApply/RecursiveApply.jl")
+include("recursive_apply.jl")
 include("DataLayouts/DataLayouts.jl")
 include("Geometry/Geometry.jl")
 include("Domains/Domains.jl")
diff --git a/src/DataLayouts/DataLayouts.jl b/src/DataLayouts/DataLayouts.jl
index 81f07429fb..807e40a73e 100644
--- a/src/DataLayouts/DataLayouts.jl
+++ b/src/DataLayouts/DataLayouts.jl
@@ -73,8 +73,9 @@ using UnrolledUtilities
 
 import ..Utilities.Unrolled:
     unrolled_setindex, unrolled_insert, unrolled_map_with_inbounds
-import ..Utilities:
-    PlusHalf, unionall_type, replace_type_parameter, fieldtype_vals
+import ..Utilities: PlusHalf, unionall_type, replace_type_parameter
+import ..Utilities: fieldtype_vals, safe_eltype, unsafe_eltype, auto_broadcasted
+import ..Utilities: add_auto_broadcasters, drop_auto_broadcasters
 import ..DebugOnly: call_post_op_callback, post_op_callback
 import ..slab, ..slab_args, ..column, ..column_args, ..level, ..level_args
 export slab,
@@ -1616,6 +1617,10 @@ rebuild(data::AbstractData, ::Type{DA}) where {DA} =
 Base.copy(data::AbstractData) =
     union_all(singleton(data)){type_params(data)...}(copy(parent(data)))
 
+Base.reinterpret(::Type{S}, data::AbstractData{S}) where {S} = data
+Base.reinterpret(::Type{S}, data::AbstractData) where {S} =
+    union_all(singleton(data)){S, type_params(data)[2:end]...}(parent(data))
+
 # broadcast machinery
 include("non_extruded_broadcasted.jl")
 include("broadcast.jl")
diff --git a/src/DataLayouts/broadcast.jl b/src/DataLayouts/broadcast.jl
index a43f205148..bdbb2ac01e 100644
--- a/src/DataLayouts/broadcast.jl
+++ b/src/DataLayouts/broadcast.jl
@@ -338,7 +338,22 @@ Base.Broadcast.BroadcastStyle(
 ) where {Nv, Nij, A1, A2} =
     VIJHFStyle{Nv, Nij, promote_parent_array_type(A1, A2)}()
 
-Base.Broadcast.broadcastable(data::AbstractData) = data
+# Enable automatic nested broadcasting over supported types of iterators, in
+# addition to the standard broadcasting over array indices.
+Base.Broadcast.broadcastable(data::AbstractData) =
+    reinterpret(add_auto_broadcasters(eltype(data)), data)
+Base.Broadcast.broadcasted(style::DataStyle, f::F, args...) where {F} =
+    auto_broadcasted(style, f, args)
+
+Base.eltype(bc::Base.Broadcast.Broadcasted{<:DataStyle}) = unsafe_eltype(bc)
+
+# Remove all AutoBroadcaster wrappers when allocating a new AbstractData.
+Base.similar(bc::Base.Broadcast.Broadcasted{<:DataStyle}) =
+    similar(bc, drop_auto_broadcasters(safe_eltype(bc)))
+
+# Only allocate a new AbstractData if its concrete element type can be inferred.
+Base.copy(bc::Base.Broadcast.Broadcasted{<:DataStyle}) =
+    copyto!(similar(bc), bc)
 
 Base.@propagate_inbounds function slab(
     bc::Base.Broadcast.Broadcasted{DS},
diff --git a/src/DataLayouts/mapreduce.jl b/src/DataLayouts/mapreduce.jl
index 648a23aa76..111ef558c2 100644
--- a/src/DataLayouts/mapreduce.jl
+++ b/src/DataLayouts/mapreduce.jl
@@ -1,18 +1,22 @@
 # This is only defined for testing.
 function mapreduce_cuda end
 
-function Base.mapreduce(
+Base.mapreduce(
+    fn::F,
+    op::Op,
+    data::Union{AbstractData, Base.Broadcast.Broadcasted{<:DataStyle}},
+) where {F, Op} =
+    drop_auto_broadcasters(mapreduce_data(fn, op, Base.broadcastable(data)))
+
+function mapreduce_data(
     fn::F,
     op::Op,
     bc::BroadcastedUnionDataF{<:Any, A},
 ) where {F, Op, A}
-    mapreduce(op, 1) do v
-        Base.@_inline_meta
-        @inbounds fn(bc[])
-    end
+    @inbounds fn(bc[])
 end
 
-function Base.mapreduce(
+function mapreduce_data(
     fn::F,
     op::Op,
     bc::Union{
@@ -25,11 +29,11 @@ function Base.mapreduce(
     mapreduce(op, 1:Nh) do h
         Base.@_inline_meta
         slabview = @inbounds slab(bc, h)
-        mapreduce(fn, op, slabview)
+        mapreduce_data(fn, op, slabview)
     end
 end
 
-function Base.mapreduce(
+function mapreduce_data(
     fn::F,
     op::Op,
     bc::Union{
@@ -42,11 +46,15 @@ function Base.mapreduce(
     mapreduce(op, 1:Nh) do h
         Base.@_inline_meta
         slabview = @inbounds slab(bc, h)
-        mapreduce(fn, op, slabview)
+        mapreduce_data(fn, op, slabview)
     end
 end
 
-function Base.mapreduce(fn::F, op::Op, bc::IJF{S, Nij}) where {F, Op, S, Nij}
+function mapreduce_data(
+    fn::F,
+    op::Op,
+    bc::BroadcastedUnionIJF{<:Any, Nij, A},
+) where {F, Op, Nij, A}
     # mapreduce across DataSlab2D nodes
     mapreduce(op, Iterators.product(1:Nij, 1:Nij)) do (i, j)
         Base.@_inline_meta
@@ -56,7 +64,11 @@ function Base.mapreduce(fn::F, op::Op, bc::IJF{S, Nij}) where {F, Op, S, Nij}
     end
 end
 
-function Base.mapreduce(fn::F, op::Op, bc::IF{S, Ni}) where {F, Op, S, Ni}
+function mapreduce_data(
+    fn::F,
+    op::Op,
+    bc::BroadcastedUnionIF{<:Any, Ni, A},
+) where {F, Op, Ni, A}
     # mapreduce across DataSlab1D nodes
     mapreduce(op, 1:Ni) do i
         Base.@_inline_meta
@@ -66,7 +78,7 @@ function Base.mapreduce(fn::F, op::Op, bc::IF{S, Ni}) where {F, Op, S, Ni}
     end
 end
 
-function Base.mapreduce(
+function mapreduce_data(
     fn::F,
     op::Op,
     bc::BroadcastedUnionVF{<:Any, Nv, A},
@@ -80,7 +92,7 @@ function Base.mapreduce(
     end
 end
 
-function Base.mapreduce(
+function mapreduce_data(
     fn::F,
     op::Op,
     bc::Union{
@@ -93,11 +105,11 @@ function Base.mapreduce(
     mapreduce(op, Iterators.product(1:Ni, 1:Nh)) do (i, h)
         Base.@_inline_meta
         columnview = @inbounds column(bc, i, h)
-        mapreduce(fn, op, columnview)
+        mapreduce_data(fn, op, columnview)
     end
 end
 
-function Base.mapreduce(
+function mapreduce_data(
     fn::F,
     op::Op,
     bc::Union{
@@ -110,6 +122,6 @@ function Base.mapreduce(
     mapreduce(op, Iterators.product(1:Nij, 1:Nij, 1:Nh)) do (i, j, h)
         Base.@_inline_meta
         columnview = @inbounds column(bc, i, j, h)
-        mapreduce(fn, op, columnview)
+        mapreduce_data(fn, op, columnview)
     end
 end
diff --git a/src/Fields/Fields.jl b/src/Fields/Fields.jl
index 6f2b6759a8..efeb824844 100644
--- a/src/Fields/Fields.jl
+++ b/src/Fields/Fields.jl
@@ -25,14 +25,14 @@ import ..Spaces: nlevels, ncolumns
 import ..Spaces: get_mask, set_mask!
 import ..DataLayouts: AbstractMask
 import ..Geometry: Geometry, Cartesian12Vector
-import ..Utilities: PlusHalf, half
+import ..Utilities: PlusHalf, half, safe_eltype, unsafe_eltype
+import ..Utilities: drop_auto_broadcasters, auto_broadcasted
 
-using ..RecursiveApply
+using UnrolledUtilities
 using ClimaComms
 import Adapt
-import UnrolledUtilities: unrolled_map, unrolled_mapreduce, unrolled_findfirst, unrolled_all
 
-import StaticArrays, LinearAlgebra, Statistics, InteractiveUtils
+import StaticArrays, LinearAlgebra, Statistics
 
 """
     Field(values, space)
diff --git a/src/Fields/broadcast.jl b/src/Fields/broadcast.jl
index 011c425510..5ae30ac6b1 100644
--- a/src/Fields/broadcast.jl
+++ b/src/Fields/broadcast.jl
@@ -43,6 +43,23 @@ Base.Broadcast.BroadcastStyle(
     ::FieldStyle{DS2},
 ) where {DS1, DS2} = FieldStyle(Base.Broadcast.BroadcastStyle(DS1(), DS2()))
 
+"""
+    FieldConflict
+
+Analogue of the built-in `Broadcast.ArrayConflict` for Fields. Used in place of
+`Broadcast.Unknown` to call `Broadcast.broadcasted(::AbstractFieldStyle, ...)`.
+Without this broadcast style, such `broadcasted` methods would need more complex
+definitions that specialize on argument types, rather than just the style type.
+"""
+struct FieldConflict <: AbstractFieldStyle end
+
+Base.Broadcast.result_join(
+    ::AbstractFieldStyle,
+    ::AbstractFieldStyle,
+    ::Base.Broadcast.Unknown,
+    ::Base.Broadcast.Unknown,
+) = FieldConflict()
+
 # Override the recursive unrolling used in combine_styles (which can lead to
 # inference failures in broadcast expressions with more than 10 arguments) with
 # manual unrolling (which can have higher latency but is always inferrable).
@@ -57,67 +74,22 @@ Base.Broadcast.combine_styles(
     (arg1, arg2, arg3, args...),
 )
 
-Base.Broadcast.broadcastable(field::Field) = field
+# Define broadcastable/broadcasted/eltype/similar/copy to match DataStyle
+# broadcasting, but with the application of a mask when copying
+Base.Broadcast.broadcastable(field::Field) =
+    Field(Base.Broadcast.broadcastable(field_values(field)), axes(field))
+
+Base.Broadcast.broadcasted(style::AbstractFieldStyle, f::F, args...) where {F} =
+    auto_broadcasted(style, f, args)
 
 Base.eltype(bc::Base.Broadcast.Broadcasted{<:AbstractFieldStyle}) =
-    Base.Broadcast.combine_eltypes(bc.f, bc.args)
-
-# _first: recursively get the first element
-function _first end
-
-# If we haven't caught the datatype, then this
-# may just result in a method error-- but all
-# we're trying to do is throw a more helpful
-# error message. So, let's throw it here instead.
-_first(bc, ::Any) = throw(BroadcastInferenceError(bc))
-_first_data_layout(data::DataLayouts.VF) = data[CartesianIndex(1, 1, 1, 1, 1)]
-_first_data_layout(data::DataLayouts.DataF) = data[]
-_first(bc, x::Real) = x
-_first(bc, x::Geometry.LocalGeometry) = x
-_first(bc, data::DataLayouts.VF) = data[]
-_first(bc, field::Field) =
-    _first_data_layout(field_values(column(field, 1, 1, 1)))
-_first(bc, space::Spaces.AbstractSpace) =
-    _first_data_layout(field_values(column(space, 1, 1, 1)))
-_first(bc, x::Base.Broadcast.Broadcasted) = _first(bc, copy(x))
-_first(bc, x::Ref{T}) where {T} = x.x
-_first(bc, x::Tuple{T}) where {T} = x[1]
-
-function call_with_first(bc)
-    # Try calling with first applied to all arguments:
-    bc′ = Base.Broadcast.preprocess(nothing, bc)
-    first_args = map(arg -> _first(bc, arg), bc′.args)
-    bc.f(first_args...)
-end
-
-# we implement our own to avoid the type-widening code, and throw a more useful error
-struct BroadcastInferenceError <: Exception
-    bc::Base.Broadcast.Broadcasted
-end
-
-function Base.showerror(io::IO, err::BroadcastInferenceError)
-    print(io, "BroadcastInferenceError: cannot infer eltype.\n")
-    bc = err.bc
-    f = bc.f
-    eltypes = map(eltype, bc.args)
-    if !hasmethod(f, eltypes)
-        print(io, "  function $(f) does not have a method for $(eltypes)")
-    else
-        InteractiveUtils.code_warntype(io, f, eltypes)
-    end
-end
+    unsafe_eltype(bc)
 
-function Base.copy(
-    bc::Base.Broadcast.Broadcasted{Style},
-) where {Style <: AbstractFieldStyle}
-    ElType = eltype(bc)
-    if !Base.isconcretetype(ElType)
-        call_with_first(bc)
-        throw(BroadcastInferenceError(bc))
-    end
-    # We can trust it and defer to the simpler `copyto!`
-    return copyto!(similar(bc, ElType), bc, Spaces.get_mask(axes(bc)))
-end
+Base.similar(bc::Base.Broadcast.Broadcasted{<:AbstractFieldStyle}) =
+    similar(bc, drop_auto_broadcasters(safe_eltype(bc)))
+
+Base.copy(bc::Base.Broadcast.Broadcasted{<:AbstractFieldStyle}) =
+    copyto!(similar(bc), bc, Spaces.get_mask(axes(bc)))
 
 Base.@propagate_inbounds function slab(
     bc::Base.Broadcast.Broadcasted{Style},
@@ -216,15 +188,15 @@ Base.axes(bc::Base.Broadcast.Broadcasted{<:AbstractFieldStyle}) =
 _axes(bc, ::Nothing) = Base.Broadcast.combine_axes(bc.args...)
 _axes(bc, axes) = axes
 
-function Base.similar(
+Base.similar(
     bc::Base.Broadcast.Broadcasted{<:AbstractFieldStyle},
     ::Type{Eltype},
-) where {Eltype}
-    return Field(similar(todata(bc), Eltype), axes(bc))
-end
+) where {Eltype} = Field(Eltype, axes(bc))
 
-Base.similar(bc::Base.Broadcast.Broadcasted{<:AbstractFieldStyle}) =
-    Base.similar(bc, eltype(bc))
+Base.similar(
+    bc::Base.Broadcast.Broadcasted{<:FieldStyle},
+    ::Type{Eltype},
+) where {Eltype} = Field(similar(todata(bc), Eltype), axes(bc))
 
 @inline function Base.copyto!(
     dest::Field,
@@ -360,6 +332,7 @@ end
     return nothing
 end
 
+# By default, broadcasted Vals are put in Refs, leading to type instabilities
 Base.Broadcast.broadcasted(
     ::typeof(Base.literal_pow),
     ::typeof(^),
@@ -367,27 +340,6 @@ Base.Broadcast.broadcasted(
     ::Val{n},
 ) where {n} = Base.Broadcast.broadcasted(x -> Base.literal_pow(^, x, Val(n)), f)
 
-# Specialize handling of +, *, muladd, so that we can support broadcasting over NamedTuple element types
-# Required for ODE solvers
-
-Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::typeof(+), args...) =
-    Base.Broadcast.broadcasted(fs, RecursiveApply.:⊞, args...)
-
-Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::typeof(-), args...) =
-    Base.Broadcast.broadcasted(fs, RecursiveApply.:⊟, args...)
-
-Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::typeof(*), args...) =
-    Base.Broadcast.broadcasted(fs, RecursiveApply.:⊠, args...)
-
-Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::typeof(/), args...) =
-    Base.Broadcast.broadcasted(fs, RecursiveApply.rdiv, args...)
-
-Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::typeof(muladd), args...) =
-    Base.Broadcast.broadcasted(fs, RecursiveApply.rmuladd, args...)
-
-Base.Broadcast.broadcasted(fs::AbstractFieldStyle, ::typeof(zero), arg) =
-    Base.Broadcast.broadcasted(fs, RecursiveApply.rzero, arg)
-
 # Specialize handling of vector-based functions to automatically add LocalGeometry information
 function Base.Broadcast.broadcasted(
     fs::AbstractFieldStyle,
diff --git a/src/Fields/mapreduce.jl b/src/Fields/mapreduce.jl
index 1197d5902e..0ad71e4ac1 100644
--- a/src/Fields/mapreduce.jl
+++ b/src/Fields/mapreduce.jl
@@ -1,4 +1,5 @@
-Base.map(fn, fields::Field...) = Base.broadcast(fn, fields...)
+Base.map(fn, field::Field, fields::Field...) =
+    Base.broadcast(fn, field, fields...)
 Base.map!(fn, dest::Field, fields::Field...) =
     Base.broadcast!(fn, dest, fields...)
 
@@ -14,10 +15,9 @@ function local_sum(
     field::Union{Field, Base.Broadcast.Broadcasted{<:FieldStyle}},
     dev::ClimaComms.AbstractCPUDevice,
 )
-    result = Base.reduce(
-        RecursiveApply.radd,
+    result = Base.sum(
         Base.Broadcast.broadcasted(
-            RecursiveApply.rmul,
+            *,
             Spaces.weighted_jacobian(axes(field)),
             todata(field),
         ),
@@ -122,7 +122,7 @@ function Statistics.mean(
         DataLayouts.DataF((local_sum(field), Spaces.local_area(space)))
     ClimaComms.allreduce!(context, parent(data_combined), +)
     sum_v, area_v = data_combined[]
-    RecursiveApply.rdiv(sum_v, area_v)
+    return sum_v ./ area_v
 end
 Statistics.mean(fn, field::Field, ::ClimaComms.AbstractCPUDevice) =
     Statistics.mean(Base.Broadcast.broadcasted(fn, field))
diff --git a/src/Geometry/Geometry.jl b/src/Geometry/Geometry.jl
index 453f676f27..27238c3385 100644
--- a/src/Geometry/Geometry.jl
+++ b/src/Geometry/Geometry.jl
@@ -1,6 +1,5 @@
 module Geometry
 
-using ..RecursiveApply
 import LinearAlgebra
 import UnrolledUtilities: unrolled_findfirst
 
@@ -22,7 +21,8 @@ include("axistensors.jl")
 include("localgeometry.jl")
 include("conversions.jl")
 include("globalgeometry.jl")
-include("rmul_with_projection.jl")
+include("mul_with_projection.jl")
+include("auto_broadcaster_methods.jl")
 
 """
     Δz_metric_component(::Type{<:AbstractPoint})
diff --git a/src/Geometry/auto_broadcaster_methods.jl b/src/Geometry/auto_broadcaster_methods.jl
new file mode 100644
index 0000000000..d75856a7d5
--- /dev/null
+++ b/src/Geometry/auto_broadcaster_methods.jl
@@ -0,0 +1,48 @@
+import ..Utilities:
+    AutoBroadcaster, nested_broadcast, nested_broadcast_result_type
+
+# TODO: Avoid defining these methods by refactoring the Geometry module so that
+# all relevant functionality is expressed in terms of standard math operations
+
+(::Type{T})(x::AutoBroadcaster) where {T <: AxisTensor} = nested_broadcast(T, x)
+
+for f in (:covariant, :contravariant), n in (1, 2, 3)
+    @eval $(Symbol(f, n))(x::AutoBroadcaster, lg) =
+        nested_broadcast(Base.Fix2($(Symbol(f, n)), lg), x)
+end
+Jcontravariant3(x::AutoBroadcaster, lg) =
+    nested_broadcast(Base.Fix2(Jcontravariant3, lg), x)
+
+mul_with_projection(x::AutoBroadcaster, y::AutoBroadcaster, lg) =
+    nested_broadcast((x, y) -> mul_with_projection(x, y, lg), x, y)
+mul_with_projection(x::AutoBroadcaster, y, lg) =
+    nested_broadcast(x -> mul_with_projection(x, y, lg), x)
+mul_with_projection(x, y::AutoBroadcaster, lg) =
+    nested_broadcast(y -> mul_with_projection(x, y, lg), y)
+
+needs_projection(
+    ::Type{X},
+    ::Type{Y},
+) where {X <: AutoBroadcaster, Y <: AutoBroadcaster} =
+    needs_projection(eltype(X), eltype(Y))
+needs_projection(::Type{X}, ::Type{Y}) where {X <: AutoBroadcaster, Y} =
+    needs_projection(eltype(X), Y)
+needs_projection(::Type{X}, ::Type{Y}) where {X, Y <: AutoBroadcaster} =
+    needs_projection(X, eltype(Y))
+
+mul_return_type(
+    ::Type{X},
+    ::Type{Y},
+) where {X <: AutoBroadcaster, Y <: AutoBroadcaster} =
+    nested_broadcast_result_type(mul_return_type, X, Y)
+mul_return_type(::Type{X}, ::Type{Y}) where {X <: AutoBroadcaster, Y} =
+    nested_broadcast_result_type(Base.Fix2(mul_return_type, Y), X)
+mul_return_type(::Type{X}, ::Type{Y}) where {X, Y <: AutoBroadcaster} =
+    nested_broadcast_result_type(Base.Fix1(mul_return_type, X), Y)
+
+divergence_result_type(::Type{X}) where {X <: AutoBroadcaster} =
+    nested_broadcast_result_type(divergence_result_type, X)
+gradient_result_type(val, ::Type{X}) where {X <: AutoBroadcaster} =
+    nested_broadcast_result_type(Base.Fix1(gradient_result_type, val), X)
+curl_result_type(val, ::Type{X}) where {X <: AutoBroadcaster} =
+    nested_broadcast_result_type(Base.Fix1(curl_result_type, val), X)
diff --git a/src/Geometry/axistensors.jl b/src/Geometry/axistensors.jl
index 32f051af2d..c072d555c5 100644
--- a/src/Geometry/axistensors.jl
+++ b/src/Geometry/axistensors.jl
@@ -519,11 +519,9 @@ end
 end
 
 """
-    outer(x, y)
     x ⊗ y
 
-Compute the outer product of `x` and `y`. Typically `x` will be a vector, and
-`y` can be either a number, vector or tuple/named tuple.
+Shorthand for the outer product `x * y'`.
 
 ```julia
 # vector ⊗ scalar = vector
@@ -537,21 +535,7 @@ julia> [1.0,2.0] ⊗ [1.0,3.0]
 2×2 Matrix{Float64}:
  1.0  3.0
  2.0  6.0
-
-# vector ⊗ tuple = recursion
-julia> [1.0,2.0] ⊗ (1.0, (a=2.0, b=3.0))
-([1.0, 2.0], (a = [2.0, 4.0], b = [3.0, 6.0]))
 ```
 """
-function outer end
-const ⊗ = outer
-
-@inline function outer(x::AbstractVector, y::AbstractVector)
-    x * y'
-end
-@inline function outer(x::AbstractVector, y::Number)
-    x * y
-end
-@inline function outer(x::AbstractVector, y)
-    RecursiveApply.rmap(y -> x ⊗ y, y)
-end
+⊗(x, y) = x * y'
+const outer = ⊗ # For backwards compatibility with previous versions of ClimaCore
diff --git a/src/Geometry/rmul_with_projection.jl b/src/Geometry/mul_with_projection.jl
similarity index 63%
rename from src/Geometry/rmul_with_projection.jl
rename to src/Geometry/mul_with_projection.jl
index 108f41a751..e50985b370 100644
--- a/src/Geometry/rmul_with_projection.jl
+++ b/src/Geometry/mul_with_projection.jl
@@ -1,7 +1,5 @@
 import LinearAlgebra: Adjoint, AdjointAbsVec
-import .RecursiveApply: rmap, rmaptype
-# import LinearAlgebra: I, UniformScaling, Adjoint, AdjointAbsVec
-# Types that are treated as single values when using matrix fields.
+
 const SingleValue = Union{Number, AxisTensor, AdjointAxisTensor}
 
 """
@@ -17,17 +15,6 @@ mul_with_projection(x, y, _) = x * y
 mul_with_projection(x::Union{AdjointAxisVector, Axis2TensorOrAdj}, y::AxisTensor, lg) =
     x * project(dual(axes(x)[2]), y, lg)
 
-"""
-    rmul_with_projection(x, y, lg)
-
-Similar to `rmul(x, y)`, except that this version calls `mul_with_projection`
-instead of `*`.
-"""
-rmul_with_projection(x, y, lg) = rmap((x′, y′) -> mul_with_projection(x′, y′, lg), x, y)
-rmul_with_projection(x::SingleValue, y, lg) = rmap(y′ -> mul_with_projection(x, y′, lg), y)
-rmul_with_projection(x, y::SingleValue, lg) = rmap(x′ -> mul_with_projection(x′, y, lg), x)
-rmul_with_projection(x::SingleValue, y::SingleValue, lg) = mul_with_projection(x, y, lg)
-
 axis_tensor_type(::Type{T}, ::Type{Tuple{A1}}) where {T, A1} =
     AxisVector{T, A1, SVector{_length(A1), T}}
 function axis_tensor_type(::Type{T}, ::Type{Tuple{A1, A2}}) where {T, A1, A2}
@@ -48,37 +35,15 @@ axis2(::Type{<:AdjointAxis2Tensor{<:Any, <:Tuple{A, Any}}}) where {A} = A
 """
     needs_projection(::Type{X}, ::Type{Y})
 
-Returns `true` if multiplying an object of type `X` with an object of type `Y` would require
-projection. This always returns false if `X` or `Y` are a `Tuple` or `NamedTuple` with
-eltype any.
+Returns `true` if multiplying an object of type `X` with an object of type `Y`
+would require projection.
 """
-needs_projection(::Type{X}, ::Type{Y}) where {X <: Number, Y <: SingleValue} = false
-needs_projection(::Type{X}, ::Type{Y}) where {X <: SingleValue, Y <: SingleValue} = false
-function needs_projection(::Type{X}, ::Type{Y}) where {X, Y}
-    (eltype(X) === Any || eltype(Y) === Any) && return false
-    needs_projection(eltype(X), eltype(Y))
-end
+needs_projection(::Type{X}, ::Type{Y}) where {X, Y} = false
 needs_projection(
     ::Type{X},
     ::Type{Y},
 ) where {X <: Union{AdjointAxisVector, Axis2TensorOrAdj}, Y <: AxisTensor} =
     axes(X)[2] != Geometry.dual(axes(Y)[1])
-function needs_projection(
-    ::Type{X},
-    ::Type{Y},
-) where {X <: SingleValue, Y <: Union{Tuple, NamedTuple}}
-    X <: Number && return false
-    eltype(Y) === Any && return false
-    needs_projection(X, eltype(Y))
-end
-function needs_projection(
-    ::Type{X},
-    ::Type{Y},
-) where {X <: Union{Tuple, NamedTuple}, Y <: SingleValue}
-    Y <: Number && return false
-    eltype(X) === Any && return false
-    needs_projection(eltype(X), Y)
-end
 
 recursively_find_dual_axes_for_projection(
     ::Type{X},
@@ -147,22 +112,3 @@ mul_return_type(
     ::Type{X}, ::Type{Y},
 ) where {T1, T2, X <: Axis2TensorOrAdj{T1}, Y <: Axis2TensorOrAdj{T2}} =
     axis_tensor_type(promote_type(T1, T2), Tuple{axis1(X), axis2(Y)})
-
-"""
-    rmul_return_type(X, Y)
-
-Computes the return type of `rmul_with_projection(x, y, lg)`, where `x isa X`
-and `y isa Y`. This can also be used to obtain the return type of `rmul(x, y)`,
-although `rmul(x, y)` will throw an error when projection is necessary.
-
-Note that this is equivalent to calling the internal function `_return_type`:
-`Base._return_type(rmul_with_projection, Tuple{X, Y, LG})`, where `lg isa LG`.
-"""
-rmul_return_type(::Type{X}, ::Type{Y}) where {X, Y} =
-    rmaptype((X′, Y′) -> mul_return_type(X′, Y′), X, Y)
-rmul_return_type(::Type{X}, ::Type{Y}) where {X <: SingleValue, Y} =
-    rmaptype(Y′ -> mul_return_type(X, Y′), Y)
-rmul_return_type(::Type{X}, ::Type{Y}) where {X, Y <: SingleValue} =
-    rmaptype(X′ -> mul_return_type(X′, Y), X)
-rmul_return_type(::Type{X}, ::Type{Y}) where {X <: SingleValue, Y <: SingleValue} =
-    mul_return_type(X, Y)
diff --git a/src/Limiters/Limiters.jl b/src/Limiters/Limiters.jl
index 9859330fa3..6eef103a20 100644
--- a/src/Limiters/Limiters.jl
+++ b/src/Limiters/Limiters.jl
@@ -1,7 +1,6 @@
 module Limiters
 
 import ..DataLayouts, ..Topologies, ..Spaces, ..Fields
-import ..RecursiveApply: rdiv, rmin, rmax
 import ..DebugOnly: call_post_op_callback, post_op_callback
 import ClimaCore: slab
 
diff --git a/src/Limiters/quasimonotone.jl b/src/Limiters/quasimonotone.jl
index 35fba4fd92..6acc3d8524 100644
--- a/src/Limiters/quasimonotone.jl
+++ b/src/Limiters/quasimonotone.jl
@@ -1,6 +1,5 @@
 import ClimaComms
 import ..Operators
-import ..RecursiveApply: ⊠, ⊞, ⊟, rmap, rzero, rdiv
 import ..DataLayouts: slab_index
 import Adapt
 
@@ -186,8 +185,8 @@ function compute_element_bounds!(
     ρ,
     dev::ClimaComms.AbstractCPUDevice,
 )
-    ρ_data = Fields.field_values(ρ)
-    ρq_data = Fields.field_values(ρq)
+    ρ_data = Base.broadcastable(Fields.field_values(ρ))
+    ρq_data = Base.broadcastable(Fields.field_values(ρq))
     q_bounds = limiter.q_bounds
     (Ni, Nj, _, Nv, Nh) = size(ρq_data)
     for h in 1:Nh
@@ -197,16 +196,13 @@ function compute_element_bounds!(
             local q_min, q_max
             for j in 1:Nj
                 for i in 1:Ni
-                    q = rdiv(
-                        slab_ρq[slab_index(i, j)],
-                        slab_ρ[slab_index(i, j)],
-                    )
+                    q = slab_ρq[slab_index(i, j)] / slab_ρ[slab_index(i, j)]
                     if i == 1 && j == 1
                         q_min = q
                         q_max = q
                     else
-                        q_min = rmin(q_min, q)
-                        q_max = rmax(q_max, q)
+                        q_min = min(q_min, q)
+                        q_max = max(q_max, q)
                     end
                 end
             end
@@ -237,7 +233,7 @@ function compute_neighbor_bounds_local!(
     dev::ClimaComms.AbstractCPUDevice,
 )
     topology = Spaces.topology(axes(ρ))
-    q_bounds = limiter.q_bounds
+    q_bounds = Base.broadcastable(limiter.q_bounds)
     q_bounds_nbr = limiter.q_bounds_nbr
     (_, _, _, Nv, Nh) = size(q_bounds_nbr)
     for h in 1:Nh
@@ -247,8 +243,8 @@ function compute_neighbor_bounds_local!(
             q_max = slab_q_bounds[slab_index(2)]
             for h_nbr in Topologies.local_neighboring_elements(topology, h)
                 slab_q_bounds = slab(q_bounds, v, h_nbr)
-                q_min = rmin(q_min, slab_q_bounds[slab_index(1)])
-                q_max = rmax(q_max, slab_q_bounds[slab_index(2)])
+                q_min = min(q_min, slab_q_bounds[slab_index(1)])
+                q_max = max(q_max, slab_q_bounds[slab_index(2)])
             end
             slab_q_bounds_nbr = slab(q_bounds_nbr, v, h)
             slab_q_bounds_nbr[slab_index(1)] = q_min
@@ -275,8 +271,7 @@ function compute_neighbor_bounds_ghost!(
     q_bounds_nbr = limiter.q_bounds_nbr
     (_, _, _, Nv, Nh) = size(q_bounds_nbr)
     if limiter.ghost_buffer isa Topologies.GhostBuffer
-        q_bounds_ghost = limiter.ghost_buffer.recv_data
-
+        q_bounds_ghost = Base.broadcastable(limiter.ghost_buffer.recv_data)
         for h in 1:Nh
             for v in 1:Nv
                 slab_q_bounds = slab(q_bounds_nbr, v, h)
@@ -284,8 +279,8 @@ function compute_neighbor_bounds_ghost!(
                 q_max = slab_q_bounds[slab_index(2)]
                 for gidx in Topologies.ghost_neighboring_elements(topology, h)
                     ghost_slab_q_bounds = slab(q_bounds_ghost, v, gidx)
-                    q_min = rmin(q_min, ghost_slab_q_bounds[slab_index(1)])
-                    q_max = rmax(q_max, ghost_slab_q_bounds[slab_index(2)])
+                    q_min = min(q_min, ghost_slab_q_bounds[slab_index(1)])
+                    q_max = max(q_max, ghost_slab_q_bounds[slab_index(2)])
                 end
                 slab_q_bounds_nbr = slab(q_bounds_nbr, v, h)
                 slab_q_bounds_nbr[slab_index(1)] = q_min
diff --git a/src/MatrixFields/MatrixFields.jl b/src/MatrixFields/MatrixFields.jl
index 5be04a597c..69d1d006f1 100644
--- a/src/MatrixFields/MatrixFields.jl
+++ b/src/MatrixFields/MatrixFields.jl
@@ -16,9 +16,9 @@ for them:
 - Matrix-matrix multiplication, e.g., `@. matrix_field1 * matrix_field2`
 - Compatibility with `LinearAlgebra.I`, e.g., `@. matrix_field = (4I,)` or
     `@. matrix_field - (4I,)`
-- Integration with `RecursiveApply`, e.g., the entries of `matrix_field` can be
-    `Tuple`s or `NamedTuple`s instead of single values, which allows
-    `matrix_field` to represent multiple band matrices at the same time
+- Compatibility with generic data types, e.g., the entries of `matrix_field` can
+    be iterators instead of single values, which allows `matrix_field` to
+    represent multiple band matrices at the same time
 - Integration with `Operators`, e.g., the `matrix_field` that gets applied to
     the argument of any `FiniteDifferenceOperator` `op` can be obtained using
     the `FiniteDifferenceOperator` `operator_matrix(op)`
@@ -35,9 +35,6 @@ multiples of `LinearAlgebra.I`. This comes with the following functionality:
 - Addition and subtraction, e.g., `@. field_matrix1 + field_matrix2`
 - Matrix-vector multiplication, e.g., `@. field_matrix * field_vector`
 - Matrix-matrix multiplication, e.g., `@. field_matrix1 * field_matrix2`
-- Integration with `RecursiveApply`, e.g., the entries of `field_matrix` can be
-    specified either as matrix `Field`s of `Tuple`s or `NamedTuple`s, or as
-    separate matrix `Field`s of single values
 - The ability to solve linear equations using `FieldMatrixSolver`, which is a
     generalization of `ldiv!` that is designed to optimize solver performance
 """
@@ -54,10 +51,10 @@ import NVTX
 import Adapt
 using UnrolledUtilities
 
-import ..Utilities: PlusHalf, half
-import ..RecursiveApply:
-    rmap, rmaptype, rpromote_type, rzero, rconvert, radd, rsub, rmul, rdiv
-import ..RecursiveApply: ⊠, ⊞, ⊟
+import ..RecursiveApply: rzero
+import ..Utilities: PlusHalf, half, new
+import ..Utilities: AutoBroadcaster, is_auto_broadcastable, auto_broadcasted
+import ..Utilities: add_auto_broadcasters, drop_auto_broadcasters
 import ..DataLayouts
 import ..DataLayouts: AbstractData
 import ..DataLayouts: vindex
@@ -67,11 +64,7 @@ import ..Spaces
 import ..Spaces: local_geometry_type
 import ..Fields
 import ..Operators
-using ..Geometry:
-    rmul_with_projection,
-    mul_with_projection,
-    axis_tensor_type,
-    rmul_return_type
+using ..Geometry: mul_with_projection, mul_return_type, axis_tensor_type
 
 export DiagonalMatrixRow,
     BidiagonalMatrixRow,
@@ -104,41 +97,18 @@ include("field_matrix_solver.jl")
 include("field_matrix_iterative_solver.jl")
 include("field_matrix_with_solver.jl")
 
-const FieldOrStencilStyleType = Union{
-    Fields.Field,
-    Base.Broadcast.Broadcasted{<:Fields.AbstractFieldStyle},
-    Operators.StencilBroadcasted,
-    LazyOperatorBroadcasted,
-}
-
-function Base.Broadcast.broadcasted(
-    ::typeof(*),
-    field_or_broadcasted::FieldOrStencilStyleType,
-    args...,
-)
+# Evaluate multiplications in left-associative order. This should technically be
+# right-associative, but flipping the order worsens performance in GPU kernels,
+# where the second argument of each matrix product is cached. Left-associativity
+# makes the first argument grow in bandwidth when multiplying a chain of
+# matrices, whereas right-associativity makes the second argument grow instead.
+Base.broadcasted(::Fields.AbstractFieldStyle, ::typeof(*), arg, args...) =
+    unrolled_reduce((x, y) -> Base.broadcasted(*, x, y), args; init = arg)
 
-    unrolled_reduce(args; init = field_or_broadcasted) do arg1, arg2
-        arg1_isa_matrix =
-            arg1 isa LazyOperatorBroadcasted && length(arg1.args) > 0 ?
-            eltype(arg1.args[1]) <: BandMatrixRow ||
-            arg1.args[1] isa LazyOperatorBroadcasted :
-            eltype(arg1) <: BandMatrixRow || arg1 isa LazyOperatorBroadcasted
-        use_matrix_mul_op = arg1_isa_matrix && arg2 isa FieldOrStencilStyleType
-        op = use_matrix_mul_op ? MultiplyColumnwiseBandMatrixField() : ⊠
-        Base.Broadcast.broadcasted(op, arg1, arg2)
-    end
-end
-Base.Broadcast.broadcasted(
-    ::typeof(*),
-    single_value_or_broadcasted::SingleValueStyleType,
-    field_or_broadcasted::FieldOrStencilStyleType,
-    args...,
-) = Base.Broadcast.broadcasted(
-    ⊠,
-    single_value_or_broadcasted,
-    Base.Broadcast.broadcasted(*, field_or_broadcasted, args...),
-)
-# TODO: Generalize this to handle, e.g., @. scalar * scalar * matrix * matrix.
+Base.broadcasted(style::Fields.AbstractFieldStyle, ::typeof(*), x, y) =
+    check_entry(FieldNamePair, x) && check_entry(FieldName, y) ?
+    Base.broadcasted(MultiplyColumnwiseBandMatrixField(), x, y) :
+    auto_broadcasted(style, *, (x, y))
 
 function Base.show(io::IO, field::ColumnwiseBandMatrixField)
     print(io, eltype(field), "-valued Field")
diff --git a/src/MatrixFields/band_matrix_row.jl b/src/MatrixFields/band_matrix_row.jl
index 0b795fc091..d8d4b727e8 100644
--- a/src/MatrixFields/band_matrix_row.jl
+++ b/src/MatrixFields/band_matrix_row.jl
@@ -16,14 +16,12 @@ several aliases for commonly used subtypes of `BandMatrixRow`:
 struct BandMatrixRow{ld, bw, T} # bw is the bandwidth (the number of diagonals)
     entries::NTuple{bw, T}
     BandMatrixRow{ld, bw, T}(entries::NTuple{bw, Any}) where {ld, bw, T} =
-        new{ld, bw, T}(rconvert(NTuple{bw, T}, entries))
-    # TODO: Remove this inner constructor once Julia's default convert function
-    # is type-stable for nested Tuple/NamedTuple types.
+        new{ld, bw, T}(entries)
 end
 BandMatrixRow{ld}(entries::Vararg{Any, bw}) where {ld, bw} =
     BandMatrixRow{ld, bw}(entries...)
 BandMatrixRow{ld, bw}(entries::Vararg{Any, bw}) where {ld, bw} =
-    BandMatrixRow{ld, bw, rpromote_type(map(typeof, entries)...)}(entries)
+    BandMatrixRow{ld, bw, promote_type(map(typeof, entries)...)}(entries)
 
 const DiagonalMatrixRow{T} = BandMatrixRow{0, 1, T}
 const BidiagonalMatrixRow{T} = BandMatrixRow{-1 + half, 2, T}
@@ -74,7 +72,7 @@ function Base.promote_rule(
          row type $BMR1 and the $(ld2 isa PlusHalf ? "non-" : "")square matrix \
          row type $BMR2 to a common type",
     )
-    T = rpromote_type(eltype(BMR1), eltype(BMR2))
+    T = promote_type(eltype(BMR1), eltype(BMR2))
     return band_matrix_row_type(min(ld1, ld2), max(ud1, ud2), T)
 end
 
@@ -84,6 +82,7 @@ Base.promote_rule(
 ) where {BMR <: BandMatrixRow, US <: UniformScaling} =
     promote_rule(BMR, DiagonalMatrixRow{eltype(US)})
 
+Base.convert(::Type{BMR}, row::BMR) where {BMR <: BandMatrixRow} = row
 function Base.convert(
     ::Type{BMR},
     row::BandMatrixRow,
@@ -95,12 +94,13 @@ function Base.convert(
          row of type $(typeof(row)) to the \
          $(new_ld isa PlusHalf ? "non-" : "")square matrix row type $BMR",
     )
+    new_ld == old_ld && new_ud == old_ud && return BMR(row.entries)
     new_ld <= old_ld && new_ud >= old_ud || error(
         "Cannot convert a $(typeof(row)) to a $BMR, since that would require \
          dropping potentially non-zero row entries",
     )
-    first_zeros = ntuple(_ -> rzero(eltype(BMR)), Val(old_ld - new_ld))
-    last_zeros = ntuple(_ -> rzero(eltype(BMR)), Val(new_ud - old_ud))
+    first_zeros = ntuple(Returns(zero(eltype(BMR))), Val(old_ld - new_ld))
+    last_zeros = ntuple(Returns(zero(eltype(BMR))), Val(new_ud - old_ud))
     return BMR((first_zeros..., row.entries..., last_zeros...))
 end
 
@@ -116,44 +116,49 @@ Base.:(==)(row1::BandMatrixRow, row2::UniformScaling) =
 Base.:(==)(row1::UniformScaling, row2::BandMatrixRow) =
     ==(promote(row1, row2)...)
 
-Base.:+(row::BandMatrixRow) = map(radd, row)
+Base.:+(row::BandMatrixRow) = map(+, row)
 Base.:+(row1::BandMatrixRow, row2::BandMatrixRow) =
-    map(radd, promote(row1, row2)...)
+    map(+, promote(row1, row2)...)
 Base.:+(row1::BandMatrixRow, row2::UniformScaling) =
-    map(radd, promote(row1, row2)...)
+    map(+, promote(row1, row2)...)
 Base.:+(row1::UniformScaling, row2::BandMatrixRow) =
-    map(radd, promote(row1, row2)...)
+    map(+, promote(row1, row2)...)
 
-Base.:-(row::BandMatrixRow) = map(rsub, row)
+Base.:-(row::BandMatrixRow) = map(-, row)
 Base.:-(row1::BandMatrixRow, row2::BandMatrixRow) =
-    map(rsub, promote(row1, row2)...)
+    map(-, promote(row1, row2)...)
 Base.:-(row1::BandMatrixRow, row2::UniformScaling) =
-    map(rsub, promote(row1, row2)...)
+    map(-, promote(row1, row2)...)
 Base.:-(row1::UniformScaling, row2::BandMatrixRow) =
-    map(rsub, promote(row1, row2)...)
+    map(-, promote(row1, row2)...)
 
-Base.:*(row::BandMatrixRow, value::Geometry.SingleValue) =
-    map(entry -> rmul(entry, value), row)
-Base.:*(value::Geometry.SingleValue, row::BandMatrixRow) =
-    map(entry -> rmul(value, entry), row)
+Base.:*(row::BandMatrixRow, value::Union{Geometry.SingleValue, AutoBroadcaster}) =
+    map(Base.Fix2(*, value), row)
+Base.:*(value::Union{Geometry.SingleValue, AutoBroadcaster}, row::BandMatrixRow) =
+    map(Base.Fix1(*, value), row)
 
-Base.:/(row::BandMatrixRow, value::Number) =
-    map(entry -> rdiv(entry, value), row)
+Base.:/(row::BandMatrixRow, value::Union{Geometry.SingleValue, AutoBroadcaster}) =
+    map(Base.Fix2(/, value), row)
 
 Base.zero(row::BandMatrixRow) = zero(typeof(row))
 Base.zero(::Type{BandMatrixRow{ld, bw, T}}) where {ld, bw, T} =
-    BandMatrixRow{ld}(ntuple(_ -> rzero(T), Val(bw))...)
+    BandMatrixRow{ld}(ntuple(Returns(zero(T)), Val(bw))...)
 
 Base.one(row::BandMatrixRow) = one(typeof(row))
-Base.one(::Type{DiagonalMatrixRow{T}}) where {T} =
-    DiagonalMatrixRow(rmap(one, T))
+Base.one(::Type{DiagonalMatrixRow{T}}) where {T} = DiagonalMatrixRow(one(T))
 Base.one(::Type{BandMatrixRow{ld, bw, T}}) where {ld, bw, T} =
     ld isa PlusHalf ?
     error("A non-square matrix does not have a corresponding identity matrix") :
     one(DiagonalMatrixRow{T})
 
-inv(row::DiagonalMatrixRow) = DiagonalMatrixRow(rmap(inv, row[0]))
+inv(row::DiagonalMatrixRow) = DiagonalMatrixRow(inv(row[0]))
 inv(::BandMatrixRow{ld, bw}) where {ld, bw} = error(
     "The inverse of a matrix with $bw diagonals is generally a dense matrix, \
      so it cannot be represented using BandMatrixRows",
 )
+
+# Allow row entries to be wrapped in AutoBroadcasters, but not the row itself.
+is_auto_broadcastable(::Type{BMR}) where {BMR <: BandMatrixRow} =
+    is_auto_broadcastable(eltype(BMR))
+add_auto_broadcasters(row::BandMatrixRow) = map(add_auto_broadcasters, row)
+drop_auto_broadcasters(row::BandMatrixRow) = map(drop_auto_broadcasters, row)
diff --git a/src/MatrixFields/field_name_dict.jl b/src/MatrixFields/field_name_dict.jl
index cfb24a38f3..589f16e37b 100644
--- a/src/MatrixFields/field_name_dict.jl
+++ b/src/MatrixFields/field_name_dict.jl
@@ -75,9 +75,7 @@ is_field_broadcasted(bc) =
 check_entry(::Type{FieldName}, entry::Base.AbstractBroadcasted) =
     is_field_broadcasted(entry)
 check_entry(::Type{FieldNamePair}, entry::Base.AbstractBroadcasted) =
-    is_field_broadcasted(entry) # && eltype(entry) <: BandMatrixRow
-# TODO: Adding the eltype check introduces JET failures to several FieldMatrix
-# test cases in CI. We may to implement our own version of eltype to avoid this.
+    is_field_broadcasted(entry) && eltype(entry) <: BandMatrixRow
 
 is_diagonal_matrix_entry(::ScalingFieldMatrixEntry) = true
 is_diagonal_matrix_entry(entry) = eltype(entry) <: DiagonalMatrixRow
@@ -276,7 +274,10 @@ function get_internal_entry(
     else # fallback to broadcasted indexing on each element, currently no support for view_of_blocks
         return Base.broadcasted(entry) do matrix_row
             map(matrix_row) do matrix_row_entry
-                get_internal_entry(matrix_row_entry, name_pair)
+                get_internal_entry(
+                    drop_auto_broadcasters(matrix_row_entry),
+                    name_pair,
+                )
             end
         end
     end
@@ -546,7 +547,7 @@ e³ = Geometry.Covariant3Vector(1)
 ᶜᶠmat2 = fill(BidiagonalMatrixRow(4.3, 1.7), center_space)
 ᶜᶜmat3_uₕ_scalar = ᶜᶜmat3 .* (e¹²,)
 ρχ_unit = (;ρq_liq = 1.0, ρq_ice = 1.0)
-ᶜᶠmat2_ρχ_u₃ = map(Base.Fix1(map, Base.Fix2(⊠, ρχ_unit ⊠ e₃')), ᶜᶠmat2)
+ᶜᶠmat2_ρχ_u₃ = map(Base.Fix1(map, Base.Fix2(*, ρχ_unit * e₃')), ᶜᶠmat2)
 
 A = MatrixFields.FieldMatrix(
     (@name(c.ρχ), @name(f.u₃)) => ᶜᶠmat2_ρχ_u₃,
@@ -714,6 +715,7 @@ const SingleValueStyle =
 
 const SingleValueStyleType = Union{
     Number,
+    Ref{Geometry.SingleValue},
     Tuple{Geometry.SingleValue},
     Base.Broadcast.Broadcasted{<:SingleValueStyle},
 }
diff --git a/src/MatrixFields/field_name_set.jl b/src/MatrixFields/field_name_set.jl
index 677d2e09b1..f2ec6b62a3 100644
--- a/src/MatrixFields/field_name_set.jl
+++ b/src/MatrixFields/field_name_set.jl
@@ -188,7 +188,7 @@ end
 
 #=
 There are four cases that we need to support in order to be compatible with
-RecursiveApply (i.e., with rmul):
+generic data types:
 1. (_, name) * name or
    (_, name) * (name, _)
 2. (_, name_child) * name      -> (_, name_child) * name_child or
diff --git a/src/MatrixFields/lazy_operators.jl b/src/MatrixFields/lazy_operators.jl
index 050eb39e3a..cef0c89376 100644
--- a/src/MatrixFields/lazy_operators.jl
+++ b/src/MatrixFields/lazy_operators.jl
@@ -59,15 +59,9 @@ replace_lazy_operators(space, bc::LazyOperatorBroadcasted) =
     bc.f isa AbstractLazyOperator ? replace_lazy_operator(space, bc.f) :
     Base.Broadcast.broadcasted(
         bc.f,
-        replace_lazy_operators_args(space, bc.args...)...,
+        unrolled_map(Base.Fix1(replace_lazy_operators, space), bc.args)...,
     )
 
-replace_lazy_operators_args(_) = ()
-replace_lazy_operators_args(space, arg, args...) = (
-    replace_lazy_operators(space, arg),
-    replace_lazy_operators_args(space, args...)...,
-)
-
 """
     replace_lazy_operator(space, lazy_op)
 
@@ -83,11 +77,8 @@ replace_lazy_operator(_, ::AbstractLazyOperator) =
 
 largest_space(_) = nothing
 largest_space(field::Fields.Field) = axes(field)
-largest_space(bc::Base.AbstractBroadcasted) = largest_space_args(bc.args...)
-
-largest_space_args() = nothing
-largest_space_args(arg, args...) =
-    larger_space(largest_space(arg), largest_space_args(args...))
+largest_space(bc::Base.AbstractBroadcasted) =
+    unrolled_reduce(larger_space, unrolled_map(largest_space, bc.args); init = nothing)
 
 larger_space(::Nothing, ::Nothing) = nothing
 larger_space(space1, ::Nothing) = space1
diff --git a/src/MatrixFields/matrix_multiplication.jl b/src/MatrixFields/matrix_multiplication.jl
index a82b70e01e..61045668b7 100644
--- a/src/MatrixFields/matrix_multiplication.jl
+++ b/src/MatrixFields/matrix_multiplication.jl
@@ -207,15 +207,6 @@ Operators.strip_space(op::MultiplyColumnwiseBandMatrixField, _) = op
 struct TopLeftMatrixCorner <: Operators.AbstractBoundaryCondition end
 struct BottomRightMatrixCorner <: Operators.AbstractBoundaryCondition end
 
-Operators.has_boundary(
-    ::MultiplyColumnwiseBandMatrixField,
-    ::Operators.LeftBoundaryWindow{name},
-) where {name} = true
-Operators.has_boundary(
-    ::MultiplyColumnwiseBandMatrixField,
-    ::Operators.RightBoundaryWindow{name},
-) where {name} = true
-
 Operators.get_boundary(
     ::MultiplyColumnwiseBandMatrixField,
     ::Operators.LeftBoundaryWindow{name},
@@ -300,42 +291,11 @@ function Operators.return_eltype(
         ld1, ud1 = outer_diagonals(et_mat1)
         ld2, ud2 = outer_diagonals(et_arg)
         prod_ld, prod_ud = ld1 + ld2, ud1 + ud2
-        prod_value_type = rmul_return_type(eltype(et_mat1), eltype(et_arg))
-        return band_matrix_row_type(prod_ld, prod_ud, prod_value_type)
-    else # matrix-vector multiplication
-        vector = arg
-        return rmul_return_type(eltype(et_mat1), et_arg)
-    end
-end
-
-function Operators.return_eltype(
-    ::MultiplyColumnwiseBandMatrixField,
-    matrix1,
-    arg,
-    ::Type{LG},
-) where {LG}
-    et_mat1 = eltype(matrix1)
-    et_arg = eltype(arg)
-    et_mat1 <: BandMatrixRow || error(
-        "The first argument of MultiplyColumnwiseBandMatrixField must have
-         elements of type BandMatrixRow, but the given argument has $et_mat1",
-    )
-    if et_arg <: BandMatrixRow # matrix-matrix multiplication
-        matrix2 = arg
-        ld1, ud1 = outer_diagonals(et_mat1)
-        ld2, ud2 = outer_diagonals(et_arg)
-        prod_ld, prod_ud = ld1 + ld2, ud1 + ud2
-        prod_value_type = Base.promote_op(
-            rmul_with_projection,
-            eltype(et_mat1),
-            eltype(et_arg),
-            LG,
-        )
+        prod_value_type = mul_return_type(eltype(et_mat1), eltype(et_arg))
         return band_matrix_row_type(prod_ld, prod_ud, prod_value_type)
     else # matrix-vector multiplication
         vector = arg
-        prod_value_type =
-            Base.promote_op(rmul_with_projection, eltype(et_mat1), et_arg, LG)
+        return mul_return_type(eltype(et_mat1), et_arg)
     end
 end
 
@@ -369,13 +329,11 @@ function multiply_matrix_at_index(
     bc,
     ::Type{T},
 ) where {T <: BandMatrixRow}
-    # T = eltype(arg)
     lg = Geometry.LocalGeometry(space, idx, hidx)
     prod_type = Operators.return_eltype(
         MultiplyColumnwiseBandMatrixField(),
         matrix1,
         arg,
-        typeof(lg),
     )
 
     column_space1 = column_axes(matrix1, space)
@@ -415,7 +373,7 @@ function multiply_matrix_at_index(
 
     # Precompute the zero value to avoid inference issues caused by passing
     # prod_type into the function closure below.
-    zero_value = rzero(eltype(prod_type))
+    zero_value = zero(eltype(prod_type))
 
     # Compute the entries of the product matrix row. To avoid inference
     # issues at boundary points, this is implemented as a padded map from
@@ -435,10 +393,7 @@ function multiply_matrix_at_index(
                 value1 = matrix1_row[d]
                 value2 = matrix2_rows_wrapper[d][prod_d - d]
                 value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
-                prod_entry = radd(
-                    prod_entry,
-                    rmul_with_projection(value1, value2, value2_lg),
-                )
+                prod_entry += mul_with_projection(value1, value2, value2_lg)
             end # Using a for-loop is currently faster than using mapreduce.
             prod_entry
         else
@@ -457,13 +412,11 @@ function multiply_matrix_at_index(
     bc,
     ::Type{T},
 ) where {T}
-    # T = eltype(arg)
     lg = Geometry.LocalGeometry(space, idx, hidx)
     prod_type = Operators.return_eltype(
         MultiplyColumnwiseBandMatrixField(),
         matrix1,
         arg,
-        typeof(lg),
     )
 
     column_space1 = column_axes(matrix1, space)
@@ -476,13 +429,12 @@ function multiply_matrix_at_index(
     matrix1_row = @inbounds Operators.getidx(space, matrix1, idx, hidx)
 
     vector = arg
-    prod_value = rzero(prod_type)
+    prod_value = zero(prod_type)
     @inbounds for d in boundary_modified_ld1:boundary_modified_ud1
         value1 = matrix1_row[d]
         value2 = Operators.getidx(space, vector, idx + d, hidx)
         value2_lg = Geometry.LocalGeometry(space, idx + d, hidx)
-        prod_value =
-            radd(prod_value, rmul_with_projection(value1, value2, value2_lg))
+        prod_value += mul_with_projection(value1, value2, value2_lg)
     end # Using a for-loop is currently faster than using mapreduce.
     return prod_value
 end
diff --git a/src/MatrixFields/multiple_field_solver.jl b/src/MatrixFields/multiple_field_solver.jl
index 926de822f1..c065d83cde 100644
--- a/src/MatrixFields/multiple_field_solver.jl
+++ b/src/MatrixFields/multiple_field_solver.jl
@@ -1,23 +1,12 @@
-# TODO: Can different A's be different matrix styles?
-#       if so, how can we handle fuse/parallelize?
-
-# First, dispatch based on the first x and the device:
 function multiple_field_solve!(cache, x, A, b)
-    name1 = first(matrix_row_keys(keys(A)))
-    x1 = x[name1]
-    multiple_field_solve!(ClimaComms.device(axes(x1)), cache, x, A, b, x1)
+    x1 = first(values(x))
+    x_bc = FieldNameDict(keys(x), unrolled_map(Base.broadcastable, values(x)))
+    b_bc = FieldNameDict(keys(b), unrolled_map(Base.broadcastable, values(b)))
+    multiple_field_solve!(ClimaComms.device(axes(x1)), cache, x_bc, A, b_bc)
 end
 
 # TODO: fuse/parallelize
-function multiple_field_solve!(
-    ::ClimaComms.AbstractCPUDevice,
-    cache,
-    x,
-    A,
-    b,
-    x1,
-)
+multiple_field_solve!(::ClimaComms.AbstractCPUDevice, cache, x, A, b) =
     foreach(matrix_row_keys(keys(A))) do name
         single_field_solve!(cache[name], x[name], A[name, name], b[name])
     end
-end
diff --git a/src/MatrixFields/operator_matrices.jl b/src/MatrixFields/operator_matrices.jl
index 59ae6dbeba..5a332f4aae 100644
--- a/src/MatrixFields/operator_matrices.jl
+++ b/src/MatrixFields/operator_matrices.jl
@@ -225,15 +225,6 @@ operator_matrix(::O) where {O <: Operators.AbstractOperator} =
 
 ################################################################################
 
-Operators.has_boundary(
-    op_matrix::FDOperatorMatrix,
-    lbw::Operators.LeftBoundaryWindow{name},
-) where {name} = Operators.has_boundary(op_matrix.op, lbw)
-Operators.has_boundary(
-    op_matrix::FDOperatorMatrix,
-    rbw::Operators.RightBoundaryWindow{name},
-) where {name} = Operators.has_boundary(op_matrix.op, rbw)
-
 Operators.get_boundary(
     op_matrix::FDOperatorMatrix,
     lbw::Operators.LeftBoundaryWindow{name},
@@ -321,6 +312,25 @@ op_matrix_first_row(op, bc, space, idx, hidx, args...) =
 op_matrix_last_row(op, bc, space, idx, hidx, args...) =
     op_matrix_last_row(op, bc, Spaces.undertype(space))
 
+# Fallback methods for unspecified boundary conditions (need to use zero here
+# instead of NaN to avoid polluting nearby interior rows with NaNs)
+Operators.stencil_left_boundary(
+    op_matrix::FDOperatorMatrix,
+    ::Operators.NullBoundaryCondition,
+    space,
+    _,
+    _,
+    args...,
+) = zero(Operators.return_eltype(op_matrix, args...))
+Operators.stencil_right_boundary(
+    op_matrix::FDOperatorMatrix,
+    ::Operators.NullBoundaryCondition,
+    space,
+    _,
+    _,
+    args...,
+) = zero(Operators.return_eltype(op_matrix, args...))
+
 ################################################################################
 
 # Additional aliases for CenterToFace or FaceToCenter matrix rows
@@ -438,8 +448,8 @@ Base.@propagate_inbounds function op_matrix_interior_row(
 )
     w⁻ = Operators.getidx(space, weight, idx - half, hidx)
     w⁺ = Operators.getidx(space, weight, idx + half, hidx)
-    denominator = radd(w⁻, w⁺)
-    return BidiagonalMatrixRow(rdiv(w⁻, denominator), rdiv(w⁺, denominator))
+    denominator = w⁻ + w⁺
+    return BidiagonalMatrixRow(w⁻ / denominator, w⁺ / denominator)
 end
 op_matrix_first_row(
     ::Operators.WeightedInterpolateC2F,
diff --git a/src/MatrixFields/single_field_solver.jl b/src/MatrixFields/single_field_solver.jl
index 74fb45053d..0ed4d92d1d 100644
--- a/src/MatrixFields/single_field_solver.jl
+++ b/src/MatrixFields/single_field_solver.jl
@@ -12,19 +12,12 @@ inv_return_type(::Type{X}) where {T, X <: Geometry.Axis2TensorOrAdj{T}} =
         Tuple{dual_type(Geometry.axis2(X)), dual_type(Geometry.axis1(X))},
     )
 
-x_eltype(A::ScalingFieldMatrixEntry, b) = x_eltype(eltype(A), eltype(b))
+x_eltype(A::ScalingFieldMatrixEntry, b) =
+    x_type(eltype(A), eltype(Base.broadcastable(b)))
 x_eltype(A::ColumnwiseBandMatrixField, b) =
-    x_eltype(eltype(eltype(A)), eltype(b))
-x_eltype(::Type{T_A}, ::Type{T_b}) where {T_A, T_b} =
-    rmul_return_type(inv_return_type(T_A), T_b)
-# Base.promote_op(rmul_with_projection, inv_return_type(T_A), T_b, LG)
-
-unit_eltype(A::ScalingFieldMatrixEntry) = eltype(A)
-unit_eltype(A::ColumnwiseBandMatrixField) =
-    unit_eltype(eltype(eltype(A)), local_geometry_type(A))
-unit_eltype(::Type{T_A}, ::Type{LG}) where {T_A, LG} =
-    rmul_return_type(inv_return_type(T_A), T_A)
-# Base.promote_op(rmul_with_projection, inv_return_type(T_A), T_A, LG)
+    x_type(eltype(eltype(A)), eltype(Base.broadcastable(b)))
+x_type(::Type{T_A}, ::Type{T_b}) where {T_A, T_b} =
+    mul_return_type(inv_return_type(T_A), T_b)
 
 ################################################################################
 
@@ -43,32 +36,22 @@ end
 single_field_solver_cache(::ScalingFieldMatrixEntry, b) = similar(b, Tuple{})
 function single_field_solver_cache(A::ColumnwiseBandMatrixField, b)
     ud = outer_diagonals(eltype(A))[2]
-    cache_eltype =
-        ud == 0 ? Tuple{} :
-        Tuple{x_eltype(A, b), ntuple(_ -> unit_eltype(A), Val(ud))...}
-    return similar(b, cache_eltype)
+    ud == 0 && return similar(b, Tuple{})
+    T_U = mul_return_type(inv_return_type(eltype(eltype(A))), eltype(eltype(A)))
+    return similar(b, Tuple{x_eltype(A, b), ntuple(Returns(T_U), Val(ud))...})
 end
 
-function single_field_solve_diag_matrix_row!(
-    cache,
-    x,
-    A::ColumnwiseBandMatrixField,
-    b,
-)
-    # Use fields here, and not field values, so that this operation is
-    # mask-aware.
-    A₀ = A.entries.:1
-    @. x = inv(A₀) ⊠ b
-end
 single_field_solve!(_, x, A::ScalingFieldMatrixEntry, b) =
     x .= (inv(scaling_value(A)),) .* b
-function single_field_solve!(cache, x, A::ColumnwiseBandMatrixField, b)
+single_field_solve!(cache, x, A::ColumnwiseBandMatrixField, b) =
     if eltype(A) <: MatrixFields.DiagonalMatrixRow
-        single_field_solve_diag_matrix_row!(cache, x, A, b)
+        A₀ = A.entries.:1
+        @. x = inv(A₀) * b
     else
-        single_field_solve!(ClimaComms.device(axes(A)), cache, x, A, b)
+        x_bc = Base.broadcastable(x)
+        b_bc = Base.broadcastable(b)
+        single_field_solve!(ClimaComms.device(axes(A)), cache, x_bc, A, b_bc)
     end
-end
 
 single_field_solve!(::ClimaComms.AbstractCPUDevice, cache, x, A, b) =
     _single_field_solve!(ClimaComms.device(axes(A)), cache, x, A, b)
@@ -85,13 +68,12 @@ function _single_field_solve!(
     mask = Spaces.get_mask(space)
     if space isa Spaces.FiniteDifferenceSpace
         @assert mask isa DataLayouts.NoMask
-        _single_field_solve_col!(device, cache, x, A, b)
+        single_field_solve_col!(cache, x, A, b)
     else
         Fields.bycolumn(space) do colidx
             I = Fields.universal_index(colidx)
             if DataLayouts.should_compute(mask, I)
-                _single_field_solve_col!(
-                    device,
+                single_field_solve_col!(
                     cache[colidx],
                     x[colidx],
                     A[colidx],
@@ -102,28 +84,15 @@ function _single_field_solve!(
     end
 end
 
-function _single_field_solve_col!(
-    ::ClimaComms.AbstractCPUDevice,
-    cache,
-    x,
-    A,
-    b,
-)
-    if A isa Fields.ColumnField
-        band_matrix_solve!(
-            eltype(A),
-            unzip_tuple_field_values(Fields.field_values(cache)),
-            Fields.field_values(x),
-            unzip_tuple_field_values(Fields.field_values(A.entries)),
-            Fields.field_values(b),
-            vindex,
-        )
-    elseif A isa ScalingFieldMatrixEntry
-        x .= (inv(scaling_value(A)),) .* b
-    else
-        error("uncaught case")
-    end
-end
+single_field_solve_col!(cache, x, A, b) =
+    band_matrix_solve!(
+        eltype(A),
+        unzip_tuple_field_values(Fields.field_values(cache)),
+        Fields.field_values(x),
+        unzip_tuple_field_values(Fields.field_values(A.entries)),
+        Fields.field_values(b),
+        vindex,
+    )
 
 unzip_tuple_field_values(data) =
     ntuple(i -> data.:($i), Val(length(propertynames(data))))
@@ -132,7 +101,7 @@ function band_matrix_solve!(::Type{<:DiagonalMatrixRow}, _, x, Aⱼs, b, vi)
     (A₀,) = Aⱼs
     n = length(x)
     @inbounds for i in 1:n
-        x[vi(i)] = inv(A₀[vi(i)]) ⊠ b[vi(i)]
+        x[vi(i)] = inv(A₀[vi(i)]) * b[vi(i)]
     end
 end
 
@@ -163,18 +132,18 @@ function band_matrix_solve!(
     n = length(x)
     @inbounds begin
         inv_D₀ = inv(A₀[vi(1)])
-        U₊₁ᵢ₋₁ = inv_D₀ ⊠ A₊₁[vi(1)]
-        Uxᵢ₋₁ = inv_D₀ ⊠ b[vi(1)]
+        U₊₁ᵢ₋₁ = inv_D₀ * A₊₁[vi(1)]
+        Uxᵢ₋₁ = inv_D₀ * b[vi(1)]
         Ux[vi(1)] = Uxᵢ₋₁
         U₊₁[vi(1)] = U₊₁ᵢ₋₁
 
         for i in 2:n
             A₋₁ᵢ = A₋₁[vi(i)]
-            inv_D₀ = inv(A₀[vi(i)] ⊟ A₋₁ᵢ ⊠ U₊₁ᵢ₋₁)
-            Uxᵢ₋₁ = inv_D₀ ⊠ (b[vi(i)] ⊟ A₋₁ᵢ ⊠ Uxᵢ₋₁)
+            inv_D₀ = inv(A₀[vi(i)] - A₋₁ᵢ * U₊₁ᵢ₋₁)
+            Uxᵢ₋₁ = inv_D₀ * (b[vi(i)] - A₋₁ᵢ * Uxᵢ₋₁)
             Ux[vi(i)] = Uxᵢ₋₁
             if i < n
-                U₊₁ᵢ₋₁ = inv_D₀ ⊠ A₊₁[vi(i)] # U₊₁[n] is outside the matrix.
+                U₊₁ᵢ₋₁ = inv_D₀ * A₊₁[vi(i)] # U₊₁[n] is outside the matrix.
                 U₊₁[vi(i)] = U₊₁ᵢ₋₁
             end
         end
@@ -184,7 +153,7 @@ function band_matrix_solve!(
         i = (n - 1)
         # for i in (n - 1):-1:1
         while i ≥ 1
-            x[vi(i)] = Ux[vi(i)] ⊟ U₊₁[vi(i)] ⊠ x[vi(i + 1)]
+            x[vi(i)] = Ux[vi(i)] - U₊₁[vi(i)] * x[vi(i + 1)]
             i -= 1
         end
     end
@@ -222,36 +191,36 @@ function band_matrix_solve!(
     n = length(x)
     @inbounds begin
         inv_D₀ = inv(A₀[vi(1)])
-        Ux[vi(1)] = inv_D₀ ⊠ b[vi(1)]
-        U₊₁[vi(1)] = inv_D₀ ⊠ A₊₁[vi(1)]
-        U₊₂[vi(1)] = inv_D₀ ⊠ A₊₂[vi(1)]
+        Ux[vi(1)] = inv_D₀ * b[vi(1)]
+        U₊₁[vi(1)] = inv_D₀ * A₊₁[vi(1)]
+        U₊₂[vi(1)] = inv_D₀ * A₊₂[vi(1)]
 
-        inv_D₀ = inv(A₀[vi(2)] ⊟ A₋₁[vi(2)] ⊠ U₊₁[vi(1)])
-        Ux[vi(2)] = inv_D₀ ⊠ (b[vi(2)] ⊟ A₋₁[vi(2)] ⊠ Ux[vi(1)])
-        U₊₁[vi(2)] = inv_D₀ ⊠ (A₊₁[vi(2)] ⊟ A₋₁[vi(2)] ⊠ U₊₂[vi(1)])
-        U₊₂[vi(2)] = inv_D₀ ⊠ A₊₂[vi(2)]
+        inv_D₀ = inv(A₀[vi(2)] - A₋₁[vi(2)] * U₊₁[vi(1)])
+        Ux[vi(2)] = inv_D₀ * (b[vi(2)] - A₋₁[vi(2)] * Ux[vi(1)])
+        U₊₁[vi(2)] = inv_D₀ * (A₊₁[vi(2)] - A₋₁[vi(2)] * U₊₂[vi(1)])
+        U₊₂[vi(2)] = inv_D₀ * A₊₂[vi(2)]
 
         for i in 3:n
-            L₋₁ = A₋₁[vi(i)] ⊟ A₋₂[vi(i)] ⊠ U₊₁[vi(i - 2)]
+            L₋₁ = A₋₁[vi(i)] - A₋₂[vi(i)] * U₊₁[vi(i - 2)]
             inv_D₀ = inv(
-                A₀[vi(i)] ⊟ L₋₁ ⊠ U₊₁[vi(i - 1)] ⊟ A₋₂[vi(i)] ⊠ U₊₂[vi(i - 2)],
+                A₀[vi(i)] - L₋₁ * U₊₁[vi(i - 1)] - A₋₂[vi(i)] * U₊₂[vi(i - 2)],
             )
             Ux[vi(i)] =
-                inv_D₀ ⊠
-                (b[vi(i)] ⊟ L₋₁ ⊠ Ux[vi(i - 1)] ⊟ A₋₂[vi(i)] ⊠ Ux[vi(i - 2)])
-            i < n && (U₊₁[vi(i)] = inv_D₀ ⊠ (A₊₁[vi(i)] ⊟ L₋₁ ⊠ U₊₂[vi(i - 1)]))
-            i < n - 1 && (U₊₂[vi(i)] = inv_D₀ ⊠ A₊₂[vi(i)])
+                inv_D₀ *
+                (b[vi(i)] - L₋₁ * Ux[vi(i - 1)] - A₋₂[vi(i)] * Ux[vi(i - 2)])
+            i < n && (U₊₁[vi(i)] = inv_D₀ * (A₊₁[vi(i)] - L₋₁ * U₊₂[vi(i - 1)]))
+            i < n - 1 && (U₊₂[vi(i)] = inv_D₀ * A₊₂[vi(i)])
         end
 
         x[vi(n)] = Ux[vi(n)]
-        x[vi(n - 1)] = Ux[vi(n - 1)] ⊟ U₊₁[vi(n - 1)] ⊠ x[vi(n)]
+        x[vi(n - 1)] = Ux[vi(n - 1)] - U₊₁[vi(n - 1)] * x[vi(n)]
         # Avoid steprange on GPU: https://cuda.juliagpu.org/stable/tutorials/performance/#Avoiding-StepRange
         # for i in (n - 2):-1:1
         i = (n - 2)
         while i ≥ 1
             x[vi(i)] =
-                Ux[vi(i)] ⊟ U₊₁[vi(i)] ⊠ x[vi(i + 1)] ⊟
-                U₊₂[vi(i)] ⊠ x[vi(i + 2)]
+                Ux[vi(i)] - U₊₁[vi(i)] * x[vi(i + 1)] -
+                U₊₂[vi(i)] * x[vi(i + 2)]
             i -= 1
         end
     end
@@ -266,8 +235,6 @@ eltype(x), eltype(A), and eltype(b):
 - SVector{N}, SMatrix{N, N}, and SVector{N}
 - AxisVector with axis A1, Axis2TensorOrAdj with axes (A2, dual(A1)), and
   AxisVector with axis A2
-- nested type (Tuple or NamedTuple), scalar type (Number, SMatrix, or
-  Axis2TensorOrAdj), nested type (Tuple or NamedTuple)
 
 We might eventually want a single general method for band_matrix_solve!, similar
 to the BLAS.gbsv function. For now, though, the methods above should be enough.
diff --git a/src/Operators/Operators.jl b/src/Operators/Operators.jl
index 375b6b5c73..57ccaed22f 100644
--- a/src/Operators/Operators.jl
+++ b/src/Operators/Operators.jl
@@ -8,6 +8,8 @@ import Base.Broadcast: Broadcasted
 
 import ..slab, ..slab_args, ..column, ..column_args
 import ClimaComms
+import ..Utilities:
+    new, is_auto_broadcastable, add_auto_broadcasters, drop_auto_broadcasters
 import ..DebugOnly: call_post_op_callback, post_op_callback
 import ..DataLayouts: DataLayouts, Data2D, DataSlab2D
 import ..DataLayouts: vindex
@@ -18,8 +20,6 @@ import ..Meshes
 import ..Grids
 import ..Fields: Fields, Field
 
-using ..RecursiveApply
-
 include("common.jl")
 include("spectralelement.jl")
 include("numericalflux.jl")
diff --git a/src/Operators/common.jl b/src/Operators/common.jl
index e3262dbe04..98f76b94f8 100644
--- a/src/Operators/common.jl
+++ b/src/Operators/common.jl
@@ -24,7 +24,6 @@ Base.Broadcast.BroadcastStyle(
     ::Type{<:OperatorBroadcasted{Style}},
 ) where {Style} = Style()
 
-
 # recursively unwrap axes broadcast arguments in a way that is statically reducible by the optimizer
 @inline axes_args(args::Tuple) = unrolled_map(axes, args)
 
@@ -38,17 +37,14 @@ function Base.axes(opbc::OperatorBroadcasted)
         opbc.axes
     end
 end
-function Base.similar(opbc::OperatorBroadcasted, ::Type{Eltype}) where {Eltype}
-    space = axes(opbc)
-    return Field(Eltype, space)
-end
-function Base.copy(opbc::OperatorBroadcasted)
-    # figure out return type
-    dest = similar(opbc, eltype(opbc))
-    # allocate dest
-    copyto!(dest, opbc)
-end
 Base.Broadcast.broadcastable(opbc::OperatorBroadcasted) = opbc
+Base.copy(opbc::OperatorBroadcasted) = copyto!(similar(opbc), opbc)
+Base.similar(opbc::OperatorBroadcasted, ::Type{Eltype}) where {Eltype} =
+    Field(Eltype, axes(opbc))
+
+# Define similar to match DataStyle and AbstractFieldStyle broadcasting
+Base.similar(opbc::OperatorBroadcasted) =
+    similar(opbc, drop_auto_broadcasters(eltype(opbc)))
 
 function Base.Broadcast.materialize(opbc::OperatorBroadcasted)
     copy(Base.Broadcast.instantiate(opbc))
diff --git a/src/Operators/finitedifference.jl b/src/Operators/finitedifference.jl
index 3b5a846be3..7b1a321c31 100644
--- a/src/Operators/finitedifference.jl
+++ b/src/Operators/finitedifference.jl
@@ -167,8 +167,6 @@ struct RightBoundaryWindow{name} <: BoundaryWindow end
 
 An abstract type for finite difference operators. Instances of this should define:
 
-- [`getidx_return_type`](@ref)
-- [`stencil_return_type`](@ref)
 - [`return_eltype`](@ref)
 - [`return_space`](@ref)
 - [`stencil_interior_width`](@ref)
@@ -180,18 +178,6 @@ abstract type FiniteDifferenceOperator <: AbstractOperator end
 
 return_eltype(::FiniteDifferenceOperator, arg) = eltype(arg)
 
-"""
-    getidx_return_type(::Base.Broadcasted)
-    getidx_return_type(::StencilBroadcasted)
-    getidx_return_type(::Field)
-    getidx_return_type(::Any)
-    ...
-
-The return type of `getidx` on the arguemnt.
-Defaults to the type of the argument.
-"""
-function getidx_return_type end
-
 # boundary width error fallback
 @noinline invalid_boundary_condition_error(op_type::Type, bc_type::Type) =
     error("Boundary `$bc_type` is not supported for operator `$op_type`")
@@ -223,18 +209,6 @@ get_boundary(
     ::RightBoundaryWindow{name},
 ) where {name} = get_boundary(op.bcs, name)
 
-has_boundary(
-    op::FiniteDifferenceOperator,
-    ::LeftBoundaryWindow{name},
-) where {name} = hasfield(typeof(op.bcs), name)
-
-has_boundary(
-    op::FiniteDifferenceOperator,
-    ::RightBoundaryWindow{name},
-) where {name} = hasfield(typeof(op.bcs), name)
-
-has_boundary(op::FiniteDifferenceOperator, ::Interior) = false
-
 strip_space(op::FiniteDifferenceOperator, parent_space) =
     unionall_type(typeof(op))(
         NamedTuple{keys(op.bcs)}(
@@ -352,14 +326,6 @@ Defines the stencil of the operator `Op` in the interior of the domain at `idx`;
 """
 function stencil_interior end
 
-"""
-    stencil_return_type(::Op, args...)
-
-The return type of the given stencil and arguments.
-"""
-function stencil_return_type end
-
-
 """
     boundary_width(::Op, ::BC, args...)
 
@@ -371,30 +337,29 @@ defined for a specific `Op`/`BC` combination.
 function boundary_width end
 
 """
-    stencil_left_boundary(::Op, ::BC, idx, args...)
+    stencil_left_boundary(op, bc, idx, hidx, args...)
 
-Defines the stencil of operator `Op` at `idx` near the left boundary, with boundary condition `BC`.
+The result of stencil operator `op` at horizontal index `hidx` and some vertical
+index `idx` near the left boundary, with boundary condition `bc`. For operators
+that cannot be evaluated without a boundary condition, using the
+`NullBoundaryCondition` will always generate `NaN` values.
 """
-function stencil_left_boundary end
+stencil_left_boundary(op, ::NullBoundaryCondition, space, _, _, args...) =
+    new(return_eltype(op, args...)) * Spaces.undertype(space)(NaN)
 
 """
-    stencil_right_boundary(::Op, ::BC, idx, args...)
+    stencil_right_boundary(op, bc, idx, hidx, args...)
 
-Defines the stencil of operator `Op` at `idx` near the right boundary, with boundary condition `BC`.
+The result of stencil operator `op` at horizontal index `hidx` and some vertical
+index `idx` near the right boundary, with boundary condition `bc`. For operators
+that cannot be evaluated without a boundary condition, using the
+`NullBoundaryCondition` will always generate `NaN` values.
 """
-function stencil_right_boundary end
-
+stencil_right_boundary(op, ::NullBoundaryCondition, space, _, _, args...) =
+    new(return_eltype(op, args...)) * Spaces.undertype(space)(NaN)
 
 abstract type InterpolationOperator <: FiniteDifferenceOperator end
 
-# single argument interpolation must be the return type of getidx on the
-# argument, which should be cheaper / simpler than return_eltype(op, args...)
-@inline stencil_return_type(::InterpolationOperator, arg) =
-    getidx_return_type(arg)
-
-@inline stencil_return_type(op::FiniteDifferenceOperator, args...) =
-    return_eltype(op, args...)
-
 function assert_no_bcs(op, kwargs)
     length(kwargs) == 0 && return nothing
     error("InterpolateF2C does not accept boundary conditions.")
@@ -435,7 +400,7 @@ Base.@propagate_inbounds function stencil_interior(
 )
     a⁺ = getidx(space, arg, idx + half, hidx)
     a⁻ = getidx(space, arg, idx - half, hidx)
-    RecursiveApply.rdiv(a⁺ ⊞ a⁻, 2)
+    (a⁺ + a⁻) / 2
 end
 
 boundary_width(::InterpolateF2C, ::AbstractBoundaryCondition) = 0
@@ -494,7 +459,7 @@ Base.@propagate_inbounds function stencil_interior(
 )
     a⁺ = getidx(space, arg, idx + half, hidx)
     a⁻ = getidx(space, arg, idx - half, hidx)
-    RecursiveApply.rdiv(a⁺ ⊞ a⁻, 2)
+    (a⁺ + a⁻) / 2
 end
 boundary_width(::InterpolateC2F, ::AbstractBoundaryCondition) = 1
 
@@ -535,7 +500,7 @@ Base.@propagate_inbounds function stencil_left_boundary(
         getidx(space, bc.val, nothing, hidx),
         Geometry.LocalGeometry(space, idx, hidx),
     )
-    a⁺ ⊟ RecursiveApply.rdiv(v₃, 2)
+    a⁺ - v₃ / 2
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::InterpolateC2F,
@@ -551,7 +516,7 @@ Base.@propagate_inbounds function stencil_right_boundary(
         getidx(space, bc.val, nothing, hidx),
         Geometry.LocalGeometry(space, idx, hidx),
     )
-    a⁻ ⊞ RecursiveApply.rdiv(v₃, 2)
+    a⁻ + v₃ / 2
 end
 
 Base.@propagate_inbounds function stencil_left_boundary(
@@ -1071,7 +1036,7 @@ end
 
 abstract type WeightedInterpolationOperator <: InterpolationOperator end
 # TODO: this is not in general correct and the return type
-# should be based on the component operator types (rdiv, rmul) but we don't have a good way
+# should be based on the component operator types (/, *) but we don't have a good way
 # of creating ex. one(field_type) for complex fields for inference
 return_eltype(::WeightedInterpolationOperator, weights, arg) = eltype(arg)
 
@@ -1121,7 +1086,7 @@ Base.@propagate_inbounds function stencil_interior(
     w⁻ = getidx(space, weight, idx - half, hidx)
     a⁺ = getidx(space, arg, idx + half, hidx)
     a⁻ = getidx(space, arg, idx - half, hidx)
-    RecursiveApply.rdiv((w⁺ ⊠ a⁺) ⊞ (w⁻ ⊠ a⁻), (w⁺ ⊞ w⁻))
+    (w⁺ * a⁺ + w⁻ * a⁻) / (w⁺ + w⁻)
 end
 
 boundary_width(::WeightedInterpolateF2C, ::AbstractBoundaryCondition) = 0
@@ -1181,7 +1146,7 @@ Base.@propagate_inbounds function stencil_interior(
     w⁻ = getidx(space, weight, idx - half, hidx)
     a⁺ = getidx(space, arg, idx + half, hidx)
     a⁻ = getidx(space, arg, idx - half, hidx)
-    RecursiveApply.rdiv((w⁺ ⊠ a⁺) ⊞ (w⁻ ⊠ a⁻), (w⁺ ⊞ w⁻))
+    (w⁺ * a⁺ + w⁻ * a⁻) / (w⁺ + w⁻)
 end
 
 boundary_width(::WeightedInterpolateC2F, ::AbstractBoundaryCondition) = 1
@@ -1225,7 +1190,7 @@ Base.@propagate_inbounds function stencil_left_boundary(
         getidx(space, bc.val, nothing, hidx),
         Geometry.LocalGeometry(space, idx, hidx),
     )
-    a⁺ ⊟ RecursiveApply.rdiv(v₃, 2)
+    a⁺ - v₃ / 2
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::WeightedInterpolateC2F,
@@ -1242,7 +1207,7 @@ Base.@propagate_inbounds function stencil_right_boundary(
         getidx(space, bc.val, nothing, hidx),
         Geometry.LocalGeometry(space, idx, hidx),
     )
-    a⁻ ⊞ RecursiveApply.rdiv(v₃, 2)
+    a⁻ + v₃ / 2
 end
 
 Base.@propagate_inbounds function stencil_left_boundary(
@@ -1331,13 +1296,7 @@ return_space(
     arg_space::AllCenterFiniteDifferenceSpace,
 ) = velocity_space
 
-function upwind_biased_product(v, a⁻, a⁺)
-    RecursiveApply.rdiv(
-        ((v ⊞ RecursiveApply.rmap(abs, v)) ⊠ a⁻) ⊞
-        ((v ⊟ RecursiveApply.rmap(abs, v)) ⊠ a⁺),
-        2,
-    )
-end
+upwind_biased_product(v, a⁻, a⁺) = ((v + abs(v)) * a⁻ + (v - abs(v)) * a⁺) / 2
 
 stencil_interior_width(::UpwindBiasedProductC2F, velocity, arg) =
     ((0, 0), (-half, half))
@@ -1504,11 +1463,11 @@ function compute_Δ𝛼_linvanleer(a⁻, a⁰, a⁺, v, dt, ::MonotoneHarmonic)
     if sign(a⁰ - a⁻) == sign(a⁺ - a⁰) && Δ𝜙_avg != 0
         return ((a⁰ - a⁻) * (a⁺ - a⁰)) / (Δ𝜙_avg) * (1 - c)
     else
-        return eltype(v)(0)
+        return zero(v)
     end
 end
 
-posdiff(x, y) = ifelse(x - y ≥ 0, x - y, eltype(x)(0))
+posdiff(x, y) = ifelse(x - y ≥ 0, x - y, zero(x))
 
 function compute_Δ𝛼_linvanleer(a⁻, a⁰, a⁺, v, dt, ::PositiveDefinite)
     Δ𝜙_avg = ((a⁰ - a⁻) + (a⁺ - a⁰)) / 2
@@ -1529,11 +1488,11 @@ function slope_limited_product(v, a⁻, a⁻⁻, a⁺, a⁺⁺, dt, constraint)
     if v >= 0
         # Eqn (2,5a,5b,5c)
         Δ𝛼 = compute_Δ𝛼_linvanleer(a⁻⁻, a⁻, a⁺, v, dt, constraint)
-        return v ⊠ (a⁻ ⊞ RecursiveApply.rdiv(Δ𝛼, 2))
+        return v * (a⁻ + Δ𝛼 / 2)
     else
         # Eqn (2,5a,5b,5c)
         Δ𝛼 = compute_Δ𝛼_linvanleer(a⁻, a⁺, a⁺⁺, v, dt, constraint)
-        return v ⊠ (a⁺ ⊟ RecursiveApply.rdiv(Δ𝛼, 2))
+        return v * (a⁺ - Δ𝛼 / 2)
     end
 end
 
@@ -1699,13 +1658,11 @@ return_space(
     arg_space::AllCenterFiniteDifferenceSpace,
 ) = velocity_space
 
-function upwind_3rdorder_biased_product(v, a⁻, a⁻⁻, a⁺, a⁺⁺)
-    RecursiveApply.rdiv(
-        (v ⊠ (7 ⊠ (a⁺ + a⁻) ⊟ (a⁺⁺ + a⁻⁻))) ⊟
-        (RecursiveApply.rmap(abs, v) ⊠ (3 ⊠ (a⁺ - a⁻) ⊟ (a⁺⁺ - a⁻⁻))),
-        12,
-    )
-end
+upwind_3rdorder_biased_product(v, a⁻, a⁻⁻, a⁺, a⁺⁺) =
+    (
+        v * (7 * (a⁺ + a⁻) - (a⁺⁺ + a⁻⁻)) -
+        abs(v) * (3 * (a⁺ - a⁻) - (a⁺⁺ - a⁻⁻))
+    ) / 12
 
 stencil_interior_width(::Upwind3rdOrderBiasedProductC2F, velocity, arg) =
     ((0, 0), (-half - 1, half + 1))
@@ -1864,33 +1821,13 @@ return_space(
     arg_space::AllCenterFiniteDifferenceSpace,
 ) = velocity_space
 
-function fct_boris_book(v, a⁻⁻, a⁻, a⁺, a⁺⁺)
-    if v != zero(eltype(v))
-        sign(v) ⊠ (RecursiveApply.rmap(
-            max,
-            zero(eltype(v)),
-            RecursiveApply.rmap(
-                min,
-                RecursiveApply.rmap(abs, v),
-                RecursiveApply.rmap(
-                    min,
-                    sign(v) ⊠ (a⁺⁺ - a⁺),
-                    sign(v) ⊠ (a⁻ - a⁻⁻),
-                ),
-            ),
-        ))
-    else
-        RecursiveApply.rmap(
-            max,
-            zero(eltype(v)),
-            RecursiveApply.rmap(
-                min,
-                v,
-                RecursiveApply.rmap(min, (a⁺⁺ - a⁺), (a⁻ - a⁻⁻)),
-            ),
-        )
-    end
-end
+fct_boris_book(v, a⁻⁻, a⁻, a⁺, a⁺⁺) =
+    ifelse(
+        iszero(v),
+        max(v, min(v, a⁺⁺ - a⁺, a⁻ - a⁻⁻)),
+        sign(v) *
+        max(zero(v), min(abs(v), sign(v) * (a⁺⁺ - a⁺), sign(v) * (a⁻ - a⁻⁻))),
+    )
 
 stencil_interior_width(::FCTBorisBook, velocity, arg) =
     ((0, 0), (-half - 1, half + 1))
@@ -2003,63 +1940,6 @@ return_space(
     Φᵗᵈ_space::AllCenterFiniteDifferenceSpace,
 ) = A_space
 
-function fct_zalesak(
-    Aⱼ₋₁₂,
-    Aⱼ₊₁₂,
-    Aⱼ₊₃₂,
-    ϕⱼ₋₁,
-    ϕⱼ,
-    ϕⱼ₊₁,
-    ϕⱼ₊₂,
-    ϕⱼ₋₁ᵗᵈ,
-    ϕⱼᵗᵈ,
-    ϕⱼ₊₁ᵗᵈ,
-    ϕⱼ₊₂ᵗᵈ,
-)
-    # 1/dt is in ϕⱼ₋₁, ϕⱼ, ϕⱼ₊₁, ϕⱼ₊₂, ϕⱼ₋₁ᵗᵈ, ϕⱼᵗᵈ, ϕⱼ₊₁ᵗᵈ, ϕⱼ₊₂ᵗᵈ
-
-    stable_zero = zero(eltype(Aⱼ₊₁₂))
-    stable_one = one(eltype(Aⱼ₊₁₂))
-
-    # 𝒮5.4.2 (1)  Durran (5.32)  Zalesak's cosmetic correction
-    # which is usually omitted but used in Durran's textbook
-    # implementation of the flux corrected transport method.
-    # (Textbook suggests mixed results in 3 reported scenarios)
-    if (
-        Aⱼ₊₁₂ * (ϕⱼ₊₁ᵗᵈ - ϕⱼᵗᵈ) < stable_zero && (
-            Aⱼ₊₁₂ * (ϕⱼ₊₂ᵗᵈ - ϕⱼ₊₁ᵗᵈ) < stable_zero ||
-            Aⱼ₊₁₂ * (ϕⱼᵗᵈ - ϕⱼ₋₁ᵗᵈ) < stable_zero
-        )
-    )
-        Aⱼ₊₁₂ = stable_zero
-    end
-
-    # 𝒮5.4.2 (2)
-    # If flow is nondivergent, ϕᵗᵈ are not needed in the formulae below
-    ϕⱼᵐᵃˣ = max(ϕⱼ₋₁, ϕⱼ, ϕⱼ₊₁, ϕⱼ₋₁ᵗᵈ, ϕⱼᵗᵈ, ϕⱼ₊₁ᵗᵈ)
-    ϕⱼᵐⁱⁿ = min(ϕⱼ₋₁, ϕⱼ, ϕⱼ₊₁, ϕⱼ₋₁ᵗᵈ, ϕⱼᵗᵈ, ϕⱼ₊₁ᵗᵈ)
-    Pⱼ⁺ = max(stable_zero, Aⱼ₋₁₂) - min(stable_zero, Aⱼ₊₁₂)
-    # Zalesak also requires, in equation (5.33) Δx/Δt, which for the
-    # reference element we may assume Δζ = 1 between interfaces
-    Qⱼ⁺ = (ϕⱼᵐᵃˣ - ϕⱼᵗᵈ)
-    Rⱼ⁺ = (Pⱼ⁺ > stable_zero ? min(stable_one, Qⱼ⁺ / Pⱼ⁺) : stable_zero)
-    Pⱼ⁻ = max(stable_zero, Aⱼ₊₁₂) - min(stable_zero, Aⱼ₋₁₂)
-    Qⱼ⁻ = (ϕⱼᵗᵈ - ϕⱼᵐⁱⁿ)
-    Rⱼ⁻ = (Pⱼ⁻ > stable_zero ? min(stable_one, Qⱼ⁻ / Pⱼ⁻) : stable_zero)
-    ϕⱼ₊₁ᵐᵃˣ = max(ϕⱼ, ϕⱼ₊₁, ϕⱼ₊₂, ϕⱼᵗᵈ, ϕⱼ₊₁ᵗᵈ, ϕⱼ₊₂ᵗᵈ)
-    ϕⱼ₊₁ᵐⁱⁿ = min(ϕⱼ, ϕⱼ₊₁, ϕⱼ₊₂, ϕⱼᵗᵈ, ϕⱼ₊₁ᵗᵈ, ϕⱼ₊₂ᵗᵈ)
-    Pⱼ₊₁⁺ = max(stable_zero, Aⱼ₊₁₂) - min(stable_zero, Aⱼ₊₃₂)
-    Qⱼ₊₁⁺ = (ϕⱼ₊₁ᵐᵃˣ - ϕⱼ₊₁ᵗᵈ)
-    Rⱼ₊₁⁺ = (Pⱼ₊₁⁺ > stable_zero ? min(stable_one, Qⱼ₊₁⁺ / Pⱼ₊₁⁺) : stable_zero)
-    Pⱼ₊₁⁻ = max(stable_zero, Aⱼ₊₃₂) - min(stable_zero, Aⱼ₊₁₂)
-    Qⱼ₊₁⁻ = (ϕⱼ₊₁ᵗᵈ - ϕⱼ₊₁ᵐⁱⁿ)
-    Rⱼ₊₁⁻ = (Pⱼ₊₁⁻ > stable_zero ? min(stable_one, Qⱼ₊₁⁻ / Pⱼ₊₁⁻) : stable_zero)
-
-    Cⱼ₊₁₂ = (Aⱼ₊₁₂ ≥ stable_zero ? min(Rⱼ₊₁⁺, Rⱼ⁻) : min(Rⱼ⁺, Rⱼ₊₁⁻))
-
-    return Cⱼ₊₁₂ * Aⱼ₊₁₂
-end
-
 stencil_interior_width(::FCTZalesak, A_space, Φ_space, Φᵗᵈ_space) =
     ((-1, 1), (-half - 1, half + 1), (-half - 1, half + 1))
 
@@ -2072,45 +1952,57 @@ Base.@propagate_inbounds function stencil_interior(
     Φ_field,
     Φᵗᵈ_field,
 )
-    # cell center variables
-    ϕⱼ₋₁ = getidx(space, Φ_field, idx - half - 1, hidx)
-    ϕⱼ = getidx(space, Φ_field, idx - half, hidx)
-    ϕⱼ₊₁ = getidx(space, Φ_field, idx + half, hidx)
-    ϕⱼ₊₂ = getidx(space, Φ_field, idx + half + 1, hidx)
-    # cell center variables
-    ϕⱼ₋₁ᵗᵈ = getidx(space, Φᵗᵈ_field, idx - half - 1, hidx)
-    ϕⱼᵗᵈ = getidx(space, Φᵗᵈ_field, idx - half, hidx)
-    ϕⱼ₊₁ᵗᵈ = getidx(space, Φᵗᵈ_field, idx + half, hidx)
-    ϕⱼ₊₂ᵗᵈ = getidx(space, Φᵗᵈ_field, idx + half + 1, hidx)
-    # cell face variables
-    Aⱼ₊₁₂ = Geometry.contravariant3(
-        getidx(space, A_field, idx, hidx),
-        Geometry.LocalGeometry(space, idx, hidx),
-    )
-    Aⱼ₋₁₂ = Geometry.contravariant3(
-        getidx(space, A_field, idx - 1, hidx),
-        Geometry.LocalGeometry(space, idx - 1, hidx),
-    )
-    Aⱼ₊₃₂ = Geometry.contravariant3(
-        getidx(space, A_field, idx + 1, hidx),
-        Geometry.LocalGeometry(space, idx + 1, hidx),
-    )
+    # 1/dt is in ϕ₋₃₂, ϕ₋₁₂, ϕ₊₁₂, ϕ₊₃₂, ϕ₋₃₂ᵗᵈ, ϕ₋₁₂ᵗᵈ, ϕ₊₁₂ᵗᵈ, ϕ₊₃₂ᵗᵈ
+    ϕ₋₃₂ = getidx(space, Φ_field, idx - half - 1, hidx)
+    ϕ₋₁₂ = getidx(space, Φ_field, idx - half, hidx)
+    ϕ₊₁₂ = getidx(space, Φ_field, idx + half, hidx)
+    ϕ₊₃₂ = getidx(space, Φ_field, idx + half + 1, hidx)
+    ϕ₋₃₂ᵗᵈ = getidx(space, Φᵗᵈ_field, idx - half - 1, hidx)
+    ϕ₋₁₂ᵗᵈ = getidx(space, Φᵗᵈ_field, idx - half, hidx)
+    ϕ₊₁₂ᵗᵈ = getidx(space, Φᵗᵈ_field, idx + half, hidx)
+    ϕ₊₃₂ᵗᵈ = getidx(space, Φᵗᵈ_field, idx + half + 1, hidx)
+
+    lg₋₁ = Geometry.LocalGeometry(space, idx - 1, hidx)
+    lg = Geometry.LocalGeometry(space, idx, hidx)
+    lg₊₁ = Geometry.LocalGeometry(space, idx + 1, hidx)
+    A₋₁ = Geometry.contravariant3(getidx(space, A_field, idx - 1, hidx), lg₋₁)
+    A = Geometry.contravariant3(getidx(space, A_field, idx, hidx), lg)
+    A₊₁ = Geometry.contravariant3(getidx(space, A_field, idx + 1, hidx), lg₊₁)
 
-    return Geometry.Contravariant3Vector(
-        fct_zalesak(
-            Aⱼ₋₁₂,
-            Aⱼ₊₁₂,
-            Aⱼ₊₃₂,
-            ϕⱼ₋₁,
-            ϕⱼ,
-            ϕⱼ₊₁,
-            ϕⱼ₊₂,
-            ϕⱼ₋₁ᵗᵈ,
-            ϕⱼᵗᵈ,
-            ϕⱼ₊₁ᵗᵈ,
-            ϕⱼ₊₂ᵗᵈ,
-        ),
+    # 𝒮5.4.2 (1)  Durran (5.32)  Zalesak's cosmetic correction
+    # which is usually omitted but used in Durran's textbook
+    # implementation of the flux corrected transport method.
+    # (Textbook suggests mixed results in 3 reported scenarios)
+    A = ifelse(
+        max(
+            A * (ϕ₊₁₂ᵗᵈ - ϕ₋₁₂ᵗᵈ),
+            min(A * (ϕ₊₃₂ᵗᵈ - ϕ₊₁₂ᵗᵈ), A * (ϕ₋₁₂ᵗᵈ - ϕ₋₃₂ᵗᵈ)),
+        ) >= 0,
+        A,
+        zero(A),
     )
+
+    P₋₁₂⁻ = max(0, A) - min(0, A₋₁)
+    P₋₁₂⁺ = max(0, A₋₁) - min(0, A)
+    P₊₁₂⁻ = max(0, A₊₁) - min(0, A)
+    P₊₁₂⁺ = max(0, A) - min(0, A₊₁)
+
+    # 𝒮5.4.2 (2)
+    # If flow is nondivergent, ϕᵗᵈ are not needed in the formulae below
+    ϕ₋₁₂ᵐᵃˣ = max(ϕ₋₃₂, ϕ₋₁₂, ϕ₊₁₂, ϕ₋₃₂ᵗᵈ, ϕ₋₁₂ᵗᵈ, ϕ₊₁₂ᵗᵈ)
+    ϕ₋₁₂ᵐⁱⁿ = min(ϕ₋₃₂, ϕ₋₁₂, ϕ₊₁₂, ϕ₋₃₂ᵗᵈ, ϕ₋₁₂ᵗᵈ, ϕ₊₁₂ᵗᵈ)
+    ϕ₊₁₂ᵐᵃˣ = max(ϕ₋₁₂, ϕ₊₁₂, ϕ₊₃₂, ϕ₋₁₂ᵗᵈ, ϕ₊₁₂ᵗᵈ, ϕ₊₃₂ᵗᵈ)
+    ϕ₊₁₂ᵐⁱⁿ = min(ϕ₋₁₂, ϕ₊₁₂, ϕ₊₃₂, ϕ₋₁₂ᵗᵈ, ϕ₊₁₂ᵗᵈ, ϕ₊₃₂ᵗᵈ)
+
+    # Zalesak also requires, in equation (5.33) Δx/Δt, which for the
+    # reference element we may assume Δζ = 1 between interfaces
+    R₋₁₂⁻ = ifelse(P₋₁₂⁻ > 0, min(1, (ϕ₋₁₂ᵗᵈ - ϕ₋₁₂ᵐⁱⁿ) / P₋₁₂⁻), zero(A))
+    R₋₁₂⁺ = ifelse(P₋₁₂⁺ > 0, min(1, (ϕ₋₁₂ᵐᵃˣ - ϕ₋₁₂ᵗᵈ) / P₋₁₂⁺), zero(A))
+    R₊₁₂⁻ = ifelse(P₊₁₂⁻ > 0, min(1, (ϕ₊₁₂ᵗᵈ - ϕ₊₁₂ᵐⁱⁿ) / P₊₁₂⁻), zero(A))
+    R₊₁₂⁺ = ifelse(P₊₁₂⁺ > 0, min(1, (ϕ₊₁₂ᵐᵃˣ - ϕ₊₁₂ᵗᵈ) / P₊₁₂⁺), zero(A))
+
+    A_fct = ifelse(A >= 0, min(R₊₁₂⁺, R₋₁₂⁻), min(R₋₁₂⁺, R₊₁₂⁻)) * A
+    return Geometry.Contravariant3Vector(A_fct)
 end
 
 boundary_width(::FCTZalesak, ::AbstractBoundaryCondition) = 2
@@ -2164,6 +2056,7 @@ A subtype of [`AbstractTVDSlopeLimiter`](@ref) limiter. See
 `TVDLimitedFluxC2F` for the general formulation.
 """
 struct RZeroLimiter <: AbstractTVDSlopeLimiter end
+limiter_coeff(r, ::RZeroLimiter) = zero(r)
 
 """
     U = RHalfLimiter(;boundaries)
@@ -2173,6 +2066,7 @@ A subtype of [`AbstractTVDSlopeLimiter`](@ref) limiter. See
 `TVDLimitedFluxC2F` for the general formulation.
 """
 struct RHalfLimiter <: AbstractTVDSlopeLimiter end
+limiter_coeff(r, ::RHalfLimiter) = one(r) / 2
 
 """
     U = RMaxLimiter(;boundaries)
@@ -2182,6 +2076,7 @@ A subtype of [`AbstractTVDSlopeLimiter`](@ref) limiter. See
 `TVDLimitedFluxC2F` for the general formulation.
 """
 struct RMaxLimiter <: AbstractTVDSlopeLimiter end
+limiter_coeff(r, ::RMaxLimiter) = one(r)
 
 """
     U = MinModLimiter(;boundaries)
@@ -2191,6 +2086,7 @@ A subtype of [`AbstractTVDSlopeLimiter`](@ref) limiter. See
 `TVDLimitedFluxC2F` for the general formulation.
 """
 struct MinModLimiter <: AbstractTVDSlopeLimiter end
+limiter_coeff(r, ::MinModLimiter) = max(0, min(1, r))
 
 """
     U = KorenLimiter(;boundaries)
@@ -2200,6 +2096,7 @@ A subtype of [`AbstractTVDSlopeLimiter`](@ref) limiter. See
 `TVDLimitedFluxC2F` for the general formulation.
 """
 struct KorenLimiter <: AbstractTVDSlopeLimiter end
+limiter_coeff(r, ::KorenLimiter) = max(0, min(2r, (1 + 2r) / 3, 2))
 
 """
     U = SuperbeeLimiter(;boundaries)
@@ -2209,6 +2106,7 @@ A subtype of [`AbstractTVDSlopeLimiter`](@ref) limiter. See
 `TVDLimitedFluxC2F` for the general formulation.
 """
 struct SuperbeeLimiter <: AbstractTVDSlopeLimiter end
+limiter_coeff(r, ::SuperbeeLimiter) = max(0, min(1, r), min(2, r))
 
 """
     U = MonotonizedCentralLimiter(;boundaries)
@@ -2218,34 +2116,7 @@ A subtype of [`AbstractTVDSlopeLimiter`](@ref) limiter. See
 `TVDLimitedFluxC2F` for the general formulation.
 """
 struct MonotonizedCentralLimiter <: AbstractTVDSlopeLimiter end
-
-@inline function compute_limiter_coeff(r, ::RZeroLimiter)
-    return zero(eltype(r))
-end
-
-@inline function compute_limiter_coeff(r, ::RHalfLimiter)
-    return one(eltype(r)) * 1 / 2
-end
-
-@inline function compute_limiter_coeff(r, ::RMaxLimiter)
-    return one(eltype(r))
-end
-
-@inline function compute_limiter_coeff(r, ::MinModLimiter)
-    return max(zero(eltype(r)), min(one(eltype(r)), r))
-end
-
-@inline function compute_limiter_coeff(r, ::KorenLimiter)
-    return max(zero(eltype(r)), min(2r, min(1 / 3 + 2r / 3, 2)))
-end
-
-@inline function compute_limiter_coeff(r, ::SuperbeeLimiter)
-    return max(zero(eltype(r)), min(one(eltype(r)), r), min(2, r))
-end
-
-@inline function compute_limiter_coeff(r, ::MonotonizedCentralLimiter)
-    return max(zero(eltype(r)), min(2r, (1 + r) / 2, 2))
-end
+limiter_coeff(r, ::MonotonizedCentralLimiter) = max(0, min(2r, (1 + r) / 2, 2))
 
 """
     TVDLimitedFluxC2F{BCS, M} <: AdvectionOperator
@@ -2305,15 +2176,6 @@ return_space(
     u_space::AllFaceFiniteDifferenceSpace,
 ) = A_space
 
-function tvd_limited_flux(Aⱼ₋₁₂, Aⱼ₊₁₂, ϕⱼ₋₁, ϕⱼ, ϕⱼ₊₁, ϕⱼ₊₂, rⱼ₊₁₂, constraint)
-    stable_zero = zero(eltype(Aⱼ₊₁₂))
-    stable_one = one(eltype(Aⱼ₊₁₂))
-    Cⱼ₊₁₂ = compute_limiter_coeff(rⱼ₊₁₂, constraint)
-    @assert Cⱼ₊₁₂ <= 2
-    @assert Cⱼ₊₁₂ >= 0
-    return Cⱼ₊₁₂ * Aⱼ₊₁₂
-end
-
 stencil_interior_width(::TVDLimitedFluxC2F, A_space, Φ_space, u_space) =
     ((-1, 1), (-half - 1, half + 1), (-1, +1))
 
@@ -2326,38 +2188,20 @@ Base.@propagate_inbounds function stencil_interior(
     Φ_field,
     𝓊_field,
 )
-    # cell center variables
-    ϕⱼ₋₁ = getidx(space, Φ_field, idx - half - 1, hidx)
-    ϕⱼ = getidx(space, Φ_field, idx - half, hidx)
-    ϕⱼ₊₁ = getidx(space, Φ_field, idx + half, hidx)
-    ϕⱼ₊₂ = getidx(space, Φ_field, idx + half + 1, hidx)
-    𝓊 = Geometry.contravariant3(
-        getidx(space, 𝓊_field, idx, hidx),
-        Geometry.LocalGeometry(space, idx, hidx),
-    )
-    # cell face variables
-    Aⱼ₊₁₂ = Geometry.contravariant3(
-        getidx(space, A_field, idx, hidx),
-        Geometry.LocalGeometry(space, idx, hidx),
-    )
-    Aⱼ₋₁₂ = Geometry.contravariant3(
-        getidx(space, A_field, idx - 1, hidx),
-        Geometry.LocalGeometry(space, idx - 1, hidx),
-    )
-    # See filter options below
-    rⱼ₊₁₂ = compute_slope_ratio(ϕⱼ, ϕⱼ₋₁, ϕⱼ₊₁, ϕⱼ₊₂, 𝓊)
+    ϕ₋₃₂ = getidx(space, Φ_field, idx - half - 1, hidx)
+    ϕ₋₁₂ = getidx(space, Φ_field, idx - half, hidx)
+    ϕ₊₁₂ = getidx(space, Φ_field, idx + half, hidx)
+    ϕ₊₃₂ = getidx(space, Φ_field, idx + half + 1, hidx)
 
-    return Geometry.Contravariant3Vector(
-        tvd_limited_flux(Aⱼ₋₁₂, Aⱼ₊₁₂, ϕⱼ₋₁, ϕⱼ, ϕⱼ₊₁, ϕⱼ₊₂, rⱼ₊₁₂, op.method),
-    )
-end
+    lg = Geometry.LocalGeometry(space, idx, hidx)
+    𝓊 = Geometry.contravariant3(getidx(space, 𝓊_field, idx, hidx), lg)
+    A = Geometry.contravariant3(getidx(space, A_field, idx, hidx), lg)
 
-@inline function compute_slope_ratio(ϕⱼ, ϕⱼ₋₁, ϕⱼ₊₁, ϕⱼ₊₂, 𝓊)
-    if 𝓊 >= 0
-        return (ϕⱼ - ϕⱼ₋₁) / (ϕⱼ₊₁ - ϕⱼ + eps(eltype(ϕⱼ)))
-    else
-        return (ϕⱼ₊₂ - ϕⱼ₊₁) / (ϕⱼ₊₁ - ϕⱼ + eps(eltype(ϕⱼ)))
-    end
+    Δϕ = ϕ₊₁₂ - ϕ₋₁₂ + eps(typeof(ϕ₋₁₂))
+    # Δϕ_clipped = sign(Δϕ) * max(abs(Δϕ), eps(typeof(Δϕ)))
+    r = ifelse(𝓊 >= 0, ϕ₋₁₂ - ϕ₋₃₂, ϕ₊₃₂ - ϕ₊₁₂) / Δϕ # Δϕ_clipped
+
+    return Geometry.Contravariant3Vector(limiter_coeff(r, op.method) * A)
 end
 
 boundary_width(::TVDLimitedFluxC2F, ::AbstractBoundaryCondition) = 2
@@ -2436,8 +2280,8 @@ Base.@propagate_inbounds function stencil_interior(
         getidx(space, velocity, idx, hidx),
         Geometry.LocalGeometry(space, idx, hidx),
     )
-    ∂θ₃ = RecursiveApply.rdiv(θ⁺ ⊟ θ⁻, 2)
-    return w³ ⊠ ∂θ₃
+    ∂θ₃ = (θ⁺ - θ⁻) / 2
+    return w³ * ∂θ₃
 end
 boundary_width(::AdvectionF2F, ::AbstractBoundaryCondition) = 1
 
@@ -2502,9 +2346,9 @@ Base.@propagate_inbounds function stencil_interior(
         getidx(space, velocity, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    ∂θ₃⁺ = θ⁺ ⊟ θ
-    ∂θ₃⁻ = θ ⊟ θ⁻
-    return RecursiveApply.rdiv((w³⁺ ⊠ ∂θ₃⁺) ⊞ (w³⁻ ⊠ ∂θ₃⁻), 2)
+    ∂θ₃⁺ = θ⁺ - θ
+    ∂θ₃⁻ = θ - θ⁻
+    return (w³⁺ * ∂θ₃⁺ + w³⁻ * ∂θ₃⁻) / 2
 end
 
 boundary_width(::AdvectionC2C, ::AbstractBoundaryCondition) = 1
@@ -2529,9 +2373,9 @@ Base.@propagate_inbounds function stencil_left_boundary(
         getidx(space, velocity, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    ∂θ₃⁺ = θ⁺ ⊟ θ
-    ∂θ₃⁻ = 2 ⊠ (θ ⊟ θ⁻)
-    return RecursiveApply.rdiv((w³⁺ ⊠ ∂θ₃⁺) ⊞ (w³⁻ ⊠ ∂θ₃⁻), 2)
+    ∂θ₃⁺ = θ⁺ - θ
+    ∂θ₃⁻ = 2 * (θ - θ⁻)
+    return (w³⁺ * ∂θ₃⁺ + w³⁻ * ∂θ₃⁻) / 2
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::AdvectionC2C,
@@ -2554,9 +2398,9 @@ Base.@propagate_inbounds function stencil_right_boundary(
         getidx(space, velocity, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    ∂θ₃⁺ = 2 ⊠ (θ⁺ ⊟ θ)
-    ∂θ₃⁻ = θ ⊟ θ⁻
-    return RecursiveApply.rdiv((w³⁺ ⊠ ∂θ₃⁺) ⊞ (w³⁻ ⊠ ∂θ₃⁻), 2)
+    ∂θ₃⁺ = 2 * (θ⁺ - θ)
+    ∂θ₃⁻ = θ - θ⁻
+    return (w³⁺ * ∂θ₃⁺ + w³⁻ * ∂θ₃⁻) / 2
 end
 
 Base.@propagate_inbounds function stencil_left_boundary(
@@ -2575,8 +2419,8 @@ Base.@propagate_inbounds function stencil_left_boundary(
         getidx(space, velocity, idx + half, hidx),
         Geometry.LocalGeometry(space, idx + half, hidx),
     )
-    ∂θ₃⁺ = θ⁺ ⊟ θ
-    return (w³⁺ ⊠ ∂θ₃⁺)
+    ∂θ₃⁺ = θ⁺ - θ
+    return (w³⁺ * ∂θ₃⁺)
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::AdvectionC2C,
@@ -2594,8 +2438,8 @@ Base.@propagate_inbounds function stencil_right_boundary(
         getidx(space, velocity, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    ∂θ₃⁻ = θ ⊟ θ⁻
-    return (w³⁻ ⊠ ∂θ₃⁻)
+    ∂θ₃⁻ = θ - θ⁻
+    return (w³⁻ * ∂θ₃⁻)
 end
 
 """
@@ -2651,9 +2495,9 @@ Base.@propagate_inbounds function stencil_interior(
         getidx(space, velocity, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    ∂θ₃⁺ = θ⁺ ⊟ θ
-    ∂θ₃⁻ = θ ⊟ θ⁻
-    return (abs(w³⁺) ⊠ ∂θ₃⁺) ⊟ (abs(w³⁻) ⊠ ∂θ₃⁻)
+    ∂θ₃⁺ = θ⁺ - θ
+    ∂θ₃⁻ = θ - θ⁻
+    return abs(w³⁺) * ∂θ₃⁺ - abs(w³⁻) * ∂θ₃⁻
 end
 
 boundary_width(::FluxCorrectionC2C, ::AbstractBoundaryCondition) = 1
@@ -2673,8 +2517,8 @@ Base.@propagate_inbounds function stencil_left_boundary(
         getidx(space, velocity, idx + half, hidx),
         Geometry.LocalGeometry(space, idx + half, hidx),
     )
-    ∂θ₃⁺ = θ⁺ ⊟ θ
-    return (abs(w³⁺) ⊠ ∂θ₃⁺)
+    ∂θ₃⁺ = θ⁺ - θ
+    return abs(w³⁺) * ∂θ₃⁺
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::FluxCorrectionC2C,
@@ -2692,8 +2536,8 @@ Base.@propagate_inbounds function stencil_right_boundary(
         getidx(space, velocity, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    ∂θ₃⁻ = θ ⊟ θ⁻
-    return ⊟(abs(w³⁻) ⊠ ∂θ₃⁻)
+    ∂θ₃⁻ = θ - θ⁻
+    return -abs(w³⁻) * ∂θ₃⁻
 end
 
 """
@@ -2749,9 +2593,9 @@ Base.@propagate_inbounds function stencil_interior(
         getidx(space, velocity, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    ∂θ₃⁺ = θ⁺ ⊟ θ
-    ∂θ₃⁻ = θ ⊟ θ⁻
-    return (abs(w³⁺) ⊠ ∂θ₃⁺) ⊟ (abs(w³⁻) ⊠ ∂θ₃⁻)
+    ∂θ₃⁺ = θ⁺ - θ
+    ∂θ₃⁻ = θ - θ⁻
+    return abs(w³⁺) * ∂θ₃⁺ - abs(w³⁻) * ∂θ₃⁻
 end
 
 boundary_width(::FluxCorrectionF2F, ::AbstractBoundaryCondition) = 1
@@ -2771,8 +2615,8 @@ Base.@propagate_inbounds function stencil_left_boundary(
         getidx(space, velocity, idx + half, hidx),
         Geometry.LocalGeometry(space, idx + half, hidx),
     )
-    ∂θ₃⁺ = θ⁺ ⊟ θ
-    return (abs(w³⁺) ⊠ ∂θ₃⁺)
+    ∂θ₃⁺ = θ⁺ - θ
+    return abs(w³⁺) * ∂θ₃⁺
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::FluxCorrectionF2F,
@@ -2790,8 +2634,8 @@ Base.@propagate_inbounds function stencil_right_boundary(
         getidx(space, velocity, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    ∂θ₃⁻ = θ ⊟ θ⁻
-    return ⊟(abs(w³⁻) ⊠ ∂θ₃⁻)
+    ∂θ₃⁻ = θ - θ⁻
+    return -abs(w³⁻) * ∂θ₃⁻
 end
 
 
@@ -2905,7 +2749,7 @@ Base.@propagate_inbounds function stencil_interior(
     arg,
 )
     Geometry.Covariant3Vector(1) ⊗ (
-        getidx(space, arg, idx + half, hidx) ⊟
+        getidx(space, arg, idx + half, hidx) -
         getidx(space, arg, idx - half, hidx)
     )
 end
@@ -2923,7 +2767,7 @@ Base.@propagate_inbounds function stencil_left_boundary(
 )
     @assert idx == left_center_boundary_idx(space)
     Geometry.Covariant3Vector(1) ⊗ (
-        getidx(space, arg, idx + half, hidx) ⊟
+        getidx(space, arg, idx + half, hidx) -
         getidx(space, bc.val, nothing, hidx)
     )
 end
@@ -2937,7 +2781,7 @@ Base.@propagate_inbounds function stencil_right_boundary(
 )
     @assert idx == right_center_boundary_idx(space)
     Geometry.Covariant3Vector(1) ⊗ (
-        getidx(space, bc.val, nothing, hidx) ⊟
+        getidx(space, bc.val, nothing, hidx) -
         getidx(space, arg, idx - half, hidx)
     )
 end
@@ -3017,7 +2861,7 @@ Base.@propagate_inbounds function stencil_interior(
     arg,
 )
     Geometry.Covariant3Vector(1) ⊗ (
-        getidx(space, arg, idx + half, hidx) ⊟
+        getidx(space, arg, idx + half, hidx) -
         getidx(space, arg, idx - half, hidx)
     )
 end
@@ -3034,7 +2878,7 @@ Base.@propagate_inbounds function stencil_left_boundary(
     @assert idx == left_face_boundary_idx(space)
     # ∂x[i] = 2(∂x[i + half] - val)
     Geometry.Covariant3Vector(2) ⊗ (
-        getidx(space, arg, idx + half, hidx) ⊟
+        getidx(space, arg, idx + half, hidx) -
         getidx(space, bc.val, nothing, hidx)
     )
 end
@@ -3048,7 +2892,7 @@ Base.@propagate_inbounds function stencil_right_boundary(
 )
     @assert idx == right_face_boundary_idx(space)
     Geometry.Covariant3Vector(2) ⊗ (
-        getidx(space, bc.val, nothing, hidx) ⊟
+        getidx(space, bc.val, nothing, hidx) -
         getidx(space, arg, idx - half, hidx)
     )
 end
@@ -3156,7 +3000,7 @@ Base.@propagate_inbounds function stencil_interior(
         Geometry.LocalGeometry(space, idx, hidx),
     )
     return Geometry.Covariant3Vector(1) ⊗
-           ((1 - sign(v)) / 2 ⊠ a⁺ + sign(v) ⊠ a - (1 + sign(v)) / 2 ⊠ a⁻)
+           ((1 - sign(v)) / 2 * a⁺ + sign(v) * a - (1 + sign(v)) / 2 * a⁻)
 end
 
 boundary_width(::UpwindBiasedGradient, ::AbstractBoundaryCondition) = 1
@@ -3171,7 +3015,7 @@ Base.@propagate_inbounds function stencil_left_boundary(
     @assert idx == left_face_boundary_idx(space)
     a⁺ = getidx(space, arg, idx + 1, hidx)
     a = getidx(space, arg, idx, hidx)
-    return Geometry.Covariant3Vector(1) ⊗ (a⁺ ⊟ a)
+    return Geometry.Covariant3Vector(1) ⊗ (a⁺ - a)
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::UpwindBiasedGradient,
@@ -3184,7 +3028,7 @@ Base.@propagate_inbounds function stencil_right_boundary(
     @assert idx == right_face_boundary_idx(space)
     a = getidx(space, arg, idx, hidx)
     a⁻ = getidx(space, arg, idx - 1, hidx)
-    return Geometry.Covariant3Vector(1) ⊗ (a ⊟ a⁻)
+    return Geometry.Covariant3Vector(1) ⊗ (a - a⁻)
 end
 
 abstract type DivergenceOperator <: FiniteDifferenceOperator end
@@ -3253,7 +3097,7 @@ Base.@propagate_inbounds function stencil_interior(
         getidx(space, arg, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    (Ju³₊ ⊟ Ju³₋) ⊠ local_geometry.invJ
+    (Ju³₊ - Ju³₋) * local_geometry.invJ
 end
 
 boundary_width(::DivergenceF2C, ::AbstractBoundaryCondition) = 0
@@ -3276,7 +3120,7 @@ Base.@propagate_inbounds function stencil_left_boundary(
         getidx(space, bc.val, nothing, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    (Ju³₊ ⊟ Ju³₋) ⊠ local_geometry.invJ
+    (Ju³₊ - Ju³₋) * local_geometry.invJ
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::DivergenceF2C,
@@ -3296,7 +3140,7 @@ Base.@propagate_inbounds function stencil_right_boundary(
         getidx(space, arg, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    (Ju³₊ ⊟ Ju³₋) ⊠ local_geometry.invJ
+    (Ju³₊ - Ju³₋) * local_geometry.invJ
 end
 
 boundary_width(::DivergenceF2C, ::SetDivergence) = 1
@@ -3425,7 +3269,7 @@ Base.@propagate_inbounds function stencil_interior(
         getidx(space, arg, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    (Ju³₊ ⊟ Ju³₋) ⊠ local_geometry.invJ
+    (Ju³₊ - Ju³₋) * local_geometry.invJ
 end
 
 boundary_width(::DivergenceC2F, ::AbstractBoundaryCondition) = 1
@@ -3448,7 +3292,7 @@ Base.@propagate_inbounds function stencil_left_boundary(
         getidx(space, bc.val, nothing, hidx),
         local_geometry,
     )
-    (Ju³₊ ⊟ Ju³) ⊠ (2 * local_geometry.invJ)
+    (Ju³₊ - Ju³) * (2 * local_geometry.invJ)
 end
 Base.@propagate_inbounds function stencil_right_boundary(
     ::DivergenceC2F,
@@ -3468,7 +3312,7 @@ Base.@propagate_inbounds function stencil_right_boundary(
         getidx(space, arg, idx - half, hidx),
         Geometry.LocalGeometry(space, idx - half, hidx),
     )
-    (Ju³ ⊟ Ju³₋) ⊠ (2 * local_geometry.invJ)
+    (Ju³ - Ju³₋) * (2 * local_geometry.invJ)
 end
 
 # left / right SetDivergence boundary conditions
@@ -3770,8 +3614,7 @@ end
 @inline function should_call_left_boundary(idx, space, op, args...)
     Topologies.isperiodic(space) && return false
     loc = left_boundary_window(space)
-    return Operators.has_boundary(op, loc) &&
-           idx < Operators.left_interior_idx(
+    return idx < Operators.left_interior_idx(
         space,
         op,
         Operators.get_boundary(op, loc),
@@ -3782,8 +3625,7 @@ end
 @inline function should_call_right_boundary(idx, space, op, args...)
     Topologies.isperiodic(space) && return false
     loc = right_boundary_window(space)
-    return Operators.has_boundary(op, loc) &&
-           idx > Operators.right_interior_idx(
+    return idx > Operators.right_interior_idx(
         space,
         op,
         Operators.get_boundary(op, loc),
@@ -3845,20 +3687,6 @@ Base.@propagate_inbounds function getidx(
     end
 end
 
-@inline getidx_return_type(scalar::Tuple{<:Any}) = eltype(scalar)
-@inline getidx_return_type(scalar::Ref) = eltype(scalar)
-@inline getidx_return_type(x::T) where {T} = T
-@inline getidx_return_type(f::Fields.Field) = eltype(f)
-
-@inline getidx_return_type(bc::Base.Broadcast.Broadcasted) =
-    Base.promote_op(bc.f, map(getidx_return_type, bc.args)...)
-
-@inline getidx_return_type(op::AbstractOperator, args...) =
-    stencil_return_type(bc.op, bc.args...)
-
-@inline getidx_return_type(bc::StencilBroadcasted) =
-    stencil_return_type(bc.op, bc.args...)
-
 # broadcasting a ColumnStencilStyle gives the StencilBroadcasted's style
 Base.Broadcast.BroadcastStyle(
     ::Type{<:StencilBroadcasted{Style}},
@@ -3910,15 +3738,14 @@ Base.@propagate_inbounds function getidx(
     return @inbounds field_data[CartesianIndex(i, j, 1, v, h)]
 end
 
-
 # unwap boxed scalars
 @inline getidx(parent_space, scalar::Tuple{T}, idx, hidx) where {T} = scalar[1]
 @inline getidx(parent_space, scalar::Ref, idx, hidx) = scalar[]
 @inline getidx(parent_space, field::Fields.PointField, idx, hidx) = field[]
 @inline getidx(parent_space, field::Fields.PointField, idx) = field[]
 
-# recursive fallback for scalar, just return
-@inline getidx(parent_space, scalar, idx, hidx) = scalar
+# enable automatic nested broadcasting over single-valued boundary conditions
+@inline getidx(parent_space, scalar, idx, hidx) = add_auto_broadcasters(scalar)
 
 # getidx error fallbacks
 @noinline inferred_getidx_error(idx_type::Type, space_type::Type) =
@@ -3979,7 +3806,12 @@ function Base.Broadcast.broadcasted(
     # TODO: we should probably disallow this, as it
     # may help with latency.
     FT = Spaces.undertype(axes(StencilBroadcasted{Style}(op, args)))
-    StencilBroadcasted{Style}(promote_bcs(op, FT), args)
+    args′ =
+        unrolled_map(args) do arg
+            is_auto_broadcastable(eltype(arg)) ?
+            Base.Broadcast.broadcasted(add_auto_broadcasters, arg) : arg
+        end
+    return StencilBroadcasted{Style}(promote_bcs(op, FT), args′)
 end
 
 # check that inferred output field space is equal to dest field space
@@ -4030,17 +3862,6 @@ if hasfield(Method, :recursion_relation)
     end
 end
 
-function Base.similar(
-    bc::Base.Broadcast.Broadcasted{S},
-    ::Type{Eltype},
-) where {Eltype, S <: AbstractStencilStyle}
-    sp = axes(bc)
-    return Field(Eltype, sp)
-end
-
-Base.similar(bc::Base.Broadcast.Broadcasted{<:AbstractStencilStyle}) =
-    Base.similar(bc, eltype(bc))
-
 function _serial_copyto!(field_out::Field, bc, Ni::Int, Nj::Int, Nh::Int)
     space = axes(field_out)
     bounds = window_bounds(space, bc)
@@ -4138,17 +3959,6 @@ Base.@propagate_inbounds function apply_stencil!(
     end
     return field_out
 end
-# Compute slope ratio 𝜃 and limiter coefficient 𝜙
-#𝜃 = compute_slope_ratio(a⁻, a⁻⁻, a⁺, a⁺⁺, v)
-#𝜙 = compute_limiter_coeff(𝜃, method)
-
-
-#@assert 0 <= 𝜙 <= 2
-#if v >= 0
-#    return v ⊠ (a⁻ ⊞ RecursiveApply.rdiv((a⁺ - a⁻) ⊠ 𝜙 ,2))
-#else
-#    return v ⊠ (a⁺ ⊟ RecursiveApply.rdiv((a⁺ - a⁻) ⊠ 𝜙 ,2)) # Current working solution
-#end
 
 """
     fd_shmem_is_supported(bc::Base.Broadcast.AbstractBroadcasted)
diff --git a/src/Operators/integrals.jl b/src/Operators/integrals.jl
index 13df0d18d6..20faaa8ed9 100644
--- a/src/Operators/integrals.jl
+++ b/src/Operators/integrals.jl
@@ -1,7 +1,8 @@
-import ..RecursiveApply: rzero, ⊠, ⊞
 import RootSolvers
 import ClimaComms
 
+broadcast_zero(field) = zero(eltype(Base.broadcastable(field)))
+
 """
     column_integral_definite!(ϕ_top, ᶜ∂ϕ∂z, [ϕ_bot])
 
@@ -12,14 +13,14 @@ area differential `J/Δz`, with `J` denoting the metric Jacobian. The input
 `ᶜ∂ϕ∂z` should be a cell-center `Field` or `AbstractBroadcasted`, and the output
 `ϕ_top` should be a horizontal `Field`. The default value of `ϕ_bot` is 0.
 """
-function column_integral_definite!(ϕ_top, ᶜ∂ϕ∂z, ϕ_bot = rzero(eltype(ϕ_top)))
+function column_integral_definite!(ϕ_top, ᶜ∂ϕ∂z, ϕ_bot = broadcast_zero(ϕ_top))
     ᶜJ = Fields.local_geometry_field(axes(ᶜ∂ϕ∂z)).J
     f_space = Spaces.face_space(axes(ᶜ∂ϕ∂z))
     J_bot = Fields.level(Fields.local_geometry_field(f_space).J, half)
     Δz_bot = Fields.level(Fields.Δz_field(f_space), half)
     ΔA_bot = Base.broadcasted(/, J_bot, Δz_bot)
-    ᶜΔϕ = Base.broadcasted(⊠, ᶜ∂ϕ∂z, Base.broadcasted(/, ᶜJ, ΔA_bot))
-    column_reduce!(⊞, ϕ_top, ᶜΔϕ; init = ϕ_bot)
+    ᶜΔϕ = Base.broadcasted(*, ᶜ∂ϕ∂z, Base.broadcasted(/, ᶜJ, ΔA_bot))
+    column_reduce!(+, ϕ_top, ᶜΔϕ; init = ϕ_bot)
 end
 
 """
@@ -43,19 +44,19 @@ is used, `ΔA = ΔA_{bot}` at all values of `z`, and the output `ᶠϕ` satisfie
 is used, the vertical gradient is replaced with an area-weighted gradient. The
 default value of `ϕ_bot` is 0, and the default value of `rtol` is 0.001.
 """
-function column_integral_indefinite!(ᶠϕ, ᶜ∂ϕ∂z, ϕ_bot = rzero(eltype(ᶠϕ)))
+function column_integral_indefinite!(ᶠϕ, ᶜ∂ϕ∂z, ϕ_bot = broadcast_zero(ᶠϕ))
     ᶜJ = Fields.local_geometry_field(axes(ᶜ∂ϕ∂z)).J
     J_bot = Fields.level(Fields.local_geometry_field(ᶠϕ).J, half)
     Δz_bot = Fields.level(Fields.Δz_field(ᶠϕ), half)
     ΔA_bot = Base.broadcasted(/, J_bot, Δz_bot)
-    ᶜΔϕ = Base.broadcasted(⊠, ᶜ∂ϕ∂z, Base.broadcasted(/, ᶜJ, ΔA_bot))
-    column_accumulate!(⊞, ᶠϕ, ᶜΔϕ; init = ϕ_bot)
+    ᶜΔϕ = Base.broadcasted(*, ᶜ∂ϕ∂z, Base.broadcasted(/, ᶜJ, ΔA_bot))
+    column_accumulate!(+, ᶠϕ, ᶜΔϕ; init = ϕ_bot)
 end
 function column_integral_indefinite!(
     ∂ϕ∂z::F,
     ᶠϕ,
-    ϕ_bot = eltype(ᶠϕ)(0),
-    rtol = eltype(ᶠϕ)(0.001),
+    ϕ_bot = broadcast_zero(ᶠϕ),
+    rtol = eltype(parent(ᶠϕ))(0.001),
 ) where {F <: Function}
     device = ClimaComms.device(ᶠϕ)
     c_space = Spaces.center_space(axes(ᶠϕ))
diff --git a/src/Operators/numericalflux.jl b/src/Operators/numericalflux.jl
index 40fdeb2bd4..f0ae1ba03c 100644
--- a/src/Operators/numericalflux.jl
+++ b/src/Operators/numericalflux.jl
@@ -28,17 +28,20 @@ function add_numerical_flux_internal!(fn, dydt, args...)
     Nq = Quadratures.degrees_of_freedom(Spaces.quadrature_style(space))
     topology = Spaces.topology(space)
     internal_surface_geometry = Spaces.grid(space).internal_surface_geometry
+    dydt_bc = Base.broadcastable(dydt)
+    args_bc =
+        map(arg -> arg isa Fields.Field ? Base.broadcastable(arg) : arg, args)
 
     for (iface, (elem⁻, face⁻, elem⁺, face⁺, reversed)) in
         enumerate(Topologies.interior_faces(topology))
 
         internal_surface_geometry_slab = slab(internal_surface_geometry, iface)
 
-        arg_slabs⁻ = map(arg -> slab(Fields.todata(arg), elem⁻), args)
-        arg_slabs⁺ = map(arg -> slab(Fields.todata(arg), elem⁺), args)
+        arg_slabs⁻ = map(arg -> slab(Fields.todata(arg), elem⁻), args_bc)
+        arg_slabs⁺ = map(arg -> slab(Fields.todata(arg), elem⁺), args_bc)
 
-        dydt_slab⁻ = slab(Fields.field_values(dydt), elem⁻)
-        dydt_slab⁺ = slab(Fields.field_values(dydt), elem⁺)
+        dydt_slab⁻ = slab(Fields.field_values(dydt_bc), elem⁻)
+        dydt_slab⁺ = slab(Fields.field_values(dydt_bc), elem⁺)
 
         for q in 1:Nq
             sgeom⁻ = internal_surface_geometry_slab[slab_index(q)]
@@ -46,24 +49,21 @@ function add_numerical_flux_internal!(fn, dydt, args...)
             i⁻, j⁻ = Topologies.face_node_index(face⁻, Nq, q, false)
             i⁺, j⁺ = Topologies.face_node_index(face⁺, Nq, q, reversed)
 
-            numflux⁻ = fn(
-                sgeom⁻.normal,
-                map(
-                    slab ->
-                        slab isa DataSlab2D ? slab[slab_index(i⁻, j⁻)] : slab,
-                    arg_slabs⁻,
-                ),
-                map(
-                    slab ->
-                        slab isa DataSlab2D ? slab[slab_index(i⁺, j⁺)] : slab,
-                    arg_slabs⁺,
-                ),
+            argvals⁻ = map(
+                slab -> slab isa DataSlab2D ? slab[slab_index(i⁻, j⁻)] : slab,
+                arg_slabs⁻,
             )
+            argvals⁺ = map(
+                slab -> slab isa DataSlab2D ? slab[slab_index(i⁺, j⁺)] : slab,
+                arg_slabs⁺,
+            )
+            numflux⁻ =
+                add_auto_broadcasters(fn(sgeom⁻.normal, argvals⁻, argvals⁺))
 
             dydt_slab⁻[slab_index(i⁻, j⁻)] =
-                dydt_slab⁻[slab_index(i⁻, j⁻)] ⊟ (sgeom⁻.sWJ ⊠ numflux⁻)
+                dydt_slab⁻[slab_index(i⁻, j⁻)] - (sgeom⁻.sWJ * numflux⁻)
             dydt_slab⁺[slab_index(i⁺, j⁺)] =
-                dydt_slab⁺[slab_index(i⁺, j⁺)] ⊞ (sgeom⁻.sWJ ⊠ numflux⁻)
+                dydt_slab⁺[slab_index(i⁺, j⁺)] + (sgeom⁻.sWJ * numflux⁻)
         end
     end
 end
@@ -78,9 +78,9 @@ struct CentralNumericalFlux{F}
 end
 
 function (fn::CentralNumericalFlux)(normal, argvals⁻, argvals⁺)
-    Favg =
-        RecursiveApply.rdiv(fn.fluxfn(argvals⁻...) ⊞ fn.fluxfn(argvals⁺...), 2)
-    return RecursiveApply.rmap(f -> f' * normal, Favg)
+    F⁻ = add_auto_broadcasters(fn.fluxfn(argvals⁻...))
+    F⁺ = add_auto_broadcasters(fn.fluxfn(argvals⁺...))
+    return ((F⁻ + F⁺) / 2)' * normal
 end
 
 """
@@ -96,10 +96,10 @@ end
 function (fn::RusanovNumericalFlux)(normal, argvals⁻, argvals⁺)
     y⁻ = argvals⁻[1]
     y⁺ = argvals⁺[1]
-    Favg =
-        RecursiveApply.rdiv(fn.fluxfn(argvals⁻...) ⊞ fn.fluxfn(argvals⁺...), 2)
+    F⁻ = add_auto_broadcasters(fn.fluxfn(argvals⁻...))
+    F⁺ = add_auto_broadcasters(fn.fluxfn(argvals⁺...))
     λ = max(fn.wavespeedfn(argvals⁻...), fn.wavespeedfn(argvals⁺...))
-    return RecursiveApply.rmap(f -> f' * normal, Favg) ⊞ (λ / 2) ⊠ (y⁻ ⊟ y⁺)
+    return ((F⁻ + F⁺) / 2)' * normal + (λ / 2) * (y⁻ - y⁺)
 end
 
 
@@ -108,6 +108,9 @@ function add_numerical_flux_boundary!(fn, dydt, args...)
     Nq = Quadratures.degrees_of_freedom(Spaces.quadrature_style(space))
     topology = Spaces.topology(space)
     boundary_surface_geometries = Spaces.grid(space).boundary_surface_geometries
+    dydt_bc = Base.broadcastable(dydt)
+    args_bc =
+        map(arg -> arg isa Fields.Field ? Base.broadcastable(arg) : arg, args)
 
     for (iboundary, boundarytag) in
         enumerate(Topologies.boundary_tags(topology))
@@ -117,22 +120,19 @@ function add_numerical_flux_boundary!(fn, dydt, args...)
                 surface_geometry_slab =
                     slab(boundary_surface_geometries[iboundary], iface)
 
-            arg_slabs⁻ = map(arg -> slab(Fields.todata(arg), elem⁻), args)
-            dydt_slab⁻ = slab(Fields.field_values(dydt), elem⁻)
+            arg_slabs⁻ = map(arg -> slab(Fields.todata(arg), elem⁻), args_bc)
+            dydt_slab⁻ = slab(Fields.field_values(dydt_bc), elem⁻)
             for q in 1:Nq
                 sgeom⁻ = boundary_surface_geometry_slab[slab_index(q)]
                 i⁻, j⁻ = Topologies.face_node_index(face⁻, Nq, q, false)
-                numflux⁻ = fn(
-                    sgeom⁻.normal,
-                    map(
-                        slab ->
-                            slab isa DataSlab2D ? slab[slab_index(i⁻, j⁻)] :
-                            slab,
-                        arg_slabs⁻,
-                    ),
+                argvals⁻ = map(
+                    slab ->
+                        slab isa DataSlab2D ? slab[slab_index(i⁻, j⁻)] : slab,
+                    arg_slabs⁻,
                 )
+                numflux⁻ = add_auto_broadcasters(fn(sgeom⁻.normal, argvals⁻))
                 dydt_slab⁻[slab_index(i⁻, j⁻)] =
-                    dydt_slab⁻[slab_index(i⁻, j⁻)] ⊟ (sgeom⁻.sWJ ⊠ numflux⁻)
+                    dydt_slab⁻[slab_index(i⁻, j⁻)] - (sgeom⁻.sWJ * numflux⁻)
             end
         end
     end
diff --git a/src/Operators/spectralelement.jl b/src/Operators/spectralelement.jl
index da4d25b38f..93090b2010 100644
--- a/src/Operators/spectralelement.jl
+++ b/src/Operators/spectralelement.jl
@@ -111,11 +111,18 @@ function Base.Broadcast.broadcasted(op::SpectralElementOperator, args...)
     Base.Broadcast.broadcasted(style, op, args′...)
 end
 
-Base.Broadcast.broadcasted(
+function Base.Broadcast.broadcasted(
     ::SpectralStyle,
     op::SpectralElementOperator,
     args...,
-) = SpectralBroadcasted{SpectralStyle}(op, args)
+)
+    args′ =
+        unrolled_map(args) do arg
+            is_auto_broadcastable(eltype(arg)) ?
+            Base.Broadcast.broadcasted(add_auto_broadcasters, arg) : arg
+        end
+    return SpectralBroadcasted{SpectralStyle}(op, args′)
+end
 
 Base.eltype(sbc::SpectralBroadcasted) =
     operator_return_eltype(sbc.op, map(eltype, sbc.args)...)
@@ -175,16 +182,6 @@ function Base.Broadcast.instantiate(
     end
 end
 
-function Base.similar(
-    bc::Base.Broadcast.Broadcasted{<:AbstractSpectralStyle},
-    ::Type{Eltype},
-) where {Eltype}
-    space = axes(bc)
-    return Field(Eltype, space)
-end
-
-
-
 # Functions for SlabBlockSpectralStyle
 function Base.copyto!(
     out::Field,
@@ -548,7 +545,7 @@ Divergence() = Divergence{()}()
 Divergence{()}(space) = Divergence{operator_axes(space)}()
 
 operator_return_eltype(op::Divergence{I}, ::Type{S}) where {I, S} =
-    RecursiveApply.rmaptype(Geometry.divergence_result_type, S)
+    Geometry.divergence_result_type(S)
 
 function apply_operator(op::Divergence{(1,)}, space, slabidx, arg)
     FT = Spaces.undertype(space)
@@ -563,20 +560,15 @@ function apply_operator(op::Divergence{(1,)}, space, slabidx, arg)
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
         v = get_node(space, arg, ij, slabidx)
-        Jv¹ =
-            local_geometry.J ⊠ RecursiveApply.rmap(
-                v -> Geometry.contravariant1(v, local_geometry),
-                v,
-            )
+        Jv¹ = local_geometry.J * Geometry.contravariant1(v, local_geometry)
         for ii in 1:Nq
-            out[slab_index(ii)] = out[slab_index(ii)] ⊞ (D[ii, i] ⊠ Jv¹)
+            out[slab_index(ii)] += D[ii, i] * Jv¹
         end
     end
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i)] =
-            RecursiveApply.rmul(out[slab_index(i)], local_geometry.invJ)
+        out[slab_index(i)] *= local_geometry.invJ
     end
     return Field(SArray(out), space)
 end
@@ -599,28 +591,19 @@ Base.@propagate_inbounds function apply_operator(
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
         v = get_node(space, arg, ij, slabidx)
-        Jv¹ =
-            local_geometry.J ⊠ RecursiveApply.rmap(
-                v -> Geometry.contravariant1(v, local_geometry),
-                v,
-            )
+        Jv¹ = local_geometry.J * Geometry.contravariant1(v, local_geometry)
         for ii in 1:Nq
-            out[slab_index(ii, j)] = out[slab_index(ii, j)] ⊞ (D[ii, i] ⊠ Jv¹)
+            out[slab_index(ii, j)] += D[ii, i] * Jv¹
         end
-        Jv² =
-            local_geometry.J ⊠ RecursiveApply.rmap(
-                v -> Geometry.contravariant2(v, local_geometry),
-                v,
-            )
+        Jv² = local_geometry.J * Geometry.contravariant2(v, local_geometry)
         for jj in 1:Nq
-            out[slab_index(i, jj)] = out[slab_index(i, jj)] ⊞ (D[jj, j] ⊠ Jv²)
+            out[slab_index(i, jj)] += D[jj, j] * Jv²
         end
     end
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i, j)] =
-            RecursiveApply.rmul(out[slab_index(i, j)], local_geometry.invJ)
+        out[slab_index(i, j)] *= local_geometry.invJ
     end
     return Field(SArray(out), space)
 end
@@ -713,11 +696,12 @@ struct SplitDivergence{I} <: SpectralElementOperator{I} end
 SplitDivergence() = SplitDivergence{()}()
 SplitDivergence{()}(space) = SplitDivergence{operator_axes(space)}()
 
-operator_return_eltype(::SplitDivergence{I}, ::Type{S1}, ::Type{S2}) where {I, S1, S2} =
-    Geometry.rmul_return_type(
-        RecursiveApply.rmaptype(Geometry.divergence_result_type, S1),
-        S2,
-    )
+operator_return_eltype(
+    ::SplitDivergence{I},
+    ::Type{S1},
+    ::Type{S2},
+) where {I, S1, S2} =
+    Geometry.mul_return_type(Geometry.divergence_result_type(S1), S2)
 
 function apply_operator(op::SplitDivergence{(1,)}, space, slabidx, arg1, arg2)
     FT = Spaces.undertype(space)
@@ -732,11 +716,9 @@ function apply_operator(op::SplitDivergence{(1,)}, space, slabidx, arg1, arg2)
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
+        u = get_node(space, arg1, ij, slabidx)
         Ju1[slab_index(i)] =
-            local_geometry.J ⊠ RecursiveApply.rmap(
-                u -> Geometry.contravariant1(u, local_geometry),
-                get_node(space, arg1, ij, slabidx),
-            )
+            local_geometry.J * Geometry.contravariant1(u, local_geometry)
         psi[slab_index(i)] = get_node(space, arg2, ij, slabidx)
     end
 
@@ -744,19 +726,19 @@ function apply_operator(op::SplitDivergence{(1,)}, space, slabidx, arg1, arg2)
     fill!(parent(out), zero(FT))
     @inbounds for i in 1:Nq
         for j in 1:(i - 1) # loop over half the indices, since F1[i,j] = F1[j,i]
-            F1 = RecursiveApply.rdiv(
-                (Ju1[slab_index(i)] ⊞ Ju1[slab_index(j)]) ⊠
-                (psi[slab_index(i)] ⊞ psi[slab_index(j)]),
-                2,
-            )
-            out[slab_index(i)] = out[slab_index(i)] ⊞ D[i, j] ⊠ F1
-            out[slab_index(j)] = out[slab_index(j)] ⊞ D[j, i] ⊠ F1
+            F1 =
+                (
+                    (Ju1[slab_index(i)] + Ju1[slab_index(j)]) *
+                    (psi[slab_index(i)] + psi[slab_index(j)])
+                ) / 2
+            out[slab_index(i)] += D[i, j] * F1
+            out[slab_index(j)] += D[j, i] * F1
         end
     end
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i)] = out[slab_index(i)] ⊠ local_geometry.invJ
+        out[slab_index(i)] *= local_geometry.invJ
     end
 
     return Field(SArray(out), space)
@@ -778,15 +760,9 @@ function apply_operator(op::SplitDivergence{(1, 2)}, space, slabidx, arg1, arg2)
         local_geometry = get_local_geometry(space, ij, slabidx)
         u = get_node(space, arg1, ij, slabidx)
         Ju1[slab_index(i, j)] =
-            local_geometry.J ⊠ RecursiveApply.rmap(
-                u -> Geometry.contravariant1(u, local_geometry),
-                u,
-            )
+            local_geometry.J * Geometry.contravariant1(u, local_geometry)
         Ju2[slab_index(i, j)] =
-            local_geometry.J ⊠ RecursiveApply.rmap(
-                u -> Geometry.contravariant2(u, local_geometry),
-                u,
-            )
+            local_geometry.J * Geometry.contravariant2(u, local_geometry)
         psi[slab_index(i, j)] = get_node(space, arg2, ij, slabidx)
     end
 
@@ -794,28 +770,28 @@ function apply_operator(op::SplitDivergence{(1, 2)}, space, slabidx, arg1, arg2)
     fill!(parent(out), zero(FT))
     @inbounds for j in 1:Nq, i in 1:Nq
         for k in 1:(i - 1) # loop over half the indices, since F1[i,k] = F1[k,i]
-            F1 = RecursiveApply.rdiv(
-                (Ju1[slab_index(i, j)] ⊞ Ju1[slab_index(k, j)]) ⊠
-                (psi[slab_index(i, j)] ⊞ psi[slab_index(k, j)]),
-                2,
-            )
-            out[slab_index(i, j)] = out[slab_index(i, j)] ⊞ D[i, k] ⊠ F1
-            out[slab_index(k, j)] = out[slab_index(k, j)] ⊞ D[k, i] ⊠ F1
+            F1 =
+                (
+                    (Ju1[slab_index(i, j)] + Ju1[slab_index(k, j)]) *
+                    (psi[slab_index(i, j)] + psi[slab_index(k, j)])
+                ) / 2
+            out[slab_index(i, j)] += D[i, k] * F1
+            out[slab_index(k, j)] += D[k, i] * F1
         end
         for k in 1:(j - 1) # loop over half the indices, since F2[j,k] = F2[k,j]
-            F2 = RecursiveApply.rdiv(
-                (Ju2[slab_index(i, j)] ⊞ Ju2[slab_index(i, k)]) ⊠
-                (psi[slab_index(i, j)] ⊞ psi[slab_index(i, k)]),
-                2,
-            )
-            out[slab_index(i, j)] = out[slab_index(i, j)] ⊞ D[j, k] ⊠ F2
-            out[slab_index(i, k)] = out[slab_index(i, k)] ⊞ D[k, j] ⊠ F2
+            F2 =
+                (
+                    (Ju2[slab_index(i, j)] + Ju2[slab_index(i, k)]) *
+                    (psi[slab_index(i, j)] + psi[slab_index(i, k)])
+                ) / 2
+            out[slab_index(i, j)] += D[j, k] * F2
+            out[slab_index(i, k)] += D[k, j] * F2
         end
     end
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i, j)] = out[slab_index(i, j)] ⊠ local_geometry.invJ
+        out[slab_index(i, j)] *= local_geometry.invJ
     end
 
     return Field(SArray(out), space)
@@ -863,7 +839,7 @@ WeakDivergence() = WeakDivergence{()}()
 WeakDivergence{()}(space) = WeakDivergence{operator_axes(space)}()
 
 operator_return_eltype(::WeakDivergence{I}, ::Type{S}) where {I, S} =
-    RecursiveApply.rmaptype(Geometry.divergence_result_type, S)
+    Geometry.divergence_result_type(S)
 
 function apply_operator(op::WeakDivergence{(1,)}, space, slabidx, arg)
     FT = Spaces.undertype(space)
@@ -878,20 +854,15 @@ function apply_operator(op::WeakDivergence{(1,)}, space, slabidx, arg)
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
         v = get_node(space, arg, ij, slabidx)
-        WJv¹ =
-            local_geometry.WJ ⊠ RecursiveApply.rmap(
-                v -> Geometry.contravariant1(v, local_geometry),
-                v,
-            )
+        WJv¹ = local_geometry.WJ * Geometry.contravariant1(v, local_geometry)
         for ii in 1:Nq
-            out[slab_index(ii)] = out[slab_index(ii)] ⊞ (D[i, ii] ⊠ WJv¹)
+            out[slab_index(ii)] += D[i, ii] * WJv¹
         end
     end
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i)] =
-            RecursiveApply.rdiv(out[slab_index(i)], ⊟(local_geometry.WJ))
+        out[slab_index(i)] /= -local_geometry.WJ
     end
     return Field(SArray(out), space)
 end
@@ -909,28 +880,19 @@ function apply_operator(op::WeakDivergence{(1, 2)}, space, slabidx, arg)
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
         v = get_node(space, arg, ij, slabidx)
-        WJv¹ =
-            local_geometry.WJ ⊠ RecursiveApply.rmap(
-                v -> Geometry.contravariant1(v, local_geometry),
-                v,
-            )
+        WJv¹ = local_geometry.WJ * Geometry.contravariant1(v, local_geometry)
         for ii in 1:Nq
-            out[slab_index(ii, j)] = out[slab_index(ii, j)] ⊞ (D[i, ii] ⊠ WJv¹)
+            out[slab_index(ii, j)] += D[i, ii] * WJv¹
         end
-        WJv² =
-            local_geometry.WJ ⊠ RecursiveApply.rmap(
-                v -> Geometry.contravariant2(v, local_geometry),
-                v,
-            )
+        WJv² = local_geometry.WJ * Geometry.contravariant2(v, local_geometry)
         for jj in 1:Nq
-            out[slab_index(i, jj)] = out[slab_index(i, jj)] ⊞ (D[j, jj] ⊠ WJv²)
+            out[slab_index(i, jj)] += D[j, jj] * WJv²
         end
     end
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i, j)] =
-            RecursiveApply.rdiv(out[slab_index(i, j)], ⊟(local_geometry.WJ))
+        out[slab_index(i, j)] /= -local_geometry.WJ
     end
     return Field(SArray(out), space)
 end
@@ -962,7 +924,7 @@ Gradient() = Gradient{()}()
 Gradient{()}(space) = Gradient{operator_axes(space)}()
 
 operator_return_eltype(::Gradient{I}, ::Type{S}) where {I, S} =
-    RecursiveApply.rmaptype(T -> Geometry.gradient_result_type(Val(I), T), S)
+    Geometry.gradient_result_type(Val(I), S)
 
 function apply_operator(op::Gradient{(1,)}, space, slabidx, arg)
     FT = Spaces.undertype(space)
@@ -1004,11 +966,11 @@ Base.@propagate_inbounds function apply_operator(
         x = get_node(space, arg, ij, slabidx)
         for ii in 1:Nq
             ∂f∂ξ₁ = Geometry.Covariant12Vector(D[ii, i], zero(eltype(D))) ⊗ x
-            out[slab_index(ii, j)] = out[slab_index(ii, j)] ⊞ ∂f∂ξ₁
+            out[slab_index(ii, j)] += ∂f∂ξ₁
         end
         for jj in 1:Nq
             ∂f∂ξ₂ = Geometry.Covariant12Vector(zero(eltype(D)), D[jj, j]) ⊗ x
-            out[slab_index(i, jj)] = out[slab_index(i, jj)] ⊞ ∂f∂ξ₂
+            out[slab_index(i, jj)] += ∂f∂ξ₂
         end
     end
     return Field(SArray(out), space)
@@ -1053,7 +1015,7 @@ WeakGradient() = WeakGradient{()}()
 WeakGradient{()}(space) = WeakGradient{operator_axes(space)}()
 
 operator_return_eltype(::WeakGradient{I}, ::Type{S}) where {I, S} =
-    RecursiveApply.rmaptype(T -> Geometry.gradient_result_type(Val(I), T), S)
+    Geometry.gradient_result_type(Val(I), S)
 
 function apply_operator(op::WeakGradient{(1,)}, space, slabidx, arg)
     FT = Spaces.undertype(space)
@@ -1068,17 +1030,17 @@ function apply_operator(op::WeakGradient{(1,)}, space, slabidx, arg)
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
         W = local_geometry.WJ * local_geometry.invJ
-        Wx = W ⊠ get_node(space, arg, ij, slabidx)
+        Wx = W * get_node(space, arg, ij, slabidx)
         for ii in 1:Nq
             Dᵀ₁Wf = Geometry.Covariant1Vector(D[i, ii]) ⊗ Wx
-            out[slab_index(ii)] = out[slab_index(ii)] ⊟ Dᵀ₁Wf
+            out[slab_index(ii)] -= Dᵀ₁Wf
         end
     end
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
         W = local_geometry.WJ * local_geometry.invJ
-        out[slab_index(i)] = RecursiveApply.rdiv(out[slab_index(i)], W)
+        out[slab_index(i)] /= W
     end
     return Field(SArray(out), space)
 end
@@ -1097,21 +1059,21 @@ function apply_operator(op::WeakGradient{(1, 2)}, space, slabidx, arg)
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
         W = local_geometry.WJ * local_geometry.invJ
-        Wx = W ⊠ get_node(space, arg, ij, slabidx)
+        Wx = W * get_node(space, arg, ij, slabidx)
         for ii in 1:Nq
             Dᵀ₁Wf = Geometry.Covariant12Vector(D[i, ii], zero(eltype(D))) ⊗ Wx
-            out[slab_index(ii, j)] = out[slab_index(ii, j)] ⊟ Dᵀ₁Wf
+            out[slab_index(ii, j)] -= Dᵀ₁Wf
         end
         for jj in 1:Nq
             Dᵀ₂Wf = Geometry.Covariant12Vector(zero(eltype(D)), D[j, jj]) ⊗ Wx
-            out[slab_index(i, jj)] = out[slab_index(i, jj)] ⊟ Dᵀ₂Wf
+            out[slab_index(i, jj)] -= Dᵀ₂Wf
         end
     end
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
         W = local_geometry.WJ * local_geometry.invJ
-        out[slab_index(i, j)] = RecursiveApply.rdiv(out[slab_index(i, j)], W)
+        out[slab_index(i, j)] /= W
     end
     return Field(SArray(out), space)
 end
@@ -1164,7 +1126,7 @@ Curl() = Curl{()}()
 Curl{()}(space) = Curl{operator_axes(space)}()
 
 operator_return_eltype(::Curl{I}, ::Type{S}) where {I, S} =
-    RecursiveApply.rmaptype(T -> Geometry.curl_result_type(Val(I), T), S)
+    Geometry.curl_result_type(Val(I), S)
 
 function apply_operator(op::Curl{(1,)}, space, slabidx, arg)
     FT = Spaces.undertype(space)
@@ -1182,9 +1144,8 @@ function apply_operator(op::Curl{(1,)}, space, slabidx, arg)
             v = get_node(space, arg, ij, slabidx)
             v₃ = Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
-                D₁v₃ = D[ii, i] ⊠ v₃
-                out[slab_index(ii)] =
-                    out[slab_index(ii)] ⊞ Geometry.Contravariant2Vector(⊟(D₁v₃))
+                D₁v₃ = D[ii, i] * v₃
+                out[slab_index(ii)] += Geometry.Contravariant2Vector(-D₁v₃)
             end
         end
     elseif RT <: Geometry.Contravariant3Vector
@@ -1194,9 +1155,8 @@ function apply_operator(op::Curl{(1,)}, space, slabidx, arg)
             v = get_node(space, arg, ij, slabidx)
             v₂ = Geometry.covariant2(v, local_geometry)
             for ii in 1:Nq
-                D₁v₂ = D[ii, i] ⊠ v₂
-                out[slab_index(ii)] =
-                    out[slab_index(ii)] ⊞ Geometry.Contravariant3Vector(D₁v₂)
+                D₁v₂ = D[ii, i] * v₂
+                out[slab_index(ii)] += Geometry.Contravariant3Vector(D₁v₂)
             end
         end
     elseif RT <: Geometry.Contravariant23Vector
@@ -1207,11 +1167,10 @@ function apply_operator(op::Curl{(1,)}, space, slabidx, arg)
             v₂ = Geometry.covariant2(v, local_geometry)
             v₃ = Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
-                D₁v₃ = D[ii, i] ⊠ v₃
-                D₁v₂ = D[ii, i] ⊠ v₂
-                out[slab_index(ii)] =
-                    out[slab_index(ii)] ⊞
-                    Geometry.Contravariant23Vector(⊟(D₁v₃), D₁v₂)
+                D₁v₃ = D[ii, i] * v₃
+                D₁v₂ = D[ii, i] * v₂
+                out[slab_index(ii)] +=
+                    Geometry.Contravariant23Vector(-D₁v₃, D₁v₂)
             end
         end
     else
@@ -1220,8 +1179,7 @@ function apply_operator(op::Curl{(1,)}, space, slabidx, arg)
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i)] =
-            RecursiveApply.rmul(out[slab_index(i)], local_geometry.invJ)
+        out[slab_index(i)] *= local_geometry.invJ
     end
     return Field(SArray(out), space)
 end
@@ -1244,16 +1202,13 @@ function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
             v = get_node(space, arg, ij, slabidx)
             v₁ = Geometry.covariant1(v, local_geometry)
             for jj in 1:Nq
-                D₂v₁ = D[jj, j] ⊠ v₁
-                out[slab_index(i, jj)] =
-                    out[slab_index(i, jj)] ⊞
-                    Geometry.Contravariant3Vector(⊟(D₂v₁))
+                D₂v₁ = D[jj, j] * v₁
+                out[slab_index(i, jj)] += Geometry.Contravariant3Vector(-D₂v₁)
             end
             v₂ = Geometry.covariant2(v, local_geometry)
             for ii in 1:Nq
-                D₁v₂ = D[ii, i] ⊠ v₂
-                out[slab_index(ii, j)] =
-                    out[slab_index(ii, j)] ⊞ Geometry.Contravariant3Vector(D₁v₂)
+                D₁v₂ = D[ii, i] * v₂
+                out[slab_index(ii, j)] += Geometry.Contravariant3Vector(D₁v₂)
             end
         end
         # input data is a Covariant3Vector field
@@ -1264,15 +1219,13 @@ function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
             v = get_node(space, arg, ij, slabidx)
             v₃ = Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
-                D₁v₃ = D[ii, i] ⊠ v₃
-                out[slab_index(ii, j)] =
-                    out[slab_index(ii, j)] ⊞
-                    Geometry.Contravariant12Vector(zero(D₁v₃), ⊟(D₁v₃))
+                D₁v₃ = D[ii, i] * v₃
+                out[slab_index(ii, j)] +=
+                    Geometry.Contravariant12Vector(zero(D₁v₃), -D₁v₃)
             end
             for jj in 1:Nq
-                D₂v₃ = D[jj, j] ⊠ v₃
-                out[slab_index(i, jj)] =
-                    out[slab_index(i, jj)] ⊞
+                D₂v₃ = D[jj, j] * v₃
+                out[slab_index(i, jj)] +=
                     Geometry.Contravariant12Vector(D₂v₃, zero(D₂v₃))
             end
         end
@@ -1285,18 +1238,16 @@ function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
             v₂ = Geometry.covariant2(v, local_geometry)
             v₃ = Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
-                D₁v₃ = D[ii, i] ⊠ v₃
-                D₁v₂ = D[ii, i] ⊠ v₂
-                out[slab_index(ii, j)] =
-                    out[slab_index(ii, j)] ⊞
-                    Geometry.Contravariant123Vector(zero(D₁v₃), ⊟(D₁v₃), D₁v₂)
+                D₁v₃ = D[ii, i] * v₃
+                D₁v₂ = D[ii, i] * v₂
+                out[slab_index(ii, j)] +=
+                    Geometry.Contravariant123Vector(zero(D₁v₃), -D₁v₃, D₁v₂)
             end
             for jj in 1:Nq
-                D₂v₃ = D[jj, j] ⊠ v₃
-                D₂v₁ = D[jj, j] ⊠ v₁
-                out[slab_index(i, jj)] =
-                    out[slab_index(i, jj)] ⊞
-                    Geometry.Contravariant123Vector(D₂v₃, zero(D₂v₃), ⊟(D₂v₁))
+                D₂v₃ = D[jj, j] * v₃
+                D₂v₁ = D[jj, j] * v₁
+                out[slab_index(i, jj)] +=
+                    Geometry.Contravariant123Vector(D₂v₃, zero(D₂v₃), -D₂v₁)
             end
         end
     else
@@ -1305,8 +1256,7 @@ function apply_operator(op::Curl{(1, 2)}, space, slabidx, arg)
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i, j)] =
-            RecursiveApply.rmul(out[slab_index(i, j)], local_geometry.invJ)
+        out[slab_index(i, j)] *= local_geometry.invJ
     end
     return Field(SArray(out), space)
 end
@@ -1352,7 +1302,7 @@ WeakCurl() = WeakCurl{()}()
 WeakCurl{()}(space) = WeakCurl{operator_axes(space)}()
 
 operator_return_eltype(::WeakCurl{I}, ::Type{S}) where {I, S} =
-    RecursiveApply.rmaptype(T -> Geometry.curl_result_type(Val(I), T), S)
+    Geometry.curl_result_type(Val(I), S)
 
 function apply_operator(op::WeakCurl{(1,)}, space, slabidx, arg)
     FT = Spaces.undertype(space)
@@ -1370,11 +1320,10 @@ function apply_operator(op::WeakCurl{(1,)}, space, slabidx, arg)
             local_geometry = get_local_geometry(space, ij, slabidx)
             v = get_node(space, arg, ij, slabidx)
             W = local_geometry.WJ * local_geometry.invJ
-            Wv₃ = W ⊠ Geometry.covariant3(v, local_geometry)
+            Wv₃ = W * Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
-                Dᵀ₁Wv₃ = D[i, ii] ⊠ Wv₃
-                out[slab_index(ii)] =
-                    out[slab_index(ii)] ⊞ Geometry.Contravariant2Vector(Dᵀ₁Wv₃)
+                Dᵀ₁Wv₃ = D[i, ii] * Wv₃
+                out[slab_index(ii)] += Geometry.Contravariant2Vector(Dᵀ₁Wv₃)
             end
         end
     elseif RT <: Geometry.Contravariant3Vector
@@ -1383,12 +1332,10 @@ function apply_operator(op::WeakCurl{(1,)}, space, slabidx, arg)
             local_geometry = get_local_geometry(space, ij, slabidx)
             v = get_node(space, arg, ij, slabidx)
             W = local_geometry.WJ * local_geometry.invJ
-            Wv₂ = W ⊠ Geometry.covariant2(v, local_geometry)
+            Wv₂ = W * Geometry.covariant2(v, local_geometry)
             for ii in 1:Nq
-                Dᵀ₁Wv₂ = D[i, ii] ⊠ Wv₂
-                out[slab_index(ii)] =
-                    out[slab_index(ii)] ⊞
-                    Geometry.Contravariant3Vector(⊟(Dᵀ₁Wv₂))
+                Dᵀ₁Wv₂ = D[i, ii] * Wv₂
+                out[slab_index(ii)] += Geometry.Contravariant3Vector(-Dᵀ₁Wv₂)
             end
         end
     elseif RT <: Geometry.Contravariant23Vector
@@ -1397,14 +1344,13 @@ function apply_operator(op::WeakCurl{(1,)}, space, slabidx, arg)
             local_geometry = get_local_geometry(space, ij, slabidx)
             v = get_node(space, arg, ij, slabidx)
             W = local_geometry.WJ * local_geometry.invJ
-            Wv₃ = W ⊠ Geometry.covariant3(v, local_geometry)
-            Wv₂ = W ⊠ Geometry.covariant2(v, local_geometry)
+            Wv₃ = W * Geometry.covariant3(v, local_geometry)
+            Wv₂ = W * Geometry.covariant2(v, local_geometry)
             for ii in 1:Nq
-                Dᵀ₁Wv₃ = D[i, ii] ⊠ Wv₃
-                Dᵀ₁Wv₂ = D[i, ii] ⊠ Wv₂
-                out[slab_index(ii)] =
-                    out[slab_index(ii)] ⊞
-                    Geometry.Contravariant23Vector(Dᵀ₁Wv₃, ⊟(Dᵀ₁Wv₂))
+                Dᵀ₁Wv₃ = D[i, ii] * Wv₃
+                Dᵀ₁Wv₂ = D[i, ii] * Wv₂
+                out[slab_index(ii)] +=
+                    Geometry.Contravariant23Vector(Dᵀ₁Wv₃, -Dᵀ₁Wv₂)
             end
         end
     else
@@ -1413,8 +1359,7 @@ function apply_operator(op::WeakCurl{(1,)}, space, slabidx, arg)
     @inbounds for i in 1:Nq
         ij = CartesianIndex((i,))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i)] =
-            RecursiveApply.rdiv(out[slab_index(i)], local_geometry.WJ)
+        out[slab_index(i)] /= local_geometry.WJ
     end
     return Field(SArray(out), space)
 end
@@ -1436,19 +1381,17 @@ function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
             local_geometry = get_local_geometry(space, ij, slabidx)
             v = get_node(space, arg, ij, slabidx)
             W = local_geometry.WJ * local_geometry.invJ
-            Wv₁ = W ⊠ Geometry.covariant1(v, local_geometry)
+            Wv₁ = W * Geometry.covariant1(v, local_geometry)
             for jj in 1:Nq
-                Dᵀ₂Wv₁ = D[j, jj] ⊠ Wv₁
-                out[slab_index(i, jj)] =
-                    out[slab_index(i, jj)] ⊞
+                Dᵀ₂Wv₁ = D[j, jj] * Wv₁
+                out[slab_index(i, jj)] +=
                     Geometry.Contravariant3Vector(Dᵀ₂Wv₁)
             end
-            Wv₂ = W ⊠ Geometry.covariant2(v, local_geometry)
+            Wv₂ = W * Geometry.covariant2(v, local_geometry)
             for ii in 1:Nq
-                Dᵀ₁Wv₂ = D[i, ii] ⊠ Wv₂
-                out[slab_index(ii, j)] =
-                    out[slab_index(ii, j)] ⊞
-                    Geometry.Contravariant3Vector(⊟(Dᵀ₁Wv₂))
+                Dᵀ₁Wv₂ = D[i, ii] * Wv₂
+                out[slab_index(ii, j)] +=
+                    Geometry.Contravariant3Vector(-Dᵀ₁Wv₂)
             end
         end
         # input data is a Covariant3Vector field
@@ -1458,18 +1401,16 @@ function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
             local_geometry = get_local_geometry(space, ij, slabidx)
             v = get_node(space, arg, ij, slabidx)
             W = local_geometry.WJ * local_geometry.invJ
-            Wv₃ = W ⊠ Geometry.covariant3(v, local_geometry)
+            Wv₃ = W * Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
-                Dᵀ₁Wv₃ = D[i, ii] ⊠ Wv₃
-                out[slab_index(ii, j)] =
-                    out[slab_index(ii, j)] ⊞
+                Dᵀ₁Wv₃ = D[i, ii] * Wv₃
+                out[slab_index(ii, j)] +=
                     Geometry.Contravariant12Vector(zero(Dᵀ₁Wv₃), Dᵀ₁Wv₃)
             end
             for jj in 1:Nq
-                Dᵀ₂Wv₃ = D[j, jj] ⊠ Wv₃
-                out[slab_index(i, jj)] =
-                    out[slab_index(i, jj)] ⊞
-                    Geometry.Contravariant12Vector(⊟(Dᵀ₂Wv₃), zero(Dᵀ₂Wv₃))
+                Dᵀ₂Wv₃ = D[j, jj] * Wv₃
+                out[slab_index(i, jj)] +=
+                    Geometry.Contravariant12Vector(-Dᵀ₂Wv₃, zero(Dᵀ₂Wv₃))
             end
         end
     elseif RT <: Geometry.Contravariant123Vector
@@ -1478,25 +1419,24 @@ function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
             local_geometry = get_local_geometry(space, ij, slabidx)
             v = get_node(space, arg, ij, slabidx)
             W = local_geometry.WJ * local_geometry.invJ
-            Wv₁ = W ⊠ Geometry.covariant1(v, local_geometry)
-            Wv₂ = W ⊠ Geometry.covariant2(v, local_geometry)
-            Wv₃ = W ⊠ Geometry.covariant3(v, local_geometry)
+            Wv₁ = W * Geometry.covariant1(v, local_geometry)
+            Wv₂ = W * Geometry.covariant2(v, local_geometry)
+            Wv₃ = W * Geometry.covariant3(v, local_geometry)
             for ii in 1:Nq
-                Dᵀ₁Wv₃ = D[i, ii] ⊠ Wv₃
-                Dᵀ₁Wv₂ = D[i, ii] ⊠ Wv₂
-                out[slab_index(ii, j)] =
-                    out[slab_index(ii, j)] ⊞ Geometry.Contravariant123Vector(
-                        zero(Dᵀ₁Wv₃),
-                        Dᵀ₁Wv₃,
-                        ⊟(Dᵀ₁Wv₂),
-                    )
+                Dᵀ₁Wv₃ = D[i, ii] * Wv₃
+                Dᵀ₁Wv₂ = D[i, ii] * Wv₂
+                out[slab_index(ii, j)] += Geometry.Contravariant123Vector(
+                    zero(Dᵀ₁Wv₃),
+                    Dᵀ₁Wv₃,
+                    -Dᵀ₁Wv₂,
+                )
             end
             for jj in 1:Nq
-                Dᵀ₂Wv₃ = D[j, jj] ⊠ Wv₃
-                Dᵀ₂Wv₁ = D[j, jj] ⊠ Wv₁
-                out[slab_index(i, jj)] =
-                    out[slab_index(i, jj)] ⊞ Geometry.Contravariant123Vector(
-                        ⊟(Dᵀ₂Wv₃),
+                Dᵀ₂Wv₃ = D[j, jj] * Wv₃
+                Dᵀ₂Wv₁ = D[j, jj] * Wv₁
+                out[slab_index(i, jj)] +=
+                    Geometry.Contravariant123Vector(
+                        -Dᵀ₂Wv₃,
                         zero(Dᵀ₂Wv₃),
                         Dᵀ₂Wv₁,
                     )
@@ -1508,8 +1448,7 @@ function apply_operator(op::WeakCurl{(1, 2)}, space, slabidx, arg)
     @inbounds for j in 1:Nq, i in 1:Nq
         ij = CartesianIndex((i, j))
         local_geometry = get_local_geometry(space, ij, slabidx)
-        out[slab_index(i, j)] =
-            RecursiveApply.rdiv(out[slab_index(i, j)], local_geometry.WJ)
+        out[slab_index(i, j)] /= local_geometry.WJ
     end
     return Field(SArray(out), space)
 end
@@ -1554,10 +1493,10 @@ function apply_operator(op::Interpolate{(1,)}, space_out, slabidx, arg)
     @inbounds for i in 1:Nq_out
         # manually inlined rmatmul with slab_getnode
         ij = CartesianIndex((1,))
-        r = Imat[i, 1] ⊠ get_node(space_in, arg, ij, slabidx)
+        r = Imat[i, 1] * get_node(space_in, arg, ij, slabidx)
         for ii in 2:Nq_in
             ij = CartesianIndex((ii,))
-            r = RecursiveApply.rmuladd(
+            r = muladd(
                 Imat[i, ii],
                 get_node(space_in, arg, ij, slabidx),
                 r,
@@ -1584,10 +1523,10 @@ function apply_operator(op::Interpolate{(1, 2)}, space_out, slabidx, arg)
         # manually inlined rmatmul1 with slab get_node
         # we do this to remove one allocated intermediate array
         ij = CartesianIndex((1, j))
-        r = Imat[i, 1] ⊠ get_node(space_in, arg, ij, slabidx)
+        r = Imat[i, 1] * get_node(space_in, arg, ij, slabidx)
         for ii in 2:Nq_in
             ij = CartesianIndex((ii, j))
-            r = RecursiveApply.rmuladd(
+            r = muladd(
                 Imat[i, ii],
                 get_node(space_in, arg, ij, slabidx),
                 r,
@@ -1645,19 +1584,19 @@ function apply_operator(op::Restrict{(1,)}, space_out, slabidx, arg)
         # manually inlined rmatmul with slab get_node
         ij = CartesianIndex((1,))
         WJ = get_local_geometry(space_in, ij, slabidx).WJ
-        r = ImatT[i, 1] ⊠ (WJ ⊠ get_node(space_in, arg, ij, slabidx))
+        r = ImatT[i, 1] * (WJ * get_node(space_in, arg, ij, slabidx))
         for ii in 2:Nq_in
             ij = CartesianIndex((ii,))
             WJ = get_local_geometry(space_in, ij, slabidx).WJ
-            r = RecursiveApply.rmuladd(
+            r = muladd(
                 ImatT[i, ii],
-                WJ ⊠ get_node(space_in, arg, ij, slabidx),
+                WJ * get_node(space_in, arg, ij, slabidx),
                 r,
             )
         end
         ij_out = CartesianIndex((i,))
         WJ_out = get_local_geometry(space_out, ij_out, slabidx).WJ
-        out[slab_index(i)] = RecursiveApply.rdiv(r, WJ_out)
+        out[slab_index(i)] = r / WJ_out
     end
     return Field(SArray(out), space_out)
 end
@@ -1678,13 +1617,13 @@ function apply_operator(op::Restrict{(1, 2)}, space_out, slabidx, arg)
         # manually inlined rmatmul1 with slab get_node
         ij = CartesianIndex((1, j))
         WJ = get_local_geometry(space_in, ij, slabidx).WJ
-        r = ImatT[i, 1] ⊠ (WJ ⊠ get_node(space_in, arg, ij, slabidx))
+        r = ImatT[i, 1] * (WJ * get_node(space_in, arg, ij, slabidx))
         for ii in 2:Nq_in
             ij = CartesianIndex((ii, j))
             WJ = get_local_geometry(space_in, ij, slabidx).WJ
-            r = RecursiveApply.rmuladd(
+            r = muladd(
                 ImatT[i, ii],
-                WJ ⊠ get_node(space_in, arg, ij, slabidx),
+                WJ * get_node(space_in, arg, ij, slabidx),
                 r,
             )
         end
@@ -1693,8 +1632,7 @@ function apply_operator(op::Restrict{(1, 2)}, space_out, slabidx, arg)
     @inbounds for j in 1:Nq_out, i in 1:Nq_out
         ij_out = CartesianIndex((i, j))
         WJ_out = get_local_geometry(space_out, ij_out, slabidx).WJ
-        out[slab_index(i, j)] =
-            RecursiveApply.rdiv(rmatmul2(ImatT, temp, i, j), WJ_out)
+        out[slab_index(i, j)] = rmatmul2(ImatT, temp, i, j) / WJ_out
     end
     return Field(SArray(out), space_out)
 end
@@ -1721,9 +1659,9 @@ function tensor_product!(
         in_slab = slab(indata, v, h)
         out_slab = slab(out, v, h)
         for i in 1:Ni_out
-            r = M[i, 1] ⊠ in_slab[slab_index(1)]
+            r = M[i, 1] * in_slab[slab_index(1)]
             for ii in 2:Ni_in
-                r = RecursiveApply.rmuladd(M[i, ii], in_slab[slab_index(ii)], r)
+                r = muladd(M[i, ii], in_slab[slab_index(ii)], r)
             end
             out_slab[slab_index(i)] = r
         end
@@ -1776,7 +1714,8 @@ function tensor_product!(
     inout::Data2D{S, Nij},
     M::SMatrix{Nij, Nij},
 ) where {S, Nij}
-    tensor_product!(inout, inout, M)
+    inout_bc = Base.broadcastable(inout)
+    tensor_product!(inout_bc, inout_bc, M)
 end
 
 """
@@ -1836,14 +1775,14 @@ import .Spaces: slab_type
 
 Recursive matrix product along the 1st dimension of `S`. Equivalent to:
 
-    mapreduce(⊠, ⊞, W[i,:], S[:,j])
+    mapreduce(*, +, W[i,:], S[:,j])
 
 """
 function rmatmul1(W, S, i, j)
     Nq = size(W, 2)
-    @inbounds r = W[i, 1] ⊠ S[slab_index(1, j)]
+    @inbounds r = W[i, 1] * S[slab_index(1, j)]
     @inbounds for ii in 2:Nq
-        r = RecursiveApply.rmuladd(W[i, ii], S[slab_index(ii, j)], r)
+        r = muladd(W[i, ii], S[slab_index(ii, j)], r)
     end
     return r
 end
@@ -1853,13 +1792,13 @@ end
 
 Recursive matrix product along the 2nd dimension `S`. Equivalent to:
 
-    mapreduce(⊠, ⊞, W[j,:], S[i, :])
+    mapreduce(*, +, W[j,:], S[i, :])
 """
 function rmatmul2(W, S, i, j)
     Nq = size(W, 2)
-    @inbounds r = W[j, 1] ⊠ S[slab_index(i, 1)]
+    @inbounds r = W[j, 1] * S[slab_index(i, 1)]
     @inbounds for jj in 2:Nq
-        r = RecursiveApply.rmuladd(W[j, jj], S[slab_index(i, jj)], r)
+        r = muladd(W[j, jj], S[slab_index(i, jj)], r)
     end
     return r
 end
diff --git a/src/RecursiveApply/RecursiveApply.jl b/src/RecursiveApply/RecursiveApply.jl
deleted file mode 100755
index f57b3105da..0000000000
--- a/src/RecursiveApply/RecursiveApply.jl
+++ /dev/null
@@ -1,206 +0,0 @@
-"""
-    RecursiveApply
-
-This module contains operators to recurse over nested `Tuple`s or `NamedTuple`s.
-
-To extend to another type `T`, define `RecursiveApply.rmap(fn, args::T...)`
-"""
-module RecursiveApply
-
-export ⊞, ⊠, ⊟
-
-# These functions need to be generated for type stability (since T.parameters is
-# a SimpleVector, the compiler cannot always infer its size and elements).
-@generated first_param(::Type{T}) where {T} = :($(first(T.parameters)))
-@generated tail_params(::Type{T}) where {T} =
-    :($(Tuple{Base.tail((T.parameters...,))...}))
-
-# Applying `rmaptype` returns `Tuple{...}` for tuple
-# types, which cannot follow the recursion pattern as
-# it cannot be splatted, so we add a separate method,
-# `rmaptype_Tuple`, for the part of the recursion.
-rmaptype_Tuple(fn::F, ::Type{Tuple{}}) where {F} = ()
-rmaptype_Tuple(fn::F, ::Type{T}) where {F, E, T <: Tuple{E}} =
-    (rmaptype(fn, first_param(T)),)
-rmaptype_Tuple(fn::F, ::Type{T}) where {F, T <: Tuple} =
-    (rmaptype(fn, first_param(T)), rmaptype_Tuple(fn, tail_params(T))...)
-
-rmaptype_Tuple(_, ::Type{Tuple{}}, ::Type{Tuple{}}) = ()
-rmaptype_Tuple(_, ::Type{Tuple{}}, ::Type{T}) where {T <: Tuple} = ()
-rmaptype_Tuple(_, ::Type{T}, ::Type{Tuple{}}) where {T <: Tuple} = ()
-rmaptype_Tuple(
-    fn::F,
-    ::Type{T1},
-    ::Type{T2},
-) where {F, T1 <: Tuple, T2 <: Tuple} = (
-    rmaptype(fn, first_param(T1), first_param(T2)),
-    rmaptype_Tuple(fn, tail_params(T1), tail_params(T2))...,
-)
-
-"""
-    rmap(fn, X...)
-
-Recursively apply `fn` to each element of `X`
-"""
-rmap(fn::F, X) where {F} = fn(X)
-rmap(fn::F, X::Tuple{}) where {F} = ()
-rmap(fn::F, X::Tuple) where {F} =
-    (rmap(fn, first(X)), rmap(fn, Base.tail(X))...)
-rmap(fn::F, X::NamedTuple) where {F} =
-    NamedTuple{nt_names(X)}(rmap(fn, Tuple(X)))
-
-rmap(fn::F, X, Y) where {F} = fn(X, Y)
-rmap(fn::F, X::Tuple{}, Y::Tuple{}) where {F} = ()
-rmap(fn::F, X::Tuple{}, Y) where {F} = ()
-rmap(fn::F, X, Y::Tuple{}) where {F} = ()
-rmap(fn::F, X::Tuple, Y::Tuple) where {F} =
-    (rmap(fn, first(X), first(Y)), rmap(fn, Base.tail(X), Base.tail(Y))...)
-rmap(fn::F, X::Tuple, Y::Tuple{}) where {F} =
-    (rmap(fn, first(X)), rmap(fn, Base.tail(X))...)
-
-rmap(fn::F, X::Tuple{}, Y::Tuple) where {F} =
-    (rmap(fn, first(Y)), rmap(fn, Base.tail(Y))...)
-rmap(fn::F, X, Y::Tuple) where {F} =
-    (rmap(fn, X, first(Y)), rmap(fn, X, Base.tail(Y))...)
-
-rmap(fn::F, X::Tuple, Y) where {F} =
-    (rmap(fn, first(X), Y), rmap(fn, Base.tail(X), Y)...)
-
-function rmap(fn::F, X::NamedTuple, Y::NamedTuple) where {F}
-    @assert nt_names(X) === nt_names(Y)
-    return NamedTuple{nt_names(X)}(rmap(fn, Tuple(X), Tuple(Y)))
-end
-rmap(fn::F, X::NamedTuple, Y) where {F} =
-    NamedTuple{nt_names(X)}(rmap(fn, Tuple(X), Y))
-rmap(fn::F, X::NamedTuple, Y::Tuple) where {F} =
-    NamedTuple{nt_names(X)}(rmap(fn, Tuple(X), Y))
-rmap(fn::F, X::NamedTuple, Y::Tuple{}) where {F} =
-    NamedTuple{nt_names(X)}(rmap(fn, Tuple(X)))
-rmap(fn::F, X, Y::NamedTuple) where {F} =
-    NamedTuple{nt_names(Y)}(rmap(fn, X, Tuple(Y)))
-rmap(fn::F, X::Tuple, Y::NamedTuple) where {F} =
-    NamedTuple{nt_names(Y)}(rmap(fn, X, Tuple(Y)))
-rmap(fn::F, X::Tuple{}, Y::NamedTuple) where {F} =
-    NamedTuple{nt_names(Y)}(rmap(fn, Tuple(Y)))
-
-nt_names(::NamedTuple{names}) where {names} = names
-
-rmin(X, Y) = rmap(min, X, Y)
-rmax(X, Y) = rmap(max, X, Y)
-
-
-"""
-    rmaptype(fn, T)
-    rmaptype(fn, T1, T2)
-
-Recursively apply `fn` to each type parameter of the type `T`, or to each type
-parameter of the types `T1` and `T2`, where `fn` returns a type.
-"""
-rmaptype(fn::F, ::Type{T}) where {F, T} = fn(T)
-rmaptype(fn::F, ::Type{T}) where {F, T <: Tuple} =
-    Tuple{rmaptype_Tuple(fn, T)...}
-rmaptype(fn::F, ::Type{T}) where {F, names, Tup, T <: NamedTuple{names, Tup}} =
-    NamedTuple{names, rmaptype(fn, Tup)}
-
-rmaptype(fn::F, ::Type{T1}, ::Type{T2}) where {F, T1, T2} = fn(T1, T2)
-rmaptype(fn::F, ::Type{T1}, ::Type{T2}) where {F, T1 <: Tuple, T2 <: Tuple} =
-    Tuple{rmaptype_Tuple(fn, T1, T2)...}
-rmaptype(
-    fn::F,
-    ::Type{T1},
-    ::Type{T2},
-) where {
-    F,
-    names,
-    Tup1,
-    Tup2,
-    T1 <: NamedTuple{names, Tup1},
-    T2 <: NamedTuple{names, Tup2},
-} = NamedTuple{names, rmaptype(fn, Tup1, Tup2)}
-
-"""
-    rpromote_type(Ts...)
-
-Recursively apply `promote_type` to the input types.
-"""
-rpromote_type(Ts...) = reduce((T1, T2) -> rmaptype(promote_type, T1, T2), Ts)
-rpromote_type() = Union{}
-
-"""
-    rzero(X)
-
-Recursively zero out each element of `X`.
-"""
-rzero(X) = rzero(typeof(X))
-rzero(::Type{T}) where {T} = zero(T)
-rzero(::Type{Tuple{}}) = ()
-rzero(::Type{T}) where {E, T <: Tuple{E}} = (rzero(E),)
-rzero(::Type{T}) where {T <: Tuple} =
-    (rzero(first_param(T)), rzero(tail_params(T))...)
-rzero(::Type{Tup}) where {names, T, Tup <: NamedTuple{names, T}} =
-    NamedTuple{names}(rzero(T))
-
-"""
-    rconvert(T, X)
-
-Identical to `convert(T, X)`, but with improved type stability for nested types.
-"""
-rconvert(::Type{T}, X::T) where {T} = X
-rconvert(::Type{T}, X) where {T} =
-    rmap((zero_value, x) -> convert(typeof(zero_value), x), rzero(T), X)
-# TODO: Remove this function once Julia's default convert function is
-# type-stable for nested Tuple/NamedTuple types.
-
-"""
-    rmul(X, Y)
-    X ⊠ Y
-
-Recursively scale each element of `X` by `Y`.
-"""
-rmul(X, Y) = rmap(*, X, Y)
-const ⊠ = rmul
-
-"""
-    radd(X, Y)
-    X ⊞ Y
-
-Recursively add elements of `X` and `Y`.
-"""
-radd(X) = X
-radd(X, Y) = rmap(+, X, Y)
-const ⊞ = radd
-
-# Adapted from Base/operators.jl for general nary operator fallbacks
-for op in (:rmul, :radd)
-    @eval begin
-        ($op)(a, b, c, xs...) = Base.afoldl($op, ($op)(($op)(a, b), c), xs...)
-    end
-end
-
-"""
-    rsub(X, Y)
-    X ⊟ Y
-
-Recursively subtract elements of `Y` from `X`.
-"""
-rsub(X) = rmap(-, X)
-rsub(X, Y) = rmap(-, X, Y)
-const ⊟ = rsub
-
-"""
-    rdiv(X, Y)
-
-Recursively divide each element of `X` by `Y`
-"""
-rdiv(X, Y) = rmap(/, X, Y)
-
-"""
-    rmuladd(w, X, Y)
-
-Recursively add elements of `w * X + Y`.
-"""
-rmuladd(w::Number, X, Y) = rmap((x, y) -> muladd(w, x, y), X, Y)
-rmuladd(X, w::Number, Y) = rmap((x, y) -> muladd(x, w, y), X, Y)
-rmuladd(x::Number, w::Number, Y) = rmap(y -> muladd(x, w, y), Y)
-
-end # module
diff --git a/src/Remapping/Remapping.jl b/src/Remapping/Remapping.jl
index 48bab928c8..465e89f418 100644
--- a/src/Remapping/Remapping.jl
+++ b/src/Remapping/Remapping.jl
@@ -19,8 +19,6 @@ import ..DataLayouts,
 import ClimaCore.Utilities: half
 import ClimaCore.Spaces: cuda_synchronize
 
-using ..RecursiveApply
-
 include("remapping_utils.jl")
 include("interpolate_array.jl")
 include("distributed_remapping.jl")
diff --git a/src/Spaces/dss.jl b/src/Spaces/dss.jl
index 2c117eace8..bd204a2b60 100644
--- a/src/Spaces/dss.jl
+++ b/src/Spaces/dss.jl
@@ -1,7 +1,6 @@
 import ..Topologies:
     DSSBuffer,
     create_dss_buffer,
-    assert_same_eltype,
     dss!,
     dss_1d!,
     dss_transform!,
@@ -89,7 +88,6 @@ function weighted_dss!(
     space::Union{AbstractSpectralElementSpace, ExtrudedFiniteDifferenceSpace},
     dss_buffer::Union{DSSBuffer, Nothing},
 )
-    assert_same_eltype(data, dss_buffer)
     weighted_dss_start!(data, space, dss_buffer)
     weighted_dss_internal!(data, space, dss_buffer)
     weighted_dss_ghost!(data, space, dss_buffer)
@@ -109,7 +107,6 @@ function weighted_dss_prepare!(
     },
     dss_buffer::DSSBuffer,
 )
-    assert_same_eltype(data, dss_buffer)
     device = ClimaComms.device(topology(space))
     hspace = horizontal_space(space)
     dss_transform!(
@@ -228,12 +225,11 @@ function weighted_dss_internal!(
 )
     Quadratures.requires_dss(quadrature_style(space)) || return nothing
     sizeof(eltype(data)) > 0 || return nothing
-    assert_same_eltype(data, dss_buffer)
     device = ClimaComms.device(topology(hspace))
     if hspace isa SpectralElementSpace1D
         dss_1d!(
             device,
-            data,
+            Base.broadcastable(data),
             topology(hspace),
             local_geometry_data(space),
             dss_weights(space),
@@ -313,7 +309,6 @@ function weighted_dss_ghost!(
 )
     Quadratures.requires_dss(quadrature_style(space)) || return data
     sizeof(eltype(data)) > 0 || return data
-    assert_same_eltype(data, dss_buffer)
     ClimaComms.finish(dss_buffer.graph_context)
     device = ClimaComms.device(topology(hspace))
     load_from_recv_buffer!(device, dss_buffer)
diff --git a/src/Topologies/Topologies.jl b/src/Topologies/Topologies.jl
index e99690aa26..1fbb2b7067 100644
--- a/src/Topologies/Topologies.jl
+++ b/src/Topologies/Topologies.jl
@@ -3,7 +3,7 @@ module Topologies
 import ClimaComms, Adapt
 
 import ..ClimaCore
-import ..Utilities: Cache, cart_ind, linear_ind
+import ..Utilities: Cache, cart_ind, linear_ind, AutoBroadcaster, nested_broadcast
 import ..Geometry
 import ..Domains: Domains, coordinate_type
 import ..Meshes: Meshes, domain, coordinates
diff --git a/src/Topologies/dss.jl b/src/Topologies/dss.jl
index 3f0b1c03ba..c637f4e2eb 100644
--- a/src/Topologies/dss.jl
+++ b/src/Topologies/dss.jl
@@ -66,7 +66,7 @@ create_dss_buffer(
     local_geometry::Union{DSSTypes2D, Nothing} = nothing,
     dss_weights::Union{DSSTypes2D, Nothing} = nothing,
 ) = create_dss_buffer(
-    data,
+    Base.broadcastable(data),
     topology,
     DataLayouts.VIFH,
     local_geometry,
@@ -168,12 +168,6 @@ end
 
 Base.eltype(::DSSBuffer{S}) where {S} = S
 
-assert_same_eltype(::DataLayouts.AbstractData, ::DSSBuffer) =
-    error("Incorrect buffer eltype")
-assert_same_eltype(::DataLayouts.AbstractData{S}, ::DSSBuffer{S}) where {S} =
-    nothing
-assert_same_eltype(::DataLayouts.AbstractData, ::Nothing) = nothing
-
 """
     dss_transform!(
         device::ClimaComms.AbstractDevice,
@@ -213,7 +207,7 @@ function dss_transform!(
         dss_transform!(
             device,
             dss_buffer.perimeter_data,
-            data,
+            Base.broadcastable(data),
             perimeter,
             local_geometry,
             dss_weights,
@@ -223,27 +217,6 @@ function dss_transform!(
     return nothing
 end
 
-# `dss_transform` of a `Covariant12Vector` returns a
-# `UVWVector`, however, we only need to store a `UVVector`
-# in `perimeter_data`. Therefore, we drop the vertical dimension:
-# via `drop_vert_dim`
-"""
-    drop_vert_dim(::Type{T}, X)
-
-Convert the type of `X` to type `T` recursively
-using `_drop_vert_dim`, which converts from `UVWVector`
-to `UVVector` if `T <: UVVector`.
-"""
-@inline drop_vert_dim(::Type{T}, X) where {T} =
-    RecursiveApply.rmap(RecursiveApply.rzero(T), X) do zero_value, x
-        _drop_vert_dim(typeof(zero_value), x)
-    end
-@inline _drop_vert_dim(
-    ::Type{T},
-    x::Geometry.UVWVector,
-) where {T <: Geometry.UVVector} = Geometry.UVVector(x.u, x.v)
-@inline _drop_vert_dim(::Type{T}, x::T) where {T} = x
-
 """
     function dss_transform!(
         ::ClimaComms.AbstractCPUDevice,
@@ -289,9 +262,7 @@ function dss_transform!(
                     local_geometry[loc],
                     dss_weights[loc],
                 )
-                perimeter_data[CI(p, 1, 1, level, elem)] =
-                    drop_vert_dim(eltype(perimeter_data), src)
-
+                perimeter_data[CI(p, 1, 1, level, elem)] = src
             end
         end
     end
@@ -334,7 +305,7 @@ function dss_untransform!(
     dss_untransform!(
         device,
         perimeter_data,
-        data,
+        Base.broadcastable(data),
         local_geometry,
         perimeter,
         localelems,
@@ -466,9 +437,9 @@ function dss_local_vertices!(
         for level in 1:Nv
             # gather: compute sum over shared vertices
             sum_data = mapreduce(
-                ⊞,
+                +,
                 vertex;
-                init = RecursiveApply.rzero(eltype(slab(perimeter_data, 1, 1))),
+                init = zero(eltype(slab(perimeter_data, 1, 1))),
             ) do (lidx, vert)
                 ip = perimeter_vertex_node_index(vert)
                 perimeter_slab = slab(perimeter_data, level, lidx)
@@ -502,7 +473,7 @@ function dss_local_faces!(
             perimeter_slab2 = slab(perimeter_data, level, lidx2)
             for (ip1, ip2) in zip(pr1, pr2)
                 val =
-                    perimeter_slab1[slab_index(ip1)] ⊞
+                    perimeter_slab1[slab_index(ip1)] +
                     perimeter_slab2[slab_index(ip2)]
                 perimeter_slab1[slab_index(ip1)] = val
                 perimeter_slab2[slab_index(ip2)] = val
@@ -538,11 +509,9 @@ function dss_local_ghost!(
             for level in 1:Nv
                 # gather: compute sum over shared vertices
                 sum_data = mapreduce(
-                    ⊞,
+                    +,
                     vertex;
-                    init = RecursiveApply.rzero(
-                        eltype(slab(perimeter_data, 1, 1)),
-                    ),
+                    init = zero(eltype(slab(perimeter_data, 1, 1))),
                 ) do (isghost, idx, vert)
                     ip = perimeter_vertex_node_index(vert)
                     if !isghost
@@ -550,10 +519,7 @@ function dss_local_ghost!(
                         perimeter_slab = slab(perimeter_data, level, lidx)
                         perimeter_slab[slab_index(ip)]
                     else
-                        RecursiveApply.rmap(
-                            zero,
-                            slab(perimeter_data, 1, 1)[slab_index(1)],
-                        )
+                        zero(slab(perimeter_data, 1, 1)[slab_index(1)])
                     end
                 end
                 for (isghost, idx, vert) in vertex
@@ -680,7 +646,7 @@ Computed unweighted/pure DSS of `data`.
 function dss!(data::DSSTypes1D, topology::IntervalTopology)
     sizeof(eltype(data)) > 0 || return nothing
     device = ClimaComms.device(topology)
-    dss_1d!(device, data, topology)
+    dss_1d!(device, Base.broadcastable(data), topology)
     return nothing
 end
 function dss!(data::DSSTypes2D, topology::Topology2D)
@@ -726,7 +692,7 @@ function dss_1d!(
         left_idx = CartesianIndex(Ni, 1, 1, level, left_face_elem)
         right_idx = CartesianIndex(1, 1, 1, level, right_face_elem)
         val =
-            dss_transform(data, local_geometry, dss_weights, left_idx) ⊞
+            dss_transform(data, local_geometry, dss_weights, left_idx) +
             dss_transform(data, local_geometry, dss_weights, right_idx)
         data[left_idx] = dss_untransform(T, val, local_geometry, left_idx)
         data[right_idx] = dss_untransform(T, val, local_geometry, right_idx)
diff --git a/src/Topologies/dss_transform.jl b/src/Topologies/dss_transform.jl
index c20a219093..1e68e69511 100644
--- a/src/Topologies/dss_transform.jl
+++ b/src/Topologies/dss_transform.jl
@@ -1,5 +1,4 @@
 import ..Topologies: Topology2D
-using ..RecursiveApply
 import UnrolledUtilities: unrolled_map
 
 """
@@ -28,63 +27,28 @@ Base.@propagate_inbounds dss_transform(
     I,
 ) = arg[I]
 
-@inline function dss_transform(
-    arg::Tuple{},
-    local_geometry::Geometry.LocalGeometry,
-    weight,
-)
-    ()
-end
-@inline function dss_transform(
-    args::Tuple,
-    local_geometry::Geometry.LocalGeometry,
-    weight,
-)
-    unrolled_map(arg -> dss_transform(arg, local_geometry, weight), args)
-end
-@inline function dss_transform(
-    arg::NamedTuple{names},
-    local_geometry::Geometry.LocalGeometry,
-    weight,
-) where {names}
-    NamedTuple{names}(dss_transform(Tuple(arg), local_geometry, weight))
-end
 @inline dss_transform(
-    arg::Number,
+    arg,
     local_geometry::Geometry.LocalGeometry,
     weight,
 ) = arg * weight
 @inline dss_transform(
-    arg::Geometry.AxisTensor{T, N, <:Tuple{Vararg{Geometry.CartesianAxis}}},
+    arg::AutoBroadcaster,
     local_geometry::Geometry.LocalGeometry,
     weight,
-) where {T, N} = arg * weight
+) = nested_broadcast(arg -> dss_transform(arg, local_geometry, weight), arg)
+
+const NonTransformedAxis = Union{
+    Geometry.LocalAxis,
+    Geometry.CartesianAxis,
+    Geometry.Covariant3Axis,
+    Geometry.Contravariant3Axis,
+}
 @inline dss_transform(
-    arg::Geometry.CartesianVector,
+    arg::Geometry.AxisVector{<:Any, <:NonTransformedAxis},
     local_geometry::Geometry.LocalGeometry,
     weight,
 ) = arg * weight
-@inline dss_transform(
-    arg::Geometry.AxisTensor{T, N, <:Tuple{Vararg{Geometry.LocalAxis}}},
-    local_geometry::Geometry.LocalGeometry,
-    weight,
-) where {T, N} = arg * weight
-@inline dss_transform(
-    arg::Geometry.AxisTensor{T, N, <:Tuple{}},
-    local_geometry::Geometry.LocalGeometry,
-    weight,
-) where {T, N} = arg * weight
-@inline dss_transform(
-    arg::Geometry.LocalVector,
-    local_geometry::Geometry.LocalGeometry,
-    weight,
-) = arg * weight
-@inline dss_transform(
-    arg::Geometry.Covariant3Vector,
-    local_geometry::Geometry.LocalGeometry,
-    weight,
-) = arg * weight
-
 @inline function dss_transform(
     arg::Geometry.AxisVector,
     local_geometry::Geometry.LocalGeometry,
@@ -137,38 +101,17 @@ Base.@propagate_inbounds dss_untransform(
 ) where {T} = dss_untransform(T, targ, local_geometry[I])
 @inline dss_untransform(::Type{T}, targ, local_geometry::Nothing, I) where {T} =
     dss_untransform(T, targ, local_geometry)
-@inline function dss_untransform(
-    ::Type{NamedTuple{names, T}},
-    targ::NamedTuple{names},
-    local_geometry,
-) where {names, T}
-    NamedTuple{names}(dss_untransform(T, Tuple(targ), local_geometry))
-end
+
+@inline dss_untransform(::Type{T}, targ::T, local_geometry) where {T} = targ
 @inline dss_untransform(
-    ::Type{Tuple{}},
-    targ::Tuple{},
-    local_geometry::Geometry.LocalGeometry,
-) = ()
-@inline function dss_untransform(
     ::Type{T},
-    targ::Tuple,
+    targ::AutoBroadcaster,
     local_geometry::Geometry.LocalGeometry,
-) where {T <: Tuple}
-    (
-        dss_untransform(
-            Base.tuple_type_head(T),
-            Base.first(targ),
-            local_geometry,
-        ),
-        dss_untransform(
-            Base.tuple_type_tail(T),
-            Base.tail(targ),
-            local_geometry,
-        )...,
-    )
-end
+) where {T <: AutoBroadcaster} =
+    nested_broadcast(zero(T), targ) do zero_value, targ
+        dss_untransform(typeof(zero_value), targ, local_geometry)
+    end
 
-@inline dss_untransform(::Type{T}, targ::T, local_geometry) where {T} = targ
 @inline dss_untransform(
     ::Type{T},
     targ::T,
diff --git a/src/Utilities/Utilities.jl b/src/Utilities/Utilities.jl
index 390c89c98f..4c3e334c0c 100644
--- a/src/Utilities/Utilities.jl
+++ b/src/Utilities/Utilities.jl
@@ -1,8 +1,12 @@
 module Utilities
 
-import UnrolledUtilities: unrolled_map
+using UnrolledUtilities
+
+import ForwardDiff
+import InteractiveUtils
 
 include("plushalf.jl")
+include("auto_broadcaster.jl")
 include("cache.jl")
 
 module Unrolled # TODO: Move all of these functions into UnrolledUtilities.jl
@@ -118,18 +122,104 @@ to ensure that recursive functions over nested types have inferrable outputs.
 @inline fieldtype_vals(::Type{T}) where {T} =
     ntuple(Val ∘ Base.Fix1(fieldtype, T), Val(fieldcount(T)))
 
+# :new may be called with uninitialized fields as of JuliaLang/julia#52169, but
+# this leads to segfaults or other compiler errors for immutable DataType fields
+@inline can_alloc_uninitialized(::Tuple{Bool, Val{T}}) where {T <: Type} =
+    throw(ArgumentError("Cannot allocate unspecified $T"))
+@inline can_alloc_uninitialized((mutable, _)::Tuple{Bool, Val{Type{T}}}) where {T} =
+    mutable
+@inline can_alloc_uninitialized((mutable, _)::Tuple{Bool, Val{T}}) where {T} =
+    if T isa Union{Union, UnionAll}
+        throw(ArgumentError("Cannot allocate value of ambiguous type $T"))
+    else
+        mutable_flags = ntuple(Base.Fix1(!isconst, T), Val(fieldcount(T)))
+        flags_and_type_vals = zip(mutable_flags, fieldtype_vals(T))
+        mutable || unrolled_all(can_alloc_uninitialized, flags_and_type_vals)
+    end
+
 """
     new(T, [fields])
 
-Exposes the `new` pseudo-function that allocates a value of type `T` with the
-specified fields. Can also be called without a second argument to leave the
-allocated value with uninitialized fields.
+Exposes the `new` pseudo-function that allocates a value of type `T`, which can
+otherwise only be explicitly called from inner constructors.
+
+If provided, the second argument is used to initialize fields of the new value
+(unlike the lowered pseudo-function, this will not automatically convert to the
+`fieldtypes` of `T`). Otherwise, the fields are initialized with arbitrary data,
+with special handling of `DataType` fields to avoid errors during compilation.
+
+# Examples
+```jldoctest; setup = :(import ClimaCore.Utilities: new), filter = r"\\d+"
+julia> new(Int)
+4889520192
+
+julia> new(Complex{Int}, (1, 2))
+1 + 2im
+
+julia> new(@NamedTuple{a::Type{Int}, b::Int, c::Complex{Int}})
+(a = Int64, b = 4889520192, c = 6162822528 + 8036417625im)
+
+julia> new(@NamedTuple{a::DataType, b::Int, c::Complex{Int}}, (Int, 1, 1 + 2im))
+(a = Int64, b = 1, c = 1 + 2im)
+```
+"""
+@inline new(::Type{T}) where {T} = maybe_nested_new(Val(T))
+@eval @inline new(::Type{T}, fields) where {T} = $(Expr(:splatnew, :T, :fields))
+
+# Wrap each type in a Val to guarantee recursive inlining
+@inline maybe_nested_new(::Val{Type{T}}) where {T} = T
+@eval @inline maybe_nested_new(val::Val{T}) where {T} =
+    can_alloc_uninitialized((false, val)) ? $(Expr(:new, :T)) : nested_new(val)
+
+# A Tuple{Type{T}, ...} turns into a Tuple{DataType, ...} when it is allocated;
+# a @NamedTuple{_::Type{T}, ...} also turns into a @NamedTuple{_::DataType, ...}
+@inline nested_new(::Val{T}) where {T} =
+    new(T, unrolled_map(maybe_nested_new, fieldtype_vals(T)))
+@inline nested_new(::Val{T}) where {T <: Tuple} =
+    unrolled_map(maybe_nested_new, fieldtype_vals(T))
+@inline nested_new(::Val{T}) where {names, T <: NamedTuple{names}} =
+    NamedTuple{names}(unrolled_map(maybe_nested_new, fieldtype_vals(T)))
+
+"""
+    unsafe_eltype(itr)
+
+Analogue of `eltype` with support for un-materialized broadcast expressions,
+adapted from `Base.Broadcast.combine_eltypes`. Does not perform any safety
+checks, and may potentially return non-concrete types (like an empty `Union{}`).
+"""
+@inline unsafe_eltype(itr) = eltype(itr)
+@inline unsafe_eltype((; f, args)::Base.Broadcast.Broadcasted) =
+    unrolled_any(has_inferred_error, args) ? Union{} :
+    Core.Compiler.return_type(f, Tuple{unrolled_map(unsafe_eltype, args)...})
+
+@inline has_inferred_error(itr) = unsafe_eltype(itr) == Union{}
+
+struct InferenceError <: Exception
+    f::Any
+    args_type::Type{<:Tuple}
+end
+function Base.showerror(io::IO, (; f, args_type)::InferenceError)
+    println(io, "Concrete type of result could not be inferred:\n")
+    InteractiveUtils.code_warntype(io, f, args_type)
+end
+
+"""
+    safe_eltype(itr)
 
-In contrast to the pseudo-function, this only asserts that all fields match the
-`fieldtypes` of `T`, rather than automatically converting them to those types.
+Analogue of `eltype` with support for un-materialized broadcast expressions,
+adapted from `Base.Broadcast.combine_eltypes`. Throws an error when the concrete
+element type of a broadcast expression cannot be inferred, indicating which part
+of the expression first encounters a type instability or error during inference.
 """
-@generated new(::Type{T}) where {T} = Expr(:new, :T)
-@generated new(::Type{T}, fields) where {T} =
-    Expr(:splatnew, :T, :(fields::$(Tuple{fieldtypes(T)...})))
+@inline safe_eltype(itr) =
+    has_inferred_error(itr) ||
+    !(isconcretetype(unsafe_eltype(itr)) || unsafe_eltype(itr) <: Type) ?
+    eltype_error(itr) : unsafe_eltype(itr)
+
+eltype_error(itr) = throw(InferenceError(eltype, Tuple{typeof(itr)}))
+eltype_error(bc::Base.Broadcast.Broadcasted) =
+    has_inferred_error(bc) ?
+    bc.f(unrolled_map(new ∘ safe_eltype, bc.args)...) : # f throws runtime error
+    throw(InferenceError(bc.f, Tuple{unrolled_map(safe_eltype, bc.args)...}))
 
 end # module
diff --git a/src/Utilities/auto_broadcaster.jl b/src/Utilities/auto_broadcaster.jl
new file mode 100644
index 0000000000..670484067f
--- /dev/null
+++ b/src/Utilities/auto_broadcaster.jl
@@ -0,0 +1,404 @@
+# Default types that can be used as arguments to auto-broadcasted math functions
+const DefaultBroadcastable = Union{Tuple, NamedTuple}
+const DefaultNonAutoBroadcaster =
+    Union{DefaultBroadcastable, Number, AbstractArray}
+
+"""
+    AutoBroadcaster(itr)
+
+Wrapper for an iterator that forces certain functions to be broadcasted over the
+iterator's elements. This allows different types of broadcasting to be applied
+simultaneously; e.g., ClimaCore's `Field`s and similar types use the standard
+dot syntax to denote parallelized iteration over spatial locations, and they
+wrap their values in `AutoBroadcaster`s for unrolled iteration over subfields.
+All statically-sized iterators for which [`is_auto_broadcastable`](@ref) is true
+are compatible with `AutoBroadcaster`s.
+
+In the context of `AutoBroadcaster`s, broadcasting a function applies it with
+[`unrolled_map`](https://clima.github.io/UnrolledUtilities.jl/dev/#Package-Features),
+iterating over all arguments for which `is_auto_broadcastable` is true
+(including those not wrapped in `AutoBroadcaster`s), while other arguments are
+passed to the function directly. This behavior is triggered by using
+`AutoBroadcaster`s, optionally in conjunction with compatible iterators that are
+not wrapped in `AutoBroadcaster`s, in the following ways:
+  - passing them to standard math functions or constructors
+  - passing them to `ifelse` (for iterating over conditional values)
+  - applying them as function calls (for iterating over functions)
+  - explicitly calling [`nested_broadcast`](@ref)
+
+Nested `AutoBroadcaster`s constructed with [`add_auto_broadcasters`](@ref)
+evaluate broadcasts recursively, mapping across every layer of nested iterators
+so that broadcasted functions are only applied to non-iterators in the innermost
+layers. Aside from automatic broadcasting, `AutoBroadcaster`s are essentially
+identical to their underlying iterators, with support for common operations like
+`iterate`, `propertynames`, `getindex`, and `reduce`.
+
+# Examples
+```jldoctest; setup = :(import ClimaCore.Utilities, ClimaCore.Geometry.StaticArrays)
+julia> x = Utilities.AutoBroadcaster((1, 2.0, StaticArrays.SVector(3, 4)))
+(1, 2.0, [3, 4])
+
+julia> zero(typeof(x))
+(0, 0.0, [0, 0])
+
+julia> 2 * x - (2, 3, [4, 5])
+(0, 1.0, [2, 3])
+
+julia> y = Utilities.add_auto_broadcasters((1, 2, (a = 3, b = 4, c = (5, 6, (7, 8)))))
+(1, 2, (a = 3, b = 4, c = (5, 6, (7, 8))))
+
+julia> min(y, abs(5 - y))
+(1, 2, (a = 2, b = 1, c = (0, 1, (2, 3))))
+
+julia> x' * y * x ÷ 5
+(0, 1.0, (a = 15, b = 20, c = (25, 30, (35, 40))))
+```
+"""
+struct AutoBroadcaster{I}
+    itr::I
+end
+
+unwrap(::Type{AutoBroadcaster{I}}) where {I} = I
+unwrap(x::AutoBroadcaster) = getfield(x, :itr) # getproperty is overwritten below
+unwrap(x) = x
+
+"""
+    is_auto_broadcastable(::Type)
+    is_auto_broadcastable(itr)
+
+Indicates whether an [`AutoBroadcaster`](@ref) should broadcast over iterators
+of the given type. By default, this is only true for `Tuple` and `NamedTuple`
+types, but it can be extended to any statically-sized type compatible with
+[UnrolledUtilities.jl](https://github.com/CliMA/UnrolledUtilities.jl).
+
+For convenience, `is_auto_broadcastable` also supports passing a concrete
+iterator instead of its type, but this method should not be extended directly.
+"""
+is_auto_broadcastable(::Type{<:DefaultBroadcastable}) = true
+is_auto_broadcastable(::Type) = false
+is_auto_broadcastable(::Type{Union{}}) = false # to resolve ambiguity
+is_auto_broadcastable(itr) = is_auto_broadcastable(typeof(itr))
+
+"""
+    add_auto_broadcasters(itr)
+    add_auto_broadcasters(::Type)
+
+Recursively applies the [`AutoBroadcaster`](@ref) constructor to iterators for
+which [`is_auto_broadcastable`](@ref) is true, as well as their elements for
+which it is true, while leaving values for which it is false unmodified. Can
+also be passed an iterator's type to infer the result type for such an iterator.
+"""
+add_auto_broadcasters(itr) =
+    itr isa AutoBroadcaster || is_auto_broadcastable(itr) ?
+    AutoBroadcaster(unrolled_map(add_auto_broadcasters, unwrap(itr))) : itr
+add_auto_broadcasters(::Type{T}) where {T} =
+    Core.Compiler.return_type(add_auto_broadcasters, Tuple{T})
+
+"""
+    drop_auto_broadcasters(itr)
+    drop_auto_broadcasters(::Type)
+
+Recursively unwraps constructors applied by [`add_auto_broadcasters`](@ref),
+extracting the iterator from every [`AutoBroadcaster`](@ref) in `itr`. Can also
+be passed an iterator's type to infer the result type for such an iterator.
+"""
+drop_auto_broadcasters(itr) =
+    itr isa AutoBroadcaster || is_auto_broadcastable(itr) ?
+    unrolled_map(drop_auto_broadcasters, unwrap(itr)) : itr
+drop_auto_broadcasters(::Type{T}) where {T} =
+    Core.Compiler.return_type(drop_auto_broadcasters, Tuple{T})
+
+"""
+    auto_broadcasted([style], f, args, [axes])
+
+Analogue of `Base.Broadcast.Broadcasted(style, f, args, axes)` that can pass the
+arguments of `f` through either [`add_auto_broadcasters`](@ref) or
+[`drop_auto_broadcasters`](@ref) if doing so will help avoid an inferred error.
+
+When the [`unsafe_eltype`](@ref) of `Broadcasted(style, f, args, axes)`
+indicates that `f` will throw an error, a new `Broadcasted` wrapper is
+constructed with `add_auto_broadcasters` applied to every argument, and then
+another is constructed with `drop_auto_broadcasters` applied to every argument.
+If one of the new wrappers no longer corresponds to a guaranteed error, it is
+returned instead of the original wrapper. Otherwise, the default result of
+`Broadcasted(style, f, args, axes)` is returned without modifications.
+
+# Examples
+```jldoctest; setup = :(import ClimaCore.Utilities), filter = r"\\{.+\\}"
+julia> x = (im, (1, 2.0), [3, 4])
+(im, (1, 2.0), [3, 4])
+
+julia> y = [x, x, x, x];
+
+julia> bc = Base.Broadcast.Broadcasted(*, (Base.Broadcast.Broadcasted(adjoint, (y,)), y));
+
+julia> sum(Base.materialize(bc))
+ERROR: MethodError: no method matching adjoint(::Tuple{...})
+[...]
+
+julia> bc = Utilities.auto_broadcasted(*, (Utilities.auto_broadcasted(adjoint, (y,)), y));
+
+julia> sum(Base.materialize(bc))
+(4 + 0im, (4, 16.0), 100)
+```
+"""
+auto_broadcasted(f::F, args, axes...) where {F} =
+    auto_broadcasted(Base.Broadcast.combine_styles(args...), f, args, axes...)
+function auto_broadcasted(style::Base.BroadcastStyle, f::F, args, axes...) where {F}
+    wrapped_f(args...) = f(unrolled_map(add_auto_broadcasters, args)...)
+    unwrapped_f(args...) = f(unrolled_map(drop_auto_broadcasters, args)...)
+    bc = Base.Broadcast.Broadcasted(style, f, args, axes...)
+    unsafe_eltype(bc) != Union{} && return bc
+    bc′ = Base.Broadcast.Broadcasted(style, wrapped_f, args, axes...)
+    unsafe_eltype(bc′) != Union{} && return bc′
+    bc′′ = Base.Broadcast.Broadcasted(style, unwrapped_f, args, axes...)
+    unsafe_eltype(bc′′) != Union{} && return bc′′
+    return bc # error in bc is not caused by missing or extra AutoBroadcasters
+end
+
+"""
+    nested_broadcast(f, args...)
+
+Analogue of `broadcast` that is applied recursively over nested iterators, as
+long as at least one argument is an [`AutoBroadcaster`](@ref). All loops over
+iterator elements are unrolled and inlined to optimize performance.
+
+This function is automatically called when an [`AutoBroadcaster`](@ref) is
+passed to any standard math function or constructor, but for generic operations
+it must be called explicitly.
+
+# Examples
+```jldoctest; setup = :(import ClimaCore.Utilities)
+julia> x = Utilities.add_auto_broadcasters(((:a, :b, :c), (:d, :e, :f), :g))
+((:a, :b, :c), (:d, :e, :f), :g)
+
+julia> Utilities.nested_broadcast(string, x)
+(("a", "b", "c"), ("d", "e", "f"), "g")
+
+julia> y = Utilities.add_auto_broadcasters((1, 11, (111, 1111, 11111)))
+(1, 11, (111, 1111, 11111))
+
+julia> Utilities.nested_broadcast(Symbol, x, y * y)
+((:a1, :b1, :c1), (:d121, :e121, :f121), (:g12321, :g1234321, :g123454321))
+```
+"""
+nested_broadcast(f::F, args...) where {F} = _nested_broadcast(f, args)
+
+# Zip the arguments instead of splatting them to guarantee recursive inlining
+function _nested_broadcast(f::F, args) where {F}
+    unrolled_any(Base.Fix2(isa, AutoBroadcaster), args) || return f(args...)
+    unwrapped_args = unrolled_map(unwrap, args)
+    broadcastable_args = unrolled_filter(is_auto_broadcastable, unwrapped_args)
+    lengths = unrolled_map(length, broadcastable_args)
+    if !unrolled_allequal(lengths)
+        lengths_str = join(unique(lengths), ", ", " and ")
+        throw(DimensionMismatch("Arguments have unequal lengths $lengths_str"))
+    end
+    broadcast_axis = StaticOneTo(first(lengths))
+    uniform_length_args = unrolled_map(unwrapped_args) do x
+        is_auto_broadcastable(x) ? x : Iterators.map(Returns(x), broadcast_axis)
+    end
+    zipped_args = unrolled_map(tuple, uniform_length_args...)
+    result_itr = unrolled_map(Base.Fix1(_nested_broadcast, f), zipped_args)
+    return AutoBroadcaster(result_itr)
+end
+
+# Wrap each Type in a struct to guarantee recursive inlining
+nested_broadcast(::Type{T}, args...) where {T} =
+    nested_broadcast(Base.Fix1((T, args) -> T(args...), T) ∘ tuple, args...)
+
+# Nested version of f.(typeof.(x), typeof.(y), ...) for x::type1, y::type2, etc.
+nested_broadcast_over_types(f::F, types...) where {F} = nested_broadcast(
+    (args...) -> f(unrolled_map(typeof, args)...),
+    unrolled_map(new, types)...,
+)
+
+# Nested version of typeof(new.(f.(typeof.(x), typeof.(y), ...))) for x::type1...
+nested_broadcast_result_type(f::F, types...) where {F} =
+    typeof(nested_broadcast_over_types((types...) -> new(f(types...)), types...))
+
+#########################################
+## Automatic Unwrapping and Rewrapping ##
+#########################################
+
+Base.eltype(::Type{X}) where {X <: AutoBroadcaster} = eltype(unwrap(X))
+
+Base.Tuple(x::AutoBroadcaster) = Tuple(unwrap(x))
+Base.NamedTuple{names}(x::AutoBroadcaster) where {names} =
+    NamedTuple{names}(unwrap(x))
+
+Base.propertynames(x::AutoBroadcaster) = propertynames(unwrap(x))
+Base.getproperty(x::AutoBroadcaster, name::Symbol) = getproperty(unwrap(x), name)
+
+for f in (:keys, :values, :pairs, :isempty, :length, :firstindex, :lastindex)
+    @eval Base.$f(x::AutoBroadcaster) = $f(unwrap(x))
+end
+Base.show(io::IO, x::AutoBroadcaster) = show(io, unwrap(x))
+Base.axes(x::AutoBroadcaster, dim...) = axes(unwrap(x), dim...)
+Base.size(x::AutoBroadcaster, dim...) = size(unwrap(x), dim...)
+Base.iterate(x::AutoBroadcaster, state...) = iterate(unwrap(x), state...)
+Base.merge(args::AutoBroadcaster...) =
+    AutoBroadcaster(merge(unrolled_map(unwrap, args)...))
+Base.@propagate_inbounds Base.getindex(x::AutoBroadcaster, index) =
+    getindex(unwrap(x), index)
+Base.@propagate_inbounds Base.setindex(x::AutoBroadcaster, value, index) =
+    AutoBroadcaster(Base.setindex(unwrap(x), value, index))
+
+# Broadcasts/maps/reductions are not recursive, unlike the math operations below
+Base.broadcastable(x::AutoBroadcaster) = Base.broadcastable(unwrap(x))
+Base.map(f::F, arg::AutoBroadcaster, args::AutoBroadcaster...) where {F} =
+    AutoBroadcaster(map(f, unwrap(arg), unrolled_map(unwrap, args)...))
+Base.mapreduce(
+    f::F,
+    op::O,
+    arg::AutoBroadcaster,
+    args::AutoBroadcaster...;
+    init...,
+) where {F, O} =
+    mapreduce(f, op, unwrap(arg), unrolled_map(unwrap, args)...; init...)
+
+# Circumvent the built-in convert function, which can introduce type
+# instabilities for nested Tuples and NamedTuples on Julia 1.10
+Base.convert(::Type{X}, x::X) where {X <: AutoBroadcaster} = x
+Base.convert(::Type{I}, x::AutoBroadcaster) where {I <: DefaultBroadcastable} =
+    nested_convert(I, x)
+Base.convert(::Type{X}, itr) where {X <: AutoBroadcaster} =
+    nested_convert(X, itr)
+nested_convert(::Type{T}, arg) where {T} = _nested_convert((new(T), arg))
+
+# Turn types into values and zip the arguments to guarantee recursive inlining
+_nested_convert((x, y)) =
+    x isa AutoBroadcaster ? AutoBroadcaster(_nested_convert((unwrap(x), y))) :
+    is_auto_broadcastable(x) ?
+    unrolled_map(_nested_convert, unrolled_map(tuple, x, unwrap(y))) :
+    convert(typeof(x), unwrap(y))
+
+###############################################
+## Automatic Broadcasting of Math Operations ##
+###############################################
+
+const AutoBroadcasterOrSimilar = Union{AutoBroadcaster, DefaultBroadcastable}
+
+# Type functions extended in ForwardDiff.jl
+for f in (:zero, :one, :eps, :float)
+    @eval Base.$f(::Type{X}) where {X <: AutoBroadcaster} =
+        nested_broadcast_over_types($f, X)
+end
+Base.precision(::Type{X}; base...) where {X <: AutoBroadcaster} =
+    nested_broadcast_over_types(x -> precision(x; base...), X)
+Base.promote_rule(
+    ::Type{X},
+    ::Type{Y},
+) where {X <: AutoBroadcaster, Y <: AutoBroadcasterOrSimilar} =
+    nested_broadcast_result_type(Base.promote_type, X, Y)
+
+# Common type functions absent from ForwardDiff.jl
+for f in (:big, :real, :complex, :widen)
+    @eval Base.$f(::Type{X}) where {X <: AutoBroadcaster} =
+        nested_broadcast_result_type($f, X)
+end
+
+# Types of constructors for subtypes of T that have an unconstrained argument,
+# leading to ambiguities with the method (::Type{<:T})(::AutoBroadcaster) = ...
+function ambiguous_constructor_types(T)
+    types = []
+    if isabstracttype(T)
+        for T_subtype in InteractiveUtils.subtypes(T)
+            append!(types, ambiguous_constructor_types(T_subtype))
+        end
+    end
+    vars = []
+    empty_var = TypeVar(Symbol())
+    while true
+        new_type = reduce((T, var) -> UnionAll(var, T), vars; init = Type{T})
+        constructor = reduce((T, _) -> UnionAll(empty_var, T), vars; init = T)
+        hasmethod(constructor, Tuple{AutoBroadcaster}) && push!(types, new_type)
+        T isa DataType && break
+        push!(vars, T.var)
+        T = T.body
+    end
+    return types
+end
+
+# All Number constructors (only defined for Integer and Dual in ForwardDiff.jl),
+# with constructors for a few subtypes defined separately to avoid ambiguities
+for constructor_type in ambiguous_constructor_types(Number)
+    @eval (T::$constructor_type)(x::AutoBroadcaster) = nested_broadcast(T, x)
+end
+(T::Type{<:Number})(x::AutoBroadcaster) = nested_broadcast(T, x)
+
+# Permutations of n type constraints that include at least one :AutoBroadcaster
+function constraint_permutations(n)
+    all_constraint_names = (:AutoBroadcaster, :DefaultNonAutoBroadcaster)
+    permutations = Iterators.product(map(Returns(all_constraint_names), 1:n)...)
+    return Iterators.filter(Base.Fix1(any, ==(:AutoBroadcaster)), permutations)
+end
+
+# Boolean functions extended in ForwardDiff.jl
+for f in ForwardDiff.UNARY_PREDICATES
+    @eval Base.$f(x::AutoBroadcaster) = nested_broadcast($f, x)
+end
+for f in (:<, :<=, :(==), :isless), (X, Y) in constraint_permutations(2)
+    @eval Base.$f(x::$X, y::$Y) = nested_broadcast($f, x, y)
+end # FIXME: Adding a method for isequal here causes invalidations
+
+# Continuously differentiable functions from Base extended in ForwardDiff.jl
+const base_function_diff_rules =
+    Iterators.filter(==(:Base) ∘ first, ForwardDiff.DiffRules.diffrules())
+for (_, f, n) in base_function_diff_rules, types in constraint_permutations(n)
+    args = map(Base.Fix1(Symbol, :arg), 1:n)
+    typed_args = map(((arg, type),) -> :($arg::$type), zip(args, types))
+    @eval Base.$f($(typed_args...)) = nested_broadcast(Base.$f, $(args...))
+end
+
+# Other math functions from Base extended in ForwardDiff.jl, excluding those
+# that return pairs of values (e.g., sincos or sincospi), so we avoid having to
+# distinguish a Tuple of 2 AutoBroadcasters from an AutoBroadcaster of 2 Tuples
+for f in (:zero, :one, :eps, :float, :nextfloat, :prevfloat, :exponent)
+    @eval Base.$f(x::AutoBroadcaster) = nested_broadcast($f, x)
+end
+for f in (:floor, :ceil, :trunc, :round)
+    @eval Base.$f(x::AutoBroadcaster) = nested_broadcast($f, x)
+    @eval Base.$f(::Type{T}, x::AutoBroadcaster) where {T} =
+        nested_broadcast(Base.Fix1($f, T), x)
+end
+Base.precision(x::AutoBroadcaster; base...) =
+    nested_broadcast(x -> precision(x; base...), x)
+Base.literal_pow(::typeof(^), x::AutoBroadcaster, p::Val) =
+    nested_broadcast(x -> Base.literal_pow(^, x, p), x)
+for (X, Y) in constraint_permutations(2)
+    @eval Base.div(x::$X, y::$Y, r::RoundingMode) =
+        nested_broadcast((x, y) -> div(x, y, r), x, y)
+    @eval Base.fld(x::$X, y::$Y) = nested_broadcast(fld, x, y)
+    @eval Base.cld(x::$X, y::$Y) = nested_broadcast(cld, x, y)
+end
+for (X, Y, Z) in constraint_permutations(3)
+    @eval Base.fma(x::$X, y::$Y, z::$Z) = nested_broadcast(fma, x, y, z)
+    @eval Base.muladd(x::$X, y::$Y, z::$Z) =
+        nested_broadcast(muladd, x, y, z)
+end
+
+# Common math functions absent from ForwardDiff.jl, excluding those that return
+# pairs of values (e.g., minmax, divrem, or fldmod), so we avoid having to
+# distinguish a Tuple of 2 AutoBroadcasters from an AutoBroadcaster of 2 tuples
+for f in (:!, :~, :adjoint, :angle, :cis, :cispi, :conj, :sign)
+    @eval Base.$f(x::AutoBroadcaster) = nested_broadcast($f, x)
+end
+for f in (://, :&, :|, :xor, :fld1, :mod1), (X, Y) in constraint_permutations(2)
+    @eval Base.$f(x::$X, y::$Y) = nested_broadcast($f, x, y)
+end
+
+# Internal functions called by Base.sum and Base.prod
+for f in (:add_sum, :mul_prod), (X, Y) in constraint_permutations(2)
+    @eval Base.$f(x::$X, y::$Y) = nested_broadcast(Base.$f, x, y)
+end
+
+# Using AutoBroadcasters/DefaultBroadcastables as if-else statement conditionals
+for (X, Y) in constraint_permutations(2)
+    @eval Base.ifelse(cond::AutoBroadcasterOrSimilar, x::$X, y::$Y) =
+        nested_broadcast(ifelse, cond, x, y)
+end
+
+# Applying AutoBroadcasters like functions
+(f::AutoBroadcaster)(args...) =
+    nested_broadcast((f, args...) -> f(args...), f, args...)
diff --git a/src/recursive_apply.jl b/src/recursive_apply.jl
new file mode 100644
index 0000000000..eda6fc75a8
--- /dev/null
+++ b/src/recursive_apply.jl
@@ -0,0 +1,27 @@
+# This module is for backwards compatibility with previous versions of ClimaCore
+module RecursiveApply
+
+using ..Utilities: add_auto_broadcasters, drop_auto_broadcasters
+
+struct WithAutoBroadcasters{F}
+    f::F
+end
+
+# Call f with all arguments wrapped in AutoBroadcasters, then unwrap the result
+((; f)::WithAutoBroadcasters)(x) =
+    drop_auto_broadcasters(f(add_auto_broadcasters(x)))
+((; f)::WithAutoBroadcasters)(x, y) =
+    drop_auto_broadcasters(f(add_auto_broadcasters(x), add_auto_broadcasters(y)))
+
+for (f, rf) in ((:+, :radd), (:-, :rsub), (:*, :rmul), (:/, :rdiv))
+    @eval const $rf = WithAutoBroadcasters($f)
+end
+for f in (:zero, :min, :max, :promote_type)
+    @eval const $(Symbol(:r, f)) = WithAutoBroadcasters($f)
+end
+
+const ⊞ = radd
+const ⊟ = rsub
+const ⊠ = rmul
+
+end
diff --git a/test/Geometry/axistensors.jl b/test/Geometry/axistensors.jl
index 85898b9e33..ffa50de776 100644
--- a/test/Geometry/axistensors.jl
+++ b/test/Geometry/axistensors.jl
@@ -43,6 +43,7 @@ import ClimaCore
     @test -x + x * 2 - x / 2 == -x + 2 * x - 2 \ x == x / 2
     @test -x' + x' * 2 - x' / 2 == -x' + 2 * x' - 2 \ x' == (x / 2)'
 
+    @test x * 3 == x ⊗ 3 == Geometry.Covariant12Vector(3.0, 6.0)
     @test x * y' ==
           x ⊗ y ==
           Geometry.AxisTensor(
@@ -53,16 +54,6 @@ import ClimaCore
     @test Geometry.components(M * inv(M)) == @SMatrix [1.0 0.0; 0.0 1.0]
     @test Geometry.components(inv(M) * M) == @SMatrix [1.0 0.0; 0.0 1.0]
 
-    @test x ⊗ 3 == Geometry.Covariant12Vector(3.0, 6.0)
-    @test x ⊗ (1, (a = 2, b = 3)) == (
-        Geometry.Covariant12Vector(1.0, 2.0),
-        (
-            a = Geometry.Covariant12Vector(2.0, 4.0),
-            b = Geometry.Covariant12Vector(3.0, 6.0),
-        ),
-    )
-
-
     @test Geometry.components(M * inv(M)) == @SMatrix [1.0 0.0; 0.0 1.0]
     @test Geometry.components(inv(M) * M) == @SMatrix [1.0 0.0; 0.0 1.0]
 
diff --git a/test/Geometry/rmul_with_projection.jl b/test/Geometry/mul_with_projection.jl
similarity index 58%
rename from test/Geometry/rmul_with_projection.jl
rename to test/Geometry/mul_with_projection.jl
index c996e19d7e..29e91114d8 100644
--- a/test/Geometry/rmul_with_projection.jl
+++ b/test/Geometry/mul_with_projection.jl
@@ -1,6 +1,6 @@
 #=
 julia --project=.buildkite
-using Revise; include(joinpath("test", "Geometry", "rmul_with_projection.jl"))
+using Revise; include(joinpath("test", "Geometry", "mul_with_projection.jl"))
 =#
 using Test
 using JET
@@ -8,15 +8,16 @@ import Random
 using StaticArrays: @SMatrix
 
 import ClimaCore.Geometry
-import ClimaCore.Geometry: rmul_with_projection, rmul_return_type
+import ClimaCore.Geometry: mul_with_projection, mul_return_type
+import ClimaCore.Utilities: add_auto_broadcasters
 
 nested_type(value) = nested_type(value, value, value)
 nested_type(value1, value2, value3) =
     (; a = (), b = value1, c = (value2, (; d = (value3,)), (;)))
 
-function test_rmul_with_projection(x::X, y::Y, lg, expected_result) where {X, Y}
-    result = rmul_with_projection(x, y, lg)
-    result_type = rmul_return_type(X, Y)
+function test_mul_with_projection(x::X, y::Y, lg, expected_result) where {X, Y}
+    result = mul_with_projection(x, y, lg)
+    result_type = mul_return_type(X, Y)
 
     # Compute the maximum error as an integer multiple of machine epsilon.
     FT = Geometry.undertype(typeof(lg))
@@ -29,15 +30,15 @@ function test_rmul_with_projection(x::X, y::Y, lg, expected_result) where {X, Y}
     )
 
     @test max_error <= 1                                   # correctness
-    @test (@allocated rmul_with_projection(x, y, lg)) == 0 # allocations
-    @test_opt rmul_with_projection(x, y, lg)               # type instabilities
+    @test (@allocated mul_with_projection(x, y, lg)) == 0  # allocations
+    @test_opt mul_with_projection(x, y, lg)                # type instabilities
 
     @test result_type == typeof(result)                    # correctness
-    @test (@allocated rmul_return_type(X, Y)) == 0         # allocations
-    @test_opt rmul_return_type(X, Y)                       # type instabilities
+    @test (@allocated mul_return_type(X, Y)) == 0          # allocations
+    @test_opt mul_return_type(X, Y)                        # type instabilities
 end
 
-@testset "rmul_with_projection Unit Tests" begin
+@testset "mul_with_projection Unit Tests" begin
     Random.seed!(1) # ensures reproducibility
 
     FT = Float64
@@ -60,65 +61,65 @@ end
     projected_tensor = Geometry.project(dual_axis, tensor, lg)
 
     # Test all valid combinations of single values.
-    test_rmul_with_projection(number, number, lg, number * number)
-    test_rmul_with_projection(number, vector, lg, number * vector)
-    test_rmul_with_projection(number, tensor, lg, number * tensor)
-    test_rmul_with_projection(number, covector, lg, number * covector)
-    test_rmul_with_projection(number, cotensor, lg, number * cotensor)
-    test_rmul_with_projection(vector, number, lg, vector * number)
-    test_rmul_with_projection(vector, covector, lg, vector * covector)
-    test_rmul_with_projection(tensor, number, lg, tensor * number)
-    test_rmul_with_projection(tensor, vector, lg, tensor * projected_vector)
-    test_rmul_with_projection(tensor, tensor, lg, tensor * projected_tensor)
-    test_rmul_with_projection(tensor, cotensor, lg, tensor * cotensor)
-    test_rmul_with_projection(covector, number, lg, covector * number)
-    test_rmul_with_projection(covector, vector, lg, covector * projected_vector)
-    test_rmul_with_projection(covector, tensor, lg, covector * projected_tensor)
-    test_rmul_with_projection(covector, cotensor, lg, covector * cotensor)
-    test_rmul_with_projection(cotensor, number, lg, cotensor * number)
-    test_rmul_with_projection(cotensor, vector, lg, cotensor * projected_vector)
-    test_rmul_with_projection(cotensor, tensor, lg, cotensor * projected_tensor)
-    test_rmul_with_projection(cotensor, cotensor, lg, cotensor * cotensor)
+    test_mul_with_projection(number, number, lg, number * number)
+    test_mul_with_projection(number, vector, lg, number * vector)
+    test_mul_with_projection(number, tensor, lg, number * tensor)
+    test_mul_with_projection(number, covector, lg, number * covector)
+    test_mul_with_projection(number, cotensor, lg, number * cotensor)
+    test_mul_with_projection(vector, number, lg, vector * number)
+    test_mul_with_projection(vector, covector, lg, vector * covector)
+    test_mul_with_projection(tensor, number, lg, tensor * number)
+    test_mul_with_projection(tensor, vector, lg, tensor * projected_vector)
+    test_mul_with_projection(tensor, tensor, lg, tensor * projected_tensor)
+    test_mul_with_projection(tensor, cotensor, lg, tensor * cotensor)
+    test_mul_with_projection(covector, number, lg, covector * number)
+    test_mul_with_projection(covector, vector, lg, covector * projected_vector)
+    test_mul_with_projection(covector, tensor, lg, covector * projected_tensor)
+    test_mul_with_projection(covector, cotensor, lg, covector * cotensor)
+    test_mul_with_projection(cotensor, number, lg, cotensor * number)
+    test_mul_with_projection(cotensor, vector, lg, cotensor * projected_vector)
+    test_mul_with_projection(cotensor, tensor, lg, cotensor * projected_tensor)
+    test_mul_with_projection(cotensor, cotensor, lg, cotensor * cotensor)
 
     # Test some combinations of complicated nested values.
-    T = nested_type
-    test_rmul_with_projection(
+    T = add_auto_broadcasters ∘ nested_type
+    test_mul_with_projection(
         number,
         T(covector, vector, tensor),
         lg,
         T(number * covector, number * vector, number * tensor),
     )
-    test_rmul_with_projection(
+    test_mul_with_projection(
         T(covector, vector, tensor),
         number,
         lg,
         T(covector * number, vector * number, tensor * number),
     )
-    test_rmul_with_projection(
+    test_mul_with_projection(
         vector,
         T(number, number, number),
         lg,
         T(vector * number, vector * number, vector * number),
     )
-    test_rmul_with_projection(
+    test_mul_with_projection(
         T(number, number, number),
         covector,
         lg,
         T(number * covector, number * covector, number * covector),
     )
-    test_rmul_with_projection(
+    test_mul_with_projection(
         T(number, vector, number),
         T(covector, number, tensor),
         lg,
         T(number * covector, vector * number, number * tensor),
     )
-    test_rmul_with_projection(
+    test_mul_with_projection(
         T(covector, number, tensor),
         T(number, vector, number),
         lg,
         T(covector * number, number * vector, tensor * number),
     )
-    test_rmul_with_projection(
+    test_mul_with_projection(
         covector,
         T(vector, number, tensor),
         lg,
@@ -128,7 +129,7 @@ end
             covector * projected_tensor,
         ),
     )
-    test_rmul_with_projection(
+    test_mul_with_projection(
         T(covector, number, covector),
         vector,
         lg,
@@ -138,7 +139,7 @@ end
             covector * projected_vector,
         ),
     )
-    test_rmul_with_projection(
+    test_mul_with_projection(
         T(covector, number, covector),
         T(number, vector, tensor),
         lg,
diff --git a/test/Limiters/distributed/dlimiter.jl b/test/Limiters/distributed/dlimiter.jl
index 8eca406f45..9e1f733d99 100644
--- a/test/Limiters/distributed/dlimiter.jl
+++ b/test/Limiters/distributed/dlimiter.jl
@@ -8,7 +8,6 @@ using ClimaCore:
     Spaces,
     Limiters,
     Quadratures
-using ClimaCore.RecursiveApply
 using ClimaCore: slab
 using Test
 
@@ -67,12 +66,12 @@ q = map(
     coord -> (x = 1.2 * coord.x, y = 1.5 * coord.y),
     Fields.coordinate_field(hv_center_space),
 )
-ρq = ρ .⊠ q
+ρq = ρ .* q
 q_ref = map(
     coord -> (x = coord.x, y = coord.y),
     Fields.coordinate_field(hv_center_space),
 )
-ρq_ref = ρ .⊠ q_ref
+ρq_ref = ρ .* q_ref
 
 total_ρq = sum(ρq)
 
@@ -80,7 +79,7 @@ limiter = Limiters.QuasiMonotoneLimiter(ρq)
 
 Limiters.compute_bounds!(limiter, ρq_ref, ρ)
 Limiters.apply_limiter!(ρq, ρ, limiter)
-q = RecursiveApply.rdiv.(ρq, ρ)
+q = ρq ./ ρ
 
 @test sum(ρq.x) ≈ total_ρq.x
 @test sum(ρq.y) ≈ total_ρq.y
diff --git a/test/Limiters/limiter.jl b/test/Limiters/limiter.jl
index 7e03d28c07..a0b783ca4c 100644
--- a/test/Limiters/limiter.jl
+++ b/test/Limiters/limiter.jl
@@ -14,7 +14,6 @@ using ClimaCore:
     Spaces,
     Limiters,
     Quadratures
-using ClimaCore.RecursiveApply
 import ClimaCore.DataLayouts: slab_index
 using ClimaCore: slab
 using Test
@@ -167,7 +166,7 @@ end
             FT[i + f for i in 1:5, j in 1:5, f in 1:2],
         )
         ρ = DataLayouts.IJF{FT, 5}(FT[j / 2 for i in 1:5, j in 1:5, f in 1:1])
-        ρq = ρ .⊠ q
+        ρq = ρ .* q
         WJ = DataLayouts.IJF{FT, 5}(ones(FT, 5, 5, 1))
         q_min = (FT(3.2), FT(3.0))
         q_max = (FT(5.2), FT(5.0))
@@ -179,7 +178,7 @@ end
         ρq_new = deepcopy(ρq)
         Limiters.apply_limit_slab!(ρq_new, ρ, WJ, q_bounds, eps(FT))
 
-        q_new = RecursiveApply.rdiv.(ρq_new, ρ)
+        q_new = ρq_new ./ ρ
         for j in 1:5, i in 1:5
             @test q_min[1] <= q_new[si(i, j)][1] <= q_max[1]
             @test q_min[2] <= q_new[si(i, j)][2] <= q_max[2]
@@ -245,12 +244,12 @@ end
         q₀(coords, x_scale, y_scale) =
             (x = x_scale * coords.x, y = y_scale * coords.y)
         q = @. q₀(coords, x_scale, y_scale)
-        ρq = ρ .⊠ q
+        ρq = ρ .* q
         q_ref = map(
             coord -> (x = coord.x, y = coord.y),
             Fields.coordinate_field(space),
         )
-        ρq_ref = ρ .⊠ q_ref
+        ρq_ref = ρ .* q_ref
 
         total_ρq = (; x = sum(ρq.x), y = sum(ρq.y))
 
@@ -258,7 +257,7 @@ end
 
         Limiters.compute_bounds!(limiter, ρq_ref, ρ)
         Limiters.apply_limiter!(ρq, ρ, limiter)
-        q = RecursiveApply.rdiv.(ρq, ρ)
+        q = ρq ./ ρ
 
         @test sum(ρq.x) ≈ total_ρq.x
         @test sum(ρq.y) ≈ total_ρq.y
@@ -300,12 +299,12 @@ end
         q₀(coords, x_scale, y_scale) =
             (x = x_scale * coords.x, y = y_scale * coords.y)
         q = @. q₀(coords, x_scale, y_scale)
-        ρq = ρ .⊠ q
+        ρq = ρ .* q
         q_ref = map(
             coord -> (x = coord.x, y = coord.y),
             Fields.coordinate_field(hv_center_space),
         )
-        ρq_ref = ρ .⊠ q_ref
+        ρq_ref = ρ .* q_ref
 
         total_ρq = (; x = sum(ρq.x), y = sum(ρq.y))
 
@@ -313,7 +312,7 @@ end
 
         Limiters.compute_bounds!(limiter, ρq_ref, ρ)
         Limiters.apply_limiter!(ρq, ρ, limiter)
-        q = RecursiveApply.rdiv.(ρq, ρ)
+        q = ρq ./ ρ
 
         @test sum(ρq.x) ≈ total_ρq.x
         @test sum(ρq.y) ≈ total_ρq.y
diff --git a/test/Limiters/vertical_mass_borrowing_limiter.jl b/test/Limiters/vertical_mass_borrowing_limiter.jl
index e6ea17ffd9..30f451bc68 100644
--- a/test/Limiters/vertical_mass_borrowing_limiter.jl
+++ b/test/Limiters/vertical_mass_borrowing_limiter.jl
@@ -5,7 +5,6 @@ using Revise; include(joinpath("test", "Limiters", "vertical_mass_borrowing_limi
 using ClimaComms
 ClimaComms.@import_required_backends
 using ClimaCore: Fields, Spaces, Limiters
-using ClimaCore.RecursiveApply
 using ClimaCore.Geometry
 using ClimaCore.Grids
 using ClimaCore.CommonGrids
@@ -63,7 +62,7 @@ end
     (; z) = coords
     perturb_field!(q; perturb_radius = perturb_q)
     perturb_field!(ρ; perturb_radius = perturb_ρ)
-    ρq_init = ρ .⊠ q
+    ρq_init = ρ .* q
     sum_ρq_init = sum(ρq_init)
 
     # Test that the minimum is below 0
@@ -74,7 +73,7 @@ end
     limiter = Limiters.VerticalMassBorrowingLimiter((0.0,))
     Limiters.apply_limiter!(q, ρ, limiter)
     @test 0 ≤ minimum(q)
-    ρq = ρ .⊠ q
+    ρq = ρ .* q
     @test isapprox(sum(ρq), sum_ρq_init; atol = 1e-15)
     @test isapprox(sum(ρq), sum_ρq_init; rtol = 1e-10)
     plot_results(ClimaCore.to_cpu(ρq), ClimaCore.to_cpu(ρq_init))
@@ -113,7 +112,7 @@ end
     perturb_field!(scalar_field; perturb_radius = perturb_q)
     q.b .= scalar_field
     perturb_field!(ρ; perturb_radius = perturb_ρ)
-    ρq_init = ρ .⊠ q
+    ρq_init = ρ .* q
     sum_ρq_init = sum(ρq_init)
 
     # Test that the minimum is below 0
@@ -124,7 +123,7 @@ end
     limiter = Limiters.VerticalMassBorrowingLimiter((0.0, 0.0))
     Limiters.apply_limiter!(q, ρ, limiter)
     @test 0 ≤ minimum(parent(q))
-    ρq = ρ .⊠ q
+    ρq = ρ .* q
     @test isapprox(sum(ρq.a), sum_ρq_init.a; atol = 0.07)
     @test isapprox(sum(ρq.a), sum_ρq_init.a; rtol = 0.07)
     @test isapprox(sum(ρq.b), sum_ρq_init.b; atol = 0.07)
@@ -161,7 +160,7 @@ end
 
     perturb_field!(q; perturb_radius = perturb_q)
     perturb_field!(ρ; perturb_radius = perturb_ρ)
-    ρq_init = ρ .⊠ q
+    ρq_init = ρ .* q
     sum_ρq_init = sum(ρq_init)
 
     # Test that the minimum is below 0
@@ -172,7 +171,7 @@ end
     limiter = Limiters.VerticalMassBorrowingLimiter((0.0,))
     Limiters.apply_limiter!(q, ρ, limiter)
     @test 0 ≤ minimum(q)
-    ρq = ρ .⊠ q
+    ρq = ρ .* q
     @test isapprox(sum(ρq), sum_ρq_init; atol = 0.1)
     @test isapprox(sum(ρq), sum_ρq_init; rtol = 0.001)
 end
diff --git a/test/MatrixFields/band_matrix_row.jl b/test/MatrixFields/band_matrix_row.jl
index ac5a55bd1c..affe69667b 100644
--- a/test/MatrixFields/band_matrix_row.jl
+++ b/test/MatrixFields/band_matrix_row.jl
@@ -1,4 +1,5 @@
 using LinearAlgebra: I
+using ClimaCore.Utilities: add_auto_broadcasters
 
 include("matrix_field_test_utils.jl")
 
@@ -29,7 +30,7 @@ include("matrix_field_test_utils.jl")
               TridiagonalMatrixRow(1, 0, 1) / 2 - I ==
               zero(PentadiagonalMatrixRow{Int})
 
-    NT = nested_type
+    NT = add_auto_broadcasters ∘ nested_type
     @test_all QuaddiagonalMatrixRow(NT(0.5), NT(1), NT(1), NT(1 // 2)) +
               BidiagonalMatrixRow(NT(-0.5), NT(-1 // 2)) ==
               QuaddiagonalMatrixRow(NT(1), NT(1), NT(1), NT(1)) / 2
diff --git a/test/MatrixFields/field_matrix_solvers.jl b/test/MatrixFields/field_matrix_solvers.jl
index 4f65861fd5..6b3fd987b9 100644
--- a/test/MatrixFields/field_matrix_solvers.jl
+++ b/test/MatrixFields/field_matrix_solvers.jl
@@ -7,7 +7,6 @@ import Logging: Debug
 import LinearAlgebra: I, norm, ldiv!, mul!
 import ClimaComms
 import ClimaCore.Utilities: half
-import ClimaCore.RecursiveApply: ⊠
 import ClimaCore.MatrixFields: @name
 import ClimaCore:
     Spaces, MatrixFields, Fields, Domains, Meshes, Topologies, Geometry
@@ -379,9 +378,7 @@ end
     ᶠᶜmat2_u₃_scalar = ᶠᶜmat2 .* (e³,)
     ᶜᶠmat2_scalar_u₃ = ᶜᶠmat2 .* (e₃',)
     ᶠᶠmat3_u₃_u₃ = ᶠᶠmat3 .* (e³ * e₃',)
-    ᶜᶠmat2_ρχ_u₃ = map(Base.Fix1(map, Base.Fix2(⊠, ρχ_unit ⊠ e₃')), ᶜᶠmat2)
-    # We need to use Fix1 and Fix2 instead of defining anonymous functions in
-    # order for the result of map to be inferrable.
+    ᶜᶠmat2_ρχ_u₃ = ᶜᶠmat2 .* (ρχ_unit,) .* (e₃',)
 
     b_dry_dycore = Fields.FieldVector(;
         c = ᶜvec .* (dry_center_gs_unit,),
diff --git a/test/MatrixFields/field_names.jl b/test/MatrixFields/field_names.jl
index 3d0849ec5c..401680f589 100644
--- a/test/MatrixFields/field_names.jl
+++ b/test/MatrixFields/field_names.jl
@@ -1,6 +1,4 @@
-import LinearAlgebra: I
-import ClimaCore.RecursiveApply: rzero
-import ClimaCore.Utilities: replace_type_parameter
+import ClimaCore.Utilities: replace_type_parameter, new
 import ClimaCore.MatrixFields: @name, is_subset_that_covers_set
 
 include("matrix_field_test_utils.jl")
@@ -11,9 +9,6 @@ end
 Base.propertynames(::FooFieldName) = (:value,)
 Base.getproperty(foo::FooFieldName, s::Symbol) =
     s == :value ? getfield(foo, :_value) : error("Invalid property name")
-Base.convert(::Type{FooFieldName{T}}, foo::FooFieldName) where {T} =
-    FooFieldName{T}(foo.value)
-Base.zero(::Type{FooFieldName{T}}) where {T} = FooFieldName(zero(T))
 
 get_x() =
     (; foo = FooFieldName(0), a = (; b = 1, c = ((; d = 2), (;), (3, ()))))
@@ -717,7 +712,7 @@ end
 @testset "FieldNameDict Unit Tests" begin
     x = get_x()
     FT = Float64
-    x_FT = convert(replace_type_parameter(typeof(x), Int, FT), x)
+    x_FT = new(replace_type_parameter(typeof(x), Int, FT))
 
     C3 = Geometry.Covariant3Vector{FT}
     C12 = Geometry.Covariant12Vector{FT}
@@ -727,10 +722,10 @@ end
     CT3XC3 = typeof(zero(CT3) * zero(C3)')
     C12XCT12 = typeof(zero(C12) * zero(CT12)')
     CT3XCT12 = typeof(zero(CT3) * zero(CT12)')
-    x_C12 = rzero(replace_type_parameter(typeof(x), Int, C12))
-    x_CT3 = rzero(replace_type_parameter(typeof(x), Int, CT3))
-    x_C12XC3 = rzero(replace_type_parameter(typeof(x), Int, C12XC3))
-    x_CT3XCT12 = rzero(replace_type_parameter(typeof(x), Int, CT3XCT12))
+    x_C12 = new(replace_type_parameter(typeof(x), Int, C12))
+    x_CT3 = new(replace_type_parameter(typeof(x), Int, CT3))
+    x_C12XC3 = new(replace_type_parameter(typeof(x), Int, C12XC3))
+    x_CT3XCT12 = new(replace_type_parameter(typeof(x), Int, CT3XCT12))
     I_CT3XC3 = DiagonalMatrixRow(Geometry.AxisTensor(axes(CT3XC3), I))
     I_C12XCT12 = DiagonalMatrixRow(Geometry.AxisTensor(axes(C12XCT12), I))
 
diff --git a/test/MatrixFields/gpu_compat_bidiag_matrix_row.jl b/test/MatrixFields/gpu_compat_bidiag_matrix_row.jl
index 3ed6a44319..f6eb090198 100644
--- a/test/MatrixFields/gpu_compat_bidiag_matrix_row.jl
+++ b/test/MatrixFields/gpu_compat_bidiag_matrix_row.jl
@@ -70,8 +70,6 @@ const ᶜright_bias_matrix = MatrixFields.operator_matrix(ᶜright_bias)
 one_C3xACT3(::Type{_FT}) where {_FT} = C3(_FT(1)) * CT3(_FT(1))'
 get_I_u₃(::Type{_FT}) where {_FT} = DiagonalMatrixRow(one_C3xACT3(_FT))
 
-conv(::Type{_FT}, ᶜbias_matrix) where {_FT} =
-    convert(BidiagonalMatrixRow{_FT}, ᶜbias_matrix)
 function foo(c, f)
     (; ᶠtridiagonal_matrix_c3, ᶠu₃, ∂ᶠu₃ʲ_err_∂ᶠu₃ʲ, adj_u₃) = f
     (; ᶜu₃ʲ, bdmr_l, bdmr_r, bdmr) = c
@@ -79,6 +77,7 @@ function foo(c, f)
     FT = Spaces.undertype(space)
     I_u₃ = get_I_u₃(FT)
     dtγ = FT(1)
+    to_bidiagonal_row = Base.Fix1(convert, BidiagonalMatrixRow{FT})
 
     @. ∂ᶠu₃ʲ_err_∂ᶠu₃ʲ =
         dtγ * ᶠtridiagonal_matrix_c3 * DiagonalMatrixRow(adjoint(CT3(ᶠu₃))) -
@@ -90,14 +89,14 @@ function foo(c, f)
     @. ᶠtridiagonal_matrix_c3 =
         -(ᶠgradᵥ_matrix()) * ifelse(
             ᶜu₃ʲ.components.data.:1 > 0,
-            convert(BidiagonalMatrixRow{FT}, ᶜleft_bias_matrix()),
-            convert(BidiagonalMatrixRow{FT}, ᶜright_bias_matrix()),
+            to_bidiagonal_row(ᶜleft_bias_matrix()),
+            to_bidiagonal_row(ᶜright_bias_matrix()),
         )
 
     # However, this can be decomposed into simpler broadcast
     # expressions that will run on gpus:
-    @. bdmr_l = convert(BidiagonalMatrixRow{FT}, ᶜleft_bias_matrix())
-    @. bdmr_r = convert(BidiagonalMatrixRow{FT}, ᶜright_bias_matrix())
+    @. bdmr_l = to_bidiagonal_row(ᶜleft_bias_matrix())
+    @. bdmr_r = to_bidiagonal_row(ᶜright_bias_matrix())
     @. bdmr = ifelse(ᶜu₃ʲ.components.data.:1 > 0, bdmr_l, bdmr_r)
     @. ᶠtridiagonal_matrix_c3 = -(ᶠgradᵥ_matrix()) * bdmr
 
diff --git a/test/MatrixFields/matrix_field_test_utils.jl b/test/MatrixFields/matrix_field_test_utils.jl
index eab74039f7..364fc3cf39 100644
--- a/test/MatrixFields/matrix_field_test_utils.jl
+++ b/test/MatrixFields/matrix_field_test_utils.jl
@@ -23,7 +23,6 @@ import ClimaCore:
     Quadratures
 using ClimaCore.MatrixFields
 import ClimaCore.Utilities: half
-import ClimaCore.RecursiveApply: ⊠
 import LinearAlgebra: I, norm, ldiv!, mul!
 import ClimaCore.MatrixFields: @name
 
@@ -182,7 +181,7 @@ function dycore_prognostic_EDMF_FieldMatrix(
     ᶠᶜmat2_u₃_scalar = ᶠᶜmat2 .* (e³,)
     ᶜᶠmat2_scalar_u₃ = ᶜᶠmat2 .* (e₃',)
     ᶠᶠmat3_u₃_u₃ = ᶠᶠmat3 .* (e³ * e₃',)
-    ᶜᶠmat2_ρχ_u₃ = map(Base.Fix1(map, Base.Fix2(⊠, ρχ_unit ⊠ e₃')), ᶜᶠmat2)
+    ᶜᶠmat2_ρχ_u₃ = ᶜᶠmat2 .* (ρχ_unit,) .* (e₃',)
     ᶜᶜmat3_uₕ_scalar = ᶜᶜmat3 .* (e¹²,)
     ᶜᶜmat3_uₕ_uₕ =
         ᶜᶜmat3 .* (
@@ -192,9 +191,9 @@ function dycore_prognostic_EDMF_FieldMatrix(
             Geometry.Contravariant12Vector(0, 1)',
         )
     ᶜᶠmat2_uₕ_u₃ = ᶜᶠmat2 .* (e¹² * e₃',)
-    ᶜᶜmat3_ρχ_scalar = map(Base.Fix1(map, Base.Fix2(⊠, ρχ_unit)), ᶜᶜmat3)
-    ᶜᶜmat3_ρaχ_scalar = map(Base.Fix1(map, Base.Fix2(⊠, ρaχ_unit)), ᶜᶜmat3)
-    ᶜᶠmat2_ρaχ_u₃ = map(Base.Fix1(map, Base.Fix2(⊠, ρaχ_unit ⊠ e₃')), ᶜᶠmat2)
+    ᶜᶜmat3_ρχ_scalar = ᶜᶜmat3 .* (ρχ_unit,)
+    ᶜᶜmat3_ρaχ_scalar = ᶜᶜmat3 .* (ρaχ_unit,)
+    ᶜᶠmat2_ρaχ_u₃ = ᶜᶠmat2 .* (ρaχ_unit,) .* (e₃',)
 
     dry_center_gs_unit = (; ρ = 1, ρe_tot = 1, uₕ = e¹²)
     center_gs_unit = (; dry_center_gs_unit..., ρatke = 1, ρχ = ρχ_unit)
@@ -389,7 +388,7 @@ function random_field(::Type{T}, space) where {T}
     return field
 end
 
-# Construct a highly nested type for testing integration with RecursiveApply.
+# Construct a nested iterator for testing compatibility with generic data types.
 nested_type(value) = nested_type(value, value, value)
 nested_type(value1, value2, value3) =
     (; a = (), b = value1, c = (value2, (; d = (value3,)), (;)))
diff --git a/test/MatrixFields/operator_matrices.jl b/test/MatrixFields/operator_matrices.jl
index 725742bb90..e06e811e2d 100644
--- a/test/MatrixFields/operator_matrices.jl
+++ b/test/MatrixFields/operator_matrices.jl
@@ -36,8 +36,7 @@ import ClimaCore.Operators:
     GradientF2C,
     DivergenceC2F,
     DivergenceF2C,
-    CurlC2F,
-    return_eltype
+    CurlC2F
 
 include("matrix_field_test_utils.jl")
 
@@ -87,7 +86,9 @@ function test_op_matrix(
     # This boundary condition doesn't matter, since it's applied after the
     # operator. It is zeroed out for simplicity, but it does not need to be.
     boundary_op = if requires_boundary_values
-        boundary_op_bc = SetValue(rzero(return_eltype(op, args...)))
+        boundary_op_bc = SetValue(
+            rzero(eltype(Base.Broadcast.broadcasted(op, args...))),
+        )
         SetBoundaryOperator(; bottom = boundary_op_bc, top = boundary_op_bc)
     else
         nothing
diff --git a/test/Operators/finitedifference/tensor.jl b/test/Operators/finitedifference/tensor.jl
index afac9b459c..c244abda9b 100644
--- a/test/Operators/finitedifference/tensor.jl
+++ b/test/Operators/finitedifference/tensor.jl
@@ -2,7 +2,6 @@ using Test
 
 using ClimaComms
 using ClimaCore:
-    Geometry,
     Domains,
     Meshes,
     Topologies,
@@ -10,6 +9,7 @@ using ClimaCore:
     Fields,
     Operators,
     Quadratures
+using ClimaCore.Geometry
 using LinearAlgebra
 
 for FT in (Float32, Float64)
@@ -40,20 +40,17 @@ for FT in (Float32, Float64)
         )
     end
 
-    ∇ᵥuvw_boundary = Geometry.outer(
-        Geometry.WVector(FT(1)),
-        Geometry.UVWVector(FT(1), FT(2), FT(3)),
-    )
+    ∇ᵥuvw_boundary =
+        Geometry.WVector(FT(1)) ⊗ Geometry.UVWVector(FT(1), FT(2), FT(3))
 
     gradc2f = Operators.GradientC2F(
         bottom = Operators.SetGradient(∇ᵥuvw_boundary),
         top = Operators.SetGradient(∇ᵥuvw_boundary),
     )
     ∇ᵥuvw = Geometry.project.(Ref(Geometry.UVWAxis()), gradc2f.(uvw))
-    ∇ᵥuvw_scalar = Geometry.outer(
-        Geometry.UVWVector(FT(0), FT(0), FT(1)),
-        Geometry.UVWVector(FT(1), FT(2), FT(3)),
-    )
+    ∇ᵥuvw_scalar =
+        Geometry.UVWVector(FT(0), FT(0), FT(1)) ⊗
+        Geometry.UVWVector(FT(1), FT(2), FT(3))
     ∇ᵥuvw_ref = fill(∇ᵥuvw_scalar, fspace)
     @test ∇ᵥuvw ≈ ∇ᵥuvw_ref
 
diff --git a/test/Operators/finitedifference/unit_column.jl b/test/Operators/finitedifference/unit_column.jl
index 4a8ce8cfd7..deb6bf0fb7 100644
--- a/test/Operators/finitedifference/unit_column.jl
+++ b/test/Operators/finitedifference/unit_column.jl
@@ -203,17 +203,6 @@ end
         # test that broadcasting into incorrect field space throws an error
         empty_faces = zeros(FT, face_space)
         @test_throws Exception empty_faces .= ∂.(w .* I.(θ))
-
-        # 5) we set boundaries on neither
-        I = Operators.InterpolateC2F()
-        ∂ = Operators.GradientF2C()
-
-        # TODO: should we throw something else?
-        if are_boundschecks_forced && !(device isa ClimaComms.CUDADevice)
-            @test_throws BoundsError ∂.(w .* I.(θ))
-        else
-            @warn "Bounds check on BoundsError ∂.(w .* I.(θ)) not verified."
-        end
     end
 end
 
diff --git a/test/Operators/finitedifference/unit_fd_ops_shared_memory.jl b/test/Operators/finitedifference/unit_fd_ops_shared_memory.jl
index f46e5c84f4..79eba774fb 100644
--- a/test/Operators/finitedifference/unit_fd_ops_shared_memory.jl
+++ b/test/Operators/finitedifference/unit_fd_ops_shared_memory.jl
@@ -89,7 +89,7 @@ end
         top = Operators.Extrapolate(),
     )
     bc = @. lazy(div(Geometry.WVector(ᶠwinterp(ϕ, ρ))))
-    test_center_windows(bc)
+    test_face_windows(bc)
     # highly nested cases
     ᶜinterp = Operators.InterpolateF2C()
     ᶠinterp = Operators.InterpolateC2F(
diff --git a/test/Operators/integrals.jl b/test/Operators/integrals.jl
index 59f3d91669..4c113c3e42 100644
--- a/test/Operators/integrals.jl
+++ b/test/Operators/integrals.jl
@@ -35,9 +35,7 @@ function test_column_integral_definite!(center_space)
     ᶜz = Fields.coordinate_field(center_space).z
     ᶠz = Fields.coordinate_field(face_space).z
     z_top = Fields.level(ᶠz, Operators.right_idx(face_space))
-    ᶜu = Base.Broadcast.broadcasted(ᶜz) do z
-        (; one = one(z), powers = (z, z^2, z^3))
-    end
+    ᶜu = map(z -> (; one = one(z), powers = (z, z^2, z^3)), ᶜz)
     ∫u_ref = map(z -> (; one = z, powers = (z^2 / 2, z^3 / 3, z^4 / 4)), z_top)
     ∫u_test = similar(∫u_ref)
 
@@ -57,9 +55,7 @@ function test_column_integral_indefinite!(center_space)
     face_space = center_to_face_space(center_space)
     ᶜz = Fields.coordinate_field(center_space).z
     ᶠz = Fields.coordinate_field(face_space).z
-    ᶜu = Base.Broadcast.broadcasted(ᶜz) do z
-        (; one = one(z), powers = (z, z^2, z^3))
-    end
+    ᶜu = map(z -> (; one = one(z), powers = (z, z^2, z^3)), ᶜz)
     ᶠ∫u_ref = map(z -> (; one = z, powers = (z^2 / 2, z^3 / 3, z^4 / 4)), ᶠz)
     ᶠ∫u_test = similar(ᶠ∫u_ref)
 
diff --git a/test/Operators/spectralelement/opt.jl b/test/Operators/spectralelement/opt.jl
index c2b93468f5..1074ce57f2 100644
--- a/test/Operators/spectralelement/opt.jl
+++ b/test/Operators/spectralelement/opt.jl
@@ -61,14 +61,8 @@ function opt_WeakDivergence(field)
     return wdiv.(field)
 end
 
-function opt_ScalarDSS(field)
-    Spaces.weighted_dss!(@. opt_Gradient(field))
-    return grad
-end
-
-function opt_VectorDss_Curl(field)
-    return Spaces.weighted_dss!(@. opt_Curl(field))
-end
+opt_ScalarDSS(field) = Spaces.weighted_dss!(opt_Gradient(field))
+opt_VectorDss_Curl(field) = Spaces.weighted_dss!(opt_Curl(field))
 
 function opt_VectorDss_DivGrad(field)
     sdiv = Operators.Divergence()
diff --git a/test/RecursiveApply/unit_recursive_apply.jl b/test/RecursiveApply/unit_recursive_apply.jl
deleted file mode 100644
index 1ccc7cf49f..0000000000
--- a/test/RecursiveApply/unit_recursive_apply.jl
+++ /dev/null
@@ -1,100 +0,0 @@
-using JET
-using Test
-
-using ClimaCore.RecursiveApply
-using ClimaCore.Geometry
-
-@static if @isdefined(var"@test_opt") # v1.7 and higher
-    @testset "RecursiveApply optimization test" begin
-        for x in [
-            1.0,
-            1.0f0,
-            (1.0, 2.0),
-            (1.0f0, 2.0f0),
-            (a = 1.0, b = (x1 = 2.0, x2 = 3.0)),
-            (a = 1.0f0, b = (x1 = 2.0f0, x2 = 3.0f0)),
-        ]
-            @test_opt 2 ⊠ x
-            @test_opt x ⊞ x
-            @test_opt RecursiveApply.rdiv(x, 3)
-        end
-    end
-end
-
-@testset "RecursiveApply nary ops" begin
-    for x in [
-        1.0,
-        1.0f0,
-        (1.0, 2.0),
-        (1.0f0, 2.0f0),
-        (a = 1.0, b = (x1 = 2.0, x2 = 3.0)),
-        (a = 1.0f0, b = (x1 = 2.0f0, x2 = 3.0f0)),
-    ]
-        FT = eltype(x[1])
-        @test RecursiveApply.rmul(x, one(FT), one(FT), one(FT)) == x
-        @test RecursiveApply.rmul(x, one(FT), x, one(FT)) ==
-              RecursiveApply.rmul(x, x)
-        @test RecursiveApply.radd(x, zero(FT), zero(FT), zero(FT)) == x
-        @test RecursiveApply.radd(x, zero(FT), x, zero(FT)) ==
-              RecursiveApply.rmul(x, FT(2))
-    end
-end
-
-@testset "Highly nested types" begin
-    FT = Float64
-    nested_types = [
-        FT,
-        Tuple{FT, FT},
-        NamedTuple{(:ϕ, :ψ), Tuple{FT, FT}},
-        Tuple{
-            NamedTuple{(:ϕ, :ψ), Tuple{FT, FT}},
-            NamedTuple{(:ϕ, :ψ), Tuple{FT, FT}},
-        },
-        Tuple{FT, FT},
-        NamedTuple{
-            (:ρ, :uₕ, :ρe_tot, :ρq_tot, :sgs⁰, :sgsʲs),
-            Tuple{
-                FT,
-                Tuple{FT, FT},
-                FT,
-                FT,
-                NamedTuple{(:ρatke,), Tuple{FT}},
-                Tuple{NamedTuple{(:ρa, :ρae_tot, :ρaq_tot), Tuple{FT, FT, FT}}},
-            },
-        },
-        NamedTuple{
-            (:u₃, :sgsʲs),
-            Tuple{Tuple{FT}, Tuple{NamedTuple{(:u₃,), Tuple{Tuple{FT}}}}},
-        },
-    ]
-    for nt in nested_types
-        rz = RecursiveApply.rmap(RecursiveApply.rzero, nt)
-        @test typeof(rz) == nt
-        @inferred RecursiveApply.rmap(RecursiveApply.rzero, nt)
-
-        rz = RecursiveApply.rmap((x, y) -> RecursiveApply.rzero(x), nt, nt)
-        @test typeof(rz) == nt
-        @inferred RecursiveApply.rmap((x, y) -> RecursiveApply.rzero(x), nt, nt)
-
-        rz = RecursiveApply.rmaptype(identity, nt)
-        @test rz == nt
-        @inferred RecursiveApply.rmaptype(zero, nt)
-
-        rz = RecursiveApply.rmaptype((x, y) -> identity(x), nt, nt)
-        @test rz == nt
-        @inferred RecursiveApply.rmaptype((x, y) -> zero(x), nt, nt)
-    end
-end
-
-@testset "NamedTuples and axis tensors" begin
-    FT = Float64
-    nt = (; a = FT(1), b = FT(2))
-    uv = Geometry.UVVector(FT(1), FT(2))
-    rz = RecursiveApply.rmap(*, nt, uv)
-    @test typeof(rz) == NamedTuple{(:a, :b), Tuple{UVVector{FT}, UVVector{FT}}}
-    @inferred RecursiveApply.rmap(*, nt, uv)
-    @test rz.a.u == 1
-    @test rz.a.v == 2
-    @test rz.b.u == 2
-    @test rz.b.v == 4
-end
diff --git a/test/Spaces/ddss1_cs.jl b/test/Spaces/ddss1_cs.jl
index 3fe1fa29c3..465408f754 100644
--- a/test/Spaces/ddss1_cs.jl
+++ b/test/Spaces/ddss1_cs.jl
@@ -78,27 +78,6 @@ end
     Spaces.weighted_dss!(x)
 
     @test Array(parent(x)) ≈ ones(size(parent(x))) # TODO: improve the quality of this test
-
-    wrong_field = map(Fields.coordinate_field(space)) do cf
-        (; a = Float64(0))
-    end
-    wrong_buffer = Spaces.create_dss_buffer(wrong_field)
-    @test_throws ErrorException("Incorrect buffer eltype") Spaces.weighted_dss!(
-        x,
-        wrong_buffer,
-    )
-    @test_throws ErrorException("Incorrect buffer eltype") Spaces.weighted_dss_start!(
-        x,
-        wrong_buffer,
-    )
-    @test_throws ErrorException("Incorrect buffer eltype") Spaces.weighted_dss_internal!(
-        x,
-        wrong_buffer,
-    )
-    @test_throws ErrorException("Incorrect buffer eltype") Spaces.weighted_dss_ghost!(
-        x,
-        wrong_buffer,
-    )
 end
 
 @testset "DSS of Covariant12Vector & Covariant123Vector on extruded Cubed Sphere mesh (ne = 3, serial run)" begin
diff --git a/test/Utilities/unit_auto_broadcaster.jl b/test/Utilities/unit_auto_broadcaster.jl
new file mode 100644
index 0000000000..4c1ac55efd
--- /dev/null
+++ b/test/Utilities/unit_auto_broadcaster.jl
@@ -0,0 +1,54 @@
+using Test
+
+using ClimaCore.Utilities: add_auto_broadcasters, nested_broadcast
+using ClimaCore.Geometry: UVVector
+
+@testset "Simple AutoBroadcasters" begin
+    for itr in (1, (1, 2), (a = 1, b = (c = 2, d = 3)))
+        x = @inferred add_auto_broadcasters(itr) + 0 + 0 + 0
+        y = @inferred add_auto_broadcasters(itr) + 0 + itr + 0
+        @test x + itr === y
+
+        x = @inferred add_auto_broadcasters(itr) * 1 * 1 * 1
+        y = @inferred add_auto_broadcasters(itr) * 1 * itr * 1
+        @test x * itr === y
+    end
+end
+
+@testset "AutoBroadcasters of AxisTensors" begin
+    x = @inferred add_auto_broadcasters((; a = 1, b = 2)) * UVVector(1, 2)
+    y = @inferred add_auto_broadcasters((; a = UVVector(1, 2), b = UVVector(2, 4)))
+    @test x === y
+end
+
+@testset "Highly nested AutoBroadcasters" begin
+    FT = Float64
+    for T in (
+        typeof(∘(ntuple(Returns(tup -> (tup,)), 20)...)(zero(FT))),
+        typeof(∘(ntuple(Returns(tup -> (tup, tup)), 10)...)(zero(FT))),
+        typeof(∘(ntuple(Returns(tup -> (tup, tup, tup)), 5)...)(zero(FT))),
+        NamedTuple{
+            (:ρ, :uₕ, :ρe_tot, :ρq_tot, :sgs⁰, :sgsʲs),
+            Tuple{
+                FT,
+                Tuple{FT, FT},
+                FT,
+                FT,
+                NamedTuple{(:ρatke,), Tuple{FT}},
+                Tuple{NamedTuple{(:ρa, :ρae_tot, :ρaq_tot), Tuple{FT, FT, FT}}},
+            },
+        }, # similar to the prognostic state used in ClimaAtmos.jl
+    )
+        X = @inferred add_auto_broadcasters(T)
+        @test zero(X) isa X
+        for x in (
+            (@inferred zero(X)),
+            (@inferred FT(Integer(zero(X)))),
+            (@inferred min(Integer(zero(X)), cos(zero(X)), abs(eps(X)))),
+            (@inferred nested_broadcast(Returns(-), zero(X))(Int(one(X)), one(FT))),
+            (@inferred nested_broadcast(Returns(zero(FT)), ntuple(Returns(one(X)), 40)...)),
+        )
+            @test x === zero(X)
+        end
+    end
+end
diff --git a/test/aqua.jl b/test/aqua.jl
index b748e8f463..351e801f59 100644
--- a/test/aqua.jl
+++ b/test/aqua.jl
@@ -20,7 +20,7 @@ using Aqua
     # then please lower the limit based on the new number of ambiguities.
     # We're trying to drive this number down to zero to reduce latency.
     # Uncomment for debugging:
-    n_existing_ambiguities = 25
+    n_existing_ambiguities = 23
     if !(length(ambs) ≤ n_existing_ambiguities)
         for method_ambiguity in ambs
             @show method_ambiguity
diff --git a/test/runtests.jl b/test/runtests.jl
index d993837649..273ae3bda8 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -15,8 +15,8 @@ UnitTest("DataLayouts get/set_index_field"         ,"DataLayouts/unit_cartesian_
 UnitTest("DataLayouts has_uniform_datalayouts"     ,"DataLayouts/unit_has_uniform_datalayouts.jl"),
 UnitTest("DataLayouts non_extruded_broadcast"      ,"DataLayouts/unit_non_extruded_broadcast.jl"),
 UnitTest("DataLayouts linear indexing"             ,"DataLayouts/unit_linear_indexing.jl"),
-UnitTest("Recursive"                               ,"RecursiveApply/unit_recursive_apply.jl"),
 UnitTest("PlusHalf"                                ,"Utilities/unit_plushalf.jl"),
+UnitTest("AutoBroadcaster"                         ,"Utilities/unit_auto_broadcaster.jl"),
 UnitTest("DataLayouts 0D"                          ,"DataLayouts/data0d.jl"),
 UnitTest("DataLayouts 1D"                          ,"DataLayouts/data1d.jl"),
 UnitTest("DataLayouts 2D"                          ,"DataLayouts/data2d.jl"),
@@ -24,7 +24,7 @@ UnitTest("DataLayouts 1dx"                         ,"DataLayouts/data1dx.jl"),
 UnitTest("DataLayouts 2dx"                         ,"DataLayouts/data2dx.jl"),
 UnitTest("DataLayouts mapreduce"                   ,"DataLayouts/unit_mapreduce.jl"),
 UnitTest("Geometry"                                ,"Geometry/geometry.jl"),
-UnitTest("rmul_with_projection"                    ,"Geometry/rmul_with_projection.jl"),
+UnitTest("mul_with_projection"                     ,"Geometry/mul_with_projection.jl"),
 UnitTest("AxisTensors"                             ,"Geometry/axistensors.jl"),
 UnitTest("Interval mesh"                           ,"Meshes/interval.jl"),
 UnitTest("Rectangle mesh"                          ,"Meshes/rectangle.jl"),