diff --git a/AbstractNFFTs/src/interface.jl b/AbstractNFFTs/src/interface.jl
index 6f10b7cd..e543432f 100644
--- a/AbstractNFFTs/src/interface.jl
+++ b/AbstractNFFTs/src/interface.jl
@@ -189,7 +189,7 @@ The transformation is applied along `D-R+1` dimensions specified in the plan `p`
     size_in(p)
 
 Size of the input array for the plan p (NFFT/NFCT/NFST/NNFFT). 
-The returned tuple has `R` entries. 
+The returned tuple has `D` entries.
 Note that this will be the output size for the transposed / adjoint operator.
 """
 @mustimplement size_in(p::AbstractFTPlan{T,D,R}) where {T,D,R}
@@ -222,10 +222,10 @@ Change nodes `k` in the plan `p` operation and return the plan.
    convolve!(p::AbstractNFFTPlan{T,D,R}, g::AbstractArray, fHat::AbstractArray)
 
 Overwrite `R`-dimensional array `fHat`
-(often R=1, and `fHat` has length `p.J`)
+(often R=1, and `fHat` has length `only(p.size_in)`)
 with the result of "convolution" (i.e., interpolation)
 between `D`-dimensional equispaced input array `g`
-(often of size `p.Ñ`)
+(often of size `p.Ñ` which is not part of abstract interface)
 and the interpolation kernel in NFFT plan `p`.
 """
 @mustimplement convolve!(p::AbstractNFFTPlan, g::AbstractArray, fHat::AbstractArray)
diff --git a/NFFTTools/Project.toml b/NFFTTools/Project.toml
index 025de674..1b818d7f 100644
--- a/NFFTTools/Project.toml
+++ b/NFFTTools/Project.toml
@@ -1,16 +1,18 @@
 name = "NFFTTools"
 uuid = "7424e34d-94f7-41d6-98a0-85abaf1b6c91"
-author = ["Tobias Knopp <tobias@knoppweb.de>"]
-version = "0.2.7"
+author = ["Tobias Knopp <tobias@knoppweb.de> and contributors"]
+version = "0.3"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" 
 AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d"
 
 [compat]
 julia = "1.6"
 AbstractFFTs = "1.0"
 AbstractNFFTs = "0.6, 0.7, 0.8, 0.9"
 FFTW = "1"
+NFFT = "0.14"
diff --git a/NFFTTools/src/NFFTTools.jl b/NFFTTools/src/NFFTTools.jl
index a4f99f60..0ff5a3dc 100644
--- a/NFFTTools/src/NFFTTools.jl
+++ b/NFFTTools/src/NFFTTools.jl
@@ -3,11 +3,12 @@ module NFFTTools
 export sdc
 export calculateToeplitzKernel, calculateToeplitzKernel!, convolveToeplitzKernel!
 
-using AbstractNFFTs: AbstractNFFTPlan, plan_nfft, nodes!
+using AbstractNFFTs: AbstractNFFTPlan, plan_nfft, nodes!, size_in, size_out
 using AbstractNFFTs: convolve!, convolve_transpose!
+#using NFFT: NFFTPlan
 using FFTW: fftshift, plan_fft, plan_ifft
 using LinearAlgebra: adjoint, mul!
-import FFTW # ESTIMATE (which is currently non-public)
+import FFTW # ESTIMATE
 
 include("samplingDensity.jl")
 include("Toeplitz.jl")
diff --git a/NFFTTools/src/samplingDensity.jl b/NFFTTools/src/samplingDensity.jl
index e8e9d66e..b332eaba 100644
--- a/NFFTTools/src/samplingDensity.jl
+++ b/NFFTTools/src/samplingDensity.jl
@@ -1,49 +1,155 @@
-
 #=
 using AbstractNFFTs: AbstractNFFTPlan, convolve!, convolve_transpose!
+using AbstractNFFTs: size_in, size_out
 using LinearAlgebra: mul!
 =#
 
-function sdc(p::AbstractNFFTPlan{T,D,1}; iters=20) where {T,D}
-    # Weights for sample density compensation.
-    # Uses method of Pipe & Menon, 1999. Mag Reson Med, 186, 179.
-    weights = similar(p.tmpVec, Complex{T}, p.J)
-    weights .= one(Complex{T})
-    weights_tmp = similar(weights)
-    scaling_factor = zero(T)
+"""
+    out = _fill_similar(array, v::T, dims) where T
+Return an allocated array similar to `array` filled with value `v`.
+
+This 2-step initialization helper function
+is needed to accommodate GPU array types.
+The more obvious statement `weights = fill(v, dims)`
+can lead to the wrong array type and cause GPU tests to fail.
+"""
+function _fill_similar(array::AbstractArray, v::T, dims::Union{Integer,Dims}) where T
+    weights = similar(array, T, dims)
+    fill!(weights, v)
+    return weights
+end
+
+#=
+It would be desirable to use the memory pointed to by p.tmpVec
+as working buffer for sdc iterations,
+but this attempt led to intermittent corrupted outputs and errors.
+So it is left commented out for now.
+=#
+#=
+function _reinterpret_real(g::StridedArray{Complex{T}}) where {T <: Real}
+    r1 = reinterpret(T, g)
+    r2 = @view r1[1:2:end,:]
+    return r2
+end
+=#
+
+"""
+   weights = sdc(plan::{Abstract}NFFTPlan; iters=20, ...)
+
+Compute weights for sample density compensation for given NFFT `plan`
+Uses method of Pipe & Menon, Mag Reson Med, 44(1):179-186, Jan. 1999.
+DOI: 10.1002/(SICI)1522-2594(199901)41:1<179::AID-MRM25>3.0.CO;2-V
+The function applies `iters` iterations of that method.
+
+The scaling here such that if the plan were 1D with N nodes equispaced
+from -0.5 to 0.5-1/N, then the returned weights are ≈ 1/N.
+
+The weights are scaled such that ``A' diag(w) A 1_N ≈ 1_N``.
+
+The returned vector is real, positive values of length `plan.J`.
+
+This method _almost_ conforms to the `AbstractNFFT` interface
+except that it uses `p.Ñ` and `p.tmpVec` that are not part of that interface.
+
+There are several named keyword arguments that are work buffers
+that are all mutated: `weights weights_tmp workg workf workv`.
+If the caller provides all of those,
+then this function should make only small allocations.
+"""
+function sdc(
+    p::AbstractNFFTPlan{T,D,1};
+    iters::Int = 20,
+    # the type of p.tmpVec (if present) is needed for GPU arrays (JLArray):
+    _array::AbstractArray = hasfield(typeof(p), :tmpVec) ?
+        p.tmpVec : Array{T,D}(undef, zeros(Int, D)...),
+    # the following are working buffers that are all mutated:
+    weights::AbstractVector{T} = _fill_similar(_array, one(T), only(size_out(p))),
+    weights_tmp::AbstractVector = similar(weights),
+#   workg::AbstractArray = _reinterpret_real(p.tmpVec), # todo
+    workg::AbstractArray = similar(_array, T, p.Ñ),
+    workf::AbstractVector = similar(_array, Complex{T}, only(size_out(p))),
+    workv::AbstractArray = similar(_array, Complex{T}, size_in(p)),
+) where {T <: Real, D}
+
+  return sdc!(
+    p,
+    iters,
+    weights,
+    weights_tmp,
+    workg,
+    workf,
+    workv,
+  )
+end
+
+
+"""
+   weights = sdc!( p::AbstractNFFTPlan, iters, weights, weights_tmp, workg)
+Compute sampling density compensation `weights`
+without performing any final scaling step.
+
+Ideally this function should be (nearly) non-allocating.
+"""
+function sdc!(
+    p::AbstractNFFTPlan{T,D,1},
+    iters::Int,
+    # the following are working buffers that are all mutated:
+    weights::AbstractVector{T},
+    weights_tmp::AbstractVector,
+    workg::AbstractArray,
+) where {T <: Real, D}
+
+    scaling_factor = missing # will be set below
 
     # Pre-weighting to correct non-uniform sample density
     for i in 1:iters
-        convolve_transpose!(p, weights, p.tmpVec)
-        if i==1
-         scaling_factor = maximum(abs.(p.tmpVec))
+        convolve_transpose!(p, weights, workg)
+        if i == 1
+            scaling_factor = maximum(workg)
         end
 
-        p.tmpVec ./= scaling_factor
-        convolve!(p, p.tmpVec, weights_tmp)
+        workg ./= scaling_factor
+        convolve!(p, workg, weights_tmp)
         weights_tmp ./= scaling_factor
-        weights ./= (abs.(weights_tmp) .+ eps(T))
+        any(≤(0), weights_tmp) && throw("non-positive weights")
+        weights ./= weights_tmp
     end
-    # Post weights to correct image scaling
-    # This finds c, where ||u - c*v||_2^2 = 0 and then uses
-    # c to scale all weights by a scalar factor.
-    u = similar(weights, Complex{T}, p.N) 
-    u .= one(Complex{T})
-    
-    # conversion to Array is a workaround for CuNFFT. Without it we get strange
-    # results that indicate some synchronization issue
-    #f = Array( p * u ) 
-    #b = f .* Array(weights) # apply weights from above
-    #v = Array( adjoint(p) * convert(typeof(weights), b) )
-    #c = vec(v) \ vec(Array(u))  # least squares diff
-    #return abs.(convert(typeof(weights), c * Array(weights))) 
-    
-    # non converting version
-    f = similar(p.tmpVec, Complex{T}, p.J)
-    mul!(f, p, u)
-    f .*= weights # apply weights from above
-    v = similar(p.tmpVec, Complex{T}, p.N)
-    mul!(v, adjoint(p), f)
-    c = vec(v) \ vec(u)  # least squares diff
-    return abs.(c * weights) 
+    return weights
+end
+
+
+"""
+    sdc!( ... )
+
+This version scales the weights such that
+``A' D(w) A 1_N ≈ 1_N``.
+Find ``c ∈ ℝ`` that minimizes ``‖u - c * v‖₂``
+where ``u ≜ 1_N`` (vector of ones)
+and ``v ≜ A' D(w) A 1_N``, and then scale `w` by that `c`.
+The analytical solution is
+``c = real(u'v) / ‖v‖² = real(sum(v)) / ‖v‖².``
+
+Ideally this function should be (nearly) non-allocating.
+"""
+function sdc!(
+    p::AbstractNFFTPlan{T,D,1},
+    iters::Int,
+    # the following are working buffers that are all mutated:
+    weights::AbstractVector{T},
+    weights_tmp::AbstractVector,
+    workg::AbstractArray,
+    workf::AbstractVector,
+    workv::AbstractArray,
+) where {T <: Real, D}
+
+    sdc!(p, iters, weights, weights_tmp, workg)
+
+    u = workv # trick to save memory
+    fill!(u, one(T))
+    mul!(workf, p, u)
+    workf .*= weights # apply weights from above
+    mul!(workv, adjoint(p), workf)
+    c = real(sum(workv)) / sum(abs2, workv)
+    weights .*= c
+    return weights
 end
diff --git a/Project.toml b/Project.toml
index dc228061..a58d53d5 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NFFT"
 uuid = "efe261a4-0d2b-5849-be55-fc731d526b0d"
 authors = ["Tobias Knopp <tobias@knoppweb.de> and contributors"]
-version = "0.14.2"
+version = "0.14.3"
 
 [deps]
 AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7"
diff --git a/ext/NFFTGPUArraysExt/implementation.jl b/ext/NFFTGPUArraysExt/implementation.jl
index 43712980..2468ab9a 100644
--- a/ext/NFFTGPUArraysExt/implementation.jl
+++ b/ext/NFFTGPUArraysExt/implementation.jl
@@ -1,3 +1,5 @@
+const RealOrComplex = Union{T, Complex{T}} where {T <: Real}
+
 mutable struct GPU_NFFTPlan{T,D, arrTc <: AbstractGPUArray{Complex{T}, D}, vecI <: AbstractGPUVector{Int32}, FP, BP, INV, SM} <: AbstractNFFTPlan{T,D,1} 
   N::NTuple{D,Int64}
   NOut::NTuple{1,Int64}
@@ -60,14 +62,21 @@ AbstractNFFTs.size_in(p::GPU_NFFTPlan) = p.N
 AbstractNFFTs.size_out(p::GPU_NFFTPlan) = p.NOut
 
 
-function AbstractNFFTs.convolve!(p::GPU_NFFTPlan{T,D, arrTc}, g::arrTc, fHat::arrH) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrH <: arr}
+function AbstractNFFTs.convolve!(
+  p::GPU_NFFTPlan{<:Real, D, <:AbstractGPUArray},
+  g::AbstractGPUArray{<:RealOrComplex, D},
+  fHat::AbstractGPUVector{<:RealOrComplex},
+) where {D}
   mul!(fHat, transpose(p.B), vec(g)) 
-  return
 end
 
-function AbstractNFFTs.convolve_transpose!(p::GPU_NFFTPlan{T,D, arrTc}, fHat::arrH, g::arrTc) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrH <: arr}
+function AbstractNFFTs.convolve_transpose!(
+  p::GPU_NFFTPlan{<:Real, D, <:AbstractGPUArray},
+  fHat::AbstractGPUVector{<:RealOrComplex},
+  g::AbstractGPUArray{<:RealOrComplex, D},
+) where {D}
   mul!(vec(g), p.B, fHat)
-  return
+  return g
 end
 
 function AbstractNFFTs.deconvolve!(p::GPU_NFFTPlan{T,D, arrTc}, f::arrF, g::arrTc) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrF <: arr}
@@ -125,4 +134,3 @@ function LinearAlgebra.mul!(f::arrF, pl::Adjoint{Complex{T},<:GPU_NFFTPlan{T,D,
 
     return f
 end
-
diff --git a/test/convolve.jl b/test/convolve.jl
index 220809fa..9e517e47 100644
--- a/test/convolve.jl
+++ b/test/convolve.jl
@@ -8,11 +8,13 @@ that exceed the 6e4 maximum of Float16.
 FFTW does not support Float16 plans as of 2026-01-01.
 =#
 
+using JLArrays: JLArray
 using NFFT: plan_nfft
 using AbstractNFFTs: convolve!, convolve_transpose!
 import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL
 using Test: @test, @testset, @test_throws
 
+
 # Determine worst precision of two types
 # (Does Julia already have such a method?)
 function worst(T1::Type{<:Number}, T2::Type{<:Number})
@@ -20,11 +22,16 @@ function worst(T1::Type{<:Number}, T2::Type{<:Number})
 end
 worst(a, b, c) = worst(worst(a, b), c)
 
+@testset "worst" begin
+  @test worst(Float64, Float16, Float32) == Float16
+  @test worst(Complex{Float16}, Complex{BigFloat}) == Complex{Float16}
+end
 
-N = (9, 8) # 9×8 grid of equally spaced sampling points
-Tb = BigFloat
-fun = N -> @. Tb((0:(N-1)) / N - 0.5) # 1D grid
-nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...)
+
+N = (9, 8) # grid
+Ñ = 2 .* N # for g
+J = 101 # nodes for "fHat"
+nodes = rand(BigFloat, length(N), J) .- 0.5
 
 F16 = Float16
 F32 = Float32
@@ -32,13 +39,6 @@ F64 = Float64
 C16 = Complex{F16}
 C32 = Complex{F32}
 
-@testset "worst" begin
-  @test worst(F64, F16, F32) == F16
-  @test worst(C16, C32) == C16
-end
-
-J = prod(N) # for f
-Ñ = 2 .* N # for g
 
 @testset "convolve-throws" begin
   # throw error when input is complex and output is real
@@ -57,78 +57,90 @@ end
 @testset "convolve!" begin
 
   for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
-    p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
-    p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+   for arr in (Array, JLArray)
+
+    p32 = plan_nfft(arr, Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    p64 = plan_nfft(arr, Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
     @assert Ñ == p64.Ñ
 
+    _similar = (T, dims) -> similar(p64.tmpVec, T, dims)
+
     for tfun in (identity, t -> Complex{t}) # test both Real and Complex
       T16 = tfun(F16)
       T32 = tfun(F32)
       T64 = tfun(F64)
 
-      g64 = rand(T64, Ñ)
-      fff = Vector{T64}(undef, J) # ground-truth reference
-      @test fff == convolve!(p64, g64, fff)
+      g64 = _similar(T64, Ñ)
+      g64 .= rand(T64, Ñ)
+      fff = _similar(T64, J) # ground-truth reference
+      @test fff === convolve!(p64, g64, fff)
 
       fmax = maximum(abs, fff) # need to avoid avoid overflow with Float16
 
       for Tg in (T16, T32, T64), Tf in (T32, T64)
-        f = Vector{Tf}(undef, J)
-        @test f == convolve!(p64, Tg.(g64), f)
+        f = _similar(Tf, J)
+        @test f === convolve!(p64, Tg.(g64), f)
         T = worst(Tg, Tf)
         @test f/fmax ≈ T.(fff/fmax)
 
-        @test f == convolve!(p32, Tg.(g64), f)
+        @test f === convolve!(p32, Tg.(g64), f)
         T = worst(Tg, Tf, T32)
         @test f / fmax ≈ T.(fff / fmax)
       end
 
       # Float16 test with careful scaling:
-      f16 = Vector{T16}(undef, J)
-      @test f16 == convolve!(p64, g64/fmax, f16)
+      f16 = _similar(T16, J)
+      @test f16 === convolve!(p64, g64/fmax, f16)
       @test f16 ≈ fff / fmax
-      @test f16 == convolve!(p32, g64/fmax, f16)
+      @test f16 === convolve!(p32, g64/fmax, f16)
       @test f16 ≈ fff / fmax
     end # tfun
+   end # arr
   end # pre
+
 end # @testset
 
 
 @testset "convolve_transpose!" begin
 
   for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
+   for arr in (Array, JLArray)
     p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
     p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
     @assert Ñ == p64.Ñ
 
+    _similar = (T, dims) -> similar(p64.tmpVec, T, dims)
+
     for tfun in (identity, t -> Complex{t}) # test both Real and Complex
       T16 = tfun(F16)
       T32 = tfun(F32)
       T64 = tfun(F64)
 
-      f64 = rand(T64, J)
-      ggg = Array{T64}(undef, Ñ) # ground-truth reference
-      @test ggg == convolve_transpose!(p64, f64, ggg)
+      f64 = _similar(T64, J)
+      f64 .= rand(T64, J)
+      ggg = _similar(T64, Ñ) # ground-truth reference
+      @test ggg === convolve_transpose!(p64, f64, ggg)
 
       gmax = maximum(abs, ggg) # need to avoid avoid overflow with Float16
 
       for Tg in (T32, T64), Tf in (T16, T32, T64)
         g = similar(ggg, Tg)
-        @test g == convolve_transpose!(p64, Tf.(f64), g)
+        @test g === convolve_transpose!(p64, Tf.(f64), g)
         T = worst(Tg, Tf)
         @test g / gmax ≈ T.(ggg / gmax)
 
-        @test g == convolve_transpose!(p32, Tf.(f64), g)
+        @test g === convolve_transpose!(p32, Tf.(f64), g)
         T = worst(Tg, Tf, T32)
         @test g / gmax ≈ T.(ggg / gmax)
       end
 
       # Float16 test with careful scaling:
-      g16 = Array{T16}(undef, Ñ)
-      @test g16 == convolve_transpose!(p64, f64/gmax, g16)
+      g16 = _similar(T16, Ñ)
+      @test g16 === convolve_transpose!(p64, f64/gmax, g16)
       @test g16 ≈ ggg / gmax
-      @test g16 == convolve_transpose!(p32, f64/gmax, g16)
+      @test g16 === convolve_transpose!(p32, f64/gmax, g16)
       @test g16 ≈ ggg / gmax
-    end # pre
-  end # tfun
+    end # tfun
+   end # arr
+  end # pre
 end # @testset
diff --git a/test/gpu.jl b/test/gpu.jl
index c8903c26..2561b1ef 100644
--- a/test/gpu.jl
+++ b/test/gpu.jl
@@ -1,3 +1,16 @@
+#=
+# uncomment this block for stand-alone testing
+using JLArrays: JLArray
+arrayTypes = [JLArray]
+=#
+
+import FFTW # ESTIMATE
+using LinearAlgebra: norm
+using NFFT: plan_nfft, NDFTPlan
+import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL
+using NFFTTools: sdc
+using Test: @test, @testset, @test_throws
+
 m = 5
 σ = 2.0
 
@@ -38,21 +51,17 @@ m = 5
 
     @testset "GPU_NFFT Sampling Density" begin
 
+      N = (9, 8) # 9×8 grid of equally spaced sampling points
       # create a 10x10 grid of unit spaced sampling points
-      N = 10
-      g = (0:(N-1)) ./ N .- 0.5
-      x = vec(ones(N) * g')
-      y = vec(g * ones(N)')
-      nodes = cat(x', y', dims=1)
+      T = Float32
+      fun = N -> @. T((0:(N-1)) / N - 0.5) # 1D grid
+      nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...)
 
       # approximate the density weights
-      p = plan_nfft(arrayType, nodes, (N, N); m=5, σ=2.0)
-      weights = Array(sdc(p, iters=5))
-
-      @info extrema(vec(weights))
-
-      @test all((≈).(vec(weights), 1 / (N * N), rtol=1e-7))
-
+      p = plan_nfft(arrayType, nodes, N; m=5, σ=2.0)
+      weights = sdc(p, iters=5)
+      weights = Vector(weights)
+      @test all(≈(1/prod(N)), weights)
     end
   end
-end
\ No newline at end of file
+end
diff --git a/test/samplingDensity.jl b/test/samplingDensity.jl
index 3f2c7bf3..71ed4a4e 100644
--- a/test/samplingDensity.jl
+++ b/test/samplingDensity.jl
@@ -4,25 +4,57 @@ Test sampling density compensation
 using NFFT: plan_nfft
 using NFFTTools: sdc
 import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL
+import NFFTTools # sdc!
 using Test: @test, @testset
 
 @testset "Sampling Density" begin
 
-# create a 10×8 grid of unit spaced sampling points
-N = (10, 8)
+# create a 9×8 grid of unit spaced sampling points
+N = (9, 8)
 T = Float32
-g = N -> @. T((0:(N-1)) / N - 0.5)
-nodes = hcat([[x; y] for x in g(N[1]), y in g(N[2])]...)
+fun = N -> @. T((0:(N-1)) / N - 0.5) # 1D grid
+nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...)
 
 for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
   # approximate the density weights
   p = plan_nfft(nodes, N, m = 5, σ = 2.0, precompute=pre)
-  weights = sdc(p, iters = 10)
+# @show @allocated weights = sdc(p; iters = 10)
+  weights = sdc(p; iters = 10)
 
   @test T == eltype(weights)
-  @test all(isreal, weights)
+  @test isreal(weights)
   @test all(>(0), weights)
   @test all(≈(1/prod(N)), weights)
-end
+
+  # Test the version with provided work buffers
+  w2 = deepcopy(weights)
+  fill!(w2, one(T))
+  sdc(p; iters = 10,
+      weights = w2,
+      weights_tmp = similar(w2),
+      workg = Array{real(eltype(p.tmpVec))}(undef, p.Ñ),
+      workf = similar(p.tmpVec, Complex{T}, p.J),
+      workv = similar(p.tmpVec, Complex{T}, p.N),
+  )
+  @test w2 == weights
+
+  # pre-allocate
+  weights_tmp = similar(w2)
+  workg = Array{real(eltype(p.tmpVec))}(undef, p.Ñ)
+  workf = similar(p.tmpVec, Complex{T}, p.J)
+  workv = similar(p.tmpVec, Complex{T}, p.N)
+  fill!(w2, one(T))
+
+  # the following test should have pretty small allocations
+  @show @allocated NFFTTools.sdc!(p, 10,
+      w2,
+      weights_tmp,
+      workg,
+      workf,
+      workv,
+  )
+  @test w2 == weights
+
+end # pre
 
 end # @testset