JuliaMath · nHackel · Jan 21, 2026 · Dec 30, 2025 · Jan 2, 2026 · Jan 2, 2026
diff --git a/AbstractNFFTs/src/interface.jl b/AbstractNFFTs/src/interface.jl
@@ -189,7 +189,7 @@ The transformation is applied along `D-R+1` dimensions specified in the plan `p`
     size_in(p)
 
 Size of the input array for the plan p (NFFT/NFCT/NFST/NNFFT). 
-The returned tuple has `R` entries. 
+The returned tuple has `D` entries.
 Note that this will be the output size for the transposed / adjoint operator.
 """
 @mustimplement size_in(p::AbstractFTPlan{T,D,R}) where {T,D,R}
@@ -222,10 +222,10 @@ Change nodes `k` in the plan `p` operation and return the plan.
    convolve!(p::AbstractNFFTPlan{T,D,R}, g::AbstractArray, fHat::AbstractArray)
 
 Overwrite `R`-dimensional array `fHat`
-(often R=1, and `fHat` has length `p.J`)
+(often R=1, and `fHat` has length `only(p.size_in)`)
 with the result of "convolution" (i.e., interpolation)
 between `D`-dimensional equispaced input array `g`
-(often of size `p.Ñ`)
+(often of size `p.Ñ` which is not part of abstract interface)
 and the interpolation kernel in NFFT plan `p`.
 """
 @mustimplement convolve!(p::AbstractNFFTPlan, g::AbstractArray, fHat::AbstractArray)

diff --git a/NFFTTools/Project.toml b/NFFTTools/Project.toml
@@ -1,16 +1,18 @@
 name = "NFFTTools"
 uuid = "7424e34d-94f7-41d6-98a0-85abaf1b6c91"
-author = ["Tobias Knopp <[email protected]>"]
-version = "0.2.7"
+author = ["Tobias Knopp <[email protected]> and contributors"]
+version = "0.3"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" 
 AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
+NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d"
 
 [compat]
 julia = "1.6"
 AbstractFFTs = "1.0"
 AbstractNFFTs = "0.6, 0.7, 0.8, 0.9"
 FFTW = "1"
+NFFT = "0.14"
diff --git a/NFFTTools/src/NFFTTools.jl b/NFFTTools/src/NFFTTools.jl
@@ -3,11 +3,12 @@ module NFFTTools
 export sdc
 export calculateToeplitzKernel, calculateToeplitzKernel!, convolveToeplitzKernel!
 
-using AbstractNFFTs: AbstractNFFTPlan, plan_nfft, nodes!
+using AbstractNFFTs: AbstractNFFTPlan, plan_nfft, nodes!, size_in, size_out
 using AbstractNFFTs: convolve!, convolve_transpose!
+#using NFFT: NFFTPlan
 using FFTW: fftshift, plan_fft, plan_ifft
 using LinearAlgebra: adjoint, mul!
-import FFTW # ESTIMATE (which is currently non-public)
+import FFTW # ESTIMATE
 
 include("samplingDensity.jl")
 include("Toeplitz.jl")

diff --git a/NFFTTools/src/samplingDensity.jl b/NFFTTools/src/samplingDensity.jl
@@ -1,49 +1,155 @@
-
 #=
 using AbstractNFFTs: AbstractNFFTPlan, convolve!, convolve_transpose!
+using AbstractNFFTs: size_in, size_out
 using LinearAlgebra: mul!
 =#
 
-function sdc(p::AbstractNFFTPlan{T,D,1}; iters=20) where {T,D}
-    # Weights for sample density compensation.
-    # Uses method of Pipe & Menon, 1999. Mag Reson Med, 186, 179.
-    weights = similar(p.tmpVec, Complex{T}, p.J)
-    weights .= one(Complex{T})
-    weights_tmp = similar(weights)
-    scaling_factor = zero(T)
+"""
+    out = _fill_similar(array, v::T, dims) where T
+Return an allocated array similar to `array` filled with value `v`.
+
+This 2-step initialization helper function
+is needed to accommodate GPU array types.
+The more obvious statement `weights = fill(v, dims)`
+can lead to the wrong array type and cause GPU tests to fail.
+"""
+function _fill_similar(array::AbstractArray, v::T, dims::Union{Integer,Dims}) where T
+    weights = similar(array, T, dims)
+    fill!(weights, v)
+    return weights
+end
+
+#=
+It would be desirable to use the memory pointed to by p.tmpVec
+as working buffer for sdc iterations,
+but this attempt led to intermittent corrupted outputs and errors.
+So it is left commented out for now.
+=#
+#=
+function _reinterpret_real(g::StridedArray{Complex{T}}) where {T <: Real}
+    r1 = reinterpret(T, g)
+    r2 = @view r1[1:2:end,:]
+    return r2
+end
+=#
+
+"""
+   weights = sdc(plan::{Abstract}NFFTPlan; iters=20, ...)
+
+Compute weights for sample density compensation for given NFFT `plan`
+Uses method of Pipe & Menon, Mag Reson Med, 44(1):179-186, Jan. 1999.
+DOI: 10.1002/(SICI)1522-2594(199901)41:1<179::AID-MRM25>3.0.CO;2-V
+The function applies `iters` iterations of that method.
+
+The scaling here such that if the plan were 1D with N nodes equispaced
+from -0.5 to 0.5-1/N, then the returned weights are ≈ 1/N.
+
+The weights are scaled such that ``A' diag(w) A 1_N ≈ 1_N``.
+
+The returned vector is real, positive values of length `plan.J`.
+
+This method _almost_ conforms to the `AbstractNFFT` interface
+except that it uses `p.Ñ` and `p.tmpVec` that are not part of that interface.
+
+There are several named keyword arguments that are work buffers
+that are all mutated: `weights weights_tmp workg workf workv`.
+If the caller provides all of those,
+then this function should make only small allocations.
+"""
+function sdc(
+    p::AbstractNFFTPlan{T,D,1};
+    iters::Int = 20,
+    # the type of p.tmpVec (if present) is needed for GPU arrays (JLArray):
+    _array::AbstractArray = hasfield(typeof(p), :tmpVec) ?
+        p.tmpVec : Array{T,D}(undef, zeros(Int, D)...),
+    # the following are working buffers that are all mutated:
+    weights::AbstractVector{T} = _fill_similar(_array, one(T), only(size_out(p))),
+    weights_tmp::AbstractVector = similar(weights),
+#   workg::AbstractArray = _reinterpret_real(p.tmpVec), # todo
+    workg::AbstractArray = similar(_array, T, p.Ñ),
+    workf::AbstractVector = similar(_array, Complex{T}, only(size_out(p))),
+    workv::AbstractArray = similar(_array, Complex{T}, size_in(p)),
+) where {T <: Real, D}
+
+  return sdc!(
+    p,
+    iters,
+    weights,
+    weights_tmp,
+    workg,
+    workf,
+    workv,
+  )
+end
+
+
+"""
+   weights = sdc!( p::AbstractNFFTPlan, iters, weights, weights_tmp, workg)
+Compute sampling density compensation `weights`
+without performing any final scaling step.
+
+Ideally this function should be (nearly) non-allocating.
+"""
+function sdc!(
+    p::AbstractNFFTPlan{T,D,1},
+    iters::Int,
+    # the following are working buffers that are all mutated:
+    weights::AbstractVector{T},
+    weights_tmp::AbstractVector,
+    workg::AbstractArray,
+) where {T <: Real, D}
+
+    scaling_factor = missing # will be set below
 
     # Pre-weighting to correct non-uniform sample density
     for i in 1:iters
-        convolve_transpose!(p, weights, p.tmpVec)
-        if i==1
-         scaling_factor = maximum(abs.(p.tmpVec))
+        convolve_transpose!(p, weights, workg)
+        if i == 1
+            scaling_factor = maximum(workg)
         end
 
-        p.tmpVec ./= scaling_factor
-        convolve!(p, p.tmpVec, weights_tmp)
+        workg ./= scaling_factor
+        convolve!(p, workg, weights_tmp)
         weights_tmp ./= scaling_factor
-        weights ./= (abs.(weights_tmp) .+ eps(T))
+        any(≤(0), weights_tmp) && throw("non-positive weights")
+        weights ./= weights_tmp
     end
-    # Post weights to correct image scaling
-    # This finds c, where ||u - c*v||_2^2 = 0 and then uses
-    # c to scale all weights by a scalar factor.
-    u = similar(weights, Complex{T}, p.N) 
-    u .= one(Complex{T})
-
-    # conversion to Array is a workaround for CuNFFT. Without it we get strange
-    # results that indicate some synchronization issue
-    #f = Array( p * u ) 
-    #b = f .* Array(weights) # apply weights from above
-    #v = Array( adjoint(p) * convert(typeof(weights), b) )
-    #c = vec(v) \ vec(Array(u))  # least squares diff
-    #return abs.(convert(typeof(weights), c * Array(weights))) 
-
-    # non converting version
-    f = similar(p.tmpVec, Complex{T}, p.J)
-    mul!(f, p, u)
-    f .*= weights # apply weights from above
-    v = similar(p.tmpVec, Complex{T}, p.N)
-    mul!(v, adjoint(p), f)
-    c = vec(v) \ vec(u)  # least squares diff
-    return abs.(c * weights) 
+    return weights
+end
+
+
+"""
+    sdc!( ... )
+
+This version scales the weights such that
+``A' D(w) A 1_N ≈ 1_N``.
+Find ``c ∈ ℝ`` that minimizes ``‖u - c * v‖₂``
+where ``u ≜ 1_N`` (vector of ones)
+and ``v ≜ A' D(w) A 1_N``, and then scale `w` by that `c`.
+The analytical solution is
+``c = real(u'v) / ‖v‖² = real(sum(v)) / ‖v‖².``
+
+Ideally this function should be (nearly) non-allocating.
+"""
+function sdc!(
+    p::AbstractNFFTPlan{T,D,1},
+    iters::Int,
+    # the following are working buffers that are all mutated:
+    weights::AbstractVector{T},
+    weights_tmp::AbstractVector,
+    workg::AbstractArray,
+    workf::AbstractVector,
+    workv::AbstractArray,
+) where {T <: Real, D}
+
+    sdc!(p, iters, weights, weights_tmp, workg)
+
+    u = workv # trick to save memory
+    fill!(u, one(T))
+    mul!(workf, p, u)
+    workf .*= weights # apply weights from above
+    mul!(workv, adjoint(p), workf)
+    c = real(sum(workv)) / sum(abs2, workv)
+    weights .*= c
+    return weights
 end
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "NFFT"
 uuid = "efe261a4-0d2b-5849-be55-fc731d526b0d"
 authors = ["Tobias Knopp <[email protected]> and contributors"]
-version = "0.14.2"
+version = "0.14.3"
 
 [deps]
 AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7"

diff --git a/ext/NFFTGPUArraysExt/implementation.jl b/ext/NFFTGPUArraysExt/implementation.jl
@@ -1,3 +1,5 @@
+const RealOrComplex = Union{T, Complex{T}} where {T <: Real}
+
 mutable struct GPU_NFFTPlan{T,D, arrTc <: AbstractGPUArray{Complex{T}, D}, vecI <: AbstractGPUVector{Int32}, FP, BP, INV, SM} <: AbstractNFFTPlan{T,D,1} 
   N::NTuple{D,Int64}
   NOut::NTuple{1,Int64}
@@ -60,14 +62,21 @@ AbstractNFFTs.size_in(p::GPU_NFFTPlan) = p.N
 AbstractNFFTs.size_out(p::GPU_NFFTPlan) = p.NOut
 
 
-function AbstractNFFTs.convolve!(p::GPU_NFFTPlan{T,D, arrTc}, g::arrTc, fHat::arrH) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrH <: arr}
+function AbstractNFFTs.convolve!(
+  p::GPU_NFFTPlan{<:Real, D, <:AbstractGPUArray},
+  g::AbstractGPUArray{<:RealOrComplex, D},
+  fHat::AbstractGPUVector{<:RealOrComplex},
+) where {D}
   mul!(fHat, transpose(p.B), vec(g)) 
-  return
 end
 
-function AbstractNFFTs.convolve_transpose!(p::GPU_NFFTPlan{T,D, arrTc}, fHat::arrH, g::arrTc) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrH <: arr}
+function AbstractNFFTs.convolve_transpose!(
+  p::GPU_NFFTPlan{<:Real, D, <:AbstractGPUArray},
+  fHat::AbstractGPUVector{<:RealOrComplex},
+  g::AbstractGPUArray{<:RealOrComplex, D},
+) where {D}
   mul!(vec(g), p.B, fHat)
-  return
+  return g
 end
 
 function AbstractNFFTs.deconvolve!(p::GPU_NFFTPlan{T,D, arrTc}, f::arrF, g::arrTc) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrF <: arr}
@@ -125,4 +134,3 @@ function LinearAlgebra.mul!(f::arrF, pl::Adjoint{Complex{T},<:GPU_NFFTPlan{T,D,
 
     return f
 end
-