From 15dfc61a0cfe0e958538240d3569a64044f975e3 Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Tue, 30 Dec 2025 13:57:06 -0500
Subject: [PATCH 01/10] Allow `convolve!` with real arrays

---
 src/convolution.jl | 41 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/convolution.jl b/src/convolution.jl
index b4a8df19..71975852 100644
--- a/src/convolution.jl
+++ b/src/convolution.jl
@@ -1,5 +1,25 @@
+#=
+convolution.jl
+Convolution (aka interpolation) methods and their adjoints.
+These are core NFFT operations.
+
+The `convolve!` method allows both real and complex types
+because sampling density compensation works with real vectors.
+=#
+
+#=
+using NFFT: NFFTPlan, @cthreads, @nloops_
+using LinearAlgebra: mul!
+import AbstractNFFTs
+=#
+
+
+function AbstractNFFTs.convolve!(
+  p::NFFTPlan{Tp,D,1},
+  g::AbstractArray{<: Union{Tg, Complex{Tg}}, D},
+  fHat::StridedVector{U},
+) where {D, Tp <: Real, Tg <: Real, U}
 
-function AbstractNFFTs.convolve!(p::NFFTPlan{T,D,1}, g::AbstractArray{Complex{T},D}, fHat::StridedVector{U}) where {D,T,U}
   if isempty(p.B)
     if p.params.blocking
       convolve_blocking!(p, g, fHat)
@@ -27,11 +47,20 @@ function _convolve_nonblocking!(p::NFFTPlan, g, fHat, L, winPoly)
 end
 
 
-@generated function _convolve_nonblocking(p::NFFTPlan{T,D,1}, g::AbstractArray{Complex{T},D}, L::Val{Z}, winPoly, scale, j) where {D,T,Z}
+@generated function _convolve_nonblocking(
+  p::NFFTPlan{Tp,D,1},
+  g::AbstractArray{<: Union{Tg, Complex{Tg}}, D},
+  L::Val{Z},
+  winPoly,
+  scale,
+  j,
+) where {D, Tp <: Real, Tg <: Real, Z}
+
   quote
     @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ, 
                                                  p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize)  )
   
+    T = promote_type(Tp, Tg)
     fHat = zero(Complex{T})
 
     @nexprs 1 d -> prodWin_{$D} = one(T)
@@ -47,11 +76,17 @@ end
   end
 end
 
-function convolve_sparse_matrix!(p::NFFTPlan{T,D,1}, g::AbstractArray{Complex{T},D}, fHat::StridedVector{U}) where {D,T,U}
+function convolve_sparse_matrix!(
+  p::NFFTPlan{Tp,D,1},
+  g::AbstractArray{<: Union{Tg, Complex{Tg}}, D},
+  fHat::StridedVector{U},
+) where {D, Tp <: Real, Tg <: Real, U}
+
   threaded_mul!(fHat, transpose(p.B), vec(g))
   #mul!(fHat, transpose(p.B), vec(g))
 end
 
+
 ########## convolve adjoint ##########
 
 function AbstractNFFTs.convolve_transpose!(p::NFFTPlan{T,D,1}, fHat::AbstractVector{U}, g::StridedArray{Complex{T},D}) where {D,T,U}

From 9f7655f2aae384b190c2e9730053946f967b3efe Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Tue, 30 Dec 2025 14:17:33 -0500
Subject: [PATCH 02/10] Allow adjoint to work with real vectors too

---
 src/convolution.jl | 154 ++++++++++++++++++++++++++-------------------
 1 file changed, 91 insertions(+), 63 deletions(-)

diff --git a/src/convolution.jl b/src/convolution.jl
index 71975852..9c391302 100644
--- a/src/convolution.jl
+++ b/src/convolution.jl
@@ -3,8 +3,8 @@ convolution.jl
 Convolution (aka interpolation) methods and their adjoints.
 These are core NFFT operations.
 
-The `convolve!` method allows both real and complex types
-because sampling density compensation works with real vectors.
+The methods allow both real and complex types
+because sampling density compensation vectors are real.
 =#
 
 #=
@@ -29,15 +29,17 @@ function AbstractNFFTs.convolve!(
   else
     convolve_sparse_matrix!(p, g, fHat)
   end
-  return
+  return fHat
 end
 
+
 function convolve_nonblocking!(p::NFFTPlan, g, fHat)
   L = Val(2*p.params.m)
   winPoly = makePolyArrayStatic(p)
   _convolve_nonblocking!(p, g, fHat, L, winPoly)
 end
 
+
 function _convolve_nonblocking!(p::NFFTPlan, g, fHat, L, winPoly)
   scale = Int(p.params.LUTSize/(p.params.m))
 
@@ -57,9 +59,9 @@ end
 ) where {D, Tp <: Real, Tg <: Real, Z}
 
   quote
-    @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ, 
+    @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ,
                                                  p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize)  )
-  
+
     T = promote_type(Tp, Tg)
     fHat = zero(Complex{T})
 
@@ -67,7 +69,7 @@ end
     @nloops_ $D l d -> 1:$Z d->begin
       # preexpr
       prodWin_{d-1} = prodWin_d * tmpWin_d[l_d]
-      gidx_d = tmpIdx_d[l_d] 
+      gidx_d = tmpIdx_d[l_d]
     end begin
       # bodyexpr
       fHat += prodWin_0 * @nref $D g gidx
@@ -76,6 +78,7 @@ end
   end
 end
 
+
 function convolve_sparse_matrix!(
   p::NFFTPlan{Tp,D,1},
   g::AbstractArray{<: Union{Tg, Complex{Tg}}, D},
@@ -89,7 +92,12 @@ end
 
 ########## convolve adjoint ##########
 
-function AbstractNFFTs.convolve_transpose!(p::NFFTPlan{T,D,1}, fHat::AbstractVector{U}, g::StridedArray{Complex{T},D}) where {D,T,U}
+function AbstractNFFTs.convolve_transpose!(
+  p::NFFTPlan{Tp,D,1},
+  fHat::AbstractVector{U},
+  g::StridedArray{<: Union{Tg, Complex{Tg}}, D},
+) where {D, Tp <: Real, Tg <: Real, U}
+
   if isempty(p.B)
     if p.params.blocking
       convolve_transpose_blocking!(p, fHat, g)
@@ -99,6 +107,7 @@ function AbstractNFFTs.convolve_transpose!(p::NFFTPlan{T,D,1}, fHat::AbstractVec
   else
     convolve_transpose_sparse_matrix!(p, fHat, g)
   end
+  return g
 end
 
 
@@ -109,7 +118,7 @@ function convolve_transpose_nonblocking!(p::NFFTPlan, fHat, g)
 end
 
 
-function _convolve_transpose_nonblocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly)  where {D,T} 
+function _convolve_transpose_nonblocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly) where {D,T}
   fill!(g, zero(T))
   scale = Int(p.params.LUTSize/(p.params.m))
 
@@ -119,25 +128,40 @@ function _convolve_transpose_nonblocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPol
 end
 
 
-@generated function __convolve_transpose_nonblocking!(p::NFFTPlan{T,D,1}, fHat::AbstractVector{U}, g::StridedArray{Complex{T},D}, L::Val{Z}, winPoly, scale, j) where {D,T,U,Z}
+@generated function __convolve_transpose_nonblocking!(
+  p::NFFTPlan{Tp,D,1},
+  fHat::AbstractVector{U},
+  g::StridedArray{<: Union{Tg, Complex{Tg}}, D},
+  L::Val{Z},
+  winPoly,
+  scale,
+  j,
+) where {D, Tp <: Real, Tg <: Real, U, Z}
+
   quote
-    @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ, 
+    @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ,
                                                  p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize)  )
 
-    @nexprs 1 d -> prodWin_{$D} = one(T)
+    T = promote_type(Tp, Tg)
+    @nexprs 1 d -> prodWin_{$D} = one(Tp)
     @nloops_ $D l d -> 1:$Z d->begin
       # preexpr
       prodWin_{d-1} = prodWin_d * tmpWin_d[l_d]
-      gidx_d = tmpIdx_d[l_d] 
+      gidx_d = tmpIdx_d[l_d]
     end begin
       # bodyexpr
-      (@nref $D g gidx) += prodWin_0 * fHat[j] 
+      (@nref $D g gidx) += prodWin_0 * fHat[j]
     end
   end
 end
 
-function convolve_transpose_sparse_matrix!(p::NFFTPlan{T,D,1},
-                        fHat::AbstractVector{U}, g::StridedArray{Complex{T},D}) where {D,T,U}
+
+function convolve_transpose_sparse_matrix!(
+  p::NFFTPlan{Tp,D,1},
+  fHat::AbstractVector{U},
+  g::StridedArray{<:Union{Tg, Complex{Tg}}, D},
+) where {D, Tp <: Real, Tg <: Real, U}
+
   #threaded_mul!(vec(g), p.B, fHat)
   mul!(vec(g), p.B, fHat)
 end
@@ -171,7 +195,7 @@ function _convolve_blocking!(p::NFFTPlan{T,D,1}, g, fHat, L, winPoly) where {D,T
     if !isempty(p.nodesInBlock[l])
       toBlock!(p, g, p.blocks[l], p.blockOffsets[l])
       winTensor = (p.params.precompute == TENSOR) ? p.windowTensor[l] : nothing
-      calcOneBlock!(p, fHat, p.nodesInBlock[l], p.blocks[l], p.blockOffsets[l], L, 
+      calcOneBlock!(p, fHat, p.nodesInBlock[l], p.blocks[l], p.blockOffsets[l], L,
                    scale, p.idxInBlock[l], winTensor, winPoly)
     end
   end
@@ -180,8 +204,8 @@ end
 @generated function toBlock!(p::NFFTPlan{T,D,1}, g, block, off) where {T,D}
 quote
   if off[1] + 2 < 1
-    LA = -(off[1] + 2) + 1 
-    offA = size(g,1) - LA 
+    LA = -(off[1] + 2) + 1
+    offA = size(g,1) - LA
     offB = -LA
   else
     LA = 0
@@ -189,7 +213,7 @@ quote
   end
 
   if offB + size(block,1) > size(g,1)
-    LB = size(g,1) - (offB )  
+    LB = size(g,1) - (offB )
     offC = -LB
   else
     LB = size(block,1)
@@ -197,31 +221,31 @@ quote
 
   if LB != size(block,1) && offC + size(block,1) > size(g,1)
     # out of bounds indices: second (or first) wrapping
-    LC = size(g,1) - (offC)  
+    LC = size(g,1) - (offC)
     offD = -LC
   else
     # no out of bounds indices
     LC = size(block,1)
   end
-  
+
   @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:size(block,d+1)) d->begin
     # preexpr
-    idx_{d+1} = ( mod(l_{d+1}  + off[d+1] + p.Ñ[d+1], p.Ñ[d+1]) + 1)
+    idx_{d+1} = ( mod(l_{d+1} + off[d+1] + p.Ñ[d+1], p.Ñ[d+1]) + 1)
   end begin
     # bodyexpr
-    @inbounds @simd for l_1 = 1:LA
+    @inbounds @simd for l_1 in 1:LA
       idx_1 = l_1 + offA
       (@nref $D block l) = (@nref $D g idx)
     end
-    @inbounds @simd for l_1 = (LA+1):LB 
+    @inbounds @simd for l_1 in (LA+1):LB
         idx_1 = l_1 + offB
       (@nref $D block l) = (@nref $D g idx)
     end
-    @inbounds @simd for l_1 = (LB+1):LC
+    @inbounds @simd for l_1 in (LB+1):LC
       idx_1 = l_1 + offC
       (@nref $D block l) = (@nref $D g idx)
     end
-    @inbounds @simd for l_1 = (LC+1):size(block,1) 
+    @inbounds @simd for l_1 in (LC+1):size(block,1)
       idx_1 = l_1 + offD
       (@nref $D block l) = (@nref $D g idx)
     end
@@ -233,41 +257,42 @@ end
 @noinline function calcOneBlock!(p::NFFTPlan{T,D,1}, fHat, nodesInBlock, block,
          off, L, scale, idxInBlock, winTensor, winPoly) where {D,T}
   for (jLocal,j) in enumerate(nodesInBlock)
-    fHat[j] = calcOneNode!(p, block, off, L, scale, j, jLocal, 
+    fHat[j] = calcOneNode!(p, block, off, L, scale, j, jLocal,
                                idxInBlock, winTensor, winPoly)
   end
   return
 end
 
 
-@generated function calcOneNode!(p::NFFTPlan{T,D,1}, block, off, L::Val{Z}, scale, 
+@generated function calcOneNode!(p::NFFTPlan{T,D,1}, block, off, L::Val{Z}, scale,
                    j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
   quote
-    @nexprs $(D) d -> ((off_d, tmpWin_d) =  precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale, 
+    @nexprs $(D) d -> ((off_d, tmpWin_d) = precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale,
                       jLocal, d, L, idxInBlock) )
 
     @nexprs 1 d -> prodWin_{$D} = one(T)
     @nexprs 1 d -> fHat_{$D} = zero(Complex{T})
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:$Z) d->begin
       # preexpr
-      block_idx_{d+1} = off_{d+1} + l_{d+1} 
+      block_idx_{d+1} = off_{d+1} + l_{d+1}
       fHat_{d} = zero(Complex{T})
     end d->begin
       # postexpr
       fHat_{d+1} += tmpWin_{d+1}[l_{d+1}] * fHat_{d}
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:$Z
+      @inbounds @simd for l_1 in 1:$Z
         block_idx_1 = off_1 + l_1
         fHat_1 += tmpWin_1[l_1] * (@nref $D block block_idx)
       end
     end
 
-    @nexprs 1 d -> fHat = fHat_{$D} 
+    @nexprs 1 d -> fHat = fHat_{$D}
     return fHat
   end
 end
 
+
 ##########################
 
 function convolve_transpose_blocking!(p::NFFTPlan, fHat, g)
@@ -276,11 +301,12 @@ function convolve_transpose_blocking!(p::NFFTPlan, fHat, g)
   _convolve_transpose_blocking!(p, fHat, g, L, winPoly)
 end
 
+
 function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly) where {D,T}
   #g .= zero(T)
   ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), g, 0, sizeof(g))
   scale = Int(p.params.LUTSize/(p.params.m))
-  
+
   lk = ReentrantLock()
   @cthreads for l in CartesianIndices(size(p.blocks))
     if !isempty(p.nodesInBlock[l])
@@ -288,9 +314,9 @@ function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly)
       ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), p.blocks[l], 0, sizeof(p.blocks[l]))
 
       winTensor = (p.params.precompute == TENSOR) ? p.windowTensor[l] : nothing
-      fillBlock!(p, fHat, p.blocks[l], p.nodesInBlock[l], p.blockOffsets[l], L, scale, 
+      fillBlock!(p, fHat, p.blocks[l], p.nodesInBlock[l], p.blockOffsets[l], L, scale,
                  p.idxInBlock[l], winTensor, winPoly)
- 
+
       lock(lk) do
         addBlock!(p, g, p.blocks[l], p.blockOffsets[l])
       end
@@ -298,6 +324,7 @@ function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly)
   end
 end
 
+
 @generated function addBlock!(p::NFFTPlan{T,D,1}, g, block, off) where {T,D}
   quote
     # addBlock! needs to wrap indices. The easies is to apply a modulo operation (rem)
@@ -311,8 +338,8 @@ end
 
     if off[1] + 2 < 1
       # negative indices: first wrapping
-      LA = -(off[1] + 2) + 1 
-      offA = size(g,1) - LA 
+      LA = -(off[1] + 2) + 1
+      offA = size(g,1) - LA
       offB = -LA
     else
       # no negative indices: skip first sum
@@ -322,7 +349,7 @@ end
 
     if offB + size(block,1) > size(g,1)
       # out of bounds indices: second (or first) wrapping
-      LB = size(g,1) - (offB )  
+      LB = size(g,1) - (offB )
       offC = -LB
     else
       # no out of bounds indices
@@ -331,31 +358,31 @@ end
 
     if LB != size(block,1) && offC + size(block,1) > size(g,1)
       # out of bounds indices: second (or first) wrapping
-      LC = size(g,1) - (offC)  
+      LC = size(g,1) - (offC)
       offD = -LC
     else
       # no out of bounds indices
       LC = size(block,1)
     end
-    
+
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:size(block,d+1)) d->begin
       # preexpr
-      idx_{d+1} = ( mod(l_{d+1}  + off[d+1] + p.Ñ[d+1], p.Ñ[d+1]) + 1)
+      idx_{d+1} = ( mod(l_{d+1} + off[d+1] + p.Ñ[d+1], p.Ñ[d+1]) + 1)
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:LA
+      @inbounds @simd for l_1 in 1:LA
         idx_1 = l_1 + offA
         (@nref $D g idx) += (@nref $D block l)
       end
-      @inbounds @simd for l_1 = (LA+1):LB 
+      @inbounds @simd for l_1 in (LA+1):LB
         idx_1 = l_1 + offB
         (@nref $D g idx) += (@nref $D block l)
       end
-      @inbounds @simd for l_1 = (LB+1):LC
+      @inbounds @simd for l_1 in (LB+1):LC
         idx_1 = l_1 + offC
         (@nref $D g idx) += (@nref $D block l)
       end
-      @inbounds @simd for l_1 = (LC+1):size(block,1) 
+      @inbounds @simd for l_1 in (LC+1):size(block,1)
         idx_1 = l_1 + offD
         (@nref $D g idx) += (@nref $D block l)
       end
@@ -364,10 +391,11 @@ end
   end
 end
 
-@noinline function fillBlock!(p::NFFTPlan{T,D,1}, fHat, block, nodesInBlock, off, L::Val{Z}, scale, 
+
+@noinline function fillBlock!(p::NFFTPlan{T,D,1}, fHat, block, nodesInBlock, off, L::Val{Z}, scale,
                               idxInBlock, winTensor, winPoly) where {T,D,Z}
   if (Threads.nthreads() == 1 || !NFFT._use_threads[]) &&
-      (D >= 3 && Z >= 16) || (D == 2 && Z >= 16) # magic 
+      (D >= 3 && Z >= 16) || (D == 2 && Z >= 16) # magic
     for (jLocal,j) in enumerate(nodesInBlock)
       fillOneNode2!(p, fHat, block, off, L, scale, j, jLocal, idxInBlock, winTensor, winPoly)
     end
@@ -380,7 +408,7 @@ end
 end
 
 
-@generated function fillOneNode!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, 
+@generated function fillOneNode!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale,
               j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
   quote
     fHat_ = fHat[j]
@@ -394,11 +422,11 @@ end
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:$Z) d->begin
       # preexpr
       prodWin_{d} = prodWin_{d+1} * tmpWin_{d+1}[l_{d+1}]
-      block_idx_{d+1} = off_{d+1} + l_{d+1} 
+      block_idx_{d+1} = off_{d+1} + l_{d+1}
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:$(Z)
-        block_idx_1 = off_1 + l_1 
+      @inbounds @simd for l_1 in 1:$(Z)
+        block_idx_1 = off_1 + l_1
         (@nref $D block block_idx) += innerWin[l_1] * prodWin_1
       end
     end
@@ -406,7 +434,8 @@ end
   end
 end
 
-@generated function fillOneNode2!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, 
+
+@generated function fillOneNode2!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale,
               j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
   quote
     fHat_ = fHat[j]
@@ -418,11 +447,11 @@ end
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:$Z) d->begin
       # preexpr
       prodWin_{d} = prodWin_{d+1} * tmpWin_{d+1}[l_{d+1}]
-      block_idx_{d+1} = off_{d+1} + l_{d+1} 
+      block_idx_{d+1} = off_{d+1} + l_{d+1}
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:$(Z)
-        block_idx_1 = off_1 + l_1 
+      @inbounds @simd for l_1 in 1:$(Z)
+        block_idx_1 = off_1 + l_1
         (@nref $D block block_idx) += (tmpWin_1[l_1] * prodWin_1) * fHat_
       end
     end
@@ -431,17 +460,16 @@ end
 end
 
 
-
-@generated function fillOneNodeReal!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, j, 
+@generated function fillOneNodeReal!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, j,
                 jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
   quote
     fHat_ = fHat[j]
 
-    @nexprs $(D) d -> ((off_d, tmpWin_d) =  precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale, jLocal, 
+    @nexprs $(D) d -> ((off_d, tmpWin_d) =  precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale, jLocal,
                                               d, L, idxInBlock) )
 
     innerWin = Vector{Float64}(undef,$(2*Z)) # Probably cache me somewhere
-    for l=1:$Z
+    for l in 1:$Z
       innerWin[2*(l-1)+1] = tmpWin_1[l] * real(fHat_)
       innerWin[2*(l-1)+2] = tmpWin_1[l] * imag(fHat_)
     end
@@ -450,7 +478,7 @@ end
     # Convert to real
     # We use unsafe_wrap here because reinterpret(T, block) is much slower
     # and reinterpret(T, reshape, block) is slightly slower than unsafe_wrap
-    # In fact the conversion to real only pays off when using unsafe_wrap, 
+    # In fact the conversion to real only pays off when using unsafe_wrap,
     # otherwise sticking with complex is faster
     U = @ntuple $(D) d -> d==1 ? 2*size(block,1) : size(block,d)
     blockReal = unsafe_wrap(Array,reinterpret(Ptr{T},pointer(block)), U)
@@ -460,11 +488,11 @@ end
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:$Z) d->begin
       # preexpr
       prodWin_{d} = prodWin_{d+1} * tmpWin_{d+1}[l_{d+1}]
-      block_idx_{d+1} = off_{d+1} + l_{d+1} 
+      block_idx_{d+1} = off_{d+1} + l_{d+1}
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:$(2*Z)
-        block_idx_1 = off_1 + l_1 
+      @inbounds @simd for l_1 in 1:$(2*Z)
+        block_idx_1 = off_1 + l_1
         (@nref $(D) blockReal block_idx) += innerWin[l_1] * prodWin_1
       end
     end

From 52f9e12577f9e89ff99adf7aec99b287159d5128 Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Wed, 31 Dec 2025 14:30:59 -0500
Subject: [PATCH 03/10] Refine type declarations

---
 src/convolution.jl | 141 ++++++++++++++++++++++++++++-----------------
 1 file changed, 87 insertions(+), 54 deletions(-)

diff --git a/src/convolution.jl b/src/convolution.jl
index 9c391302..2c4210c5 100644
--- a/src/convolution.jl
+++ b/src/convolution.jl
@@ -10,15 +10,20 @@ because sampling density compensation vectors are real.
 #=
 using NFFT: NFFTPlan, @cthreads, @nloops_
 using LinearAlgebra: mul!
+import Base.Cartesian: @nexprs, @nref
 import AbstractNFFTs
 =#
 
+const RealOrComplex = Union{T, Complex{T}} where {T <: Real}
 
 function AbstractNFFTs.convolve!(
-  p::NFFTPlan{Tp,D,1},
-  g::AbstractArray{<: Union{Tg, Complex{Tg}}, D},
-  fHat::StridedVector{U},
-) where {D, Tp <: Real, Tg <: Real, U}
+  p::NFFTPlan{<:Real,D,1},
+  g::AbstractArray{<:RealOrComplex, D},
+  fHat::StridedVector{<:RealOrComplex}, # mutated
+) where {D}
+
+  (eltype(g) <: Complex) && (eltype(fHat) <: Real) &&
+    throw(ArgumentError("Complex input g requires Complex output fHat"))
 
   if isempty(p.B)
     if p.params.blocking
@@ -41,7 +46,7 @@ end
 
 
 function _convolve_nonblocking!(p::NFFTPlan, g, fHat, L, winPoly)
-  scale = Int(p.params.LUTSize/(p.params.m))
+  scale = Int(p.params.LUTSize / p.params.m)
 
   @cthreads for j in 1:p.J
       fHat[j] = _convolve_nonblocking(p, g, L, winPoly, scale, j)
@@ -54,13 +59,13 @@ end
   g::AbstractArray{<: Union{Tg, Complex{Tg}}, D},
   L::Val{Z},
   winPoly,
-  scale,
-  j,
+  scale::Int,
+  j::Int,
 ) where {D, Tp <: Real, Tg <: Real, Z}
 
   quote
     @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ,
-                                                 p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize)  )
+                                                 p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize) )
 
     T = promote_type(Tp, Tg)
     fHat = zero(Complex{T})
@@ -80,10 +85,10 @@ end
 
 
 function convolve_sparse_matrix!(
-  p::NFFTPlan{Tp,D,1},
-  g::AbstractArray{<: Union{Tg, Complex{Tg}}, D},
-  fHat::StridedVector{U},
-) where {D, Tp <: Real, Tg <: Real, U}
+  p::NFFTPlan{<:Real,D,1},
+  g::AbstractArray{<:RealOrComplex, D},
+  fHat::StridedVector{<:RealOrComplex},
+) where {D}
 
   threaded_mul!(fHat, transpose(p.B), vec(g))
   #mul!(fHat, transpose(p.B), vec(g))
@@ -93,10 +98,13 @@ end
 ########## convolve adjoint ##########
 
 function AbstractNFFTs.convolve_transpose!(
-  p::NFFTPlan{Tp,D,1},
-  fHat::AbstractVector{U},
-  g::StridedArray{<: Union{Tg, Complex{Tg}}, D},
-) where {D, Tp <: Real, Tg <: Real, U}
+  p::NFFTPlan{<:Real,D,1},
+  fHat::AbstractVector{<:RealOrComplex},
+  g::StridedArray{<:RealOrComplex, D}, # mutated
+) where {D}
+
+  (eltype(fHat) <: Complex) && (eltype(g) <: Real) &&
+    throw(ArgumentError("Complex input fHat requires Complex output g"))
 
   if isempty(p.B)
     if p.params.blocking
@@ -118,9 +126,16 @@ function convolve_transpose_nonblocking!(p::NFFTPlan, fHat, g)
 end
 
 
-function _convolve_transpose_nonblocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly) where {D,T}
-  fill!(g, zero(T))
-  scale = Int(p.params.LUTSize/(p.params.m))
+function _convolve_transpose_nonblocking!(
+  p::NFFTPlan{<:Real,D,1},
+  fHat::AbstractVector{<:RealOrComplex},
+  g::StridedArray{<: Union{Tg, Complex{Tg}}, D},
+  L::Val,
+  winPoly,
+) where {D, Tg <: Real}
+
+  fill!(g, zero(Tg))
+  scale = Int(p.params.LUTSize / p.params.m)
 
   @inbounds @simd for j in 1:p.J
     __convolve_transpose_nonblocking!(p, fHat, g, L, winPoly, scale, j)
@@ -129,21 +144,20 @@ end
 
 
 @generated function __convolve_transpose_nonblocking!(
-  p::NFFTPlan{Tp,D,1},
-  fHat::AbstractVector{U},
+  p::NFFTPlan{<:Real,D,1},
+  fHat::AbstractVector{<:RealOrComplex},
   g::StridedArray{<: Union{Tg, Complex{Tg}}, D},
   L::Val{Z},
   winPoly,
-  scale,
-  j,
-) where {D, Tp <: Real, Tg <: Real, U, Z}
+  scale::Int,
+  j::Int,
+) where {D, Tg <: Real, Z}
 
   quote
     @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ,
-                                                 p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize)  )
+                                                 p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize) )
 
-    T = promote_type(Tp, Tg)
-    @nexprs 1 d -> prodWin_{$D} = one(Tp)
+    @nexprs 1 d -> prodWin_{$D} = one(Tg)
     @nloops_ $D l d -> 1:$Z d->begin
       # preexpr
       prodWin_{d-1} = prodWin_d * tmpWin_d[l_d]
@@ -157,10 +171,10 @@ end
 
 
 function convolve_transpose_sparse_matrix!(
-  p::NFFTPlan{Tp,D,1},
-  fHat::AbstractVector{U},
-  g::StridedArray{<:Union{Tg, Complex{Tg}}, D},
-) where {D, Tp <: Real, Tg <: Real, U}
+  p::NFFTPlan{<:Real,D,1},
+  fHat::AbstractVector{<:RealOrComplex},
+  g::StridedArray{<:RealOrComplex, D},
+) where {D}
 
   #threaded_mul!(vec(g), p.B, fHat)
   mul!(vec(g), p.B, fHat)
@@ -188,8 +202,8 @@ function convolve_blocking!(p::NFFTPlan, g, fHat)
   _convolve_blocking!(p, g, fHat, L, winPoly)
 end
 
-function _convolve_blocking!(p::NFFTPlan{T,D,1}, g, fHat, L, winPoly) where {D,T}
-  scale = Int(p.params.LUTSize/(p.params.m))
+function _convolve_blocking!(p::NFFTPlan{<:Real,D,1}, g, fHat, L, winPoly) where {D}
+  scale = Int(p.params.LUTSize / p.params.m)
 
   @cthreads for l in CartesianIndices(size(p.blocks))
     if !isempty(p.nodesInBlock[l])
@@ -201,7 +215,7 @@ function _convolve_blocking!(p::NFFTPlan{T,D,1}, g, fHat, L, winPoly) where {D,T
   end
 end
 
-@generated function toBlock!(p::NFFTPlan{T,D,1}, g, block, off) where {T,D}
+@generated function toBlock!(p::NFFTPlan{<:Real,D,1}, g, block, off) where {D}
 quote
   if off[1] + 2 < 1
     LA = -(off[1] + 2) + 1
@@ -213,7 +227,7 @@ quote
   end
 
   if offB + size(block,1) > size(g,1)
-    LB = size(g,1) - (offB )
+    LB = size(g,1) - offB
     offC = -LB
   else
     LB = size(block,1)
@@ -221,7 +235,7 @@ quote
 
   if LB != size(block,1) && offC + size(block,1) > size(g,1)
     # out of bounds indices: second (or first) wrapping
-    LC = size(g,1) - (offC)
+    LC = size(g,1) - offC
     offD = -LC
   else
     # no out of bounds indices
@@ -254,8 +268,12 @@ quote
  end
 end
 
-@noinline function calcOneBlock!(p::NFFTPlan{T,D,1}, fHat, nodesInBlock, block,
-         off, L, scale, idxInBlock, winTensor, winPoly) where {D,T}
+@noinline function calcOneBlock!(
+  p::NFFTPlan{<:Real,D,1},
+  fHat, nodesInBlock, block,
+  off, L, scale, idxInBlock, winTensor, winPoly,
+) where {D}
+
   for (jLocal,j) in enumerate(nodesInBlock)
     fHat[j] = calcOneNode!(p, block, off, L, scale, j, jLocal,
                                idxInBlock, winTensor, winPoly)
@@ -264,8 +282,11 @@ end
 end
 
 
-@generated function calcOneNode!(p::NFFTPlan{T,D,1}, block, off, L::Val{Z}, scale,
-                   j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
+@generated function calcOneNode!(
+  p::NFFTPlan{T,D,1}, block, off, L::Val{Z}, scale,
+  j, jLocal, idxInBlock, winTensor, winPoly,
+) where {D, T <: Real, Z}
+
   quote
     @nexprs $(D) d -> ((off_d, tmpWin_d) = precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale,
                       jLocal, d, L, idxInBlock) )
@@ -302,10 +323,10 @@ function convolve_transpose_blocking!(p::NFFTPlan, fHat, g)
 end
 
 
-function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly) where {D,T}
+function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly) where {D, T <: Real}
   #g .= zero(T)
   ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), g, 0, sizeof(g))
-  scale = Int(p.params.LUTSize/(p.params.m))
+  scale = Int(p.params.LUTSize / p.params.m)
 
   lk = ReentrantLock()
   @cthreads for l in CartesianIndices(size(p.blocks))
@@ -325,7 +346,7 @@ function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly)
 end
 
 
-@generated function addBlock!(p::NFFTPlan{T,D,1}, g, block, off) where {T,D}
+@generated function addBlock!(p::NFFTPlan{<:Real,D,1}, g, block, off) where {D}
   quote
     # addBlock! needs to wrap indices. The easies is to apply a modulo operation (rem)
     # but doing so for each index is slow. We thus only do this for the outer dimensions
@@ -349,7 +370,7 @@ end
 
     if offB + size(block,1) > size(g,1)
       # out of bounds indices: second (or first) wrapping
-      LB = size(g,1) - (offB )
+      LB = size(g,1) - offB
       offC = -LB
     else
       # no out of bounds indices
@@ -358,7 +379,7 @@ end
 
     if LB != size(block,1) && offC + size(block,1) > size(g,1)
       # out of bounds indices: second (or first) wrapping
-      LC = size(g,1) - (offC)
+      LC = size(g,1) - offC
       offD = -LC
     else
       # no out of bounds indices
@@ -392,8 +413,11 @@ end
 end
 
 
-@noinline function fillBlock!(p::NFFTPlan{T,D,1}, fHat, block, nodesInBlock, off, L::Val{Z}, scale,
-                              idxInBlock, winTensor, winPoly) where {T,D,Z}
+@noinline function fillBlock!(
+  p::NFFTPlan{<:Real,D,1}, fHat, block, nodesInBlock, off, L::Val{Z}, scale,
+  idxInBlock, winTensor, winPoly,
+) where {D, Z}
+
   if (Threads.nthreads() == 1 || !NFFT._use_threads[]) &&
       (D >= 3 && Z >= 16) || (D == 2 && Z >= 16) # magic
     for (jLocal,j) in enumerate(nodesInBlock)
@@ -408,8 +432,11 @@ end
 end
 
 
-@generated function fillOneNode!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale,
-              j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
+@generated function fillOneNode!(
+  p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale,
+  j, jLocal, idxInBlock, winTensor, winPoly,
+) where {D, T <: Real, Z}
+
   quote
     fHat_ = fHat[j]
 
@@ -435,8 +462,11 @@ end
 end
 
 
-@generated function fillOneNode2!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale,
-              j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
+@generated function fillOneNode2!(
+  p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale,
+  j, jLocal, idxInBlock, winTensor, winPoly,
+) where {D, T <: Real, Z}
+
   quote
     fHat_ = fHat[j]
 
@@ -460,12 +490,15 @@ end
 end
 
 
-@generated function fillOneNodeReal!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, j,
-                jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
+@generated function fillOneNodeReal!(
+  p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, j,
+  jLocal, idxInBlock, winTensor, winPoly,
+) where {D, T <: Real, Z}
+
   quote
     fHat_ = fHat[j]
 
-    @nexprs $(D) d -> ((off_d, tmpWin_d) =  precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale, jLocal,
+    @nexprs $(D) d -> ((off_d, tmpWin_d) = precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale, jLocal,
                                               d, L, idxInBlock) )
 
     innerWin = Vector{Float64}(undef,$(2*Z)) # Probably cache me somewhere

From 975df61410a10ce8b66a7369f2957a5ce75fe234 Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Wed, 31 Dec 2025 14:33:56 -0500
Subject: [PATCH 04/10] To v0.14.2

---
 Project.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project.toml b/Project.toml
index 2535b12b..dc228061 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NFFT"
 uuid = "efe261a4-0d2b-5849-be55-fc731d526b0d"
-authors = ["Tobias Knopp <tobias@knoppweb.de>"]
-version = "0.14.1"
+authors = ["Tobias Knopp <tobias@knoppweb.de> and contributors"]
+version = "0.14.2"
 
 [deps]
 AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7"

From 0c1640fa2582b9bfe963bdd5a8756ac5be804737 Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Thu, 1 Jan 2026 11:38:08 -0500
Subject: [PATCH 05/10] Test convolve!

---
 test/convolve.jl | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 test/runtests.jl |  1 +
 2 files changed, 93 insertions(+)
 create mode 100644 test/convolve.jl

diff --git a/test/convolve.jl b/test/convolve.jl
new file mode 100644
index 00000000..0248b60b
--- /dev/null
+++ b/test/convolve.jl
@@ -0,0 +1,92 @@
+#=
+Test convolution (interpolation) and its adjoint in isolation
+
+One must be careful with Float16 output arrays here,
+because plan.windowLinInterp has values above 1e8
+that exceed the 6e4 maximum of Float16.
+
+FFTW does not support Float16 plans as of 2026-01-01.
+=#
+
+using NFFT: plan_nfft
+using AbstractNFFTs: convolve!, convolve_transpose!
+import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL
+using Test: @test, @testset
+
+# determine worst precision of two types
+function worst(T1::Type{<:Number}, T2::Type{<:Number})
+    return promote_type(T1, T2) == T1 ? T2 : T1
+end
+worst(a, b, c) = worst(worst(a, b), c)
+
+
+@testset "convolve!" begin
+
+  N = (9, 8) # 9×8 grid of equally spaced sampling points
+  Tb = BigFloat
+  fun = N -> @. Tb((0:(N-1)) / N - 0.5) # 1D grid
+  nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...)
+
+  T16 = Complex{Float16}
+  T32 = Complex{Float32}
+  T64 = Complex{Float64}
+  @test worst(T16, T64) == T16
+
+  J = prod(N) # for f
+  Ñ = 2 .* N # for g
+
+  for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
+    p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+
+    @assert Ñ == p64.Ñ
+    f16 = Vector{T16}(undef, J)
+
+    g64 = rand(T64, Ñ)
+    g32 = T32.(g64)
+    g16 = T16.(g64)
+    @test g32 ≈ g64
+    @test g16 ≈ g64
+
+    fff = Vector{T64}(undef, J) # ground-truth reference
+    @test fff == convolve!(p64, g64, fff)
+
+    fmax = maximum(abs, fff) # need to avoid avoid overflow with T16
+
+    for Tg in (T16, T32, T64), Tf in (T32, T64)
+      f = Vector{Tf}(undef, J)
+      @test f == convolve!(p64, Tg.(g64), f)
+      T = worst(Tg, Tf)
+      @test f/fmax ≈ T.(fff/fmax)
+
+      @test f == convolve!(p32, Tg.(g64), f)
+      T = worst(Tg, Tf, T32)
+      @test f/fmax ≈ T.(fff/fmax)
+    end
+
+    # Float16 test with careful scaling:
+    @test f16 == convolve!(p64, g64/fmax, f16)
+    @test f16 ≈ fff/fmax
+    @test f16 == convolve!(p32, g64/fmax, f16)
+    @test f16 ≈ fff/fmax
+
+  end
+
+end # @testset
+
+#=
+
+  for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
+@show pre
+    p = plan_nfft(nodes, N, m = 5, σ = 2.0, precompute=pre)
+
+    # ensure that convolve! works with mixed precisions
+    for Tf in (Float16, Float64), Tg in (Float16, Float64)
+        f = rand(Complex{Tf}, p.J)
+        g = rand(Complex{Tg}, p.N)
+        @test f == convolve!(p, g, f)
+        @test g == convolve_transpose!(p, g, f)
+    end
+  end
+=#
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 1b47482b..6b215d7c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -17,6 +17,7 @@ arrayTypes = areTypesDefined ? arrayTypes : [JLArray]
     include("issues.jl")
     include("accuracy.jl")
     include("constructors.jl")
+    include("convolve.jl")
     include("performance.jl")
     include("testToeplitz.jl")
     include("samplingDensity.jl")

From 21f0e22bfd1f16766751e9249021026050ea1eb0 Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Thu, 1 Jan 2026 12:34:33 -0500
Subject: [PATCH 06/10] size checks

---
 src/convolution.jl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/convolution.jl b/src/convolution.jl
index 2c4210c5..ea5c67c2 100644
--- a/src/convolution.jl
+++ b/src/convolution.jl
@@ -22,6 +22,10 @@ function AbstractNFFTs.convolve!(
   fHat::StridedVector{<:RealOrComplex}, # mutated
 ) where {D}
 
+  size(g) == p.Ñ ||
+    throw(DimensionMismatch("size(g)=$(size(g)) ≠ Ñ = $(p.Ñ)"))
+  size(fHat) == (p.J,) ||
+   throw(DimensionMismatch("size(fHat)=$(size(fHat)) ≠ J = $(p.J)"))
   (eltype(g) <: Complex) && (eltype(fHat) <: Real) &&
     throw(ArgumentError("Complex input g requires Complex output fHat"))
 
@@ -103,6 +107,10 @@ function AbstractNFFTs.convolve_transpose!(
   g::StridedArray{<:RealOrComplex, D}, # mutated
 ) where {D}
 
+  size(g) == p.Ñ ||
+    throw(DimensionMismatch("size(g)=$(size(g)) ≠ Ñ = $(p.Ñ)"))
+  size(fHat) == (p.J,) ||
+    throw(DimensionMismatch("size(fHat)=$(size(fHat)) ≠ J = $(p.J)"))
   (eltype(fHat) <: Complex) && (eltype(g) <: Real) &&
     throw(ArgumentError("Complex input fHat requires Complex output g"))
 

From 6294d39b7cc626b0a9d8987d6a732f9ed7fad5d4 Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Thu, 1 Jan 2026 12:45:10 -0500
Subject: [PATCH 07/10] Add convolve_transpose tests and real tests

---
 test/convolve.jl | 158 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 100 insertions(+), 58 deletions(-)

diff --git a/test/convolve.jl b/test/convolve.jl
index 0248b60b..220809fa 100644
--- a/test/convolve.jl
+++ b/test/convolve.jl
@@ -11,82 +11,124 @@ FFTW does not support Float16 plans as of 2026-01-01.
 using NFFT: plan_nfft
 using AbstractNFFTs: convolve!, convolve_transpose!
 import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL
-using Test: @test, @testset
+using Test: @test, @testset, @test_throws
 
-# determine worst precision of two types
+# Determine worst precision of two types
+# (Does Julia already have such a method?)
 function worst(T1::Type{<:Number}, T2::Type{<:Number})
     return promote_type(T1, T2) == T1 ? T2 : T1
 end
 worst(a, b, c) = worst(worst(a, b), c)
 
 
-@testset "convolve!" begin
-
-  N = (9, 8) # 9×8 grid of equally spaced sampling points
-  Tb = BigFloat
-  fun = N -> @. Tb((0:(N-1)) / N - 0.5) # 1D grid
-  nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...)
-
-  T16 = Complex{Float16}
-  T32 = Complex{Float32}
-  T64 = Complex{Float64}
-  @test worst(T16, T64) == T16
-
-  J = prod(N) # for f
-  Ñ = 2 .* N # for g
-
-  for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
-    p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
-    p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+N = (9, 8) # 9×8 grid of equally spaced sampling points
+Tb = BigFloat
+fun = N -> @. Tb((0:(N-1)) / N - 0.5) # 1D grid
+nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...)
 
-    @assert Ñ == p64.Ñ
-    f16 = Vector{T16}(undef, J)
+F16 = Float16
+F32 = Float32
+F64 = Float64
+C16 = Complex{F16}
+C32 = Complex{F32}
 
-    g64 = rand(T64, Ñ)
-    g32 = T32.(g64)
-    g16 = T16.(g64)
-    @test g32 ≈ g64
-    @test g16 ≈ g64
+@testset "worst" begin
+  @test worst(F64, F16, F32) == F16
+  @test worst(C16, C32) == C16
+end
 
-    fff = Vector{T64}(undef, J) # ground-truth reference
-    @test fff == convolve!(p64, g64, fff)
+J = prod(N) # for f
+Ñ = 2 .* N # for g
 
-    fmax = maximum(abs, fff) # need to avoid avoid overflow with T16
+@testset "convolve-throws" begin
+  # throw error when input is complex and output is real
+  p = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=NFFT.LINEAR)
+  @test_throws ArgumentError convolve!(p, ones(C32, Ñ), zeros(Float32, J))
+  @test_throws ArgumentError convolve_transpose!(p, ones(C32, J), zeros(Float32,  Ñ))
 
-    for Tg in (T16, T32, T64), Tf in (T32, T64)
-      f = Vector{Tf}(undef, J)
-      @test f == convolve!(p64, Tg.(g64), f)
-      T = worst(Tg, Tf)
-      @test f/fmax ≈ T.(fff/fmax)
+  # or when dimensions mismatch
+  @test_throws DimensionMismatch convolve!(p, ones(C32, Ñ), zeros(C32, 2))
+  @test_throws DimensionMismatch convolve!(p, ones(C32, N), zeros(C32, J))
+  @test_throws DimensionMismatch convolve_transpose!(p, ones(C32, 2), zeros(C32, Ñ))
+  @test_throws DimensionMismatch convolve_transpose!(p, ones(C32, J), zeros(C32, N))
+end
 
-      @test f == convolve!(p32, Tg.(g64), f)
-      T = worst(Tg, Tf, T32)
-      @test f/fmax ≈ T.(fff/fmax)
-    end
 
-    # Float16 test with careful scaling:
-    @test f16 == convolve!(p64, g64/fmax, f16)
-    @test f16 ≈ fff/fmax
-    @test f16 == convolve!(p32, g64/fmax, f16)
-    @test f16 ≈ fff/fmax
+@testset "convolve!" begin
 
-  end
+  for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
+    p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    @assert Ñ == p64.Ñ
 
+    for tfun in (identity, t -> Complex{t}) # test both Real and Complex
+      T16 = tfun(F16)
+      T32 = tfun(F32)
+      T64 = tfun(F64)
+
+      g64 = rand(T64, Ñ)
+      fff = Vector{T64}(undef, J) # ground-truth reference
+      @test fff == convolve!(p64, g64, fff)
+
+      fmax = maximum(abs, fff) # need to avoid avoid overflow with Float16
+
+      for Tg in (T16, T32, T64), Tf in (T32, T64)
+        f = Vector{Tf}(undef, J)
+        @test f == convolve!(p64, Tg.(g64), f)
+        T = worst(Tg, Tf)
+        @test f/fmax ≈ T.(fff/fmax)
+
+        @test f == convolve!(p32, Tg.(g64), f)
+        T = worst(Tg, Tf, T32)
+        @test f / fmax ≈ T.(fff / fmax)
+      end
+
+      # Float16 test with careful scaling:
+      f16 = Vector{T16}(undef, J)
+      @test f16 == convolve!(p64, g64/fmax, f16)
+      @test f16 ≈ fff / fmax
+      @test f16 == convolve!(p32, g64/fmax, f16)
+      @test f16 ≈ fff / fmax
+    end # tfun
+  end # pre
 end # @testset
 
-#=
+
+@testset "convolve_transpose!" begin
 
   for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
-@show pre
-    p = plan_nfft(nodes, N, m = 5, σ = 2.0, precompute=pre)
-
-    # ensure that convolve! works with mixed precisions
-    for Tf in (Float16, Float64), Tg in (Float16, Float64)
-        f = rand(Complex{Tf}, p.J)
-        g = rand(Complex{Tg}, p.N)
-        @test f == convolve!(p, g, f)
-        @test g == convolve_transpose!(p, g, f)
-    end
-  end
-=#
+    p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    @assert Ñ == p64.Ñ
 
+    for tfun in (identity, t -> Complex{t}) # test both Real and Complex
+      T16 = tfun(F16)
+      T32 = tfun(F32)
+      T64 = tfun(F64)
+
+      f64 = rand(T64, J)
+      ggg = Array{T64}(undef, Ñ) # ground-truth reference
+      @test ggg == convolve_transpose!(p64, f64, ggg)
+
+      gmax = maximum(abs, ggg) # need to avoid avoid overflow with Float16
+
+      for Tg in (T32, T64), Tf in (T16, T32, T64)
+        g = similar(ggg, Tg)
+        @test g == convolve_transpose!(p64, Tf.(f64), g)
+        T = worst(Tg, Tf)
+        @test g / gmax ≈ T.(ggg / gmax)
+
+        @test g == convolve_transpose!(p32, Tf.(f64), g)
+        T = worst(Tg, Tf, T32)
+        @test g / gmax ≈ T.(ggg / gmax)
+      end
+
+      # Float16 test with careful scaling:
+      g16 = Array{T16}(undef, Ñ)
+      @test g16 == convolve_transpose!(p64, f64/gmax, g16)
+      @test g16 ≈ ggg / gmax
+      @test g16 == convolve_transpose!(p32, f64/gmax, g16)
+      @test g16 ≈ ggg / gmax
+    end # pre
+  end # tfun
+end # @testset

From de750c59b9d05a655a6120828dcedfa38cf01a98 Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Thu, 1 Jan 2026 12:51:14 -0500
Subject: [PATCH 08/10] =?UTF-8?q?Fix=20size=20to=20=C3=91=20in=20comment?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AbstractNFFTs/src/interface.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/AbstractNFFTs/src/interface.jl b/AbstractNFFTs/src/interface.jl
index fd8c7913..6f10b7cd 100644
--- a/AbstractNFFTs/src/interface.jl
+++ b/AbstractNFFTs/src/interface.jl
@@ -225,7 +225,7 @@ Overwrite `R`-dimensional array `fHat`
 (often R=1, and `fHat` has length `p.J`)
 with the result of "convolution" (i.e., interpolation)
 between `D`-dimensional equispaced input array `g`
-(often of size `p.N`)
+(often of size `p.Ñ`)
 and the interpolation kernel in NFFT plan `p`.
 """
 @mustimplement convolve!(p::AbstractNFFTPlan, g::AbstractArray, fHat::AbstractArray)

From 18e76ec5ca7e9622d82f780fd430895ea1d76efd Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Fri, 2 Jan 2026 09:34:12 -0500
Subject: [PATCH 09/10] Multiple dispatch for complex/real mismatch

---
 src/convolution.jl | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/convolution.jl b/src/convolution.jl
index ea5c67c2..c771a54e 100644
--- a/src/convolution.jl
+++ b/src/convolution.jl
@@ -5,6 +5,7 @@ These are core NFFT operations.
 
 The methods allow both real and complex types
 because sampling density compensation vectors are real.
+However, a Complex input with a Real output is verboten.
 =#
 
 #=
@@ -26,8 +27,10 @@ function AbstractNFFTs.convolve!(
     throw(DimensionMismatch("size(g)=$(size(g)) ≠ Ñ = $(p.Ñ)"))
   size(fHat) == (p.J,) ||
    throw(DimensionMismatch("size(fHat)=$(size(fHat)) ≠ J = $(p.J)"))
-  (eltype(g) <: Complex) && (eltype(fHat) <: Real) &&
-    throw(ArgumentError("Complex input g requires Complex output fHat"))
+
+# multiple dispatch (next function) obviates need for this test
+# (eltype(g) <: Complex) && (eltype(fHat) <: Real) &&
+#   throw(ArgumentError("Complex input g requires Complex output fHat"))
 
   if isempty(p.B)
     if p.params.blocking
@@ -41,6 +44,14 @@ function AbstractNFFTs.convolve!(
   return fHat
 end
 
+function AbstractNFFTs.convolve!(
+  ::NFFTPlan{<:Real,D,1},
+  ::AbstractArray{<:Complex, D},
+  ::StridedVector{<:Real},
+) where {D}
+  throw(ArgumentError("Complex input g requires Complex output fHat"))
+end
+
 
 function convolve_nonblocking!(p::NFFTPlan, g, fHat)
   L = Val(2*p.params.m)
@@ -111,8 +122,10 @@ function AbstractNFFTs.convolve_transpose!(
     throw(DimensionMismatch("size(g)=$(size(g)) ≠ Ñ = $(p.Ñ)"))
   size(fHat) == (p.J,) ||
     throw(DimensionMismatch("size(fHat)=$(size(fHat)) ≠ J = $(p.J)"))
-  (eltype(fHat) <: Complex) && (eltype(g) <: Real) &&
-    throw(ArgumentError("Complex input fHat requires Complex output g"))
+
+# multiple dispatch (next function) obviates need for this test
+# (eltype(fHat) <: Complex) && (eltype(g) <: Real) &&
+#   throw(ArgumentError("Complex input fHat requires Complex output g"))
 
   if isempty(p.B)
     if p.params.blocking
@@ -127,6 +140,15 @@ function AbstractNFFTs.convolve_transpose!(
 end
 
 
+function AbstractNFFTs.convolve_transpose!(
+  ::NFFTPlan{<:Real,D,1},
+  ::StridedVector{<:Real},
+  ::AbstractArray{<:Complex, D},
+) where {D}
+  throw(ArgumentError("Complex input fHat requires Complex output g"))
+end
+
+
 function convolve_transpose_nonblocking!(p::NFFTPlan, fHat, g)
   L = Val(2*p.params.m)
   winPoly = makePolyArrayStatic(p)

From 47a8c92bc1fc4251b4f2f8834ee90a1b570a8c44 Mon Sep 17 00:00:00 2001
From: Jeff Fessler <fessler@umich.edu>
Date: Fri, 2 Jan 2026 10:33:45 -0500
Subject: [PATCH 10/10] Fix

---
 src/convolution.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/convolution.jl b/src/convolution.jl
index c771a54e..7106bb24 100644
--- a/src/convolution.jl
+++ b/src/convolution.jl
@@ -142,8 +142,8 @@ end
 
 function AbstractNFFTs.convolve_transpose!(
   ::NFFTPlan{<:Real,D,1},
-  ::StridedVector{<:Real},
-  ::AbstractArray{<:Complex, D},
+  ::AbstractVector{<:Complex},
+  ::StridedArray{<:Real, D},
 ) where {D}
   throw(ArgumentError("Complex input fHat requires Complex output g"))
 end