diff --git a/AbstractNFFTs/src/interface.jl b/AbstractNFFTs/src/interface.jl
index fd8c7913..6f10b7cd 100644
--- a/AbstractNFFTs/src/interface.jl
+++ b/AbstractNFFTs/src/interface.jl
@@ -225,7 +225,7 @@ Overwrite `R`-dimensional array `fHat`
 (often R=1, and `fHat` has length `p.J`)
 with the result of "convolution" (i.e., interpolation)
 between `D`-dimensional equispaced input array `g`
-(often of size `p.N`)
+(often of size `p.Ñ`)
 and the interpolation kernel in NFFT plan `p`.
 """
 @mustimplement convolve!(p::AbstractNFFTPlan, g::AbstractArray, fHat::AbstractArray)
diff --git a/Project.toml b/Project.toml
index 2535b12b..dc228061 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "NFFT"
 uuid = "efe261a4-0d2b-5849-be55-fc731d526b0d"
-authors = ["Tobias Knopp <tobias@knoppweb.de>"]
-version = "0.14.1"
+authors = ["Tobias Knopp <tobias@knoppweb.de> and contributors"]
+version = "0.14.2"
 
 [deps]
 AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7"
diff --git a/src/convolution.jl b/src/convolution.jl
index b4a8df19..7106bb24 100644
--- a/src/convolution.jl
+++ b/src/convolution.jl
@@ -1,5 +1,37 @@
+#=
+convolution.jl
+Convolution (aka interpolation) methods and their adjoints.
+These are core NFFT operations.
+
+The methods allow both real and complex types
+because sampling density compensation vectors are real.
+However, a Complex input with a Real output is verboten.
+=#
+
+#=
+using NFFT: NFFTPlan, @cthreads, @nloops_
+using LinearAlgebra: mul!
+import Base.Cartesian: @nexprs, @nref
+import AbstractNFFTs
+=#
+
+const RealOrComplex = Union{T, Complex{T}} where {T <: Real}
+
+function AbstractNFFTs.convolve!(
+  p::NFFTPlan{<:Real,D,1},
+  g::AbstractArray{<:RealOrComplex, D},
+  fHat::StridedVector{<:RealOrComplex}, # mutated
+) where {D}
+
+  size(g) == p.Ñ ||
+    throw(DimensionMismatch("size(g)=$(size(g)) ≠ Ñ = $(p.Ñ)"))
+  size(fHat) == (p.J,) ||
+   throw(DimensionMismatch("size(fHat)=$(size(fHat)) ≠ J = $(p.J)"))
+
+# multiple dispatch (next function) obviates need for this test
+# (eltype(g) <: Complex) && (eltype(fHat) <: Real) &&
+#   throw(ArgumentError("Complex input g requires Complex output fHat"))
 
-function AbstractNFFTs.convolve!(p::NFFTPlan{T,D,1}, g::AbstractArray{Complex{T},D}, fHat::StridedVector{U}) where {D,T,U}
   if isempty(p.B)
     if p.params.blocking
       convolve_blocking!(p, g, fHat)
@@ -9,17 +41,27 @@ function AbstractNFFTs.convolve!(p::NFFTPlan{T,D,1}, g::AbstractArray{Complex{T}
   else
     convolve_sparse_matrix!(p, g, fHat)
   end
-  return
+  return fHat
+end
+
+function AbstractNFFTs.convolve!(
+  ::NFFTPlan{<:Real,D,1},
+  ::AbstractArray{<:Complex, D},
+  ::StridedVector{<:Real},
+) where {D}
+  throw(ArgumentError("Complex input g requires Complex output fHat"))
 end
 
+
 function convolve_nonblocking!(p::NFFTPlan, g, fHat)
   L = Val(2*p.params.m)
   winPoly = makePolyArrayStatic(p)
   _convolve_nonblocking!(p, g, fHat, L, winPoly)
 end
 
+
 function _convolve_nonblocking!(p::NFFTPlan, g, fHat, L, winPoly)
-  scale = Int(p.params.LUTSize/(p.params.m))
+  scale = Int(p.params.LUTSize / p.params.m)
 
   @cthreads for j in 1:p.J
       fHat[j] = _convolve_nonblocking(p, g, L, winPoly, scale, j)
@@ -27,18 +69,27 @@ function _convolve_nonblocking!(p::NFFTPlan, g, fHat, L, winPoly)
 end
 
 
-@generated function _convolve_nonblocking(p::NFFTPlan{T,D,1}, g::AbstractArray{Complex{T},D}, L::Val{Z}, winPoly, scale, j) where {D,T,Z}
+@generated function _convolve_nonblocking(
+  p::NFFTPlan{Tp,D,1},
+  g::AbstractArray{<: Union{Tg, Complex{Tg}}, D},
+  L::Val{Z},
+  winPoly,
+  scale::Int,
+  j::Int,
+) where {D, Tp <: Real, Tg <: Real, Z}
+
   quote
-    @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ, 
-                                                 p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize)  )
-  
+    @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ,
+                                                 p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize) )
+
+    T = promote_type(Tp, Tg)
     fHat = zero(Complex{T})
 
     @nexprs 1 d -> prodWin_{$D} = one(T)
     @nloops_ $D l d -> 1:$Z d->begin
       # preexpr
       prodWin_{d-1} = prodWin_d * tmpWin_d[l_d]
-      gidx_d = tmpIdx_d[l_d] 
+      gidx_d = tmpIdx_d[l_d]
     end begin
       # bodyexpr
       fHat += prodWin_0 * @nref $D g gidx
@@ -47,14 +98,35 @@ end
   end
 end
 
-function convolve_sparse_matrix!(p::NFFTPlan{T,D,1}, g::AbstractArray{Complex{T},D}, fHat::StridedVector{U}) where {D,T,U}
+
+function convolve_sparse_matrix!(
+  p::NFFTPlan{<:Real,D,1},
+  g::AbstractArray{<:RealOrComplex, D},
+  fHat::StridedVector{<:RealOrComplex},
+) where {D}
+
   threaded_mul!(fHat, transpose(p.B), vec(g))
   #mul!(fHat, transpose(p.B), vec(g))
 end
 
+
 ########## convolve adjoint ##########
 
-function AbstractNFFTs.convolve_transpose!(p::NFFTPlan{T,D,1}, fHat::AbstractVector{U}, g::StridedArray{Complex{T},D}) where {D,T,U}
+function AbstractNFFTs.convolve_transpose!(
+  p::NFFTPlan{<:Real,D,1},
+  fHat::AbstractVector{<:RealOrComplex},
+  g::StridedArray{<:RealOrComplex, D}, # mutated
+) where {D}
+
+  size(g) == p.Ñ ||
+    throw(DimensionMismatch("size(g)=$(size(g)) ≠ Ñ = $(p.Ñ)"))
+  size(fHat) == (p.J,) ||
+    throw(DimensionMismatch("size(fHat)=$(size(fHat)) ≠ J = $(p.J)"))
+
+# multiple dispatch (next function) obviates need for this test
+# (eltype(fHat) <: Complex) && (eltype(g) <: Real) &&
+#   throw(ArgumentError("Complex input fHat requires Complex output g"))
+
   if isempty(p.B)
     if p.params.blocking
       convolve_transpose_blocking!(p, fHat, g)
@@ -64,6 +136,16 @@ function AbstractNFFTs.convolve_transpose!(p::NFFTPlan{T,D,1}, fHat::AbstractVec
   else
     convolve_transpose_sparse_matrix!(p, fHat, g)
   end
+  return g
+end
+
+
+function AbstractNFFTs.convolve_transpose!(
+  ::NFFTPlan{<:Real,D,1},
+  ::AbstractVector{<:Complex},
+  ::StridedArray{<:Real, D},
+) where {D}
+  throw(ArgumentError("Complex input fHat requires Complex output g"))
 end
 
 
@@ -74,9 +156,16 @@ function convolve_transpose_nonblocking!(p::NFFTPlan, fHat, g)
 end
 
 
-function _convolve_transpose_nonblocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly)  where {D,T} 
-  fill!(g, zero(T))
-  scale = Int(p.params.LUTSize/(p.params.m))
+function _convolve_transpose_nonblocking!(
+  p::NFFTPlan{<:Real,D,1},
+  fHat::AbstractVector{<:RealOrComplex},
+  g::StridedArray{<: Union{Tg, Complex{Tg}}, D},
+  L::Val,
+  winPoly,
+) where {D, Tg <: Real}
+
+  fill!(g, zero(Tg))
+  scale = Int(p.params.LUTSize / p.params.m)
 
   @inbounds @simd for j in 1:p.J
     __convolve_transpose_nonblocking!(p, fHat, g, L, winPoly, scale, j)
@@ -84,25 +173,39 @@ function _convolve_transpose_nonblocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPol
 end
 
 
-@generated function __convolve_transpose_nonblocking!(p::NFFTPlan{T,D,1}, fHat::AbstractVector{U}, g::StridedArray{Complex{T},D}, L::Val{Z}, winPoly, scale, j) where {D,T,U,Z}
+@generated function __convolve_transpose_nonblocking!(
+  p::NFFTPlan{<:Real,D,1},
+  fHat::AbstractVector{<:RealOrComplex},
+  g::StridedArray{<: Union{Tg, Complex{Tg}}, D},
+  L::Val{Z},
+  winPoly,
+  scale::Int,
+  j::Int,
+) where {D, Tg <: Real, Z}
+
   quote
-    @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ, 
-                                                 p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize)  )
+    @nexprs $(D) d -> ((tmpIdx_d, tmpWin_d) = precomputeOneNode(p.windowLinInterp, winPoly, p.k, p.Ñ,
+                                                 p.params.m, p.params.σ, scale, j, d, L, p.params.LUTSize) )
 
-    @nexprs 1 d -> prodWin_{$D} = one(T)
+    @nexprs 1 d -> prodWin_{$D} = one(Tg)
     @nloops_ $D l d -> 1:$Z d->begin
       # preexpr
       prodWin_{d-1} = prodWin_d * tmpWin_d[l_d]
-      gidx_d = tmpIdx_d[l_d] 
+      gidx_d = tmpIdx_d[l_d]
     end begin
       # bodyexpr
-      (@nref $D g gidx) += prodWin_0 * fHat[j] 
+      (@nref $D g gidx) += prodWin_0 * fHat[j]
     end
   end
 end
 
-function convolve_transpose_sparse_matrix!(p::NFFTPlan{T,D,1},
-                        fHat::AbstractVector{U}, g::StridedArray{Complex{T},D}) where {D,T,U}
+
+function convolve_transpose_sparse_matrix!(
+  p::NFFTPlan{<:Real,D,1},
+  fHat::AbstractVector{<:RealOrComplex},
+  g::StridedArray{<:RealOrComplex, D},
+) where {D}
+
   #threaded_mul!(vec(g), p.B, fHat)
   mul!(vec(g), p.B, fHat)
 end
@@ -129,24 +232,24 @@ function convolve_blocking!(p::NFFTPlan, g, fHat)
   _convolve_blocking!(p, g, fHat, L, winPoly)
 end
 
-function _convolve_blocking!(p::NFFTPlan{T,D,1}, g, fHat, L, winPoly) where {D,T}
-  scale = Int(p.params.LUTSize/(p.params.m))
+function _convolve_blocking!(p::NFFTPlan{<:Real,D,1}, g, fHat, L, winPoly) where {D}
+  scale = Int(p.params.LUTSize / p.params.m)
 
   @cthreads for l in CartesianIndices(size(p.blocks))
     if !isempty(p.nodesInBlock[l])
       toBlock!(p, g, p.blocks[l], p.blockOffsets[l])
       winTensor = (p.params.precompute == TENSOR) ? p.windowTensor[l] : nothing
-      calcOneBlock!(p, fHat, p.nodesInBlock[l], p.blocks[l], p.blockOffsets[l], L, 
+      calcOneBlock!(p, fHat, p.nodesInBlock[l], p.blocks[l], p.blockOffsets[l], L,
                    scale, p.idxInBlock[l], winTensor, winPoly)
     end
   end
 end
 
-@generated function toBlock!(p::NFFTPlan{T,D,1}, g, block, off) where {T,D}
+@generated function toBlock!(p::NFFTPlan{<:Real,D,1}, g, block, off) where {D}
 quote
   if off[1] + 2 < 1
-    LA = -(off[1] + 2) + 1 
-    offA = size(g,1) - LA 
+    LA = -(off[1] + 2) + 1
+    offA = size(g,1) - LA
     offB = -LA
   else
     LA = 0
@@ -154,7 +257,7 @@ quote
   end
 
   if offB + size(block,1) > size(g,1)
-    LB = size(g,1) - (offB )  
+    LB = size(g,1) - offB
     offC = -LB
   else
     LB = size(block,1)
@@ -162,31 +265,31 @@ quote
 
   if LB != size(block,1) && offC + size(block,1) > size(g,1)
     # out of bounds indices: second (or first) wrapping
-    LC = size(g,1) - (offC)  
+    LC = size(g,1) - offC
     offD = -LC
   else
     # no out of bounds indices
     LC = size(block,1)
   end
-  
+
   @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:size(block,d+1)) d->begin
     # preexpr
-    idx_{d+1} = ( mod(l_{d+1}  + off[d+1] + p.Ñ[d+1], p.Ñ[d+1]) + 1)
+    idx_{d+1} = ( mod(l_{d+1} + off[d+1] + p.Ñ[d+1], p.Ñ[d+1]) + 1)
   end begin
     # bodyexpr
-    @inbounds @simd for l_1 = 1:LA
+    @inbounds @simd for l_1 in 1:LA
       idx_1 = l_1 + offA
       (@nref $D block l) = (@nref $D g idx)
     end
-    @inbounds @simd for l_1 = (LA+1):LB 
+    @inbounds @simd for l_1 in (LA+1):LB
         idx_1 = l_1 + offB
       (@nref $D block l) = (@nref $D g idx)
     end
-    @inbounds @simd for l_1 = (LB+1):LC
+    @inbounds @simd for l_1 in (LB+1):LC
       idx_1 = l_1 + offC
       (@nref $D block l) = (@nref $D g idx)
     end
-    @inbounds @simd for l_1 = (LC+1):size(block,1) 
+    @inbounds @simd for l_1 in (LC+1):size(block,1)
       idx_1 = l_1 + offD
       (@nref $D block l) = (@nref $D g idx)
     end
@@ -195,44 +298,52 @@ quote
  end
 end
 
-@noinline function calcOneBlock!(p::NFFTPlan{T,D,1}, fHat, nodesInBlock, block,
-         off, L, scale, idxInBlock, winTensor, winPoly) where {D,T}
+@noinline function calcOneBlock!(
+  p::NFFTPlan{<:Real,D,1},
+  fHat, nodesInBlock, block,
+  off, L, scale, idxInBlock, winTensor, winPoly,
+) where {D}
+
   for (jLocal,j) in enumerate(nodesInBlock)
-    fHat[j] = calcOneNode!(p, block, off, L, scale, j, jLocal, 
+    fHat[j] = calcOneNode!(p, block, off, L, scale, j, jLocal,
                                idxInBlock, winTensor, winPoly)
   end
   return
 end
 
 
-@generated function calcOneNode!(p::NFFTPlan{T,D,1}, block, off, L::Val{Z}, scale, 
-                   j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
+@generated function calcOneNode!(
+  p::NFFTPlan{T,D,1}, block, off, L::Val{Z}, scale,
+  j, jLocal, idxInBlock, winTensor, winPoly,
+) where {D, T <: Real, Z}
+
   quote
-    @nexprs $(D) d -> ((off_d, tmpWin_d) =  precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale, 
+    @nexprs $(D) d -> ((off_d, tmpWin_d) = precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale,
                       jLocal, d, L, idxInBlock) )
 
     @nexprs 1 d -> prodWin_{$D} = one(T)
     @nexprs 1 d -> fHat_{$D} = zero(Complex{T})
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:$Z) d->begin
       # preexpr
-      block_idx_{d+1} = off_{d+1} + l_{d+1} 
+      block_idx_{d+1} = off_{d+1} + l_{d+1}
       fHat_{d} = zero(Complex{T})
     end d->begin
       # postexpr
       fHat_{d+1} += tmpWin_{d+1}[l_{d+1}] * fHat_{d}
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:$Z
+      @inbounds @simd for l_1 in 1:$Z
         block_idx_1 = off_1 + l_1
         fHat_1 += tmpWin_1[l_1] * (@nref $D block block_idx)
       end
     end
 
-    @nexprs 1 d -> fHat = fHat_{$D} 
+    @nexprs 1 d -> fHat = fHat_{$D}
     return fHat
   end
 end
 
+
 ##########################
 
 function convolve_transpose_blocking!(p::NFFTPlan, fHat, g)
@@ -241,11 +352,12 @@ function convolve_transpose_blocking!(p::NFFTPlan, fHat, g)
   _convolve_transpose_blocking!(p, fHat, g, L, winPoly)
 end
 
-function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly) where {D,T}
+
+function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly) where {D, T <: Real}
   #g .= zero(T)
   ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), g, 0, sizeof(g))
-  scale = Int(p.params.LUTSize/(p.params.m))
-  
+  scale = Int(p.params.LUTSize / p.params.m)
+
   lk = ReentrantLock()
   @cthreads for l in CartesianIndices(size(p.blocks))
     if !isempty(p.nodesInBlock[l])
@@ -253,9 +365,9 @@ function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly)
       ccall(:memset, Ptr{Cvoid}, (Ptr{Cvoid}, Cint, Csize_t), p.blocks[l], 0, sizeof(p.blocks[l]))
 
       winTensor = (p.params.precompute == TENSOR) ? p.windowTensor[l] : nothing
-      fillBlock!(p, fHat, p.blocks[l], p.nodesInBlock[l], p.blockOffsets[l], L, scale, 
+      fillBlock!(p, fHat, p.blocks[l], p.nodesInBlock[l], p.blockOffsets[l], L, scale,
                  p.idxInBlock[l], winTensor, winPoly)
- 
+
       lock(lk) do
         addBlock!(p, g, p.blocks[l], p.blockOffsets[l])
       end
@@ -263,7 +375,8 @@ function _convolve_transpose_blocking!(p::NFFTPlan{T,D,1}, fHat, g, L, winPoly)
   end
 end
 
-@generated function addBlock!(p::NFFTPlan{T,D,1}, g, block, off) where {T,D}
+
+@generated function addBlock!(p::NFFTPlan{<:Real,D,1}, g, block, off) where {D}
   quote
     # addBlock! needs to wrap indices. The easies is to apply a modulo operation (rem)
     # but doing so for each index is slow. We thus only do this for the outer dimensions
@@ -276,8 +389,8 @@ end
 
     if off[1] + 2 < 1
       # negative indices: first wrapping
-      LA = -(off[1] + 2) + 1 
-      offA = size(g,1) - LA 
+      LA = -(off[1] + 2) + 1
+      offA = size(g,1) - LA
       offB = -LA
     else
       # no negative indices: skip first sum
@@ -287,7 +400,7 @@ end
 
     if offB + size(block,1) > size(g,1)
       # out of bounds indices: second (or first) wrapping
-      LB = size(g,1) - (offB )  
+      LB = size(g,1) - offB
       offC = -LB
     else
       # no out of bounds indices
@@ -296,31 +409,31 @@ end
 
     if LB != size(block,1) && offC + size(block,1) > size(g,1)
       # out of bounds indices: second (or first) wrapping
-      LC = size(g,1) - (offC)  
+      LC = size(g,1) - offC
       offD = -LC
     else
       # no out of bounds indices
       LC = size(block,1)
     end
-    
+
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:size(block,d+1)) d->begin
       # preexpr
-      idx_{d+1} = ( mod(l_{d+1}  + off[d+1] + p.Ñ[d+1], p.Ñ[d+1]) + 1)
+      idx_{d+1} = ( mod(l_{d+1} + off[d+1] + p.Ñ[d+1], p.Ñ[d+1]) + 1)
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:LA
+      @inbounds @simd for l_1 in 1:LA
         idx_1 = l_1 + offA
         (@nref $D g idx) += (@nref $D block l)
       end
-      @inbounds @simd for l_1 = (LA+1):LB 
+      @inbounds @simd for l_1 in (LA+1):LB
         idx_1 = l_1 + offB
         (@nref $D g idx) += (@nref $D block l)
       end
-      @inbounds @simd for l_1 = (LB+1):LC
+      @inbounds @simd for l_1 in (LB+1):LC
         idx_1 = l_1 + offC
         (@nref $D g idx) += (@nref $D block l)
       end
-      @inbounds @simd for l_1 = (LC+1):size(block,1) 
+      @inbounds @simd for l_1 in (LC+1):size(block,1)
         idx_1 = l_1 + offD
         (@nref $D g idx) += (@nref $D block l)
       end
@@ -329,10 +442,14 @@ end
   end
 end
 
-@noinline function fillBlock!(p::NFFTPlan{T,D,1}, fHat, block, nodesInBlock, off, L::Val{Z}, scale, 
-                              idxInBlock, winTensor, winPoly) where {T,D,Z}
+
+@noinline function fillBlock!(
+  p::NFFTPlan{<:Real,D,1}, fHat, block, nodesInBlock, off, L::Val{Z}, scale,
+  idxInBlock, winTensor, winPoly,
+) where {D, Z}
+
   if (Threads.nthreads() == 1 || !NFFT._use_threads[]) &&
-      (D >= 3 && Z >= 16) || (D == 2 && Z >= 16) # magic 
+      (D >= 3 && Z >= 16) || (D == 2 && Z >= 16) # magic
     for (jLocal,j) in enumerate(nodesInBlock)
       fillOneNode2!(p, fHat, block, off, L, scale, j, jLocal, idxInBlock, winTensor, winPoly)
     end
@@ -345,8 +462,11 @@ end
 end
 
 
-@generated function fillOneNode!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, 
-              j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
+@generated function fillOneNode!(
+  p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale,
+  j, jLocal, idxInBlock, winTensor, winPoly,
+) where {D, T <: Real, Z}
+
   quote
     fHat_ = fHat[j]
 
@@ -359,11 +479,11 @@ end
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:$Z) d->begin
       # preexpr
       prodWin_{d} = prodWin_{d+1} * tmpWin_{d+1}[l_{d+1}]
-      block_idx_{d+1} = off_{d+1} + l_{d+1} 
+      block_idx_{d+1} = off_{d+1} + l_{d+1}
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:$(Z)
-        block_idx_1 = off_1 + l_1 
+      @inbounds @simd for l_1 in 1:$(Z)
+        block_idx_1 = off_1 + l_1
         (@nref $D block block_idx) += innerWin[l_1] * prodWin_1
       end
     end
@@ -371,8 +491,12 @@ end
   end
 end
 
-@generated function fillOneNode2!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, 
-              j, jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
+
+@generated function fillOneNode2!(
+  p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale,
+  j, jLocal, idxInBlock, winTensor, winPoly,
+) where {D, T <: Real, Z}
+
   quote
     fHat_ = fHat[j]
 
@@ -383,11 +507,11 @@ end
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:$Z) d->begin
       # preexpr
       prodWin_{d} = prodWin_{d+1} * tmpWin_{d+1}[l_{d+1}]
-      block_idx_{d+1} = off_{d+1} + l_{d+1} 
+      block_idx_{d+1} = off_{d+1} + l_{d+1}
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:$(Z)
-        block_idx_1 = off_1 + l_1 
+      @inbounds @simd for l_1 in 1:$(Z)
+        block_idx_1 = off_1 + l_1
         (@nref $D block block_idx) += (tmpWin_1[l_1] * prodWin_1) * fHat_
       end
     end
@@ -396,17 +520,19 @@ end
 end
 
 
+@generated function fillOneNodeReal!(
+  p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, j,
+  jLocal, idxInBlock, winTensor, winPoly,
+) where {D, T <: Real, Z}
 
-@generated function fillOneNodeReal!(p::NFFTPlan{T,D,1}, fHat, block, off, L::Val{Z}, scale, j, 
-                jLocal, idxInBlock, winTensor, winPoly) where {D,T,Z}
   quote
     fHat_ = fHat[j]
 
-    @nexprs $(D) d -> ((off_d, tmpWin_d) =  precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale, jLocal, 
+    @nexprs $(D) d -> ((off_d, tmpWin_d) = precomputeOneNodeBlocking(p.windowLinInterp, winTensor, winPoly, scale, jLocal,
                                               d, L, idxInBlock) )
 
     innerWin = Vector{Float64}(undef,$(2*Z)) # Probably cache me somewhere
-    for l=1:$Z
+    for l in 1:$Z
       innerWin[2*(l-1)+1] = tmpWin_1[l] * real(fHat_)
       innerWin[2*(l-1)+2] = tmpWin_1[l] * imag(fHat_)
     end
@@ -415,7 +541,7 @@ end
     # Convert to real
     # We use unsafe_wrap here because reinterpret(T, block) is much slower
     # and reinterpret(T, reshape, block) is slightly slower than unsafe_wrap
-    # In fact the conversion to real only pays off when using unsafe_wrap, 
+    # In fact the conversion to real only pays off when using unsafe_wrap,
     # otherwise sticking with complex is faster
     U = @ntuple $(D) d -> d==1 ? 2*size(block,1) : size(block,d)
     blockReal = unsafe_wrap(Array,reinterpret(Ptr{T},pointer(block)), U)
@@ -425,11 +551,11 @@ end
     @nloops_ $(D-1)  (d->l_{d+1})  (d -> 1:$Z) d->begin
       # preexpr
       prodWin_{d} = prodWin_{d+1} * tmpWin_{d+1}[l_{d+1}]
-      block_idx_{d+1} = off_{d+1} + l_{d+1} 
+      block_idx_{d+1} = off_{d+1} + l_{d+1}
     end begin
       # bodyexpr
-      @inbounds @simd for l_1 = 1:$(2*Z)
-        block_idx_1 = off_1 + l_1 
+      @inbounds @simd for l_1 in 1:$(2*Z)
+        block_idx_1 = off_1 + l_1
         (@nref $(D) blockReal block_idx) += innerWin[l_1] * prodWin_1
       end
     end
diff --git a/test/convolve.jl b/test/convolve.jl
new file mode 100644
index 00000000..220809fa
--- /dev/null
+++ b/test/convolve.jl
@@ -0,0 +1,134 @@
+#=
+Test convolution (interpolation) and its adjoint in isolation
+
+One must be careful with Float16 output arrays here,
+because plan.windowLinInterp has values above 1e8
+that exceed the 6e4 maximum of Float16.
+
+FFTW does not support Float16 plans as of 2026-01-01.
+=#
+
+using NFFT: plan_nfft
+using AbstractNFFTs: convolve!, convolve_transpose!
+import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL
+using Test: @test, @testset, @test_throws
+
+# Determine worst precision of two types
+# (Does Julia already have such a method?)
+function worst(T1::Type{<:Number}, T2::Type{<:Number})
+    return promote_type(T1, T2) == T1 ? T2 : T1
+end
+worst(a, b, c) = worst(worst(a, b), c)
+
+
+N = (9, 8) # 9×8 grid of equally spaced sampling points
+Tb = BigFloat
+fun = N -> @. Tb((0:(N-1)) / N - 0.5) # 1D grid
+nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...)
+
+F16 = Float16
+F32 = Float32
+F64 = Float64
+C16 = Complex{F16}
+C32 = Complex{F32}
+
+@testset "worst" begin
+  @test worst(F64, F16, F32) == F16
+  @test worst(C16, C32) == C16
+end
+
+J = prod(N) # for f
+Ñ = 2 .* N # for g
+
+@testset "convolve-throws" begin
+  # throw error when input is complex and output is real
+  p = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=NFFT.LINEAR)
+  @test_throws ArgumentError convolve!(p, ones(C32, Ñ), zeros(Float32, J))
+  @test_throws ArgumentError convolve_transpose!(p, ones(C32, J), zeros(Float32,  Ñ))
+
+  # or when dimensions mismatch
+  @test_throws DimensionMismatch convolve!(p, ones(C32, Ñ), zeros(C32, 2))
+  @test_throws DimensionMismatch convolve!(p, ones(C32, N), zeros(C32, J))
+  @test_throws DimensionMismatch convolve_transpose!(p, ones(C32, 2), zeros(C32, Ñ))
+  @test_throws DimensionMismatch convolve_transpose!(p, ones(C32, J), zeros(C32, N))
+end
+
+
+@testset "convolve!" begin
+
+  for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
+    p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    @assert Ñ == p64.Ñ
+
+    for tfun in (identity, t -> Complex{t}) # test both Real and Complex
+      T16 = tfun(F16)
+      T32 = tfun(F32)
+      T64 = tfun(F64)
+
+      g64 = rand(T64, Ñ)
+      fff = Vector{T64}(undef, J) # ground-truth reference
+      @test fff == convolve!(p64, g64, fff)
+
+      fmax = maximum(abs, fff) # need to avoid avoid overflow with Float16
+
+      for Tg in (T16, T32, T64), Tf in (T32, T64)
+        f = Vector{Tf}(undef, J)
+        @test f == convolve!(p64, Tg.(g64), f)
+        T = worst(Tg, Tf)
+        @test f/fmax ≈ T.(fff/fmax)
+
+        @test f == convolve!(p32, Tg.(g64), f)
+        T = worst(Tg, Tf, T32)
+        @test f / fmax ≈ T.(fff / fmax)
+      end
+
+      # Float16 test with careful scaling:
+      f16 = Vector{T16}(undef, J)
+      @test f16 == convolve!(p64, g64/fmax, f16)
+      @test f16 ≈ fff / fmax
+      @test f16 == convolve!(p32, g64/fmax, f16)
+      @test f16 ≈ fff / fmax
+    end # tfun
+  end # pre
+end # @testset
+
+
+@testset "convolve_transpose!" begin
+
+  for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL]
+    p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre)
+    @assert Ñ == p64.Ñ
+
+    for tfun in (identity, t -> Complex{t}) # test both Real and Complex
+      T16 = tfun(F16)
+      T32 = tfun(F32)
+      T64 = tfun(F64)
+
+      f64 = rand(T64, J)
+      ggg = Array{T64}(undef, Ñ) # ground-truth reference
+      @test ggg == convolve_transpose!(p64, f64, ggg)
+
+      gmax = maximum(abs, ggg) # need to avoid avoid overflow with Float16
+
+      for Tg in (T32, T64), Tf in (T16, T32, T64)
+        g = similar(ggg, Tg)
+        @test g == convolve_transpose!(p64, Tf.(f64), g)
+        T = worst(Tg, Tf)
+        @test g / gmax ≈ T.(ggg / gmax)
+
+        @test g == convolve_transpose!(p32, Tf.(f64), g)
+        T = worst(Tg, Tf, T32)
+        @test g / gmax ≈ T.(ggg / gmax)
+      end
+
+      # Float16 test with careful scaling:
+      g16 = Array{T16}(undef, Ñ)
+      @test g16 == convolve_transpose!(p64, f64/gmax, g16)
+      @test g16 ≈ ggg / gmax
+      @test g16 == convolve_transpose!(p32, f64/gmax, g16)
+      @test g16 ≈ ggg / gmax
+    end # pre
+  end # tfun
+end # @testset
diff --git a/test/runtests.jl b/test/runtests.jl
index 1b47482b..6b215d7c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -17,6 +17,7 @@ arrayTypes = areTypesDefined ? arrayTypes : [JLArray]
     include("issues.jl")
     include("accuracy.jl")
     include("constructors.jl")
+    include("convolve.jl")
     include("performance.jl")
     include("testToeplitz.jl")
     include("samplingDensity.jl")