diff --git a/AbstractNFFTs/src/interface.jl b/AbstractNFFTs/src/interface.jl index 6f10b7cd..e543432f 100644 --- a/AbstractNFFTs/src/interface.jl +++ b/AbstractNFFTs/src/interface.jl @@ -189,7 +189,7 @@ The transformation is applied along `D-R+1` dimensions specified in the plan `p` size_in(p) Size of the input array for the plan p (NFFT/NFCT/NFST/NNFFT). -The returned tuple has `R` entries. +The returned tuple has `D` entries. Note that this will be the output size for the transposed / adjoint operator. """ @mustimplement size_in(p::AbstractFTPlan{T,D,R}) where {T,D,R} @@ -222,10 +222,10 @@ Change nodes `k` in the plan `p` operation and return the plan. convolve!(p::AbstractNFFTPlan{T,D,R}, g::AbstractArray, fHat::AbstractArray) Overwrite `R`-dimensional array `fHat` -(often R=1, and `fHat` has length `p.J`) +(often R=1, and `fHat` has length `only(p.size_in)`) with the result of "convolution" (i.e., interpolation) between `D`-dimensional equispaced input array `g` -(often of size `p.Ñ`) +(often of size `p.Ñ` which is not part of abstract interface) and the interpolation kernel in NFFT plan `p`. """ @mustimplement convolve!(p::AbstractNFFTPlan, g::AbstractArray, fHat::AbstractArray) diff --git a/NFFTTools/Project.toml b/NFFTTools/Project.toml index 025de674..1b818d7f 100644 --- a/NFFTTools/Project.toml +++ b/NFFTTools/Project.toml @@ -1,16 +1,18 @@ name = "NFFTTools" uuid = "7424e34d-94f7-41d6-98a0-85abaf1b6c91" -author = ["Tobias Knopp "] -version = "0.2.7" +author = ["Tobias Knopp and contributors"] +version = "0.3" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +NFFT = "efe261a4-0d2b-5849-be55-fc731d526b0d" [compat] julia = "1.6" AbstractFFTs = "1.0" AbstractNFFTs = "0.6, 0.7, 0.8, 0.9" FFTW = "1" +NFFT = "0.14" diff --git a/NFFTTools/src/NFFTTools.jl b/NFFTTools/src/NFFTTools.jl index a4f99f60..0ff5a3dc 100644 --- a/NFFTTools/src/NFFTTools.jl +++ b/NFFTTools/src/NFFTTools.jl @@ -3,11 +3,12 @@ module NFFTTools export sdc export calculateToeplitzKernel, calculateToeplitzKernel!, convolveToeplitzKernel! -using AbstractNFFTs: AbstractNFFTPlan, plan_nfft, nodes! +using AbstractNFFTs: AbstractNFFTPlan, plan_nfft, nodes!, size_in, size_out using AbstractNFFTs: convolve!, convolve_transpose! +#using NFFT: NFFTPlan using FFTW: fftshift, plan_fft, plan_ifft using LinearAlgebra: adjoint, mul! -import FFTW # ESTIMATE (which is currently non-public) +import FFTW # ESTIMATE include("samplingDensity.jl") include("Toeplitz.jl") diff --git a/NFFTTools/src/samplingDensity.jl b/NFFTTools/src/samplingDensity.jl index e8e9d66e..b332eaba 100644 --- a/NFFTTools/src/samplingDensity.jl +++ b/NFFTTools/src/samplingDensity.jl @@ -1,49 +1,155 @@ - #= using AbstractNFFTs: AbstractNFFTPlan, convolve!, convolve_transpose! +using AbstractNFFTs: size_in, size_out using LinearAlgebra: mul! =# -function sdc(p::AbstractNFFTPlan{T,D,1}; iters=20) where {T,D} - # Weights for sample density compensation. - # Uses method of Pipe & Menon, 1999. Mag Reson Med, 186, 179. - weights = similar(p.tmpVec, Complex{T}, p.J) - weights .= one(Complex{T}) - weights_tmp = similar(weights) - scaling_factor = zero(T) +""" + out = _fill_similar(array, v::T, dims) where T +Return an allocated array similar to `array` filled with value `v`. + +This 2-step initialization helper function +is needed to accommodate GPU array types. +The more obvious statement `weights = fill(v, dims)` +can lead to the wrong array type and cause GPU tests to fail. +""" +function _fill_similar(array::AbstractArray, v::T, dims::Union{Integer,Dims}) where T + weights = similar(array, T, dims) + fill!(weights, v) + return weights +end + +#= +It would be desirable to use the memory pointed to by p.tmpVec +as working buffer for sdc iterations, +but this attempt led to intermittent corrupted outputs and errors. +So it is left commented out for now. +=# +#= +function _reinterpret_real(g::StridedArray{Complex{T}}) where {T <: Real} + r1 = reinterpret(T, g) + r2 = @view r1[1:2:end,:] + return r2 +end +=# + +""" + weights = sdc(plan::{Abstract}NFFTPlan; iters=20, ...) + +Compute weights for sample density compensation for given NFFT `plan` +Uses method of Pipe & Menon, Mag Reson Med, 44(1):179-186, Jan. 1999. +DOI: 10.1002/(SICI)1522-2594(199901)41:1<179::AID-MRM25>3.0.CO;2-V +The function applies `iters` iterations of that method. + +The scaling here such that if the plan were 1D with N nodes equispaced +from -0.5 to 0.5-1/N, then the returned weights are ≈ 1/N. + +The weights are scaled such that ``A' diag(w) A 1_N ≈ 1_N``. + +The returned vector is real, positive values of length `plan.J`. + +This method _almost_ conforms to the `AbstractNFFT` interface +except that it uses `p.Ñ` and `p.tmpVec` that are not part of that interface. + +There are several named keyword arguments that are work buffers +that are all mutated: `weights weights_tmp workg workf workv`. +If the caller provides all of those, +then this function should make only small allocations. +""" +function sdc( + p::AbstractNFFTPlan{T,D,1}; + iters::Int = 20, + # the type of p.tmpVec (if present) is needed for GPU arrays (JLArray): + _array::AbstractArray = hasfield(typeof(p), :tmpVec) ? + p.tmpVec : Array{T,D}(undef, zeros(Int, D)...), + # the following are working buffers that are all mutated: + weights::AbstractVector{T} = _fill_similar(_array, one(T), only(size_out(p))), + weights_tmp::AbstractVector = similar(weights), +# workg::AbstractArray = _reinterpret_real(p.tmpVec), # todo + workg::AbstractArray = similar(_array, T, p.Ñ), + workf::AbstractVector = similar(_array, Complex{T}, only(size_out(p))), + workv::AbstractArray = similar(_array, Complex{T}, size_in(p)), +) where {T <: Real, D} + + return sdc!( + p, + iters, + weights, + weights_tmp, + workg, + workf, + workv, + ) +end + + +""" + weights = sdc!( p::AbstractNFFTPlan, iters, weights, weights_tmp, workg) +Compute sampling density compensation `weights` +without performing any final scaling step. + +Ideally this function should be (nearly) non-allocating. +""" +function sdc!( + p::AbstractNFFTPlan{T,D,1}, + iters::Int, + # the following are working buffers that are all mutated: + weights::AbstractVector{T}, + weights_tmp::AbstractVector, + workg::AbstractArray, +) where {T <: Real, D} + + scaling_factor = missing # will be set below # Pre-weighting to correct non-uniform sample density for i in 1:iters - convolve_transpose!(p, weights, p.tmpVec) - if i==1 - scaling_factor = maximum(abs.(p.tmpVec)) + convolve_transpose!(p, weights, workg) + if i == 1 + scaling_factor = maximum(workg) end - p.tmpVec ./= scaling_factor - convolve!(p, p.tmpVec, weights_tmp) + workg ./= scaling_factor + convolve!(p, workg, weights_tmp) weights_tmp ./= scaling_factor - weights ./= (abs.(weights_tmp) .+ eps(T)) + any(≤(0), weights_tmp) && throw("non-positive weights") + weights ./= weights_tmp end - # Post weights to correct image scaling - # This finds c, where ||u - c*v||_2^2 = 0 and then uses - # c to scale all weights by a scalar factor. - u = similar(weights, Complex{T}, p.N) - u .= one(Complex{T}) - - # conversion to Array is a workaround for CuNFFT. Without it we get strange - # results that indicate some synchronization issue - #f = Array( p * u ) - #b = f .* Array(weights) # apply weights from above - #v = Array( adjoint(p) * convert(typeof(weights), b) ) - #c = vec(v) \ vec(Array(u)) # least squares diff - #return abs.(convert(typeof(weights), c * Array(weights))) - - # non converting version - f = similar(p.tmpVec, Complex{T}, p.J) - mul!(f, p, u) - f .*= weights # apply weights from above - v = similar(p.tmpVec, Complex{T}, p.N) - mul!(v, adjoint(p), f) - c = vec(v) \ vec(u) # least squares diff - return abs.(c * weights) + return weights +end + + +""" + sdc!( ... ) + +This version scales the weights such that +``A' D(w) A 1_N ≈ 1_N``. +Find ``c ∈ ℝ`` that minimizes ``‖u - c * v‖₂`` +where ``u ≜ 1_N`` (vector of ones) +and ``v ≜ A' D(w) A 1_N``, and then scale `w` by that `c`. +The analytical solution is +``c = real(u'v) / ‖v‖² = real(sum(v)) / ‖v‖².`` + +Ideally this function should be (nearly) non-allocating. +""" +function sdc!( + p::AbstractNFFTPlan{T,D,1}, + iters::Int, + # the following are working buffers that are all mutated: + weights::AbstractVector{T}, + weights_tmp::AbstractVector, + workg::AbstractArray, + workf::AbstractVector, + workv::AbstractArray, +) where {T <: Real, D} + + sdc!(p, iters, weights, weights_tmp, workg) + + u = workv # trick to save memory + fill!(u, one(T)) + mul!(workf, p, u) + workf .*= weights # apply weights from above + mul!(workv, adjoint(p), workf) + c = real(sum(workv)) / sum(abs2, workv) + weights .*= c + return weights end diff --git a/Project.toml b/Project.toml index dc228061..a58d53d5 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "NFFT" uuid = "efe261a4-0d2b-5849-be55-fc731d526b0d" authors = ["Tobias Knopp and contributors"] -version = "0.14.2" +version = "0.14.3" [deps] AbstractNFFTs = "7f219486-4aa7-41d6-80a7-e08ef20ceed7" diff --git a/ext/NFFTGPUArraysExt/implementation.jl b/ext/NFFTGPUArraysExt/implementation.jl index 43712980..2468ab9a 100644 --- a/ext/NFFTGPUArraysExt/implementation.jl +++ b/ext/NFFTGPUArraysExt/implementation.jl @@ -1,3 +1,5 @@ +const RealOrComplex = Union{T, Complex{T}} where {T <: Real} + mutable struct GPU_NFFTPlan{T,D, arrTc <: AbstractGPUArray{Complex{T}, D}, vecI <: AbstractGPUVector{Int32}, FP, BP, INV, SM} <: AbstractNFFTPlan{T,D,1} N::NTuple{D,Int64} NOut::NTuple{1,Int64} @@ -60,14 +62,21 @@ AbstractNFFTs.size_in(p::GPU_NFFTPlan) = p.N AbstractNFFTs.size_out(p::GPU_NFFTPlan) = p.NOut -function AbstractNFFTs.convolve!(p::GPU_NFFTPlan{T,D, arrTc}, g::arrTc, fHat::arrH) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrH <: arr} +function AbstractNFFTs.convolve!( + p::GPU_NFFTPlan{<:Real, D, <:AbstractGPUArray}, + g::AbstractGPUArray{<:RealOrComplex, D}, + fHat::AbstractGPUVector{<:RealOrComplex}, +) where {D} mul!(fHat, transpose(p.B), vec(g)) - return end -function AbstractNFFTs.convolve_transpose!(p::GPU_NFFTPlan{T,D, arrTc}, fHat::arrH, g::arrTc) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrH <: arr} +function AbstractNFFTs.convolve_transpose!( + p::GPU_NFFTPlan{<:Real, D, <:AbstractGPUArray}, + fHat::AbstractGPUVector{<:RealOrComplex}, + g::AbstractGPUArray{<:RealOrComplex, D}, +) where {D} mul!(vec(g), p.B, fHat) - return + return g end function AbstractNFFTs.deconvolve!(p::GPU_NFFTPlan{T,D, arrTc}, f::arrF, g::arrTc) where {D,T,arr<: AbstractGPUArray, arrTc <: arr, arrF <: arr} @@ -125,4 +134,3 @@ function LinearAlgebra.mul!(f::arrF, pl::Adjoint{Complex{T},<:GPU_NFFTPlan{T,D, return f end - diff --git a/test/convolve.jl b/test/convolve.jl index 220809fa..9e517e47 100644 --- a/test/convolve.jl +++ b/test/convolve.jl @@ -8,11 +8,13 @@ that exceed the 6e4 maximum of Float16. FFTW does not support Float16 plans as of 2026-01-01. =# +using JLArrays: JLArray using NFFT: plan_nfft using AbstractNFFTs: convolve!, convolve_transpose! import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL using Test: @test, @testset, @test_throws + # Determine worst precision of two types # (Does Julia already have such a method?) function worst(T1::Type{<:Number}, T2::Type{<:Number}) @@ -20,11 +22,16 @@ function worst(T1::Type{<:Number}, T2::Type{<:Number}) end worst(a, b, c) = worst(worst(a, b), c) +@testset "worst" begin + @test worst(Float64, Float16, Float32) == Float16 + @test worst(Complex{Float16}, Complex{BigFloat}) == Complex{Float16} +end -N = (9, 8) # 9×8 grid of equally spaced sampling points -Tb = BigFloat -fun = N -> @. Tb((0:(N-1)) / N - 0.5) # 1D grid -nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...) + +N = (9, 8) # grid +Ñ = 2 .* N # for g +J = 101 # nodes for "fHat" +nodes = rand(BigFloat, length(N), J) .- 0.5 F16 = Float16 F32 = Float32 @@ -32,13 +39,6 @@ F64 = Float64 C16 = Complex{F16} C32 = Complex{F32} -@testset "worst" begin - @test worst(F64, F16, F32) == F16 - @test worst(C16, C32) == C16 -end - -J = prod(N) # for f -Ñ = 2 .* N # for g @testset "convolve-throws" begin # throw error when input is complex and output is real @@ -57,78 +57,90 @@ end @testset "convolve!" begin for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL] - p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre) - p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre) + for arr in (Array, JLArray) + + p32 = plan_nfft(arr, Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre) + p64 = plan_nfft(arr, Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre) @assert Ñ == p64.Ñ + _similar = (T, dims) -> similar(p64.tmpVec, T, dims) + for tfun in (identity, t -> Complex{t}) # test both Real and Complex T16 = tfun(F16) T32 = tfun(F32) T64 = tfun(F64) - g64 = rand(T64, Ñ) - fff = Vector{T64}(undef, J) # ground-truth reference - @test fff == convolve!(p64, g64, fff) + g64 = _similar(T64, Ñ) + g64 .= rand(T64, Ñ) + fff = _similar(T64, J) # ground-truth reference + @test fff === convolve!(p64, g64, fff) fmax = maximum(abs, fff) # need to avoid avoid overflow with Float16 for Tg in (T16, T32, T64), Tf in (T32, T64) - f = Vector{Tf}(undef, J) - @test f == convolve!(p64, Tg.(g64), f) + f = _similar(Tf, J) + @test f === convolve!(p64, Tg.(g64), f) T = worst(Tg, Tf) @test f/fmax ≈ T.(fff/fmax) - @test f == convolve!(p32, Tg.(g64), f) + @test f === convolve!(p32, Tg.(g64), f) T = worst(Tg, Tf, T32) @test f / fmax ≈ T.(fff / fmax) end # Float16 test with careful scaling: - f16 = Vector{T16}(undef, J) - @test f16 == convolve!(p64, g64/fmax, f16) + f16 = _similar(T16, J) + @test f16 === convolve!(p64, g64/fmax, f16) @test f16 ≈ fff / fmax - @test f16 == convolve!(p32, g64/fmax, f16) + @test f16 === convolve!(p32, g64/fmax, f16) @test f16 ≈ fff / fmax end # tfun + end # arr end # pre + end # @testset @testset "convolve_transpose!" begin for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL] + for arr in (Array, JLArray) p32 = plan_nfft(Float32.(nodes), N, m = 5, σ = 2.0, precompute=pre) p64 = plan_nfft(Float64.(nodes), N, m = 5, σ = 2.0, precompute=pre) @assert Ñ == p64.Ñ + _similar = (T, dims) -> similar(p64.tmpVec, T, dims) + for tfun in (identity, t -> Complex{t}) # test both Real and Complex T16 = tfun(F16) T32 = tfun(F32) T64 = tfun(F64) - f64 = rand(T64, J) - ggg = Array{T64}(undef, Ñ) # ground-truth reference - @test ggg == convolve_transpose!(p64, f64, ggg) + f64 = _similar(T64, J) + f64 .= rand(T64, J) + ggg = _similar(T64, Ñ) # ground-truth reference + @test ggg === convolve_transpose!(p64, f64, ggg) gmax = maximum(abs, ggg) # need to avoid avoid overflow with Float16 for Tg in (T32, T64), Tf in (T16, T32, T64) g = similar(ggg, Tg) - @test g == convolve_transpose!(p64, Tf.(f64), g) + @test g === convolve_transpose!(p64, Tf.(f64), g) T = worst(Tg, Tf) @test g / gmax ≈ T.(ggg / gmax) - @test g == convolve_transpose!(p32, Tf.(f64), g) + @test g === convolve_transpose!(p32, Tf.(f64), g) T = worst(Tg, Tf, T32) @test g / gmax ≈ T.(ggg / gmax) end # Float16 test with careful scaling: - g16 = Array{T16}(undef, Ñ) - @test g16 == convolve_transpose!(p64, f64/gmax, g16) + g16 = _similar(T16, Ñ) + @test g16 === convolve_transpose!(p64, f64/gmax, g16) @test g16 ≈ ggg / gmax - @test g16 == convolve_transpose!(p32, f64/gmax, g16) + @test g16 === convolve_transpose!(p32, f64/gmax, g16) @test g16 ≈ ggg / gmax - end # pre - end # tfun + end # tfun + end # arr + end # pre end # @testset diff --git a/test/gpu.jl b/test/gpu.jl index c8903c26..2561b1ef 100644 --- a/test/gpu.jl +++ b/test/gpu.jl @@ -1,3 +1,16 @@ +#= +# uncomment this block for stand-alone testing +using JLArrays: JLArray +arrayTypes = [JLArray] +=# + +import FFTW # ESTIMATE +using LinearAlgebra: norm +using NFFT: plan_nfft, NDFTPlan +import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL +using NFFTTools: sdc +using Test: @test, @testset, @test_throws + m = 5 σ = 2.0 @@ -38,21 +51,17 @@ m = 5 @testset "GPU_NFFT Sampling Density" begin + N = (9, 8) # 9×8 grid of equally spaced sampling points # create a 10x10 grid of unit spaced sampling points - N = 10 - g = (0:(N-1)) ./ N .- 0.5 - x = vec(ones(N) * g') - y = vec(g * ones(N)') - nodes = cat(x', y', dims=1) + T = Float32 + fun = N -> @. T((0:(N-1)) / N - 0.5) # 1D grid + nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...) # approximate the density weights - p = plan_nfft(arrayType, nodes, (N, N); m=5, σ=2.0) - weights = Array(sdc(p, iters=5)) - - @info extrema(vec(weights)) - - @test all((≈).(vec(weights), 1 / (N * N), rtol=1e-7)) - + p = plan_nfft(arrayType, nodes, N; m=5, σ=2.0) + weights = sdc(p, iters=5) + weights = Vector(weights) + @test all(≈(1/prod(N)), weights) end end -end \ No newline at end of file +end diff --git a/test/samplingDensity.jl b/test/samplingDensity.jl index 3f2c7bf3..71ed4a4e 100644 --- a/test/samplingDensity.jl +++ b/test/samplingDensity.jl @@ -4,25 +4,57 @@ Test sampling density compensation using NFFT: plan_nfft using NFFTTools: sdc import NFFT # LINEAR, FULL, TENSOR, POLYNOMIAL +import NFFTTools # sdc! using Test: @test, @testset @testset "Sampling Density" begin -# create a 10×8 grid of unit spaced sampling points -N = (10, 8) +# create a 9×8 grid of unit spaced sampling points +N = (9, 8) T = Float32 -g = N -> @. T((0:(N-1)) / N - 0.5) -nodes = hcat([[x; y] for x in g(N[1]), y in g(N[2])]...) +fun = N -> @. T((0:(N-1)) / N - 0.5) # 1D grid +nodes = hcat([[x; y] for x in fun(N[1]), y in fun(N[2])]...) for pre in [NFFT.LINEAR, NFFT.FULL, NFFT.TENSOR, NFFT.POLYNOMIAL] # approximate the density weights p = plan_nfft(nodes, N, m = 5, σ = 2.0, precompute=pre) - weights = sdc(p, iters = 10) +# @show @allocated weights = sdc(p; iters = 10) + weights = sdc(p; iters = 10) @test T == eltype(weights) - @test all(isreal, weights) + @test isreal(weights) @test all(>(0), weights) @test all(≈(1/prod(N)), weights) -end + + # Test the version with provided work buffers + w2 = deepcopy(weights) + fill!(w2, one(T)) + sdc(p; iters = 10, + weights = w2, + weights_tmp = similar(w2), + workg = Array{real(eltype(p.tmpVec))}(undef, p.Ñ), + workf = similar(p.tmpVec, Complex{T}, p.J), + workv = similar(p.tmpVec, Complex{T}, p.N), + ) + @test w2 == weights + + # pre-allocate + weights_tmp = similar(w2) + workg = Array{real(eltype(p.tmpVec))}(undef, p.Ñ) + workf = similar(p.tmpVec, Complex{T}, p.J) + workv = similar(p.tmpVec, Complex{T}, p.N) + fill!(w2, one(T)) + + # the following test should have pretty small allocations + @show @allocated NFFTTools.sdc!(p, 10, + w2, + weights_tmp, + workg, + workf, + workv, + ) + @test w2 == weights + +end # pre end # @testset