From badced127851859a4827e0c6751661220bcd6d23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Tue, 23 May 2023 15:59:30 +0200
Subject: [PATCH 01/22] allocate ManyPencilArrayInplaceRFFT for RFFT! plan

---
 src/allocate.jl | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/allocate.jl b/src/allocate.jl
index 5a23ba19..3642234b 100644
--- a/src/allocate.jl
+++ b/src/allocate.jl
@@ -12,9 +12,13 @@ size `dims`, and a tuple of `N` `PencilArray`s.
 
 !!! note "In-place plans"
 
-    If `p` is an in-place plan, a
+    If `p` is an in-place real-to-real or complex-to-complex plan, a
     [`ManyPencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.ManyPencilArray)
-    is allocated. This
+    is allocated. If `p` is an in-place real-to-complex plan, a
+    [`ManyPencilArrayInplaceRFFT`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.ManyPencilArray)
+    is allocated. 
+    
+    This
     type holds `PencilArray` wrappers for the input and output transforms (as
     well as for intermediate transforms) which share the same space in memory.
     The input and output `PencilArray`s should be respectively accessed by
@@ -61,6 +65,18 @@ function allocate_input(p::PencilFFTPlan{T,N,true} where {T,N})
     ManyPencilArray{T}(undef, pencils...; extra_dims=p.extra_dims)
 end
 
+# In-place version for real-to-complex RFFT!
+function allocate_input(plan::PencilFFTPlan{T,N,true, 
+    Nt,  # number of transformed dimensions
+    Nd,  # number of decomposed dimensions
+    Ne,  # number of extra dimensions
+    G }
+    ; unsafe::Bool = true, fill_with_zeros::Bool = false) where {N,Nd,Ne,G<:PencilFFTs.GlobalFFTParams{T,Nt,true,TR}} where {T<:FFTReal,Nt,TR<:Tuple{Transforms.RFFT!,Vararg{Transforms.FFT!}}}
+    plans = plan.plans
+    pencils = (first(plans).pencil_in, first(plans).pencil_out,  map(pp -> pp.pencil_in, plans[2:end])...)
+    ManyPencilArrayInplaceRFFT{T}(undef, pencils...; extra_dims=plan.extra_dims, unsafe=unsafe, fill_with_zeros = fill_with_zeros)
+end
+
 allocate_input(p::PencilFFTPlan, dims...) =
     _allocate_many(allocate_input, p, dims...)
 

From 5df2f7b26e5451a1f325a0127b3a190b425aab47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Tue, 23 May 2023 16:02:05 +0200
Subject: [PATCH 02/22] definition of ManyPencilArrayInplaceRFFT for RFFT!

---
 src/multiarrays_r2c.jl | 155 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 src/multiarrays_r2c.jl

diff --git a/src/multiarrays_r2c.jl b/src/multiarrays_r2c.jl
new file mode 100644
index 00000000..a39c25af
--- /dev/null
+++ b/src/multiarrays_r2c.jl
@@ -0,0 +1,155 @@
+# copied and modified from https://github.com/jipolanco/PencilArrays.jl/blob/master/src/multiarrays.jl
+
+"""
+    ManyPencilArrayInplaceRFFT{T,N,M}
+
+Container holding `M` different [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) views to the same
+underlying data buffer. All views share the same and dimensionality `N`.
+The element type `T` of the first view is real, that of subsequent views is 
+`Complex{T}`. 
+
+This can be used to perform in-place real-to-complex plan, see also[`Tranforms.RFFT!`](@ref). 
+It is used internally for such transforms by [`allocate_input`](@ref) and should not be constructed directly.
+
+---
+
+    ManyPencilArrayInplaceRFFT{T}(undef, pencils...; extra_dims=())
+
+Create a `ManyPencilArrayInplaceRFFT` container that can hold data of type `T` and `Complex{T}` associated
+to all the given [`Pencil`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.Pencil)s.
+
+The optional `extra_dims` argument is the same as for [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray).
+
+See also [`ManyPencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.ManyPencilArray)
+"""
+struct ManyPencilArrayInplaceRFFT{
+        T,  # element type of real array
+        N,  # number of dimensions of each array (including extra_dims)
+        M,  # number of arrays
+        Arrays <: Tuple{Vararg{PencilArray,M}},
+        DataVector <: AbstractVector{T},
+        DataVectorComplex <: AbstractVector{Complex{T}},
+    }
+    data   :: DataVector
+    data_complex   :: DataVectorComplex
+    arrays :: Arrays
+
+    function ManyPencilArrayInplaceRFFT{T}(
+            init, real_pencil::Pencil{Np}, complex_pencils::Vararg{Pencil{Np}};
+            extra_dims::Dims=()
+        ) where {Np,T}
+        # real_pencil is a Pencil with dimensions `dims` of a real array with no padding and no permutation 
+        # the padded dimensions are (2*(dims[1] ÷ 2 + 1), dims[2:end]...)  
+        # first(complex_pencils) is a Pencil with dimensions of a complex array (dims[1] ÷ 2 + 1, dims[2:end]...) and no permutation 
+        pencils = (real_pencil, complex_pencils...)
+        BufType = PencilArrays.typeof_array(real_pencil)
+        @assert all(p -> PencilArrays.typeof_array(p) === BufType, complex_pencils)
+        # TODO : checks dims of pencils
+        data_length = max(2 .* length.(complex_pencils)...) * prod(extra_dims)
+        data_real = BufType{T}(init, data_length)
+        fill_with_zeros && fill!(data_real, 0.0)
+        # we don't use data_complex = reinterpret(Complex{T}, data_real)
+        # since there is an issue with StridedView of ReinterpretArray, called by _permutedims in PencilArrays.Transpositions
+        ptr_complex = convert(Ptr{Complex{T}}, pointer(data_real)) 
+        data_complex = unsafe_wrap(BufType, ptr_complex, data_length ÷ 2)  
+        array_real = _make_real_array(data_real, extra_dims, real_pencil)
+        arrays_complex = _make_arrays(data_complex, extra_dims, complex_pencils...)
+        arrays = (array_real, arrays_complex...)
+        N = Np + length(extra_dims)
+        M = length(pencils)
+        new{T, N, M, typeof(arrays), typeof(data_real), typeof(data_complex)}(data_real, data_complex, arrays)
+    end
+end
+
+function _make_real_array(data, extra_dims, p)
+    dims_space_local = size_local(p, MemoryOrder())
+    dims_padded_local = (2*(dims_space_local[1] ÷ 2 + 1), dims_space_local[2:end]...)
+    dims = (dims_padded_local..., extra_dims...)
+    axes_local = (Base.OneTo.(dims_space_local)..., Base.OneTo.(extra_dims)...)
+    n = prod(dims)
+    vec = view(data, Base.OneTo(n))
+    parent_arr = reshape(vec, dims)
+    arr = view(parent_arr, axes_local...)
+    PencilArray(p, arr)
+end
+
+function _make_arrays(data::V, extra_dims::Dims, p::Pencil,
+    pens::Vararg{Pencil}) where {V<:Union{DV, Base.ReinterpretArray{Complex{T}, 1, T, DV}}} where {DV <: DenseVector{T}} where {T}
+    dims = (size_local(p, MemoryOrder())..., extra_dims...)
+    n = prod(dims)
+    @assert n == length_local(p) * prod(extra_dims)
+    vec = view(data, Base.OneTo(n))
+    arr = reshape(vec, dims)
+    A = PencilArray(p, arr)
+    (A, _make_arrays(data, extra_dims, pens...)...)
+end
+
+_make_arrays(::V, ::Dims) where {V<:Union{DV, Base.ReinterpretArray{Complex{T}, 1, T, DV}}} where {DV <: DenseVector{T}} where {T} = ()
+
+Base.eltype(::ManyPencilArrayInplaceRFFT{T}) where {T} = T
+Base.ndims(::ManyPencilArrayInplaceRFFT{T,N}) where {T,N} = N
+
+"""
+    Tuple(A::ManyPencilArrayInplaceRFFT) -> (u1, u2, …)
+
+Returns the [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray)s wrapped by `A`.
+
+This can be useful for iterating over all the wrapped arrays.
+"""
+@inline Base.Tuple(A::ManyPencilArrayInplaceRFFT) = A.arrays
+
+"""
+    length(A::ManyPencilArrayInplaceRFFT)
+
+Returns the number of [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray)s wrapped by `A`.
+"""
+Base.length(::ManyPencilArrayInplaceRFFT{T,N,M}) where {T,N,M} = M
+
+"""
+    first(A::ManyPencilArrayInplaceRFFT)
+
+Returns the first [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
+"""
+Base.first(A::ManyPencilArrayInplaceRFFT) = first(A.arrays)
+
+"""
+    last(A::ManyPencilArrayInplaceRFFT)
+
+Returns the last [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
+"""
+Base.last(A::ManyPencilArrayInplaceRFFT) = last(A.arrays)
+
+"""
+    getindex(A::ManyPencilArrayInplaceRFFT, ::Val{i})
+    getindex(A::ManyPencilArrayInplaceRFFT, i::Integer)
+
+Returns the i-th [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
+
+If possible, the `Val{i}` form should be preferred, as it ensures that the full
+type of the returned `PencilArray` is known by the compiler.
+
+See also [`first(::ManyPencilArrayInplaceRFFT)`](@ref), [`last(::ManyPencilArrayInplaceRFFT)`](@ref).
+
+# Example
+
+```julia
+A = ManyPencilArrayInplaceRFFT(pencil1, pencil2, pencil3)
+
+# Get the PencilArray associated to `pencil2`.
+u2 = A[2]
+u2 = A[Val(2)]
+```
+"""
+Base.getindex(A::ManyPencilArrayInplaceRFFT, ::Val{i}) where {i} =
+    _getindex(Val(i), A.arrays...)
+@inline Base.getindex(A::ManyPencilArrayInplaceRFFT, i) = A[Val(i)]
+
+@inline function _getindex(::Val{i}, a, t::Vararg) where {i}
+    i :: Integer
+    i <= 0 && throw(BoundsError("index must be >= 1"))
+    i == 1 && return a
+    _getindex(Val(i - 1), t...)
+end
+
+# This will happen if the index `i` intially passed is too large.
+@inline _getindex(::Val) = throw(BoundsError("invalid index"))
\ No newline at end of file

From f4a153827c287109994eab7e8a034f0eb72a4e2c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Tue, 23 May 2023 16:03:10 +0200
Subject: [PATCH 03/22] inplace RFFT! mul! and ldiv!  operations

---
 src/operations.jl | 55 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 4 deletions(-)

diff --git a/src/operations.jl b/src/operations.jl
index 4bd4e485..155d9ac9 100644
--- a/src/operations.jl
+++ b/src/operations.jl
@@ -2,9 +2,9 @@ const RealOrComplex{T} = Union{T, Complex{T}} where T <: FFTReal
 const PlanArrayPair{P,A} = Pair{P,A} where {P <: PencilPlan1D, A <: PencilArray}
 
 # Types of array over which a PencilFFTPlan can operate.
-# PencilArray and ManyPencilArray are respectively for out-of-place and in-place
+# PencilArray, ManyPencilArray and ManyPencilArrayInplaceRFFT are respectively for out-of-place, in-place and in-place rfft
 # transforms.
-const FFTArray{T,N} = Union{PencilArray{T,N}, ManyPencilArray{T,N}} where {T,N}
+const FFTArray{T,N} = Union{PencilArray{T,N}, ManyPencilArray{T,N}, ManyPencilArrayInplaceRFFT{T,N}} where {T,N}
 
 # Collections of FFTArray (e.g. for vector components), for broadcasting plans
 # to each array. These types are basically those returned by `allocate_input`
@@ -12,6 +12,8 @@ const FFTArray{T,N} = Union{PencilArray{T,N}, ManyPencilArray{T,N}} where {T,N}
 const FFTArrayCollection =
     Union{Tuple{Vararg{A}}, AbstractArray{A}} where {A <: FFTArray}
 
+const PencilMultiarray{T,N} = Union{ManyPencilArray{T,N}, ManyPencilArrayInplaceRFFT{T,N}} where {T,N}
+
 # This allows to treat plans as scalars when broadcasting.
 # This means that, if u = (u1, u2, u3) is a tuple of PencilArrays
 # compatible with p, then p .* u does what one would expect, that is, it
@@ -54,7 +56,7 @@ _maybe_allocate(allocator::Function, p::PencilFFTPlan{T,N,false} where {T,N},
 
 # In-place version
 _maybe_allocate(::Function, ::PencilFFTPlan{T,N,true} where {T,N},
-                src::ManyPencilArray) = src
+                src::PencilMultiarray) = src
 
 # Fallback case.
 function _maybe_allocate(::Function, p::PencilFFTPlan, src::A) where {A}
@@ -76,7 +78,7 @@ end
 
 function _check_arrays(
         p::PencilFFTPlan{T,N,true} where {T,N},
-        Ain::ManyPencilArray, Aout::ManyPencilArray,
+        Ain::PencilMultiarray, Aout::PencilMultiarray,
     )
     if Ain !== Aout
         throw(ArgumentError(
@@ -171,6 +173,45 @@ function _apply_plans!(
     A
 end
 
+# In-place RFFT version
+function _apply_plans!(
+    dir::Val, full_plan::PencilFFTPlan{T,N,true},
+    A::ManyPencilArrayInplaceRFFT{T,N}, A_again::ManyPencilArrayInplaceRFFT{T,N}) where {T<:FFTReal,N}
+    @assert A === A_again
+    # pairs for 1D FFT! plans, RFFT! plan is treated separately
+    pairs = _make_pairs(full_plan.plans[2:end], A.arrays[3:end])
+
+    # Backward transforms are applied in reverse order.
+    pp = dir === Val(FFTW.BACKWARD) ? reverse(pairs) : pairs
+
+    if dir === Val(FFTW.FORWARD)
+        # apply separately first transform (RFFT!)
+        _apply_rfft_plan_in_place!(dir, full_plan, A.arrays[2], first(full_plan.plans), A.arrays[1])
+        # apply recursively all successive transforms (FFT!)
+        _apply_plans_in_place!(dir, full_plan, A.arrays[2], pp...)
+    elseif dir === Val(FFTW.BACKWARD)
+        # apply recursively all transforms but last (BFFT!)
+        _apply_plans_in_place!(dir, full_plan, nothing, pp...)
+        # transpose before last transform
+        t = if pp ==()
+                nothing
+            else
+                @assert Base.mightalias(A.arrays[3], A.arrays[2]) # they're aliased!
+                t = Transpositions.Transposition(A.arrays[2], A.arrays[3],
+                                         method=full_plan.transpose_method)
+                transpose!(t, waitall=false)
+            end
+        # apply separately last transform (BRFFT!)
+        _apply_rfft_plan_in_place!(dir, full_plan, A.arrays[1], first(full_plan.plans), A.arrays[2])
+
+        _wait_mpi_operations!(t, full_plan.timer)
+        # Scale transform.
+        first(A) ./= scale_factor(full_plan)
+    end
+
+    A
+end
+
 function _apply_plans_out_of_place!(
         dir::Val, full_plan::PencilFFTPlan, y::PencilArray, x::PencilArray,
         plan::PencilPlan1D, next_plans::Vararg{PencilPlan1D})
@@ -240,6 +281,12 @@ end
 
 _apply_plans_in_place!(::Val, ::PencilFFTPlan, u_prev::PencilArray) = u_prev
 
+function _apply_rfft_plan_in_place!(dir::Val, full_plan::PencilFFTPlan, A_out ::PencilArray{To,N}, p::PencilPlan1D{ti,to,Pi,Po,Tr}, A_in ::PencilArray{Ti,N}) where
+    {Ti<:RealOrComplex{T},To<:RealOrComplex{T},ti<:RealOrComplex{T},to<:RealOrComplex{T},Pi,Po,N,Tr<:Union{Transforms.RFFT!,Transforms.BRFFT!}} where T<:FFTReal
+   fft_plan = dir === Val(FFTW.FORWARD) ? p.fft_plan : p.bfft_plan
+   @timeit_debug full_plan.timer "FFT!" mul!(parent(A_out), fft_plan, parent(A_in)) #fft_plan * parent(A_in) # A_in_padded
+end
+
 _split_first(a, b...) = (a, b)  # (x, y, z, w) -> (x, (y, z, w))
 
 function _make_pairs(plans::Tuple{Vararg{PencilPlan1D,N}},

From 0079ee7505ba135998b78247f337ff6710d7cc6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Tue, 23 May 2023 16:04:16 +0200
Subject: [PATCH 04/22] define RFFT! and BRFFT! + wrappers to FFTW plans

---
 src/Transforms/r2c.jl | 99 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 11 deletions(-)

diff --git a/src/Transforms/r2c.jl b/src/Transforms/r2c.jl
index 27af8a78..1f32bc6e 100644
--- a/src/Transforms/r2c.jl
+++ b/src/Transforms/r2c.jl
@@ -10,6 +10,13 @@ See also
 """
 struct RFFT <: AbstractTransform end
 
+"""
+    RFFT!()
+
+In-place version of [`RFFT`](@ref).
+"""
+struct RFFT! <: AbstractTransform end
+
 """
     BRFFT(d::Integer)
     BRFFT((d1, d2, ..., dN))
@@ -40,23 +47,40 @@ struct BRFFT <: AbstractTransform
     even_output :: Bool
 end
 
-_show_extra_info(io::IO, tr::BRFFT) = print(io, tr.even_output ? "{even}" : "{odd}")
+"""
+    BRFFT!(d::Integer)
+    BRFFT!((d1, d2, ..., dN))
+
+In-place version of [`BRFFT`](@ref).
+"""
+struct BRFFT! <: AbstractTransform
+    even_output :: Bool
+end
+
+const TransformR2C = Union{RFFT, RFFT!}
+const TransformC2R = Union{BRFFT, BRFFT!}
+
+_show_extra_info(io::IO, tr::TransformC2R) = print(io, tr.even_output ? "{even}" : "{odd}")
 
 BRFFT(d::Integer) = BRFFT(iseven(d))
 BRFFT(ts::Tuple) = BRFFT(last(ts))  # c2r transform is applied along the **last** dimension (opposite of FFTW)
+BRFFT!(d::Integer) = BRFFT!(iseven(d))
+BRFFT!(ts::Tuple) = BRFFT!(last(ts))  # c2r transform is applied along the **last** dimension (opposite of FFTW)
 
 is_inplace(::Union{RFFT, BRFFT}) = false
+is_inplace(::Union{RFFT!, BRFFT!}) = true
 
-length_output(::RFFT, length_in::Integer) = div(length_in, 2) + 1
-length_output(tr::BRFFT, length_in::Integer) = 2 * length_in - 1 - tr.even_output
+length_output(::TransformR2C, length_in::Integer) = div(length_in, 2) + 1
+length_output(tr::TransformC2R, length_in::Integer) = 2 * length_in - 1 - tr.even_output
 
-eltype_output(::RFFT, ::Type{T}) where {T <: FFTReal} = Complex{T}
-eltype_output(::BRFFT, ::Type{Complex{T}}) where {T <: FFTReal} = T
+eltype_output(::TransformR2C, ::Type{T}) where {T <: FFTReal} = Complex{T}
+eltype_output(::TransformC2R, ::Type{Complex{T}}) where {T <: FFTReal} = T
 
-eltype_input(::RFFT, ::Type{T}) where {T <: FFTReal} = T
-eltype_input(::BRFFT, ::Type{T}) where {T <: FFTReal} = Complex{T}
+eltype_input(::TransformR2C, ::Type{T}) where {T <: FFTReal} = T
+eltype_input(::TransformC2R, ::Type{T}) where {T <: FFTReal} = Complex{T}
 
 plan(::RFFT, A::AbstractArray, args...; kwargs...) = FFTW.plan_rfft(A, args...; kwargs...)
+plan(::RFFT!, A::AbstractArray, args...; kwargs...) = plan_rfft!(A, args...; kwargs...)
 
 # NOTE: unlike most FFTW plans, this function also requires the length `d` of
 # the transform output along the first transformed dimension.
@@ -65,23 +89,76 @@ function plan(tr::BRFFT, A::AbstractArray, dims; kwargs...)
     d = length_output(tr, Nin)
     FFTW.plan_brfft(A, d, dims; kwargs...)
 end
+function plan(tr::BRFFT!, A::AbstractArray, dims; kwargs...)
+    Nin = size(A, first(dims))  # input length along first dimension
+    d = length_output(tr, Nin)
+    plan_brfft!(A, d, dims; kwargs...) 
+end
 
 binv(::RFFT, d) = BRFFT(d)
 binv(::BRFFT, d) = RFFT()
+binv(::RFFT!, d) = BRFFT!(d)
+binv(::BRFFT!, d) = RFFT!()
 
-function scale_factor(tr::BRFFT, A::ComplexArray, dims)
+function scale_factor(tr::TransformC2R, A::ComplexArray, dims)
     prod(dims; init = one(Int)) do i
         n = size(A, i)
         i == last(dims) ? length_output(tr, n) : n
     end
 end
 
-scale_factor(::RFFT, A::RealArray, dims) = _prod_dims(A, dims)
+scale_factor(::TransformR2C, A::RealArray, dims) = _prod_dims(A, dims)
 
 # r2c along the first dimension, then c2c for the other dimensions.
 expand_dims(tr::RFFT, ::Val{N}) where {N} =
     N === 0 ? () : (tr, expand_dims(FFT(), Val(N - 1))...)
+expand_dims(tr::RFFT!, ::Val{N}) where {N} =
+    N === 0 ? () : (tr, expand_dims(FFT!(), Val(N - 1))...)
 
 expand_dims(tr::BRFFT, ::Val{N}) where {N} = (BFFT(), expand_dims(tr, Val(N - 1))...)
-expand_dims(tr::BRFFT, ::Val{1}) = (tr, )
-expand_dims(tr::BRFFT, ::Val{0}) = ()
+expand_dims(tr::BRFFT!, ::Val{N}) where {N} = (BFFT!(), expand_dims(tr, Val(N - 1))...)
+expand_dims(tr::TransformC2R, ::Val{1}) = (tr, )
+expand_dims(tr::TransformC2R, ::Val{0}) = ()
+
+## FFTW wrappers for inplace RFFT plans
+
+function plan_rfft!(X::StridedArray{T,N}, region;
+    flags::Integer=FFTW.ESTIMATE,
+    timelimit::Real=FFTW.NO_TIMELIMIT) where {T<:FFTW.fftwReal,N}
+    sz = size(X) # physical input size (real)
+    osize = FFTW.rfft_output_size(sz, region) # output size (complex)
+    isize = ntuple(i -> i == first(region) ? 2osize[i] : osize[i], Val(N)) # padded input size (real)
+    X_padded = flags&FFTW.ESTIMATE != 0 ? # is time measurement not required ?
+        FFTW.FakeArray{T,N}(sz, FFTW.colmajorstrides(isize)) : # fake allocation, only size and strides matter
+        view(Array{T}(undef, isize), Base.OneTo.(sz)...) # allocation
+    Y = flags&FFTW.ESTIMATE != 0 ? 
+        FFTW.FakeArray{Complex{T}}(osize) : # fake allocation, only size and strides matter
+        Array{Complex{T}}(undef, osize) # allocation
+    return FFTW.rFFTWPlan{T,FFTW.FORWARD,true,N}(X_padded, Y, region, flags, timelimit)
+end
+
+function plan_brfft!(X::StridedArray{Complex{T},N}, d, region;
+    flags::Integer=FFTW.ESTIMATE,
+    timelimit::Real=FFTW.NO_TIMELIMIT) where {T<:FFTW.fftwReal,N}
+    isize = size(X) # input size (complex)
+    osize = ntuple(i -> i == first(region) ? 2isize[i] : isize[i], Val(N)) # padded output size (real)
+    sz = FFTW.brfft_output_size(X, d, region) # physical output size (real)
+    # Y is padded
+    Y = flags&FFTW.ESTIMATE != 0 ?  # is time measurement not required ?
+        FFTW.FakeArray{T,N}(sz, FFTW.colmajorstrides(osize)) : # fake allocation, only size and strides matter
+        view(Array{T}(undef, osize), Base.OneTo.(sz)...) # allocation
+    return FFTW.rFFTWPlan{Complex{T},FFTW.BACKWARD,true,N}(X, Y, region, flags, timelimit)
+end
+#= function Base.:*(p::FFTW.rFFTWPlan{T,FFTW.FORWARD,true,N}, X::StridedArray{T,N}) where {T<:FFTW.fftwReal,N}
+    ptr_complex = convert(Ptr{Complex{T}}, pointer(X)) 
+    Y = unsafe_wrap(Array, ptr_complex, p.osz) # reinterpretation of X as a complex Array
+    mul!(Y, p, X) # assesses size, strides and memory alignment
+end
+
+function Base.:*(p::FFTW.rFFTWPlan{Complex{T},FFTW.BACKWARD,true,N}, X::StridedArray{Complex{T},N}) where {T<:FFTW.fftwReal,N}
+    ptr_real = convert(Ptr{T}, pointer(X))
+    osize = ntuple(i -> i == first(p.region) ? 2p.sz[i] : p.sz[i], Val(N)) # padded real size
+    Y_parent = unsafe_wrap(Array, ptr_real, osize) # reinterpretation of X as a real Array
+    Y = view(Y_parent, Base.OneTo.(p.osz)...) # skip padded values, view to physical real size only
+    mul!(Y, p, X) # assesses size, strides and memory alignment
+end =#
\ No newline at end of file

From fd51debba80524ae4b8c17779b7e5885905578c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Tue, 23 May 2023 16:04:47 +0200
Subject: [PATCH 05/22] updated tests for rfft!

---
 test/rfft.jl | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/test/rfft.jl b/test/rfft.jl
index d955ca59..cec24ba9 100644
--- a/test/rfft.jl
+++ b/test/rfft.jl
@@ -5,6 +5,7 @@ import MPI
 
 using BenchmarkTools
 using LinearAlgebra
+using FFTW
 using Printf
 using Random
 using Test
@@ -190,6 +191,78 @@ function test_rfft(size_in; benchmark=true)
     MPI.Barrier(comm)
 end
 
+function test_rfft!(size_in)
+    comm = MPI.COMM_WORLD
+    rank = MPI.Comm_rank(comm)
+
+    rank == 0 && @info "Input data size: $size_in"
+
+    # Test creating Pencil and creating plan.
+    pen = Pencil(size_in, comm)
+
+    inplace_plan = @inferred PencilFFTPlan(pen, Transforms.RFFT!())
+    outofplace_place = @inferred PencilFFTPlan(pen, Transforms.RFFT())
+
+    # Allocate and initialise scalar fields 
+    u = allocate_input(inplace_plan) #, Val(3))
+    x = first(u) ; x̂ = last(u) # Real and Complex views
+
+    v = allocate_input(outofplace_place)
+    v̂ = allocate_output(outofplace_place)
+
+    fill!(x, 0.0) ; rank == 0 && (x[1] = 1.0) ; rank == 0 && (x[2] = 2.0) ;
+    fill!(v, 0.0) ; rank == 0 && (v[1] = 1.0) ; rank == 0 && (v[2] = 2.0) ;
+
+    @testset "RFFT! vs RFFT" begin
+        mul!(u, inplace_plan, u)
+        mul!(v̂, outofplace_place, v)
+        @test all(isapprox.(x̂, v̂, atol=1e-8))
+
+        ldiv!(u, inplace_plan, u)
+        ldiv!(v, outofplace_place, v̂)
+        @test all(isapprox.(x, v, atol=1e-8))
+        @test all(isapprox.(x[1:3], [1.0, 2.0, 0.0], atol = 1e-8))
+    end
+
+    MPI.Barrier(comm)
+end
+
+function test_1D_rfft!(size_in; flags=FFTW.ESTIMATE)
+    dims = (size_in,) ; 
+    dims_padded = (2(dims[1] ÷ 2 + 1), dims[2:end]...) ; 
+    dims_fourier = ((dims[1] ÷ 2 + 1), dims[2:end]...)
+
+    A = zeros(Float64, dims_padded) ; 
+    a = view(A, Base.OneTo.(dims)...) ; 
+    â = reinterpret(Complex{Float64}, A)
+
+    â2 = zeros(Complex{Float64}, dims_fourier) ;
+    a2 = zeros(Float64, dims) ;
+    
+    p = Transforms.plan_rfft!(a, 1, flags=flags) ; #display(p)
+    p2 = FFTW.plan_rfft(a2, 1, flags=flags) ;
+    bp = Transforms.plan_brfft!(â, dims[1], 1, flags=flags) ; 
+    bp2 = FFTW.plan_brfft(â, dims[1], 1, flags=flags) ; 
+
+    fill!(a2, 0.0) ; a2[1] = 1 ; a2[2] = 2 ;
+    fill!(a, 0.0) ; a[1] = 1 ; a[2] = 2 ;
+    
+    @testset "1D RFFT! vs RFFT" begin
+        mul!(â, p, a) ; 
+        mul!(â2, p2, a2) ;
+        @test all(isapprox.(â2, â, atol = 1e-8))
+        
+        mul!(a, bp, â) ; 
+        mul!(a2, bp2, â2) ;
+        @test all(isapprox.(a2, a, atol = 1e-8))
+        
+        a /= size_in ; a2 /= size_in ;
+        @test all(isapprox.(a[1:3], [1.0, 2.0, 0.0], atol = 1e-8))
+    end
+
+    MPI.Barrier(comm)
+end
+
 MPI.Init()
 comm = MPI.COMM_WORLD
 rank = MPI.Comm_rank(comm)
@@ -198,3 +271,11 @@ rank == 0 || redirect_stdout(devnull)
 test_rfft(DATA_DIMS_EVEN)
 println()
 test_rfft(DATA_DIMS_ODD, benchmark=false)
+
+test_1D_rfft!(first(DATA_DIMS_ODD))
+test_1D_rfft!(first(DATA_DIMS_EVEN))
+# test_1D_rfft!(first(DATA_DIMS_EVEN), flags = FFTW.MEASURE) # test fails for this specific dimension, possibly due to a FFTW bug
+
+test_rfft!(DATA_DIMS_ODD)
+# test_rfft!(DATA_DIMS_EVEN) # test fails for this specific dimension, possibly due to a FFTW bug
+test_rfft!(DATA_DIMS_EVEN .+ 2)

From 1618d71ea24e31ec745eacbfbecb9059d9914906 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Tue, 23 May 2023 16:05:23 +0200
Subject: [PATCH 06/22] include ManyPencilArrayInplaceRFFT

---
 src/PencilFFTs.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/PencilFFTs.jl b/src/PencilFFTs.jl
index f2784cea..8c70555a 100644
--- a/src/PencilFFTs.jl
+++ b/src/PencilFFTs.jl
@@ -27,6 +27,7 @@ const AbstractTransformList{N} = NTuple{N, AbstractTransform} where N
 
 include("global_params.jl")
 include("plans.jl")
+include("multiarrays_r2c.jl")
 include("allocate.jl")
 include("operations.jl")
 

From a0ab500533623bb280ceaeb2dbb2c2d6070e10f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Wed, 24 May 2023 13:23:47 +0200
Subject: [PATCH 07/22] changes to ManyPencilArrayRFFT!

---
 src/allocate.jl        | 11 +++----
 src/multiarrays_r2c.jl | 72 +++++++++++++++++++-----------------------
 2 files changed, 38 insertions(+), 45 deletions(-)

diff --git a/src/allocate.jl b/src/allocate.jl
index 3642234b..e1698dcf 100644
--- a/src/allocate.jl
+++ b/src/allocate.jl
@@ -15,11 +15,11 @@ size `dims`, and a tuple of `N` `PencilArray`s.
     If `p` is an in-place real-to-real or complex-to-complex plan, a
     [`ManyPencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.ManyPencilArray)
     is allocated. If `p` is an in-place real-to-complex plan, a
-    [`ManyPencilArrayInplaceRFFT`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.ManyPencilArray)
+    []`ManyPencilArrayRFFT!`](@ref)
     is allocated. 
     
-    This
-    type holds `PencilArray` wrappers for the input and output transforms (as
+    These
+    types hold `PencilArray` wrappers for the input and output transforms (as
     well as for intermediate transforms) which share the same space in memory.
     The input and output `PencilArray`s should be respectively accessed by
     calling [`first(::ManyPencilArray)`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#Base.first-Tuple{ManyPencilArray}) and
@@ -70,11 +70,10 @@ function allocate_input(plan::PencilFFTPlan{T,N,true,
     Nt,  # number of transformed dimensions
     Nd,  # number of decomposed dimensions
     Ne,  # number of extra dimensions
-    G }
-    ; unsafe::Bool = true, fill_with_zeros::Bool = false) where {N,Nd,Ne,G<:PencilFFTs.GlobalFFTParams{T,Nt,true,TR}} where {T<:FFTReal,Nt,TR<:Tuple{Transforms.RFFT!,Vararg{Transforms.FFT!}}}
+    G }) where {N,Nd,Ne,G<:PencilFFTs.GlobalFFTParams{T,Nt,true,TR}} where {T<:FFTReal,Nt,TR<:Tuple{Transforms.RFFT!,Vararg{Transforms.FFT!}}}
     plans = plan.plans
     pencils = (first(plans).pencil_in, first(plans).pencil_out,  map(pp -> pp.pencil_in, plans[2:end])...)
-    ManyPencilArrayInplaceRFFT{T}(undef, pencils...; extra_dims=plan.extra_dims, unsafe=unsafe, fill_with_zeros = fill_with_zeros)
+    ManyPencilArrayRFFT!{T}(undef, pencils...; extra_dims=plan.extra_dims)
 end
 
 allocate_input(p::PencilFFTPlan, dims...) =
diff --git a/src/multiarrays_r2c.jl b/src/multiarrays_r2c.jl
index a39c25af..2a04de3c 100644
--- a/src/multiarrays_r2c.jl
+++ b/src/multiarrays_r2c.jl
@@ -1,7 +1,7 @@
 # copied and modified from https://github.com/jipolanco/PencilArrays.jl/blob/master/src/multiarrays.jl
-
+import PencilArrays:ManyPencilArray, _make_arrays
 """
-    ManyPencilArrayInplaceRFFT{T,N,M}
+    ManyPencilArrayRFFT!{T,N,M}
 
 Container holding `M` different [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) views to the same
 underlying data buffer. All views share the same and dimensionality `N`.
@@ -13,16 +13,16 @@ It is used internally for such transforms by [`allocate_input`](@ref) and should
 
 ---
 
-    ManyPencilArrayInplaceRFFT{T}(undef, pencils...; extra_dims=())
+    ManyPencilArrayRFFT!{T}(undef, pencils...; extra_dims=())
 
-Create a `ManyPencilArrayInplaceRFFT` container that can hold data of type `T` and `Complex{T}` associated
+Create a `ManyPencilArrayRFFT!` container that can hold data of type `T` and `Complex{T}` associated
 to all the given [`Pencil`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.Pencil)s.
 
 The optional `extra_dims` argument is the same as for [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray).
 
 See also [`ManyPencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.ManyPencilArray)
 """
-struct ManyPencilArrayInplaceRFFT{
+struct ManyPencilArrayRFFT!{ ## TODO: redefine PencilArrays.ManyPencilArray instead or creating new struct, with eltype RealOrComplex{T}
         T,  # element type of real array
         N,  # number of dimensions of each array (including extra_dims)
         M,  # number of arrays
@@ -34,27 +34,31 @@ struct ManyPencilArrayInplaceRFFT{
     data_complex   :: DataVectorComplex
     arrays :: Arrays
 
-    function ManyPencilArrayInplaceRFFT{T}(
+    function ManyPencilArrayRFFT!{T}(
             init, real_pencil::Pencil{Np}, complex_pencils::Vararg{Pencil{Np}};
             extra_dims::Dims=()
-        ) where {Np,T}
+        ) where {Np,T<:FFTReal}
         # real_pencil is a Pencil with dimensions `dims` of a real array with no padding and no permutation 
         # the padded dimensions are (2*(dims[1] ÷ 2 + 1), dims[2:end]...)  
         # first(complex_pencils) is a Pencil with dimensions of a complex array (dims[1] ÷ 2 + 1, dims[2:end]...) and no permutation 
         pencils = (real_pencil, complex_pencils...)
         BufType = PencilArrays.typeof_array(real_pencil)
         @assert all(p -> PencilArrays.typeof_array(p) === BufType, complex_pencils)
-        # TODO : checks dims of pencils
+        @assert size_global(real_pencil)[2:end] ==  size_global(first(complex_pencils))[2:end] 
+        @assert first(size_global(real_pencil)) ÷ 2 + 1 ==  first(size_global(first(complex_pencils)))
+
         data_length = max(2 .* length.(complex_pencils)...) * prod(extra_dims)
         data_real = BufType{T}(init, data_length)
-        fill_with_zeros && fill!(data_real, 0.0)
+
         # we don't use data_complex = reinterpret(Complex{T}, data_real)
         # since there is an issue with StridedView of ReinterpretArray, called by _permutedims in PencilArrays.Transpositions
         ptr_complex = convert(Ptr{Complex{T}}, pointer(data_real)) 
         data_complex = unsafe_wrap(BufType, ptr_complex, data_length ÷ 2)  
+        
         array_real = _make_real_array(data_real, extra_dims, real_pencil)
-        arrays_complex = _make_arrays(data_complex, extra_dims, complex_pencils...)
+        arrays_complex = PencilArrays._make_arrays(data_complex, extra_dims, complex_pencils...)
         arrays = (array_real, arrays_complex...)
+        
         N = Np + length(extra_dims)
         M = length(pencils)
         new{T, N, M, typeof(arrays), typeof(data_real), typeof(data_complex)}(data_real, data_complex, arrays)
@@ -73,76 +77,66 @@ function _make_real_array(data, extra_dims, p)
     PencilArray(p, arr)
 end
 
-function _make_arrays(data::V, extra_dims::Dims, p::Pencil,
-    pens::Vararg{Pencil}) where {V<:Union{DV, Base.ReinterpretArray{Complex{T}, 1, T, DV}}} where {DV <: DenseVector{T}} where {T}
-    dims = (size_local(p, MemoryOrder())..., extra_dims...)
-    n = prod(dims)
-    @assert n == length_local(p) * prod(extra_dims)
-    vec = view(data, Base.OneTo(n))
-    arr = reshape(vec, dims)
-    A = PencilArray(p, arr)
-    (A, _make_arrays(data, extra_dims, pens...)...)
-end
-
-_make_arrays(::V, ::Dims) where {V<:Union{DV, Base.ReinterpretArray{Complex{T}, 1, T, DV}}} where {DV <: DenseVector{T}} where {T} = ()
-
-Base.eltype(::ManyPencilArrayInplaceRFFT{T}) where {T} = T
-Base.ndims(::ManyPencilArrayInplaceRFFT{T,N}) where {T,N} = N
+# all methods redefinitions below could be avoided by using a 
+# ManyPencilArray{ComplexOrReal{T}} instead of a ManyPencilArrayRFFT!{T}
+# or defining a supertype in PencilArrays/src/multiarrays.jl
+Base.eltype(::ManyPencilArrayRFFT!{T}) where {T} = T
+Base.ndims(::ManyPencilArrayRFFT!{T,N}) where {T,N} = N
 
 """
-    Tuple(A::ManyPencilArrayInplaceRFFT) -> (u1, u2, …)
+    Tuple(A::ManyPencilArrayRFFT!) -> (u1, u2, …)
 
 Returns the [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray)s wrapped by `A`.
 
 This can be useful for iterating over all the wrapped arrays.
 """
-@inline Base.Tuple(A::ManyPencilArrayInplaceRFFT) = A.arrays
+@inline Base.Tuple(A::ManyPencilArrayRFFT!) = A.arrays
 
 """
-    length(A::ManyPencilArrayInplaceRFFT)
+    length(A::ManyPencilArrayRFFT!)
 
 Returns the number of [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray)s wrapped by `A`.
 """
-Base.length(::ManyPencilArrayInplaceRFFT{T,N,M}) where {T,N,M} = M
+Base.length(::ManyPencilArrayRFFT!{T,N,M}) where {T,N,M} = M
 
 """
-    first(A::ManyPencilArrayInplaceRFFT)
+    first(A::ManyPencilArrayRFFT!)
 
 Returns the first [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
 """
-Base.first(A::ManyPencilArrayInplaceRFFT) = first(A.arrays)
+Base.first(A::ManyPencilArrayRFFT!) = first(A.arrays)
 
 """
-    last(A::ManyPencilArrayInplaceRFFT)
+    last(A::ManyPencilArrayRFFT!)
 
 Returns the last [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
 """
-Base.last(A::ManyPencilArrayInplaceRFFT) = last(A.arrays)
+Base.last(A::ManyPencilArrayRFFT!) = last(A.arrays)
 
 """
-    getindex(A::ManyPencilArrayInplaceRFFT, ::Val{i})
-    getindex(A::ManyPencilArrayInplaceRFFT, i::Integer)
+    getindex(A::ManyPencilArrayRFFT!, ::Val{i})
+    getindex(A::ManyPencilArrayRFFT!, i::Integer)
 
 Returns the i-th [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
 
 If possible, the `Val{i}` form should be preferred, as it ensures that the full
 type of the returned `PencilArray` is known by the compiler.
 
-See also [`first(::ManyPencilArrayInplaceRFFT)`](@ref), [`last(::ManyPencilArrayInplaceRFFT)`](@ref).
+See also [`first(::ManyPencilArrayRFFT!)`](@ref), [`last(::ManyPencilArrayRFFT!)`](@ref).
 
 # Example
 
 ```julia
-A = ManyPencilArrayInplaceRFFT(pencil1, pencil2, pencil3)
+A = ManyPencilArrayRFFT!(pencil1, pencil2, pencil3)
 
 # Get the PencilArray associated to `pencil2`.
 u2 = A[2]
 u2 = A[Val(2)]
 ```
 """
-Base.getindex(A::ManyPencilArrayInplaceRFFT, ::Val{i}) where {i} =
+Base.getindex(A::ManyPencilArrayRFFT!, ::Val{i}) where {i} =
     _getindex(Val(i), A.arrays...)
-@inline Base.getindex(A::ManyPencilArrayInplaceRFFT, i) = A[Val(i)]
+@inline Base.getindex(A::ManyPencilArrayRFFT!, i) = A[Val(i)]
 
 @inline function _getindex(::Val{i}, a, t::Vararg) where {i}
     i :: Integer

From 8029a10487b77be6883207f357ace16988e8a868 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Wed, 24 May 2023 13:25:02 +0200
Subject: [PATCH 08/22] separated scaling and bacward transform

---
 src/operations.jl | 47 +++++++++++++++++++++++++++++------------------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/src/operations.jl b/src/operations.jl
index 155d9ac9..39407e53 100644
--- a/src/operations.jl
+++ b/src/operations.jl
@@ -2,9 +2,9 @@ const RealOrComplex{T} = Union{T, Complex{T}} where T <: FFTReal
 const PlanArrayPair{P,A} = Pair{P,A} where {P <: PencilPlan1D, A <: PencilArray}
 
 # Types of array over which a PencilFFTPlan can operate.
-# PencilArray, ManyPencilArray and ManyPencilArrayInplaceRFFT are respectively for out-of-place, in-place and in-place rfft
+# PencilArray, ManyPencilArray and ManyPencilArrayRFFT! are respectively for out-of-place, in-place and in-place rfft
 # transforms.
-const FFTArray{T,N} = Union{PencilArray{T,N}, ManyPencilArray{T,N}, ManyPencilArrayInplaceRFFT{T,N}} where {T,N}
+const FFTArray{T,N} = Union{PencilArray{T,N}, ManyPencilArray{T,N}, ManyPencilArrayRFFT!{T,N}} where {T,N}
 
 # Collections of FFTArray (e.g. for vector components), for broadcasting plans
 # to each array. These types are basically those returned by `allocate_input`
@@ -12,7 +12,7 @@ const FFTArray{T,N} = Union{PencilArray{T,N}, ManyPencilArray{T,N}, ManyPencilAr
 const FFTArrayCollection =
     Union{Tuple{Vararg{A}}, AbstractArray{A}} where {A <: FFTArray}
 
-const PencilMultiarray{T,N} = Union{ManyPencilArray{T,N}, ManyPencilArrayInplaceRFFT{T,N}} where {T,N}
+const PencilMultiarray{T,N} = Union{ManyPencilArray{T,N}, ManyPencilArrayRFFT!{T,N}} where {T,N}
 
 # This allows to treat plans as scalars when broadcasting.
 # This means that, if u = (u1, u2, u3) is a tuple of PencilArrays
@@ -30,13 +30,24 @@ function LinearAlgebra.mul!(
     end
 end
 
-# Backward transforms
+# Backward transforms (unscaled)
+function bmul!(
+        dst::FFTArray{T,N}, p::PencilFFTPlan{T,N}, src::FFTArray{Ti,N},
+    ) where {T, N, Ti <: RealOrComplex}
+    @timeit_debug p.timer "PencilFFTs bmul!" begin
+        _check_arrays(p, dst, src)
+        _apply_plans!(Val(FFTW.BACKWARD), p, dst, src)
+    end
+end
+
+# Inverse transforms (scaled)
 function LinearAlgebra.ldiv!(
         dst::FFTArray{T,N}, p::PencilFFTPlan{T,N}, src::FFTArray{Ti,N},
     ) where {T, N, Ti <: RealOrComplex}
     @timeit_debug p.timer "PencilFFTs ldiv!" begin
         _check_arrays(p, dst, src)
         _apply_plans!(Val(FFTW.BACKWARD), p, dst, src)
+        _scale!(dst, inv(scale_factor(p)))
     end
 end
 
@@ -50,6 +61,14 @@ function Base.:\(p::PencilFFTPlan, src::FFTArray)
     ldiv!(dst, p, src)
 end
 
+function _scale!(dst::PencilArray{<:RealOrComplex{T},N}, inv_scale::T) where {T,N}
+    parent(dst) .*= inv_scale
+end
+
+function _scale!(dst::PencilMultiarray{<:RealOrComplex{T},N}, inv_scale::T) where {T,N}
+    parent(first(dst)) .*= inv_scale
+end
+
 # Out-of-place version
 _maybe_allocate(allocator::Function, p::PencilFFTPlan{T,N,false} where {T,N},
                 ::PencilArray) = allocator(p)
@@ -123,6 +142,10 @@ for f in (:mul!, :ldiv!)
         (check_compatible(dst, src); $f.(dst, p, src))
 end
 
+bmul!(dst::FFTArrayCollection, p::PencilFFTPlan,
+                           src::FFTArrayCollection) =
+        (check_compatible(dst, src); bmul!.(dst, p, src))
+
 for f in (:*, :\)
     @eval Base.$f(p::PencilFFTPlan, src::FFTArrayCollection) =
         $f.(p, src)
@@ -145,11 +168,6 @@ function _apply_plans!(
 
     _apply_plans_out_of_place!(dir, full_plan, y, x, plans...)
 
-    if dir === Val(FFTW.BACKWARD)
-        # Scale transform.
-        y ./= scale_factor(full_plan)
-    end
-
     y
 end
 
@@ -165,18 +183,13 @@ function _apply_plans!(
 
     _apply_plans_in_place!(dir, full_plan, nothing, pp...)
 
-    if dir === Val(FFTW.BACKWARD)
-        # Scale transform.
-        first(A) ./= scale_factor(full_plan)
-    end
-
     A
 end
 
 # In-place RFFT version
 function _apply_plans!(
     dir::Val, full_plan::PencilFFTPlan{T,N,true},
-    A::ManyPencilArrayInplaceRFFT{T,N}, A_again::ManyPencilArrayInplaceRFFT{T,N}) where {T<:FFTReal,N}
+    A::ManyPencilArrayRFFT!{T,N}, A_again::ManyPencilArrayRFFT!{T,N}) where {T<:FFTReal,N}
     @assert A === A_again
     # pairs for 1D FFT! plans, RFFT! plan is treated separately
     pairs = _make_pairs(full_plan.plans[2:end], A.arrays[3:end])
@@ -205,8 +218,6 @@ function _apply_plans!(
         _apply_rfft_plan_in_place!(dir, full_plan, A.arrays[1], first(full_plan.plans), A.arrays[2])
 
         _wait_mpi_operations!(t, full_plan.timer)
-        # Scale transform.
-        first(A) ./= scale_factor(full_plan)
     end
 
     A
@@ -284,7 +295,7 @@ _apply_plans_in_place!(::Val, ::PencilFFTPlan, u_prev::PencilArray) = u_prev
 function _apply_rfft_plan_in_place!(dir::Val, full_plan::PencilFFTPlan, A_out ::PencilArray{To,N}, p::PencilPlan1D{ti,to,Pi,Po,Tr}, A_in ::PencilArray{Ti,N}) where
     {Ti<:RealOrComplex{T},To<:RealOrComplex{T},ti<:RealOrComplex{T},to<:RealOrComplex{T},Pi,Po,N,Tr<:Union{Transforms.RFFT!,Transforms.BRFFT!}} where T<:FFTReal
    fft_plan = dir === Val(FFTW.FORWARD) ? p.fft_plan : p.bfft_plan
-   @timeit_debug full_plan.timer "FFT!" mul!(parent(A_out), fft_plan, parent(A_in)) #fft_plan * parent(A_in) # A_in_padded
+   @timeit_debug full_plan.timer "FFT!" mul!(parent(A_out), fft_plan, parent(A_in)) 
 end
 
 _split_first(a, b...) = (a, b)  # (x, y, z, w) -> (x, (y, z, w))

From 658bd5e1391bc72e427ed1b03b29376ce5b4eab5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Wed, 24 May 2023 13:25:32 +0200
Subject: [PATCH 09/22] fixed rfft! and brff! plan creation

---
 src/Transforms/r2c.jl | 37 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

diff --git a/src/Transforms/r2c.jl b/src/Transforms/r2c.jl
index 1f32bc6e..a5fa4203 100644
--- a/src/Transforms/r2c.jl
+++ b/src/Transforms/r2c.jl
@@ -1,5 +1,5 @@
 ## Real-to-complex and complex-to-real transforms.
-
+import FFTW: rFFTWPlan, FORWARD, BACKWARD, ESTIMATE, NO_TIMELIMIT, FakeArray, colmajorstrides, brfft_output_size
 """
     RFFT()
 
@@ -128,12 +128,14 @@ function plan_rfft!(X::StridedArray{T,N}, region;
     sz = size(X) # physical input size (real)
     osize = FFTW.rfft_output_size(sz, region) # output size (complex)
     isize = ntuple(i -> i == first(region) ? 2osize[i] : osize[i], Val(N)) # padded input size (real)
-    X_padded = flags&FFTW.ESTIMATE != 0 ? # is time measurement not required ?
-        FFTW.FakeArray{T,N}(sz, FFTW.colmajorstrides(isize)) : # fake allocation, only size and strides matter
-        view(Array{T}(undef, isize), Base.OneTo.(sz)...) # allocation
-    Y = flags&FFTW.ESTIMATE != 0 ? 
-        FFTW.FakeArray{Complex{T}}(osize) : # fake allocation, only size and strides matter
-        Array{Complex{T}}(undef, osize) # allocation
+    if flags&FFTW.ESTIMATE != 0 # time measurement not required
+        X_padded = FFTW.FakeArray{T,N}(sz, FFTW.colmajorstrides(isize))  # fake allocation, only pointer, size and strides matter
+        Y = FFTW.FakeArray{Complex{T}}(osize) 
+    else # need to allocate new array since size of X is too small...
+        data = Array{T}(undef, prod(isize))
+        X_padded = view(reshape(data, isize), Base.OneTo.(sz)...) # allocation
+        Y = reshape(reinterpret(Complex{T}, data), osize)
+    end
     return FFTW.rFFTWPlan{T,FFTW.FORWARD,true,N}(X_padded, Y, region, flags, timelimit)
 end
 
@@ -143,22 +145,7 @@ function plan_brfft!(X::StridedArray{Complex{T},N}, d, region;
     isize = size(X) # input size (complex)
     osize = ntuple(i -> i == first(region) ? 2isize[i] : isize[i], Val(N)) # padded output size (real)
     sz = FFTW.brfft_output_size(X, d, region) # physical output size (real)
-    # Y is padded
-    Y = flags&FFTW.ESTIMATE != 0 ?  # is time measurement not required ?
-        FFTW.FakeArray{T,N}(sz, FFTW.colmajorstrides(osize)) : # fake allocation, only size and strides matter
-        view(Array{T}(undef, osize), Base.OneTo.(sz)...) # allocation
+    Yflat = reinterpret(T, reshape(X, prod(isize)))
+    Y = view(reshape(Yflat, osize), Base.OneTo.(sz)...) # Y is padded
     return FFTW.rFFTWPlan{Complex{T},FFTW.BACKWARD,true,N}(X, Y, region, flags, timelimit)
-end
-#= function Base.:*(p::FFTW.rFFTWPlan{T,FFTW.FORWARD,true,N}, X::StridedArray{T,N}) where {T<:FFTW.fftwReal,N}
-    ptr_complex = convert(Ptr{Complex{T}}, pointer(X)) 
-    Y = unsafe_wrap(Array, ptr_complex, p.osz) # reinterpretation of X as a complex Array
-    mul!(Y, p, X) # assesses size, strides and memory alignment
-end
-
-function Base.:*(p::FFTW.rFFTWPlan{Complex{T},FFTW.BACKWARD,true,N}, X::StridedArray{Complex{T},N}) where {T<:FFTW.fftwReal,N}
-    ptr_real = convert(Ptr{T}, pointer(X))
-    osize = ntuple(i -> i == first(p.region) ? 2p.sz[i] : p.sz[i], Val(N)) # padded real size
-    Y_parent = unsafe_wrap(Array, ptr_real, osize) # reinterpretation of X as a real Array
-    Y = view(Y_parent, Base.OneTo.(p.osz)...) # skip padded values, view to physical real size only
-    mul!(Y, p, X) # assesses size, strides and memory alignment
-end =#
\ No newline at end of file
+end
\ No newline at end of file

From 0d886128655b4943ddc86f57ab1065794aeb3d5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Wed, 24 May 2023 13:26:01 +0200
Subject: [PATCH 10/22] further rfft! tests

---
 test/rfft.jl | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/test/rfft.jl b/test/rfft.jl
index cec24ba9..fb5e25c1 100644
--- a/test/rfft.jl
+++ b/test/rfft.jl
@@ -191,7 +191,7 @@ function test_rfft(size_in; benchmark=true)
     MPI.Barrier(comm)
 end
 
-function test_rfft!(size_in)
+function test_rfft!(size_in; flags = FFTW.ESTIMATE, benchmark=true)
     comm = MPI.COMM_WORLD
     rank = MPI.Comm_rank(comm)
 
@@ -200,8 +200,8 @@ function test_rfft!(size_in)
     # Test creating Pencil and creating plan.
     pen = Pencil(size_in, comm)
 
-    inplace_plan = @inferred PencilFFTPlan(pen, Transforms.RFFT!())
-    outofplace_place = @inferred PencilFFTPlan(pen, Transforms.RFFT())
+    inplace_plan = @inferred PencilFFTPlan(pen, Transforms.RFFT!(), fftw_flags=flags)
+    outofplace_place = @inferred PencilFFTPlan(pen, Transforms.RFFT(), fftw_flags=flags)
 
     # Allocate and initialise scalar fields 
     u = allocate_input(inplace_plan) #, Val(3))
@@ -221,9 +221,26 @@ function test_rfft!(size_in)
         ldiv!(u, inplace_plan, u)
         ldiv!(v, outofplace_place, v̂)
         @test all(isapprox.(x, v, atol=1e-8))
-        @test all(isapprox.(x[1:3], [1.0, 2.0, 0.0], atol = 1e-8))
-    end
+        rank == 0 && @test all(isapprox.(x[1:3], [1.0, 2.0, 0.0], atol = 1e-8))
+
+        rng = MersenneTwister(42)
+        init_random_field!(x̂, rng)
+        copyto!(parent(v̂), parent(x̂))
 
+        PencilFFTs.bmul!(u, inplace_plan, u)
+        PencilFFTs.bmul!(v, outofplace_place, v̂)
+        @test all(isapprox.(x, v, atol=1e-8))
+
+        mul!(u, inplace_plan, u)
+        mul!(v̂, outofplace_place, v)
+        @test all(isapprox.(x̂, v̂, atol=1e-8))
+    end
+    if benchmark
+        println("micro-benchmarks: ")
+        println("- rfft!...\t") ; @time mul!(u, inplace_plan, u)  ; @time mul!(u, inplace_plan, u)
+        println("- rfft...\t") ; @time mul!(v̂, outofplace_place, v)  ; @time mul!(v̂, outofplace_place, v)
+        println("done ")
+    end
     MPI.Barrier(comm)
 end
 
@@ -273,9 +290,9 @@ println()
 test_rfft(DATA_DIMS_ODD, benchmark=false)
 
 test_1D_rfft!(first(DATA_DIMS_ODD))
+test_1D_rfft!(first(DATA_DIMS_EVEN), flags = FFTW.MEASURE)
 test_1D_rfft!(first(DATA_DIMS_EVEN))
-# test_1D_rfft!(first(DATA_DIMS_EVEN), flags = FFTW.MEASURE) # test fails for this specific dimension, possibly due to a FFTW bug
 
-test_rfft!(DATA_DIMS_ODD)
-# test_rfft!(DATA_DIMS_EVEN) # test fails for this specific dimension, possibly due to a FFTW bug
-test_rfft!(DATA_DIMS_EVEN .+ 2)
+test_rfft!(DATA_DIMS_ODD, benchmark=false)
+test_rfft!(DATA_DIMS_EVEN, benchmark=false)
+# test_rfft!((256,256,256)) # similar execution times for large rfft and rfft!
\ No newline at end of file

From 936ecff42f44b55f3a3cb48056b963c44b4a0cc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Wed, 24 May 2023 13:29:30 +0200
Subject: [PATCH 11/22] further rfft! tests

---
 test/rfft.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/rfft.jl b/test/rfft.jl
index fb5e25c1..c8353159 100644
--- a/test/rfft.jl
+++ b/test/rfft.jl
@@ -204,7 +204,7 @@ function test_rfft!(size_in; flags = FFTW.ESTIMATE, benchmark=true)
     outofplace_place = @inferred PencilFFTPlan(pen, Transforms.RFFT(), fftw_flags=flags)
 
     # Allocate and initialise scalar fields 
-    u = allocate_input(inplace_plan) #, Val(3))
+    u = allocate_input(inplace_plan)
     x = first(u) ; x̂ = last(u) # Real and Complex views
 
     v = allocate_input(outofplace_place)
@@ -256,7 +256,7 @@ function test_1D_rfft!(size_in; flags=FFTW.ESTIMATE)
     â2 = zeros(Complex{Float64}, dims_fourier) ;
     a2 = zeros(Float64, dims) ;
     
-    p = Transforms.plan_rfft!(a, 1, flags=flags) ; #display(p)
+    p = Transforms.plan_rfft!(a, 1, flags=flags) ;
     p2 = FFTW.plan_rfft(a2, 1, flags=flags) ;
     bp = Transforms.plan_brfft!(â, dims[1], 1, flags=flags) ; 
     bp2 = FFTW.plan_brfft(â, dims[1], 1, flags=flags) ; 

From 00c6a10b5c8219f8be4bebe36ba5f3d3f521209c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Wed, 24 May 2023 14:41:54 +0200
Subject: [PATCH 12/22] fixed docstring

---
 src/allocate.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/allocate.jl b/src/allocate.jl
index e1698dcf..24e19ec4 100644
--- a/src/allocate.jl
+++ b/src/allocate.jl
@@ -15,11 +15,9 @@ size `dims`, and a tuple of `N` `PencilArray`s.
     If `p` is an in-place real-to-real or complex-to-complex plan, a
     [`ManyPencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.ManyPencilArray)
     is allocated. If `p` is an in-place real-to-complex plan, a
-    []`ManyPencilArrayRFFT!`](@ref)
-    is allocated. 
+    [`ManyPencilArrayRFFT!`](@ref) is allocated. 
     
-    These
-    types hold `PencilArray` wrappers for the input and output transforms (as
+    These types hold `PencilArray` wrappers for the input and output transforms (as
     well as for intermediate transforms) which share the same space in memory.
     The input and output `PencilArray`s should be respectively accessed by
     calling [`first(::ManyPencilArray)`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#Base.first-Tuple{ManyPencilArray}) and

From 254a7683a3289ff85dbba084db4bebf816668c3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Wed, 24 May 2023 16:25:35 +0200
Subject: [PATCH 13/22] fixed output of _scale!

---
 src/operations.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operations.jl b/src/operations.jl
index 39407e53..32e4a0cb 100644
--- a/src/operations.jl
+++ b/src/operations.jl
@@ -61,12 +61,12 @@ function Base.:\(p::PencilFFTPlan, src::FFTArray)
     ldiv!(dst, p, src)
 end
 
-function _scale!(dst::PencilArray{<:RealOrComplex{T},N}, inv_scale::T) where {T,N}
-    parent(dst) .*= inv_scale
+function _scale!(dst::PencilArray{<:RealOrComplex{T},N}, inv_scale::Number) where {T,N}
+    dst .*= inv_scale
 end
 
-function _scale!(dst::PencilMultiarray{<:RealOrComplex{T},N}, inv_scale::T) where {T,N}
-    parent(first(dst)) .*= inv_scale
+function _scale!(dst::PencilMultiarray{<:RealOrComplex{T},N}, inv_scale::Number) where {T,N}
+    first(dst) .*= inv_scale
 end
 
 # Out-of-place version

From 14eb06dd6837f7d2a1984dd67ae614d3ed9d0918 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Bignonnet?=
 <francois.bignonnet@univ-nantes.fr>
Date: Wed, 24 May 2023 16:26:18 +0200
Subject: [PATCH 14/22] fixed triangular dispatch for expand_dims

---
 src/Transforms/r2c.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Transforms/r2c.jl b/src/Transforms/r2c.jl
index a5fa4203..d9658ea7 100644
--- a/src/Transforms/r2c.jl
+++ b/src/Transforms/r2c.jl
@@ -117,8 +117,10 @@ expand_dims(tr::RFFT!, ::Val{N}) where {N} =
 
 expand_dims(tr::BRFFT, ::Val{N}) where {N} = (BFFT(), expand_dims(tr, Val(N - 1))...)
 expand_dims(tr::BRFFT!, ::Val{N}) where {N} = (BFFT!(), expand_dims(tr, Val(N - 1))...)
-expand_dims(tr::TransformC2R, ::Val{1}) = (tr, )
-expand_dims(tr::TransformC2R, ::Val{0}) = ()
+expand_dims(tr::BRFFT, ::Val{1}) = (tr, )
+expand_dims(tr::BRFFT, ::Val{0}) = ()
+expand_dims(tr::BRFFT!, ::Val{1}) = (tr, )
+expand_dims(tr::BRFFT!, ::Val{0}) = ()
 
 ## FFTW wrappers for inplace RFFT plans
 

From 1f823243120049ce353d927ee9afd0bc2672f7dd Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 25 May 2023 09:31:05 +0200
Subject: [PATCH 15/22] Test: use rtol so that commented 256^3 tests pass

---
 test/rfft.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/rfft.jl b/test/rfft.jl
index c8353159..2e0ae895 100644
--- a/test/rfft.jl
+++ b/test/rfft.jl
@@ -233,7 +233,7 @@ function test_rfft!(size_in; flags = FFTW.ESTIMATE, benchmark=true)
 
         mul!(u, inplace_plan, u)
         mul!(v̂, outofplace_place, v)
-        @test all(isapprox.(x̂, v̂, atol=1e-8))
+        @test all(isapprox.(x̂, v̂, rtol=1e-8))
     end
     if benchmark
         println("micro-benchmarks: ")
@@ -295,4 +295,4 @@ test_1D_rfft!(first(DATA_DIMS_EVEN))
 
 test_rfft!(DATA_DIMS_ODD, benchmark=false)
 test_rfft!(DATA_DIMS_EVEN, benchmark=false)
-# test_rfft!((256,256,256)) # similar execution times for large rfft and rfft!
\ No newline at end of file
+# test_rfft!((256,256,256)) # similar execution times for large rfft and rfft!

From 6517407fae3fb955ce990cf97060336f10174e80 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 25 May 2023 09:44:50 +0200
Subject: [PATCH 16/22] Revert "Test: use rtol so that commented 256^3 tests
 pass"

This reverts commit 1f823243120049ce353d927ee9afd0bc2672f7dd.
---
 test/rfft.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/rfft.jl b/test/rfft.jl
index 2e0ae895..c8353159 100644
--- a/test/rfft.jl
+++ b/test/rfft.jl
@@ -233,7 +233,7 @@ function test_rfft!(size_in; flags = FFTW.ESTIMATE, benchmark=true)
 
         mul!(u, inplace_plan, u)
         mul!(v̂, outofplace_place, v)
-        @test all(isapprox.(x̂, v̂, rtol=1e-8))
+        @test all(isapprox.(x̂, v̂, atol=1e-8))
     end
     if benchmark
         println("micro-benchmarks: ")
@@ -295,4 +295,4 @@ test_1D_rfft!(first(DATA_DIMS_EVEN))
 
 test_rfft!(DATA_DIMS_ODD, benchmark=false)
 test_rfft!(DATA_DIMS_EVEN, benchmark=false)
-# test_rfft!((256,256,256)) # similar execution times for large rfft and rfft!
+# test_rfft!((256,256,256)) # similar execution times for large rfft and rfft!
\ No newline at end of file

From cf431908a38a95ad464482972bfe8d25b59fd264 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 25 May 2023 10:00:27 +0200
Subject: [PATCH 17/22] Formatting

---
 src/Transforms/r2c.jl |  5 ++--
 test/rfft.jl          | 63 ++++++++++++++++++++++++-------------------
 2 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/src/Transforms/r2c.jl b/src/Transforms/r2c.jl
index d9658ea7..4dd0cb30 100644
--- a/src/Transforms/r2c.jl
+++ b/src/Transforms/r2c.jl
@@ -1,5 +1,6 @@
 ## Real-to-complex and complex-to-real transforms.
-import FFTW: rFFTWPlan, FORWARD, BACKWARD, ESTIMATE, NO_TIMELIMIT, FakeArray, colmajorstrides, brfft_output_size
+using FFTW: FFTW
+
 """
     RFFT()
 
@@ -150,4 +151,4 @@ function plan_brfft!(X::StridedArray{Complex{T},N}, d, region;
     Yflat = reinterpret(T, reshape(X, prod(isize)))
     Y = view(reshape(Yflat, osize), Base.OneTo.(sz)...) # Y is padded
     return FFTW.rFFTWPlan{Complex{T},FFTW.BACKWARD,true,N}(X, Y, region, flags, timelimit)
-end
\ No newline at end of file
+end
diff --git a/test/rfft.jl b/test/rfft.jl
index c8353159..e1212c14 100644
--- a/test/rfft.jl
+++ b/test/rfft.jl
@@ -204,14 +204,18 @@ function test_rfft!(size_in; flags = FFTW.ESTIMATE, benchmark=true)
     outofplace_place = @inferred PencilFFTPlan(pen, Transforms.RFFT(), fftw_flags=flags)
 
     # Allocate and initialise scalar fields 
-    u = allocate_input(inplace_plan)
-    x = first(u) ; x̂ = last(u) # Real and Complex views
+    u = @inferred allocate_input(inplace_plan)
+    x = first(u); x̂ = last(u) # Real and Complex views
 
-    v = allocate_input(outofplace_place)
-    v̂ = allocate_output(outofplace_place)
+    v = @inferred allocate_input(outofplace_place)
+    v̂ = @inferred allocate_output(outofplace_place)
 
-    fill!(x, 0.0) ; rank == 0 && (x[1] = 1.0) ; rank == 0 && (x[2] = 2.0) ;
-    fill!(v, 0.0) ; rank == 0 && (v[1] = 1.0) ; rank == 0 && (v[2] = 2.0) ;
+    fill!(x, 0.0)
+    fill!(v, 0.0)
+    if rank == 0
+        x[1] = 1.0; x[2] = 2.0
+        v[1] = 1.0; v[2] = 2.0
+    end
 
     @testset "RFFT! vs RFFT" begin
         mul!(u, inplace_plan, u)
@@ -237,43 +241,48 @@ function test_rfft!(size_in; flags = FFTW.ESTIMATE, benchmark=true)
     end
     if benchmark
         println("micro-benchmarks: ")
-        println("- rfft!...\t") ; @time mul!(u, inplace_plan, u)  ; @time mul!(u, inplace_plan, u)
-        println("- rfft...\t") ; @time mul!(v̂, outofplace_place, v)  ; @time mul!(v̂, outofplace_place, v)
+        println("- rfft!...\t")
+        @time mul!(u, inplace_plan, u)
+        @time mul!(u, inplace_plan, u)
+        println("- rfft...\t")
+        @time mul!(v̂, outofplace_place, v)
+        @time mul!(v̂, outofplace_place, v)
         println("done ")
     end
     MPI.Barrier(comm)
 end
 
 function test_1D_rfft!(size_in; flags=FFTW.ESTIMATE)
-    dims = (size_in,) ; 
-    dims_padded = (2(dims[1] ÷ 2 + 1), dims[2:end]...) ; 
+    dims = (size_in,)
+    dims_padded = (2(dims[1] ÷ 2 + 1), dims[2:end]...)
     dims_fourier = ((dims[1] ÷ 2 + 1), dims[2:end]...)
 
-    A = zeros(Float64, dims_padded) ; 
-    a = view(A, Base.OneTo.(dims)...) ; 
+    A = zeros(Float64, dims_padded)
+    a = view(A, Base.OneTo.(dims)...)
     â = reinterpret(Complex{Float64}, A)
 
-    â2 = zeros(Complex{Float64}, dims_fourier) ;
-    a2 = zeros(Float64, dims) ;
-    
-    p = Transforms.plan_rfft!(a, 1, flags=flags) ;
-    p2 = FFTW.plan_rfft(a2, 1, flags=flags) ;
-    bp = Transforms.plan_brfft!(â, dims[1], 1, flags=flags) ; 
-    bp2 = FFTW.plan_brfft(â, dims[1], 1, flags=flags) ; 
+    â2 = zeros(Complex{Float64}, dims_fourier)
+    a2 = zeros(Float64, dims)
+
+    p = Transforms.plan_rfft!(a, 1, flags=flags)
+    p2 = FFTW.plan_rfft(a2, 1, flags=flags)
+    bp = Transforms.plan_brfft!(â, dims[1], 1, flags=flags)
+    bp2 = FFTW.plan_brfft(â, dims[1], 1, flags=flags)
 
-    fill!(a2, 0.0) ; a2[1] = 1 ; a2[2] = 2 ;
-    fill!(a, 0.0) ; a[1] = 1 ; a[2] = 2 ;
+    fill!(a2, 0.0); a2[1] = 1; a2[2] = 2;
+    fill!(a, 0.0); a[1] = 1; a[2] = 2;
     
     @testset "1D RFFT! vs RFFT" begin
-        mul!(â, p, a) ; 
-        mul!(â2, p2, a2) ;
+        mul!(â, p, a)
+        mul!(â2, p2, a2)
         @test all(isapprox.(â2, â, atol = 1e-8))
         
-        mul!(a, bp, â) ; 
-        mul!(a2, bp2, â2) ;
+        mul!(a, bp, â) 
+        mul!(a2, bp2, â2)
         @test all(isapprox.(a2, a, atol = 1e-8))
         
-        a /= size_in ; a2 /= size_in ;
+        a /= size_in
+        a2 /= size_in
         @test all(isapprox.(a[1:3], [1.0, 2.0, 0.0], atol = 1e-8))
     end
 
@@ -295,4 +304,4 @@ test_1D_rfft!(first(DATA_DIMS_EVEN))
 
 test_rfft!(DATA_DIMS_ODD, benchmark=false)
 test_rfft!(DATA_DIMS_EVEN, benchmark=false)
-# test_rfft!((256,256,256)) # similar execution times for large rfft and rfft!
\ No newline at end of file
+# test_rfft!((256,256,256)) # similar execution times for large rfft and rfft!

From 0ca7322fbf89eff4348fd848ffe494614b70326e Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 25 May 2023 10:11:44 +0200
Subject: [PATCH 18/22] Simplify dispatch in allocate_input

---
 src/allocate.jl | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/src/allocate.jl b/src/allocate.jl
index 24e19ec4..ec0f18d2 100644
--- a/src/allocate.jl
+++ b/src/allocate.jl
@@ -41,17 +41,26 @@ size `dims`, and a tuple of `N` `PencilArray`s.
     # p * v_in  # not allowed!!
     ```
 """
-function allocate_input end
+function allocate_input(p::PencilFFTPlan)
+    inplace = is_inplace(p)
+    _allocate_input(Val(inplace), p)
+end
 
 # Out-of-place version
-function allocate_input(p::PencilFFTPlan{T,N,false} where {T,N})
+function _allocate_input(inplace::Val{false}, p::PencilFFTPlan)
     T = eltype_input(p)
     pen = pencil_input(p)
     PencilArray{T}(undef, pen, p.extra_dims...)
 end
 
 # In-place version
-function allocate_input(p::PencilFFTPlan{T,N,true} where {T,N})
+function _allocate_input(inplace::Val{true}, p::PencilFFTPlan)
+    (; transforms,) = p.global_params
+    _allocate_input(inplace, p, transforms...)
+end
+
+# In-place: generic case
+function _allocate_input(inplace::Val{true}, p::PencilFFTPlan, transforms...)
     pencils = map(pp -> pp.pencil_in, p.plans)
 
     # Note that for each 1D plan, the input and output pencils are the same.
@@ -63,15 +72,14 @@ function allocate_input(p::PencilFFTPlan{T,N,true} where {T,N})
     ManyPencilArray{T}(undef, pencils...; extra_dims=p.extra_dims)
 end
 
-# In-place version for real-to-complex RFFT!
-function allocate_input(plan::PencilFFTPlan{T,N,true, 
-    Nt,  # number of transformed dimensions
-    Nd,  # number of decomposed dimensions
-    Ne,  # number of extra dimensions
-    G }) where {N,Nd,Ne,G<:PencilFFTs.GlobalFFTParams{T,Nt,true,TR}} where {T<:FFTReal,Nt,TR<:Tuple{Transforms.RFFT!,Vararg{Transforms.FFT!}}}
-    plans = plan.plans
+# In-place: specific case of RFFT!
+function _allocate_input(
+        inplace::Val{true}, p::PencilFFTPlan{T},
+        ::Transforms.RFFT!, ::Vararg{Transforms.FFT!},
+    ) where {T}
+    plans = p.plans
     pencils = (first(plans).pencil_in, first(plans).pencil_out,  map(pp -> pp.pencil_in, plans[2:end])...)
-    ManyPencilArrayRFFT!{T}(undef, pencils...; extra_dims=plan.extra_dims)
+    ManyPencilArrayRFFT!{T}(undef, pencils...; extra_dims=p.extra_dims)
 end
 
 allocate_input(p::PencilFFTPlan, dims...) =
@@ -89,17 +97,20 @@ If `p` is an in-place plan, a [`ManyPencilArray`](https://jipolanco.github.io/Pe
 
 See [`allocate_input`](@ref) for details.
 """
-function allocate_output end
+function allocate_output(p::PencilFFTPlan)
+    inplace = is_inplace(p)
+    _allocate_output(Val(inplace), p)
+end
 
 # Out-of-place version.
-function allocate_output(p::PencilFFTPlan{T,N,false} where {T,N})
+function _allocate_output(inplace::Val{false}, p::PencilFFTPlan)
     T = eltype_output(p)
     pen = pencil_output(p)
     PencilArray{T}(undef, pen, p.extra_dims...)
 end
 
 # For in-place plans, the output and input are the same ManyPencilArray.
-allocate_output(p::PencilFFTPlan{T,N,true} where {T,N}) = allocate_input(p)
+_allocate_output(inplace::Val{true}, p::PencilFFTPlan) = _allocate_input(inplace, p)
 
 allocate_output(p::PencilFFTPlan, dims...) =
     _allocate_many(allocate_output, p, dims...)

From 3dee0a62d0f62b9b86b19e281c1364851856b621 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 25 May 2023 12:20:47 +0200
Subject: [PATCH 19/22] Use PencilArrays.AbstractManyPencilArray

---
 Project.toml           |  2 +-
 src/multiarrays_r2c.jl | 80 +++---------------------------------------
 src/operations.jl      |  3 +-
 3 files changed, 8 insertions(+), 77 deletions(-)

diff --git a/Project.toml b/Project.toml
index 92f7c81f..f6abdf62 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,7 +16,7 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 AbstractFFTs = "1"
 FFTW = "1.6"
 MPI = "0.19, 0.20"
-PencilArrays = "0.17"
+PencilArrays = "0.18"
 Reexport = "1"
 TimerOutputs = "0.5"
 julia = "1.7"
diff --git a/src/multiarrays_r2c.jl b/src/multiarrays_r2c.jl
index 2a04de3c..ad02c15d 100644
--- a/src/multiarrays_r2c.jl
+++ b/src/multiarrays_r2c.jl
@@ -1,7 +1,8 @@
 # copied and modified from https://github.com/jipolanco/PencilArrays.jl/blob/master/src/multiarrays.jl
-import PencilArrays:ManyPencilArray, _make_arrays
+import PencilArrays: AbstractManyPencilArray, _make_arrays
+
 """
-    ManyPencilArrayRFFT!{T,N,M}
+    ManyPencilArrayRFFT!{T,N,M} <: AbstractManyPencilArray{N,M}
 
 Container holding `M` different [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) views to the same
 underlying data buffer. All views share the same and dimensionality `N`.
@@ -22,14 +23,14 @@ The optional `extra_dims` argument is the same as for [`PencilArray`](https://ji
 
 See also [`ManyPencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.ManyPencilArray)
 """
-struct ManyPencilArrayRFFT!{ ## TODO: redefine PencilArrays.ManyPencilArray instead or creating new struct, with eltype RealOrComplex{T}
+struct ManyPencilArrayRFFT!{
         T,  # element type of real array
         N,  # number of dimensions of each array (including extra_dims)
         M,  # number of arrays
         Arrays <: Tuple{Vararg{PencilArray,M}},
         DataVector <: AbstractVector{T},
         DataVectorComplex <: AbstractVector{Complex{T}},
-    }
+    } <: AbstractManyPencilArray{N, M}
     data   :: DataVector
     data_complex   :: DataVectorComplex
     arrays :: Arrays
@@ -76,74 +77,3 @@ function _make_real_array(data, extra_dims, p)
     arr = view(parent_arr, axes_local...)
     PencilArray(p, arr)
 end
-
-# all methods redefinitions below could be avoided by using a 
-# ManyPencilArray{ComplexOrReal{T}} instead of a ManyPencilArrayRFFT!{T}
-# or defining a supertype in PencilArrays/src/multiarrays.jl
-Base.eltype(::ManyPencilArrayRFFT!{T}) where {T} = T
-Base.ndims(::ManyPencilArrayRFFT!{T,N}) where {T,N} = N
-
-"""
-    Tuple(A::ManyPencilArrayRFFT!) -> (u1, u2, …)
-
-Returns the [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray)s wrapped by `A`.
-
-This can be useful for iterating over all the wrapped arrays.
-"""
-@inline Base.Tuple(A::ManyPencilArrayRFFT!) = A.arrays
-
-"""
-    length(A::ManyPencilArrayRFFT!)
-
-Returns the number of [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray)s wrapped by `A`.
-"""
-Base.length(::ManyPencilArrayRFFT!{T,N,M}) where {T,N,M} = M
-
-"""
-    first(A::ManyPencilArrayRFFT!)
-
-Returns the first [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
-"""
-Base.first(A::ManyPencilArrayRFFT!) = first(A.arrays)
-
-"""
-    last(A::ManyPencilArrayRFFT!)
-
-Returns the last [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
-"""
-Base.last(A::ManyPencilArrayRFFT!) = last(A.arrays)
-
-"""
-    getindex(A::ManyPencilArrayRFFT!, ::Val{i})
-    getindex(A::ManyPencilArrayRFFT!, i::Integer)
-
-Returns the i-th [`PencilArray`](https://jipolanco.github.io/PencilArrays.jl/dev/PencilArrays/#PencilArrays.PencilArray) wrapped by `A`.
-
-If possible, the `Val{i}` form should be preferred, as it ensures that the full
-type of the returned `PencilArray` is known by the compiler.
-
-See also [`first(::ManyPencilArrayRFFT!)`](@ref), [`last(::ManyPencilArrayRFFT!)`](@ref).
-
-# Example
-
-```julia
-A = ManyPencilArrayRFFT!(pencil1, pencil2, pencil3)
-
-# Get the PencilArray associated to `pencil2`.
-u2 = A[2]
-u2 = A[Val(2)]
-```
-"""
-Base.getindex(A::ManyPencilArrayRFFT!, ::Val{i}) where {i} =
-    _getindex(Val(i), A.arrays...)
-@inline Base.getindex(A::ManyPencilArrayRFFT!, i) = A[Val(i)]
-
-@inline function _getindex(::Val{i}, a, t::Vararg) where {i}
-    i :: Integer
-    i <= 0 && throw(BoundsError("index must be >= 1"))
-    i == 1 && return a
-    _getindex(Val(i - 1), t...)
-end
-
-# This will happen if the index `i` intially passed is too large.
-@inline _getindex(::Val) = throw(BoundsError("invalid index"))
\ No newline at end of file
diff --git a/src/operations.jl b/src/operations.jl
index 32e4a0cb..8caab361 100644
--- a/src/operations.jl
+++ b/src/operations.jl
@@ -191,6 +191,7 @@ function _apply_plans!(
     dir::Val, full_plan::PencilFFTPlan{T,N,true},
     A::ManyPencilArrayRFFT!{T,N}, A_again::ManyPencilArrayRFFT!{T,N}) where {T<:FFTReal,N}
     @assert A === A_again
+
     # pairs for 1D FFT! plans, RFFT! plan is treated separately
     pairs = _make_pairs(full_plan.plans[2:end], A.arrays[3:end])
 
@@ -206,7 +207,7 @@ function _apply_plans!(
         # apply recursively all transforms but last (BFFT!)
         _apply_plans_in_place!(dir, full_plan, nothing, pp...)
         # transpose before last transform
-        t = if pp ==()
+        t = if pp == ()
                 nothing
             else
                 @assert Base.mightalias(A.arrays[3], A.arrays[2]) # they're aliased!

From 7ee3f883bbd0f4df0e3adf92a19e04429f7cea38 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 25 May 2023 13:54:59 +0200
Subject: [PATCH 20/22] Update version

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f6abdf62..95eae0c8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "PencilFFTs"
 uuid = "4a48f351-57a6-4416-9ec4-c37015456aae"
 authors = ["Juan Ignacio Polanco <jipolanc@gmail.com>"]
-version = "0.14.4"
+version = "0.15.0"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"

From 3e1991b13a508b10acde12d462b276b75bb43dea Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 25 May 2023 13:55:07 +0200
Subject: [PATCH 21/22] Don't test on Julia 1.8

---
 .github/workflows/ci.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index daac645e..d2677881 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -21,8 +21,7 @@ jobs:
         experimental: [false]
         version:
           - '1.7'
-          - '1.8'
-          - '~1.9.0-0'
+          - '1.9'
         os:
           - ubuntu-latest
         arch:

From c22f1df1d765510e3e599a4e08718c635e8e6eb9 Mon Sep 17 00:00:00 2001
From: Juan Ignacio Polanco <juan-ignacio.polanco@cnrs.fr>
Date: Thu, 25 May 2023 16:56:46 +0200
Subject: [PATCH 22/22] Update docs

---
 docs/src/PencilFFTs.md | 6 ++++++
 docs/src/Transforms.md | 2 ++
 src/multiarrays_r2c.jl | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/src/PencilFFTs.md b/docs/src/PencilFFTs.md
index cb03891b..803824ea 100644
--- a/docs/src/PencilFFTs.md
+++ b/docs/src/PencilFFTs.md
@@ -28,3 +28,9 @@ scale_factor(::PencilFFTPlan)
 timer(::PencilFFTPlan)
 is_inplace(::PencilFFTPlan)
 ```
+
+## Internals
+
+```@docs
+ManyPencilArrayRFFT!
+```
diff --git a/docs/src/Transforms.md b/docs/src/Transforms.md
index 3c2e2c10..d4fe4a42 100644
--- a/docs/src/Transforms.md
+++ b/docs/src/Transforms.md
@@ -17,7 +17,9 @@ BFFT
 BFFT!
 
 RFFT
+RFFT!
 BRFFT
+BRFFT!
 
 R2R
 R2R!
diff --git a/src/multiarrays_r2c.jl b/src/multiarrays_r2c.jl
index ad02c15d..0faf8c5b 100644
--- a/src/multiarrays_r2c.jl
+++ b/src/multiarrays_r2c.jl
@@ -9,7 +9,7 @@ underlying data buffer. All views share the same and dimensionality `N`.
 The element type `T` of the first view is real, that of subsequent views is 
 `Complex{T}`. 
 
-This can be used to perform in-place real-to-complex plan, see also[`Tranforms.RFFT!`](@ref). 
+This can be used to perform in-place real-to-complex plan, see also[`Transforms.RFFT!`](@ref). 
 It is used internally for such transforms by [`allocate_input`](@ref) and should not be constructed directly.
 
 ---