diff --git a/test/Project.toml b/test/Project.toml index c72f6e82..43263ce9 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -6,6 +6,7 @@ DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e" GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" +JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" diff --git a/test/include/jlarray.jl b/test/include/jlarray.jl index 5eba7cf8..749f85fd 100644 --- a/test/include/jlarray.jl +++ b/test/include/jlarray.jl @@ -1,385 +1,20 @@ -# File adapted from GPUArrays.jl test suite: -# https://raw.githubusercontent.com/JuliaGPU/GPUArrays.jl/master/test/jlarray.jl -# -# The MIT License (MIT) -# Copyright © 2016 Simon Danisch -# Copyright © 2018 JuliaGPU developers -# -# Added functions for PencilArrays tests (these seem to be defined for CuArray -# in CUDA.jl): -# - resize!(::DenseJLVector, n) -# - unsafe_wrap(::Type{JLArray}, ...) -# - rand!(::AbstractRNG, ::JLArray, ...) +import JLArrays -# ============================================================================ # +# Define a few more functions needed for PencilArrays tests +# (these seem to be defined for CuArray in CUDA.jl) +# TODO define these in JLArrays.jl -# reference implementation on the CPU +using Random: Random, AbstractRNG +using JLArrays: DenseJLVector, JLArray -# note that most of the code in this file serves to define a functional array type, -# the actual implementation of GPUArrays-interfaces is much more limited. - -module JLArrays - -export JLArray, jl - -using GPUArrays - -using Adapt - - -# -# Device functionality -# - -const MAXTHREADS = 256 - - -## execution - -struct JLBackend <: AbstractGPUBackend end - -mutable struct JLKernelContext <: AbstractKernelContext - blockdim::Int - griddim::Int - blockidx::Int - threadidx::Int - - localmem_counter::Int - localmems::Vector{Vector{Array}} -end - -function JLKernelContext(threads::Int, blockdim::Int) - blockcount = prod(blockdim) - lmems = [Vector{Array}() for i in 1:blockcount] - JLKernelContext(threads, blockdim, 1, 1, 0, lmems) -end - -function JLKernelContext(ctx::JLKernelContext, threadidx::Int) - JLKernelContext( - ctx.blockdim, - ctx.griddim, - ctx.blockidx, - threadidx, - 0, - ctx.localmems - ) -end - -struct Adaptor end -jlconvert(arg) = adapt(Adaptor(), arg) - -# FIXME: add Ref to Adapt.jl (but make sure it doesn't cause ambiguities with CUDAnative's) -struct JlRefValue{T} <: Ref{T} - x::T -end -Base.getindex(r::JlRefValue) = r.x -Adapt.adapt_structure(to::Adaptor, r::Base.RefValue) = JlRefValue(adapt(to, r[])) - -function GPUArrays.gpu_call(::JLBackend, f, args, threads::Int, blocks::Int; - name::Union{String,Nothing}) - ctx = JLKernelContext(threads, blocks) - device_args = jlconvert.(args) - tasks = Array{Task}(undef, threads) - for blockidx in 1:blocks - ctx.blockidx = blockidx - for threadidx in 1:threads - thread_ctx = JLKernelContext(ctx, threadidx) - tasks[threadidx] = @async f(thread_ctx, device_args...) - # TODO: require 1.3 and use Base.Threads.@spawn for actual multithreading - # (this would require a different synchronization mechanism) - end - for t in tasks - fetch(t) - end - end - return -end - - -## executed on-device - -# array type - -struct JLDeviceArray{T, N} <: AbstractDeviceArray{T, N} - data::Array{T, N} - dims::Dims{N} - - function JLDeviceArray{T,N}(data::Array{T, N}, dims::Dims{N}) where {T,N} - new(data, dims) - end -end - -Base.size(x::JLDeviceArray) = x.dims - -@inline Base.getindex(A::JLDeviceArray, index::Integer) = getindex(A.data, index) -@inline Base.setindex!(A::JLDeviceArray, x, index::Integer) = setindex!(A.data, x, index) - -# indexing - -for f in (:blockidx, :blockdim, :threadidx, :griddim) - @eval GPUArrays.$f(ctx::JLKernelContext) = ctx.$f -end - -# memory - -function GPUArrays.LocalMemory(ctx::JLKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}) where {T, dims, id} - ctx.localmem_counter += 1 - lmems = ctx.localmems[blockidx(ctx)] - - # first invocation in block - data = if length(lmems) < ctx.localmem_counter - lmem = fill(zero(T), dims) - push!(lmems, lmem) - lmem - else - lmems[ctx.localmem_counter] - end - - N = length(dims) - JLDeviceArray{T,N}(data, tuple(dims...)) -end - -# synchronization - -@inline function GPUArrays.synchronize_threads(::JLKernelContext) - # All threads are getting started asynchronously, so a yield will yield to the next - # execution of the same function, which should call yield at the exact same point in the - # program, leading to a chain of yields effectively syncing the tasks (threads). - yield() - return -end - - -# -# Host abstractions -# - -struct JLArray{T, N} <: AbstractGPUArray{T, N} - data::Array{T, N} - dims::Dims{N} - - function JLArray{T,N}(data::Array{T, N}, dims::Dims{N}) where {T,N} - @assert isbitstype(T) "JLArray only supports bits types" - new(data, dims) - end -end - - -## constructors - -# type and dimensionality specified, accepting dims as tuples of Ints -JLArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} = - JLArray{T,N}(Array{T, N}(undef, dims), dims) - -# type and dimensionality specified, accepting dims as series of Ints -JLArray{T,N}(::UndefInitializer, dims::Integer...) where {T,N} = JLArray{T,N}(undef, dims) - -# type but not dimensionality specified -JLArray{T}(::UndefInitializer, dims::Dims{N}) where {T,N} = JLArray{T,N}(undef, dims) -JLArray{T}(::UndefInitializer, dims::Integer...) where {T} = - JLArray{T}(undef, convert(Tuple{Vararg{Int}}, dims)) - -# empty vector constructor -JLArray{T,1}() where {T} = JLArray{T,1}(undef, 0) - -Base.similar(a::JLArray{T,N}) where {T,N} = JLArray{T,N}(undef, size(a)) -Base.similar(a::JLArray{T}, dims::Base.Dims{N}) where {T,N} = JLArray{T,N}(undef, dims) -Base.similar(a::JLArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = JLArray{T,N}(undef, dims) - -Base.copy(a::JLArray{T,N}) where {T,N} = JLArray{T,N}(copy(a.data), size(a)) - - -## derived types - -export DenseJLArray, DenseJLVector, DenseJLMatrix, DenseJLVecOrMat, - StridedJLArray, StridedJLVector, StridedJLMatrix, StridedJLVecOrMat, - AnyJLArray, AnyJLVector, AnyJLMatrix, AnyJLVecOrMat - -ContiguousSubJLArray{T,N,A<:JLArray} = Base.FastContiguousSubArray{T,N,A} - -# dense arrays: stored contiguously in memory -DenseReinterpretJLArray{T,N,A<:Union{JLArray,ContiguousSubJLArray}} = - Base.ReinterpretArray{T,N,S,A} where S -DenseReshapedJLArray{T,N,A<:Union{JLArray,ContiguousSubJLArray,DenseReinterpretJLArray}} = - Base.ReshapedArray{T,N,A} -DenseSubJLArray{T,N,A<:Union{JLArray,DenseReshapedJLArray,DenseReinterpretJLArray}} = - Base.FastContiguousSubArray{T,N,A} -DenseJLArray{T,N} = Union{JLArray{T,N}, DenseSubJLArray{T,N}, DenseReshapedJLArray{T,N}, - DenseReinterpretJLArray{T,N}} -DenseJLVector{T} = DenseJLArray{T,1} -DenseJLMatrix{T} = DenseJLArray{T,2} -DenseJLVecOrMat{T} = Union{DenseJLVector{T}, DenseJLMatrix{T}} - -# strided arrays -StridedSubJLArray{T,N,A<:Union{JLArray,DenseReshapedJLArray,DenseReinterpretJLArray}, - I<:Tuple{Vararg{Union{Base.RangeIndex, Base.ReshapedUnitRange, - Base.AbstractCartesianIndex}}}} = SubArray{T,N,A,I} -StridedJLArray{T,N} = Union{JLArray{T,N}, StridedSubJLArray{T,N}, DenseReshapedJLArray{T,N}, - DenseReinterpretJLArray{T,N}} -StridedJLVector{T} = StridedJLArray{T,1} -StridedJLMatrix{T} = StridedJLArray{T,2} -StridedJLVecOrMat{T} = Union{StridedJLVector{T}, StridedJLMatrix{T}} - -# anything that's (secretly) backed by a JLArray -AnyJLArray{T,N} = Union{JLArray{T,N}, WrappedArray{T,N,JLArray,JLArray{T,N}}} -AnyJLVector{T} = AnyJLArray{T,1} -AnyJLMatrix{T} = AnyJLArray{T,2} -AnyJLVecOrMat{T} = Union{AnyJLVector{T}, AnyJLMatrix{T}} - - -## array interface - -Base.elsize(::Type{<:JLArray{T}}) where {T} = sizeof(T) - -Base.size(x::JLArray) = x.dims -Base.sizeof(x::JLArray) = Base.elsize(x) * length(x) - -Base.unsafe_convert(::Type{Ptr{T}}, x::JLArray{T}) where {T} = - Base.unsafe_convert(Ptr{T}, x.data) - - -## interop with Julia arrays - -JLArray{T,N}(x::AbstractArray{<:Any,N}) where {T,N} = - JLArray{T,N}(convert(Array{T}, x), size(x)) - -# underspecified constructors -JLArray{T}(xs::AbstractArray{S,N}) where {T,N,S} = JLArray{T,N}(xs) -(::Type{JLArray{T,N} where T})(x::AbstractArray{S,N}) where {S,N} = JLArray{S,N}(x) -JLArray(A::AbstractArray{T,N}) where {T,N} = JLArray{T,N}(A) - -# idempotency -JLArray{T,N}(xs::JLArray{T,N}) where {T,N} = xs - -# adapt for the GPU -jl(xs) = adapt(JLArray, xs) -## don't convert isbits types since they are already considered GPU-compatible -Adapt.adapt_storage(::Type{JLArray}, xs::AbstractArray) = - isbits(xs) ? xs : convert(JLArray, xs) -## if an element type is specified, convert to it -Adapt.adapt_storage(::Type{<:JLArray{T}}, xs::AbstractArray) where {T} = - isbits(xs) ? xs : convert(JLArray{T}, xs) - -# adapt back to the CPU -Adapt.adapt_storage(::Type{Array}, xs::JLArray) = convert(Array, xs) - - -## conversions - -Base.convert(::Type{T}, x::T) where T <: JLArray = x - - -## broadcast - -using Base.Broadcast: BroadcastStyle, Broadcasted - -struct JLArrayStyle{N} <: AbstractGPUArrayStyle{N} end -JLArrayStyle(::Val{N}) where N = JLArrayStyle{N}() -JLArrayStyle{M}(::Val{N}) where {N,M} = JLArrayStyle{N}() - -BroadcastStyle(::Type{JLArray{T,N}}) where {T,N} = JLArrayStyle{N}() - -# Allocating the output container -Base.similar(bc::Broadcasted{JLArrayStyle{N}}, ::Type{T}) where {N,T} = - similar(JLArray{T}, axes(bc)) -Base.similar(bc::Broadcasted{JLArrayStyle{N}}, ::Type{T}, dims) where {N,T} = - JLArray{T}(undef, dims) - - -## memory operations - -function Base.copyto!(dest::Array{T}, d_offset::Integer, - source::DenseJLArray{T}, s_offset::Integer, - amount::Integer) where T - amount==0 && return dest - @boundscheck checkbounds(dest, d_offset) - @boundscheck checkbounds(dest, d_offset+amount-1) - @boundscheck checkbounds(source, s_offset) - @boundscheck checkbounds(source, s_offset+amount-1) - GC.@preserve dest source Base.unsafe_copyto!(pointer(dest, d_offset), - pointer(source, s_offset), amount) - return dest -end - -Base.copyto!(dest::Array{T}, source::DenseJLArray{T}) where {T} = - copyto!(dest, 1, source, 1, length(source)) - -function Base.copyto!(dest::DenseJLArray{T}, d_offset::Integer, - source::Array{T}, s_offset::Integer, - amount::Integer) where T - amount==0 && return dest - @boundscheck checkbounds(dest, d_offset) - @boundscheck checkbounds(dest, d_offset+amount-1) - @boundscheck checkbounds(source, s_offset) - @boundscheck checkbounds(source, s_offset+amount-1) - GC.@preserve dest source Base.unsafe_copyto!(pointer(dest, d_offset), - pointer(source, s_offset), amount) - return dest -end - -Base.copyto!(dest::DenseJLArray{T}, source::Array{T}) where {T} = - copyto!(dest, 1, source, 1, length(source)) - -function Base.copyto!(dest::DenseJLArray{T}, d_offset::Integer, - source::DenseJLArray{T}, s_offset::Integer, - amount::Integer) where T - amount==0 && return dest - @boundscheck checkbounds(dest, d_offset) - @boundscheck checkbounds(dest, d_offset+amount-1) - @boundscheck checkbounds(source, s_offset) - @boundscheck checkbounds(source, s_offset+amount-1) - GC.@preserve dest source Base.unsafe_copyto!(pointer(dest, d_offset), - pointer(source, s_offset), amount) - return dest -end - -Base.copyto!(dest::DenseJLArray{T}, source::DenseJLArray{T}) where {T} = - copyto!(dest, 1, source, 1, length(source)) - -# Added for PencilArrays tests Base.resize!(u::DenseJLVector, n) = (resize!(u.data, n); u) -# Added for PencilArrays tests function Base.unsafe_wrap(::Type{JLArray}, p::Ptr, dims::Union{Integer, Dims}; kws...) data = unsafe_wrap(Array, p, dims; kws...) JLArray(data) end -## random number generation - -using Random - -const GLOBAL_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing) -function GPUArrays.default_rng(::Type{<:JLArray}) - if GLOBAL_RNG[] === nothing - N = MAXTHREADS - state = JLArray{NTuple{4, UInt32}}(undef, N) - rng = GPUArrays.RNG(state) - Random.seed!(rng) - GLOBAL_RNG[] = rng - end - GLOBAL_RNG[] -end - -# Added for PencilArrays tests function Random.rand!(rng::AbstractRNG, u::JLArray, ::Type{X}) where {X} rand!(rng, u.data, X) u end - -## GPUArrays interfaces - -GPUArrays.backend(::Type{<:JLArray}) = JLBackend() - -Adapt.adapt_storage(::Adaptor, x::JLArray{T,N}) where {T,N} = - JLDeviceArray{T,N}(x.data, x.dims) - -function GPUArrays.mapreducedim!(f, op, R::AnyJLArray, A::Union{AbstractArray,Broadcast.Broadcasted}; - init=nothing) - if init !== nothing - fill!(R, init) - end - @allowscalar Base.reducedim!(op, R.data, map(f, A)) -end - -end