Skip to content

WIP: Shared arrays and GPU arrays #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[*]
end_of_line = lf
insert_final_newline = true
indent_style = space
indent_size = 4
trim_trailing_whitespace = true
max_line_length = 100
112 changes: 68 additions & 44 deletions examples/blockarray_backend.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# using CUDAnative
# device!(0)
# using CuArrays
using CUDAnative
device!(0)
using CuArrays
using CuArrays.CUBLAS: libcublas_handle, cublasSetStream_v2, CuDefaultStream
using CUDAnative.CUDAdrv: CuStream
using GPUArrays
using BlockArrays: _BlockArray, PseudoBlockArray, BlockArray, BlockMatrix, BlockVector,
nblocks, Block, cumulsizes, AbstractBlockVector
Expand Down Expand Up @@ -30,7 +32,7 @@ if @isdefined CuArray
end
###############

adapt(T::Type, b::BandedBlockBandedMatrix) =
adapt(T::Type, b::BandedBlockBandedMatrix) =
_BandedBlockBandedMatrix(adapt(T, b.data), b.block_sizes)


Expand All @@ -55,6 +57,44 @@ function LinearAlgebra.mul!(c::BlockVector{T},
x.blocks[j], one(T), c.blocks[i])
end

# mostly for symmetry with streamed version
# synchronizing could be left to user
synchronize(c)
c
end

function streamed_mul!(c::BlockVector{T},
A::BandedBlockBandedMatrix{T, <: BlockMatrix},
x::BlockVector{T};
nstreams :: Union{Integer, Nothing} = nothing) where T
@assert nblocks(A, 1) == nblocks(c, 1)
@assert cumulsizes(A, 1) == cumulsizes(c, 1)
@assert nblocks(A, 2) == nblocks(x, 1)
@assert cumulsizes(A, 2) == cumulsizes(x, 1)

if nstreams isa Nothing
nₛ = nblocks(A, 1)
else
nₛ = min(nstreams, nblocks(A, 1))
end
streams = [CuStream() for u in 1:nₛ]
for (i, block) in enumerate(c.blocks)
cublasSetStream_v2(libcublas_handle[], streams[(i - 1) % nₛ + 1])
fill!(block, zero(eltype(block)))
end
l, u = blockbandwidths(A)
λ, μ = subblockbandwidths(A)
N,M = nblocks(A)

@inbounds for i = 1:N, j = max(1,i-l):min(M,i+u)
cublasSetStream_v2(libcublas_handle[], streams[(i - 1) % nₛ + 1])
BLAS.gbmv!('N', size(view(A, Block(i, j)), 1), λ, μ, one(T),
A.data.blocks[i - j + u + 1, j],
x.blocks[j], one(T), c.blocks[i])
end

synchronize(c)
cublasSetStream_v2(libcublas_handle[], CuDefaultStream())
c
end

Expand Down Expand Up @@ -83,31 +123,6 @@ function banded_mul!(c::BlockVector{T},
c
end

function nofill_mul!(Cblock::BandedBlockBandedMatrix{T, <: BlockMatrix},
Ablock::BandedBlockBandedMatrix{T, <: BlockMatrix},
Xblock::BandedBlockBandedMatrix{T, <: BlockMatrix}) where T
@assert nblocks(Ablock, 1) == nblocks(Cblock, 1)
@assert cumulsizes(Ablock, 1) == cumulsizes(Cblock, 1)
@assert nblocks(Ablock, 2) == nblocks(Xblock, 1)
@assert cumulsizes(Ablock, 2) == cumulsizes(Xblock, 1)
@assert nblocks(Xblock, 2) == nblocks(Cblock, 2)
@assert cumulsizes(Xblock, 2) == cumulsizes(Xblock, 2)

lₐ, uₐ = blockbandwidths(Ablock)
lₓ, uₓ = blockbandwidths(xblock)
λ, μ = subblockbandwidths(Ablock)
N,M = nblocks(Ablock)
M, K = nblocks(Xblock)

@inbounds for i = 1:N, j = max(1,i-lₐ):min(M,i+uₐ), k = max(1, j - lₓ):min(j + uₓ, K)
BLAS.gbmv!('N', size(view(Ablock, Block(i, j)), 1), λ, μ, one(T),
Ablock.data.blocks[i - j + u + 1, j],
Xblock.blocks[j], one(T), Cblock.blocks[i])
end

Cblock
end

using Test

function testme()
Expand All @@ -118,7 +133,7 @@ function testme()
@test adapt(JLArray, bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
@test eltype(adapt(JLArray, bmat)) === Float64
if @isdefined CuArray
@test cu(bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
@test cu(bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
@test eltype(cu(bmat)) === Float32
end
end
Expand All @@ -129,8 +144,8 @@ function testme()
@test adapt(JLArray, bmat) isa PseudoBlockArray
if @isdefined CuArray
@test !(adapt(CuArray, bmat) isa PseudoBlockArray)
@test adapt(CuArray, bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
@test cu(bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
@test adapt(CuArray, bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
@test cu(bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
@test eltype(cu(bmat)) === Float32
end
end
Expand All @@ -143,7 +158,7 @@ function testme()
if @isdefined CuArray
@test adapt(CuArray, bmat).data isa BlockArray{T, 2, CuArray{T, 2}} where T
@test cu(bmat) isa BandedBlockBandedMatrix
@test cu(bmat).data isa BlockArray{T, 2, JLArray{T, 2}} where T
@test cu(bmat).data isa BlockArray{T, 2, CuArray{T, 2}} where T
@test eltype(cu(bmat)) === Float32
end
end
Expand All @@ -164,7 +179,10 @@ function testme()
@test LinearAlgebra.mul!(cblock, Ablock, xblock) ≈ A * x
cblock .= 0
@test banded_mul!(cblock, Ablock, xblock) ≈ A * x
cgpu = cu(cblock)
@test streamed_mul!(cgpu, cu(A), cu(x)) ≈ A * x
end

end
end

Expand All @@ -173,16 +191,14 @@ using Statistics

function benchmarks()
suite = BenchmarkGroup()
suite["viabm"] = BenchmarkGroup()
# suite["viabm"] = BenchmarkGroup()
suite["pseudo"] = BenchmarkGroup()
suite["block"] = BenchmarkGroup()
possibles = [5, 10, 100, 500, 1000]
for N in possibles #, n in possibles
n = N
suite["pseudo"]["N=$N n=$n"] = BenchmarkGroup()
suite["block"]["N=$N n=$n"] = BenchmarkGroup()
suite["viabm"]["N=$N n=$n"] = BenchmarkGroup()

if @isdefined CuArrays
suite["gpu"] = BenchmarkGroup()
suite["streams"] = BenchmarkGroup()
end
for N in [10, 100, 500, 1000], n = [N ÷ 5, N, 5N, 10N]
l, u, λ, μ = rand(0:2, 4)
M, m = N, n

Expand All @@ -201,9 +217,17 @@ function benchmarks()
LinearAlgebra.mul!($(adapt(BlockArray, c)), $(adapt(BlockArray, A)),
$(adapt(BlockArray, x)))
end
suite["viabm"]["N=$N n=$n"] = @benchmarkable begin
banded_mul!($(adapt(BlockArray, c)), $(adapt(BlockArray, A)),
$(adapt(BlockArray, x)))
if @isdefined CuArrays
gpus = Dict(:c => adapt(CuArray, c),
:x => adapt(CuArray, x),
:A => adapt(CuArray, A))

suite["gpu"]["N=$N n=$n"] = @benchmarkable begin
LinearAlgebra.mul!($(gpus[:c]), $(gpus[:A]), $(gpus[:x]))
end
suite["streams"]["N=$N n=$n"] = @benchmarkable begin
gpuc = streamed_mul!($(gpus[:c]), $(gpus[:A]), $(gpus[:x]))
end
end
end
suite
Expand Down
175 changes: 175 additions & 0 deletions examples/shared_array_backend.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
using BlockArrays: _BlockArray, PseudoBlockArray, BlockArray, BlockMatrix, BlockVector,
nblocks, Block, cumulsizes, AbstractBlockVector
using BlockBandedMatrices: BandedBlockBandedMatrix, _BandedBlockBandedMatrix,
blockbandwidths, subblockbandwidths, blockbandwidth,
BandedBlockBandedSizes
using LinearAlgebra: BLAS
import LinearAlgebra
using BandedMatrices: _BandedMatrix, BandedMatrix
using SharedArrays
using LazyArrays
using Distributed: procs, remotecall_wait
import Distributed

import Adapt: adapt

adapt(T::Type, b::BandedBlockBandedMatrix) =
_BandedBlockBandedMatrix(adapt(T, b.data), b.block_sizes)
adapt(T::Type{<:AbstractArray}, b::PseudoBlockArray) =
PseudoBlockArray(T(b.blocks), b.block_sizes)


const SharedBandedBlockBandedMatrix =
BandedBlockBandedMatrix{T, PseudoBlockArray{T, 2, SharedArray{T, 2}}} where T

function SharedBandedBlockBandedMatrix{T}(::UndefInitializer,
bs::BandedBlockBandedSizes;
kwargs...) where T
Block = fieldtype(SharedBandedBlockBandedMatrix{T}, :data)
Shared = fieldtype(Block, :blocks)
kwargs = Dict(kwargs)
init = pop!(kwargs, :init, nothing)
shared = Shared(size(bs); kwargs...)
result = _BandedBlockBandedMatrix(Block(shared, bs.data_block_sizes), bs)
populate!(result, init)
result
end

Distributed.procs(A::SharedBandedBlockBandedMatrix) = procs(A.data.blocks)

function populate!(A::SharedBandedBlockBandedMatrix, range, block_populate!::Function)
k = 1
for i in 1:nblocks(A, 1), j in max(i - A.u, 1):min(i + A.l, nblocks(A, 2))
if k in range
block_populate!(view(A, Block(i, j)), i, j)
end
k += 1
end
A
end


function populate!(A::SharedBandedBlockBandedMatrix, block_populate!::Function)
n = nnzero(nblocks(A)..., A.l, A.u)
m = length(procs(A))
@sync begin
for (i, proc) in enumerate(procs(A))
start = (n ÷ m) * (i - 1) + min((n % m), i - 1) + 1
stop = (n ÷ m) * i + min((n % m), i)
@async remotecall_wait(populate!, proc, A, start:stop, block_populate!)
end
end
A
end

populate!(block_populate!::Function, A::SharedBandedBlockBandedMatrix) =
populate!(A, block_populate!)

SharedBandedBlockBandedMatrix{T}(init::Function,
bs::BandedBlockBandedSizes;
pids=Int[]) where T =
SharedBandedBlockBandedMatrix{T}(undef, bs; pids=pids, init=init)
function SharedBandedBlockBandedMatrix{T}(init::Function,
dims::NTuple{2, AbstractVector{Int}},
lu::NTuple{2, Int}, λμ::NTuple{2, Int};
pids=Int[]) where T
bs = BandedBlockBandedSizes(dims..., lu..., λμ...)
SharedBandedBlockBandedMatrix{T}(init, bs; pids=pids)
end

"""Number of non-zero elements in an banded matrix"""
function nnzero(n::Integer, m::Integer, l::Integer, u::Integer)
result = zero(n)
for i = 0:min(n, l)
result += min(n - i, m)
end
for i = 1:min(m, u)
result += min(m - i, n)
end
result
end

function LinearAlgebra.mul!(c::AbstractBlockVector{T},
A::SharedBandedBlockBandedMatrix{T},
x::AbstractBlockVector{T}) where T
@assert nblocks(A, 1) == nblocks(c, 1)
@assert cumulsizes(A, 1) == cumulsizes(c, 1)
@assert nblocks(A, 2) == nblocks(x, 1)
@assert cumulsizes(A, 2) == cumulsizes(x, 1)

n = nblocks(A, 1)
m = length(procs(A))

@sync for (p, proc) in enumerate(procs(A))

p > n && continue
start = (n ÷ m) * (p - 1) + min((n % m), p - 1) + 1
stop = (n ÷ m) * p + min((n % m), p)

@async begin
remotecall_wait(proc, start:stop) do irange
@inbounds for i in irange
fill!(view(c, Block(i)), zero(eltype(c)))
for j = max(1, i - A.l):min(nblocks(A, 2), i + A.u)
c[Block(i)] .+= Mul(view(A, Block(i, j)), view(x, Block(j)))
end
end
end
end

end
c
end


using Test

function testme()
SBBB = SharedBandedBlockBandedMatrix
@testset "shared array backend" begin

@testset "Initialization" begin
n, m = repeat([2], 4), repeat([3], 2)
A = SBBB{Int64}((n, m), (1, 1), (1, 0)) do block, i, j
block .= 0
if (i == 3) && (j == 2); block[2, 2] = 1 end
end
@test view(A, Block(3, 2))[2, 2] == 1
view(A, Block(3, 2))[2, 2] = 0
@test all(A .== 0)
end

@testset "count non-zero elements" begin
for i in 1:100
n, m = rand(1:10, 2)
l, u = rand(0:10, 2)
A = BandedMatrix{Int8}(undef, n, m, l, u)
A.data .= 1
@test sum(A) == nnzero(n, m, l, u)
end
end

@testset "Multiplication" begin
N, M = rand(1:3, 2)
l, u, λ, μ = rand(0:2, 4)
n, m = rand(max(l, u, λ, μ):20, N), rand(max(l, u, λ, μ):20, M)
A = BandedBlockBandedMatrix{Float64}(undef, (n, m), (l, u), (λ, μ))
A.data .= rand.()
x = PseudoBlockArray(Array{Float64, 1}(undef, size(A, 2)), m)
x .= rand.()

Ashared = adapt(SharedArray, A)
@test Ashared.data.blocks isa SharedArray
@test Ashared isa SharedBandedBlockBandedMatrix
@test length(procs(Ashared)) == max(1, length(procs()) - 1)
cshared = adapt(SharedArray,
PseudoBlockArray(Array{Float64, 1}(undef, size(A, 1)), n))
@test cshared.blocks isa SharedArray
cshared .= rand.()
xshared = adapt(SharedArray, x)

@test LinearAlgebra.mul!(cshared, Ashared, xshared) ≈ A * x
end

end
end
2 changes: 1 addition & 1 deletion src/linalg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const Block1 = Block{1,Int}
const BlockRange1 = BlockRange{1,Tuple{UnitRange{Int}}}
const BlockIndexRange1 = BlockIndexRange{1,Tuple{UnitRange{Int64}}}
const SubBlockBandedMatrix{T,R1,R2} =
SubArray{T,2,BlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}
SubArray{T,2,<:BlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}

const SubBandedBlockBandedMatrix{T,R1,R2} =
SubArray{T,2,<:BandedBlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}
Expand Down