Skip to content

Commit ab96be6

Browse files
mdavezacdlfivefifty
authored andcommitted
WIP: Shared arrays and GPU arrays (#19)
* block array back end with example M*V * Missing generalization over container types * Shared array back end works, now to benchmark it * Add editorconfig
1 parent 09f8364 commit ab96be6

File tree

4 files changed

+251
-45
lines changed

4 files changed

+251
-45
lines changed

.editorconfig

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[*]
2+
end_of_line = lf
3+
insert_final_newline = true
4+
indent_style = space
5+
indent_size = 4
6+
trim_trailing_whitespace = true
7+
max_line_length = 100

examples/blockarray_backend.jl

Lines changed: 68 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
# using CUDAnative
2-
# device!(0)
3-
# using CuArrays
1+
using CUDAnative
2+
device!(0)
3+
using CuArrays
4+
using CuArrays.CUBLAS: libcublas_handle, cublasSetStream_v2, CuDefaultStream
5+
using CUDAnative.CUDAdrv: CuStream
46
using GPUArrays
57
using BlockArrays: _BlockArray, PseudoBlockArray, BlockArray, BlockMatrix, BlockVector,
68
nblocks, Block, cumulsizes, AbstractBlockVector
@@ -30,7 +32,7 @@ if @isdefined CuArray
3032
end
3133
###############
3234

33-
adapt(T::Type, b::BandedBlockBandedMatrix) =
35+
adapt(T::Type, b::BandedBlockBandedMatrix) =
3436
_BandedBlockBandedMatrix(adapt(T, b.data), b.block_sizes)
3537

3638

@@ -55,6 +57,44 @@ function LinearAlgebra.mul!(c::BlockVector{T},
5557
x.blocks[j], one(T), c.blocks[i])
5658
end
5759

60+
# mostly for symmetry with streamed version
61+
# synchronizing could be left to user
62+
synchronize(c)
63+
c
64+
end
65+
66+
function streamed_mul!(c::BlockVector{T},
67+
A::BandedBlockBandedMatrix{T, <: BlockMatrix},
68+
x::BlockVector{T};
69+
nstreams :: Union{Integer, Nothing} = nothing) where T
70+
@assert nblocks(A, 1) == nblocks(c, 1)
71+
@assert cumulsizes(A, 1) == cumulsizes(c, 1)
72+
@assert nblocks(A, 2) == nblocks(x, 1)
73+
@assert cumulsizes(A, 2) == cumulsizes(x, 1)
74+
75+
if nstreams isa Nothing
76+
nₛ = nblocks(A, 1)
77+
else
78+
nₛ = min(nstreams, nblocks(A, 1))
79+
end
80+
streams = [CuStream() for u in 1:nₛ]
81+
for (i, block) in enumerate(c.blocks)
82+
cublasSetStream_v2(libcublas_handle[], streams[(i - 1) % nₛ + 1])
83+
fill!(block, zero(eltype(block)))
84+
end
85+
l, u = blockbandwidths(A)
86+
λ, μ = subblockbandwidths(A)
87+
N,M = nblocks(A)
88+
89+
@inbounds for i = 1:N, j = max(1,i-l):min(M,i+u)
90+
cublasSetStream_v2(libcublas_handle[], streams[(i - 1) % nₛ + 1])
91+
BLAS.gbmv!('N', size(view(A, Block(i, j)), 1), λ, μ, one(T),
92+
A.data.blocks[i - j + u + 1, j],
93+
x.blocks[j], one(T), c.blocks[i])
94+
end
95+
96+
synchronize(c)
97+
cublasSetStream_v2(libcublas_handle[], CuDefaultStream())
5898
c
5999
end
60100

@@ -83,31 +123,6 @@ function banded_mul!(c::BlockVector{T},
83123
c
84124
end
85125

86-
function nofill_mul!(Cblock::BandedBlockBandedMatrix{T, <: BlockMatrix},
87-
Ablock::BandedBlockBandedMatrix{T, <: BlockMatrix},
88-
Xblock::BandedBlockBandedMatrix{T, <: BlockMatrix}) where T
89-
@assert nblocks(Ablock, 1) == nblocks(Cblock, 1)
90-
@assert cumulsizes(Ablock, 1) == cumulsizes(Cblock, 1)
91-
@assert nblocks(Ablock, 2) == nblocks(Xblock, 1)
92-
@assert cumulsizes(Ablock, 2) == cumulsizes(Xblock, 1)
93-
@assert nblocks(Xblock, 2) == nblocks(Cblock, 2)
94-
@assert cumulsizes(Xblock, 2) == cumulsizes(Xblock, 2)
95-
96-
lₐ, uₐ = blockbandwidths(Ablock)
97-
lₓ, uₓ = blockbandwidths(xblock)
98-
λ, μ = subblockbandwidths(Ablock)
99-
N,M = nblocks(Ablock)
100-
M, K = nblocks(Xblock)
101-
102-
@inbounds for i = 1:N, j = max(1,i-lₐ):min(M,i+uₐ), k = max(1, j - lₓ):min(j + uₓ, K)
103-
BLAS.gbmv!('N', size(view(Ablock, Block(i, j)), 1), λ, μ, one(T),
104-
Ablock.data.blocks[i - j + u + 1, j],
105-
Xblock.blocks[j], one(T), Cblock.blocks[i])
106-
end
107-
108-
Cblock
109-
end
110-
111126
using Test
112127

113128
function testme()
@@ -118,7 +133,7 @@ function testme()
118133
@test adapt(JLArray, bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
119134
@test eltype(adapt(JLArray, bmat)) === Float64
120135
if @isdefined CuArray
121-
@test cu(bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
136+
@test cu(bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
122137
@test eltype(cu(bmat)) === Float32
123138
end
124139
end
@@ -129,8 +144,8 @@ function testme()
129144
@test adapt(JLArray, bmat) isa PseudoBlockArray
130145
if @isdefined CuArray
131146
@test !(adapt(CuArray, bmat) isa PseudoBlockArray)
132-
@test adapt(CuArray, bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
133-
@test cu(bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
147+
@test adapt(CuArray, bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
148+
@test cu(bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
134149
@test eltype(cu(bmat)) === Float32
135150
end
136151
end
@@ -143,7 +158,7 @@ function testme()
143158
if @isdefined CuArray
144159
@test adapt(CuArray, bmat).data isa BlockArray{T, 2, CuArray{T, 2}} where T
145160
@test cu(bmat) isa BandedBlockBandedMatrix
146-
@test cu(bmat).data isa BlockArray{T, 2, JLArray{T, 2}} where T
161+
@test cu(bmat).data isa BlockArray{T, 2, CuArray{T, 2}} where T
147162
@test eltype(cu(bmat)) === Float32
148163
end
149164
end
@@ -164,7 +179,10 @@ function testme()
164179
@test LinearAlgebra.mul!(cblock, Ablock, xblock) A * x
165180
cblock .= 0
166181
@test banded_mul!(cblock, Ablock, xblock) A * x
182+
cgpu = cu(cblock)
183+
@test streamed_mul!(cgpu, cu(A), cu(x)) A * x
167184
end
185+
168186
end
169187
end
170188

@@ -173,16 +191,14 @@ using Statistics
173191

174192
function benchmarks()
175193
suite = BenchmarkGroup()
176-
suite["viabm"] = BenchmarkGroup()
194+
# suite["viabm"] = BenchmarkGroup()
177195
suite["pseudo"] = BenchmarkGroup()
178196
suite["block"] = BenchmarkGroup()
179-
possibles = [5, 10, 100, 500, 1000]
180-
for N in possibles #, n in possibles
181-
n = N
182-
suite["pseudo"]["N=$N n=$n"] = BenchmarkGroup()
183-
suite["block"]["N=$N n=$n"] = BenchmarkGroup()
184-
suite["viabm"]["N=$N n=$n"] = BenchmarkGroup()
185-
197+
if @isdefined CuArrays
198+
suite["gpu"] = BenchmarkGroup()
199+
suite["streams"] = BenchmarkGroup()
200+
end
201+
for N in [10, 100, 500, 1000], n = [N ÷ 5, N, 5N, 10N]
186202
l, u, λ, μ = rand(0:2, 4)
187203
M, m = N, n
188204

@@ -201,9 +217,17 @@ function benchmarks()
201217
LinearAlgebra.mul!($(adapt(BlockArray, c)), $(adapt(BlockArray, A)),
202218
$(adapt(BlockArray, x)))
203219
end
204-
suite["viabm"]["N=$N n=$n"] = @benchmarkable begin
205-
banded_mul!($(adapt(BlockArray, c)), $(adapt(BlockArray, A)),
206-
$(adapt(BlockArray, x)))
220+
if @isdefined CuArrays
221+
gpus = Dict(:c => adapt(CuArray, c),
222+
:x => adapt(CuArray, x),
223+
:A => adapt(CuArray, A))
224+
225+
suite["gpu"]["N=$N n=$n"] = @benchmarkable begin
226+
LinearAlgebra.mul!($(gpus[:c]), $(gpus[:A]), $(gpus[:x]))
227+
end
228+
suite["streams"]["N=$N n=$n"] = @benchmarkable begin
229+
gpuc = streamed_mul!($(gpus[:c]), $(gpus[:A]), $(gpus[:x]))
230+
end
207231
end
208232
end
209233
suite

examples/shared_array_backend.jl

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
using BlockArrays: _BlockArray, PseudoBlockArray, BlockArray, BlockMatrix, BlockVector,
2+
nblocks, Block, cumulsizes, AbstractBlockVector
3+
using BlockBandedMatrices: BandedBlockBandedMatrix, _BandedBlockBandedMatrix,
4+
blockbandwidths, subblockbandwidths, blockbandwidth,
5+
BandedBlockBandedSizes
6+
using LinearAlgebra: BLAS
7+
import LinearAlgebra
8+
using BandedMatrices: _BandedMatrix, BandedMatrix
9+
using SharedArrays
10+
using LazyArrays
11+
using Distributed: procs, remotecall_wait
12+
import Distributed
13+
14+
import Adapt: adapt
15+
16+
adapt(T::Type, b::BandedBlockBandedMatrix) =
17+
_BandedBlockBandedMatrix(adapt(T, b.data), b.block_sizes)
18+
adapt(T::Type{<:AbstractArray}, b::PseudoBlockArray) =
19+
PseudoBlockArray(T(b.blocks), b.block_sizes)
20+
21+
22+
const SharedBandedBlockBandedMatrix =
23+
BandedBlockBandedMatrix{T, PseudoBlockArray{T, 2, SharedArray{T, 2}}} where T
24+
25+
function SharedBandedBlockBandedMatrix{T}(::UndefInitializer,
26+
bs::BandedBlockBandedSizes;
27+
kwargs...) where T
28+
Block = fieldtype(SharedBandedBlockBandedMatrix{T}, :data)
29+
Shared = fieldtype(Block, :blocks)
30+
kwargs = Dict(kwargs)
31+
init = pop!(kwargs, :init, nothing)
32+
shared = Shared(size(bs); kwargs...)
33+
result = _BandedBlockBandedMatrix(Block(shared, bs.data_block_sizes), bs)
34+
populate!(result, init)
35+
result
36+
end
37+
38+
Distributed.procs(A::SharedBandedBlockBandedMatrix) = procs(A.data.blocks)
39+
40+
function populate!(A::SharedBandedBlockBandedMatrix, range, block_populate!::Function)
41+
k = 1
42+
for i in 1:nblocks(A, 1), j in max(i - A.u, 1):min(i + A.l, nblocks(A, 2))
43+
if k in range
44+
block_populate!(view(A, Block(i, j)), i, j)
45+
end
46+
k += 1
47+
end
48+
A
49+
end
50+
51+
52+
function populate!(A::SharedBandedBlockBandedMatrix, block_populate!::Function)
53+
n = nnzero(nblocks(A)..., A.l, A.u)
54+
m = length(procs(A))
55+
@sync begin
56+
for (i, proc) in enumerate(procs(A))
57+
start = (n ÷ m) * (i - 1) + min((n % m), i - 1) + 1
58+
stop = (n ÷ m) * i + min((n % m), i)
59+
@async remotecall_wait(populate!, proc, A, start:stop, block_populate!)
60+
end
61+
end
62+
A
63+
end
64+
65+
populate!(block_populate!::Function, A::SharedBandedBlockBandedMatrix) =
66+
populate!(A, block_populate!)
67+
68+
SharedBandedBlockBandedMatrix{T}(init::Function,
69+
bs::BandedBlockBandedSizes;
70+
pids=Int[]) where T =
71+
SharedBandedBlockBandedMatrix{T}(undef, bs; pids=pids, init=init)
72+
function SharedBandedBlockBandedMatrix{T}(init::Function,
73+
dims::NTuple{2, AbstractVector{Int}},
74+
lu::NTuple{2, Int}, λμ::NTuple{2, Int};
75+
pids=Int[]) where T
76+
bs = BandedBlockBandedSizes(dims..., lu..., λμ...)
77+
SharedBandedBlockBandedMatrix{T}(init, bs; pids=pids)
78+
end
79+
80+
"""Number of non-zero elements in an banded matrix"""
81+
function nnzero(n::Integer, m::Integer, l::Integer, u::Integer)
82+
result = zero(n)
83+
for i = 0:min(n, l)
84+
result += min(n - i, m)
85+
end
86+
for i = 1:min(m, u)
87+
result += min(m - i, n)
88+
end
89+
result
90+
end
91+
92+
function LinearAlgebra.mul!(c::AbstractBlockVector{T},
93+
A::SharedBandedBlockBandedMatrix{T},
94+
x::AbstractBlockVector{T}) where T
95+
@assert nblocks(A, 1) == nblocks(c, 1)
96+
@assert cumulsizes(A, 1) == cumulsizes(c, 1)
97+
@assert nblocks(A, 2) == nblocks(x, 1)
98+
@assert cumulsizes(A, 2) == cumulsizes(x, 1)
99+
100+
n = nblocks(A, 1)
101+
m = length(procs(A))
102+
103+
@sync for (p, proc) in enumerate(procs(A))
104+
105+
p > n && continue
106+
start = (n ÷ m) * (p - 1) + min((n % m), p - 1) + 1
107+
stop = (n ÷ m) * p + min((n % m), p)
108+
109+
@async begin
110+
remotecall_wait(proc, start:stop) do irange
111+
@inbounds for i in irange
112+
fill!(view(c, Block(i)), zero(eltype(c)))
113+
for j = max(1, i - A.l):min(nblocks(A, 2), i + A.u)
114+
c[Block(i)] .+= Mul(view(A, Block(i, j)), view(x, Block(j)))
115+
end
116+
end
117+
end
118+
end
119+
120+
end
121+
c
122+
end
123+
124+
125+
using Test
126+
127+
function testme()
128+
SBBB = SharedBandedBlockBandedMatrix
129+
@testset "shared array backend" begin
130+
131+
@testset "Initialization" begin
132+
n, m = repeat([2], 4), repeat([3], 2)
133+
A = SBBB{Int64}((n, m), (1, 1), (1, 0)) do block, i, j
134+
block .= 0
135+
if (i == 3) && (j == 2); block[2, 2] = 1 end
136+
end
137+
@test view(A, Block(3, 2))[2, 2] == 1
138+
view(A, Block(3, 2))[2, 2] = 0
139+
@test all(A .== 0)
140+
end
141+
142+
@testset "count non-zero elements" begin
143+
for i in 1:100
144+
n, m = rand(1:10, 2)
145+
l, u = rand(0:10, 2)
146+
A = BandedMatrix{Int8}(undef, n, m, l, u)
147+
A.data .= 1
148+
@test sum(A) == nnzero(n, m, l, u)
149+
end
150+
end
151+
152+
@testset "Multiplication" begin
153+
N, M = rand(1:3, 2)
154+
l, u, λ, μ = rand(0:2, 4)
155+
n, m = rand(max(l, u, λ, μ):20, N), rand(max(l, u, λ, μ):20, M)
156+
A = BandedBlockBandedMatrix{Float64}(undef, (n, m), (l, u), (λ, μ))
157+
A.data .= rand.()
158+
x = PseudoBlockArray(Array{Float64, 1}(undef, size(A, 2)), m)
159+
x .= rand.()
160+
161+
Ashared = adapt(SharedArray, A)
162+
@test Ashared.data.blocks isa SharedArray
163+
@test Ashared isa SharedBandedBlockBandedMatrix
164+
@test length(procs(Ashared)) == max(1, length(procs()) - 1)
165+
cshared = adapt(SharedArray,
166+
PseudoBlockArray(Array{Float64, 1}(undef, size(A, 1)), n))
167+
@test cshared.blocks isa SharedArray
168+
cshared .= rand.()
169+
xshared = adapt(SharedArray, x)
170+
171+
@test LinearAlgebra.mul!(cshared, Ashared, xshared) A * x
172+
end
173+
174+
end
175+
end

src/linalg.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ const Block1 = Block{1,Int}
33
const BlockRange1 = BlockRange{1,Tuple{UnitRange{Int}}}
44
const BlockIndexRange1 = BlockIndexRange{1,Tuple{UnitRange{Int64}}}
55
const SubBlockBandedMatrix{T,R1,R2} =
6-
SubArray{T,2,BlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}
6+
SubArray{T,2,<:BlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}
77

88
const SubBandedBlockBandedMatrix{T,R1,R2} =
99
SubArray{T,2,<:BandedBlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}

0 commit comments

Comments
 (0)