Skip to content

Commit 9f549b7

Browse files
committed
block array back end with example M*V
1 parent 09f8364 commit 9f549b7

File tree

1 file changed

+68
-44
lines changed

1 file changed

+68
-44
lines changed

examples/blockarray_backend.jl

Lines changed: 68 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
# using CUDAnative
2-
# device!(0)
3-
# using CuArrays
1+
using CUDAnative
2+
device!(0)
3+
using CuArrays
4+
using CuArrays.CUBLAS: libcublas_handle, cublasSetStream_v2, CuDefaultStream
5+
using CUDAnative.CUDAdrv: CuStream
46
using GPUArrays
57
using BlockArrays: _BlockArray, PseudoBlockArray, BlockArray, BlockMatrix, BlockVector,
68
nblocks, Block, cumulsizes, AbstractBlockVector
@@ -30,7 +32,7 @@ if @isdefined CuArray
3032
end
3133
###############
3234

33-
adapt(T::Type, b::BandedBlockBandedMatrix) =
35+
adapt(T::Type, b::BandedBlockBandedMatrix) =
3436
_BandedBlockBandedMatrix(adapt(T, b.data), b.block_sizes)
3537

3638

@@ -55,6 +57,44 @@ function LinearAlgebra.mul!(c::BlockVector{T},
5557
x.blocks[j], one(T), c.blocks[i])
5658
end
5759

60+
# mostly for symmetry with streamed version
61+
# synchronizing could be left to user
62+
synchronize(c)
63+
c
64+
end
65+
66+
function streamed_mul!(c::BlockVector{T},
67+
A::BandedBlockBandedMatrix{T, <: BlockMatrix},
68+
x::BlockVector{T};
69+
nstreams :: Union{Integer, Nothing} = nothing) where T
70+
@assert nblocks(A, 1) == nblocks(c, 1)
71+
@assert cumulsizes(A, 1) == cumulsizes(c, 1)
72+
@assert nblocks(A, 2) == nblocks(x, 1)
73+
@assert cumulsizes(A, 2) == cumulsizes(x, 1)
74+
75+
if nstreams isa Nothing
76+
nₛ = nblocks(A, 1)
77+
else
78+
nₛ = min(nstreams, nblocks(A, 1))
79+
end
80+
streams = [CuStream() for u in 1:nₛ]
81+
for (i, block) in enumerate(c.blocks)
82+
cublasSetStream_v2(libcublas_handle[], streams[(i - 1) % nₛ + 1])
83+
fill!(block, zero(eltype(block)))
84+
end
85+
l, u = blockbandwidths(A)
86+
λ, μ = subblockbandwidths(A)
87+
N,M = nblocks(A)
88+
89+
@inbounds for i = 1:N, j = max(1,i-l):min(M,i+u)
90+
cublasSetStream_v2(libcublas_handle[], streams[(i - 1) % nₛ + 1])
91+
BLAS.gbmv!('N', size(view(A, Block(i, j)), 1), λ, μ, one(T),
92+
A.data.blocks[i - j + u + 1, j],
93+
x.blocks[j], one(T), c.blocks[i])
94+
end
95+
96+
synchronize(c)
97+
cublasSetStream_v2(libcublas_handle[], CuDefaultStream())
5898
c
5999
end
60100

@@ -83,31 +123,6 @@ function banded_mul!(c::BlockVector{T},
83123
c
84124
end
85125

86-
function nofill_mul!(Cblock::BandedBlockBandedMatrix{T, <: BlockMatrix},
87-
Ablock::BandedBlockBandedMatrix{T, <: BlockMatrix},
88-
Xblock::BandedBlockBandedMatrix{T, <: BlockMatrix}) where T
89-
@assert nblocks(Ablock, 1) == nblocks(Cblock, 1)
90-
@assert cumulsizes(Ablock, 1) == cumulsizes(Cblock, 1)
91-
@assert nblocks(Ablock, 2) == nblocks(Xblock, 1)
92-
@assert cumulsizes(Ablock, 2) == cumulsizes(Xblock, 1)
93-
@assert nblocks(Xblock, 2) == nblocks(Cblock, 2)
94-
@assert cumulsizes(Xblock, 2) == cumulsizes(Xblock, 2)
95-
96-
lₐ, uₐ = blockbandwidths(Ablock)
97-
lₓ, uₓ = blockbandwidths(xblock)
98-
λ, μ = subblockbandwidths(Ablock)
99-
N,M = nblocks(Ablock)
100-
M, K = nblocks(Xblock)
101-
102-
@inbounds for i = 1:N, j = max(1,i-lₐ):min(M,i+uₐ), k = max(1, j - lₓ):min(j + uₓ, K)
103-
BLAS.gbmv!('N', size(view(Ablock, Block(i, j)), 1), λ, μ, one(T),
104-
Ablock.data.blocks[i - j + u + 1, j],
105-
Xblock.blocks[j], one(T), Cblock.blocks[i])
106-
end
107-
108-
Cblock
109-
end
110-
111126
using Test
112127

113128
function testme()
@@ -118,7 +133,7 @@ function testme()
118133
@test adapt(JLArray, bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
119134
@test eltype(adapt(JLArray, bmat)) === Float64
120135
if @isdefined CuArray
121-
@test cu(bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
136+
@test cu(bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
122137
@test eltype(cu(bmat)) === Float32
123138
end
124139
end
@@ -129,8 +144,8 @@ function testme()
129144
@test adapt(JLArray, bmat) isa PseudoBlockArray
130145
if @isdefined CuArray
131146
@test !(adapt(CuArray, bmat) isa PseudoBlockArray)
132-
@test adapt(CuArray, bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
133-
@test cu(bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
147+
@test adapt(CuArray, bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
148+
@test cu(bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
134149
@test eltype(cu(bmat)) === Float32
135150
end
136151
end
@@ -143,7 +158,7 @@ function testme()
143158
if @isdefined CuArray
144159
@test adapt(CuArray, bmat).data isa BlockArray{T, 2, CuArray{T, 2}} where T
145160
@test cu(bmat) isa BandedBlockBandedMatrix
146-
@test cu(bmat).data isa BlockArray{T, 2, JLArray{T, 2}} where T
161+
@test cu(bmat).data isa BlockArray{T, 2, CuArray{T, 2}} where T
147162
@test eltype(cu(bmat)) === Float32
148163
end
149164
end
@@ -164,7 +179,10 @@ function testme()
164179
@test LinearAlgebra.mul!(cblock, Ablock, xblock) A * x
165180
cblock .= 0
166181
@test banded_mul!(cblock, Ablock, xblock) A * x
182+
cgpu = cu(cblock)
183+
@test streamed_mul!(cgpu, cu(A), cu(x)) A * x
167184
end
185+
168186
end
169187
end
170188

@@ -173,16 +191,14 @@ using Statistics
173191

174192
function benchmarks()
175193
suite = BenchmarkGroup()
176-
suite["viabm"] = BenchmarkGroup()
194+
# suite["viabm"] = BenchmarkGroup()
177195
suite["pseudo"] = BenchmarkGroup()
178196
suite["block"] = BenchmarkGroup()
179-
possibles = [5, 10, 100, 500, 1000]
180-
for N in possibles #, n in possibles
181-
n = N
182-
suite["pseudo"]["N=$N n=$n"] = BenchmarkGroup()
183-
suite["block"]["N=$N n=$n"] = BenchmarkGroup()
184-
suite["viabm"]["N=$N n=$n"] = BenchmarkGroup()
185-
197+
if @isdefined CuArrays
198+
suite["gpu"] = BenchmarkGroup()
199+
suite["streams"] = BenchmarkGroup()
200+
end
201+
for N in [10, 100, 500, 1000], n = [N ÷ 5, N, 5N, 10N]
186202
l, u, λ, μ = rand(0:2, 4)
187203
M, m = N, n
188204

@@ -201,9 +217,17 @@ function benchmarks()
201217
LinearAlgebra.mul!($(adapt(BlockArray, c)), $(adapt(BlockArray, A)),
202218
$(adapt(BlockArray, x)))
203219
end
204-
suite["viabm"]["N=$N n=$n"] = @benchmarkable begin
205-
banded_mul!($(adapt(BlockArray, c)), $(adapt(BlockArray, A)),
206-
$(adapt(BlockArray, x)))
220+
if @isdefined CuArrays
221+
gpus = Dict(:c => adapt(CuArray, c),
222+
:x => adapt(CuArray, x),
223+
:A => adapt(CuArray, A))
224+
225+
suite["gpu"]["N=$N n=$n"] = @benchmarkable begin
226+
LinearAlgebra.mul!($(gpus[:c]), $(gpus[:A]), $(gpus[:x]))
227+
end
228+
suite["streams"]["N=$N n=$n"] = @benchmarkable begin
229+
gpuc = streamed_mul!($(gpus[:c]), $(gpus[:A]), $(gpus[:x]))
230+
end
207231
end
208232
end
209233
suite

0 commit comments

Comments
 (0)