1
- # using CUDAnative
2
- # device!(0)
3
- # using CuArrays
1
+ using CUDAnative
2
+ device! (0 )
3
+ using CuArrays
4
+ using CuArrays. CUBLAS: libcublas_handle, cublasSetStream_v2, CuDefaultStream
5
+ using CUDAnative. CUDAdrv: CuStream
4
6
using GPUArrays
5
7
using BlockArrays: _BlockArray, PseudoBlockArray, BlockArray, BlockMatrix, BlockVector,
6
8
nblocks, Block, cumulsizes, AbstractBlockVector
@@ -30,7 +32,7 @@ if @isdefined CuArray
30
32
end
31
33
# ##############
32
34
33
- adapt (T:: Type , b:: BandedBlockBandedMatrix ) =
35
+ adapt (T:: Type , b:: BandedBlockBandedMatrix ) =
34
36
_BandedBlockBandedMatrix (adapt (T, b. data), b. block_sizes)
35
37
36
38
@@ -55,6 +57,44 @@ function LinearAlgebra.mul!(c::BlockVector{T},
55
57
x. blocks[j], one (T), c. blocks[i])
56
58
end
57
59
60
+ # mostly for symmetry with streamed version
61
+ # synchronizing could be left to user
62
+ synchronize (c)
63
+ c
64
+ end
65
+
66
+ function streamed_mul! (c:: BlockVector{T} ,
67
+ A:: BandedBlockBandedMatrix{T, <: BlockMatrix} ,
68
+ x:: BlockVector{T} ;
69
+ nstreams :: Union{Integer, Nothing} = nothing ) where T
70
+ @assert nblocks (A, 1 ) == nblocks (c, 1 )
71
+ @assert cumulsizes (A, 1 ) == cumulsizes (c, 1 )
72
+ @assert nblocks (A, 2 ) == nblocks (x, 1 )
73
+ @assert cumulsizes (A, 2 ) == cumulsizes (x, 1 )
74
+
75
+ if nstreams isa Nothing
76
+ nₛ = nblocks (A, 1 )
77
+ else
78
+ nₛ = min (nstreams, nblocks (A, 1 ))
79
+ end
80
+ streams = [CuStream () for u in 1 : nₛ]
81
+ for (i, block) in enumerate (c. blocks)
82
+ cublasSetStream_v2 (libcublas_handle[], streams[(i - 1 ) % nₛ + 1 ])
83
+ fill! (block, zero (eltype (block)))
84
+ end
85
+ l, u = blockbandwidths (A)
86
+ λ, μ = subblockbandwidths (A)
87
+ N,M = nblocks (A)
88
+
89
+ @inbounds for i = 1 : N, j = max (1 ,i- l): min (M,i+ u)
90
+ cublasSetStream_v2 (libcublas_handle[], streams[(i - 1 ) % nₛ + 1 ])
91
+ BLAS. gbmv! (' N' , size (view (A, Block (i, j)), 1 ), λ, μ, one (T),
92
+ A. data. blocks[i - j + u + 1 , j],
93
+ x. blocks[j], one (T), c. blocks[i])
94
+ end
95
+
96
+ synchronize (c)
97
+ cublasSetStream_v2 (libcublas_handle[], CuDefaultStream ())
58
98
c
59
99
end
60
100
@@ -83,31 +123,6 @@ function banded_mul!(c::BlockVector{T},
83
123
c
84
124
end
85
125
86
- function nofill_mul! (Cblock:: BandedBlockBandedMatrix{T, <: BlockMatrix} ,
87
- Ablock:: BandedBlockBandedMatrix{T, <: BlockMatrix} ,
88
- Xblock:: BandedBlockBandedMatrix{T, <: BlockMatrix} ) where T
89
- @assert nblocks (Ablock, 1 ) == nblocks (Cblock, 1 )
90
- @assert cumulsizes (Ablock, 1 ) == cumulsizes (Cblock, 1 )
91
- @assert nblocks (Ablock, 2 ) == nblocks (Xblock, 1 )
92
- @assert cumulsizes (Ablock, 2 ) == cumulsizes (Xblock, 1 )
93
- @assert nblocks (Xblock, 2 ) == nblocks (Cblock, 2 )
94
- @assert cumulsizes (Xblock, 2 ) == cumulsizes (Xblock, 2 )
95
-
96
- lₐ, uₐ = blockbandwidths (Ablock)
97
- lₓ, uₓ = blockbandwidths (xblock)
98
- λ, μ = subblockbandwidths (Ablock)
99
- N,M = nblocks (Ablock)
100
- M, K = nblocks (Xblock)
101
-
102
- @inbounds for i = 1 : N, j = max (1 ,i- lₐ): min (M,i+ uₐ), k = max (1 , j - lₓ): min (j + uₓ, K)
103
- BLAS. gbmv! (' N' , size (view (Ablock, Block (i, j)), 1 ), λ, μ, one (T),
104
- Ablock. data. blocks[i - j + u + 1 , j],
105
- Xblock. blocks[j], one (T), Cblock. blocks[i])
106
- end
107
-
108
- Cblock
109
- end
110
-
111
126
using Test
112
127
113
128
function testme ()
@@ -118,7 +133,7 @@ function testme()
118
133
@test adapt (JLArray, bmat) isa BlockArray{T, 2 , JLArray{T, 2 }} where T
119
134
@test eltype (adapt (JLArray, bmat)) === Float64
120
135
if @isdefined CuArray
121
- @test cu (bmat) isa BlockArray{T, 2 , JLArray {T, 2 }} where T
136
+ @test cu (bmat) isa BlockArray{T, 2 , CuArray {T, 2 }} where T
122
137
@test eltype (cu (bmat)) === Float32
123
138
end
124
139
end
@@ -129,8 +144,8 @@ function testme()
129
144
@test adapt (JLArray, bmat) isa PseudoBlockArray
130
145
if @isdefined CuArray
131
146
@test ! (adapt (CuArray, bmat) isa PseudoBlockArray)
132
- @test adapt (CuArray, bmat) isa BlockArray{T, 2 , JLArray {T, 2 }} where T
133
- @test cu (bmat) isa BlockArray{T, 2 , JLArray {T, 2 }} where T
147
+ @test adapt (CuArray, bmat) isa BlockArray{T, 2 , CuArray {T, 2 }} where T
148
+ @test cu (bmat) isa BlockArray{T, 2 , CuArray {T, 2 }} where T
134
149
@test eltype (cu (bmat)) === Float32
135
150
end
136
151
end
@@ -143,7 +158,7 @@ function testme()
143
158
if @isdefined CuArray
144
159
@test adapt (CuArray, bmat). data isa BlockArray{T, 2 , CuArray{T, 2 }} where T
145
160
@test cu (bmat) isa BandedBlockBandedMatrix
146
- @test cu (bmat). data isa BlockArray{T, 2 , JLArray {T, 2 }} where T
161
+ @test cu (bmat). data isa BlockArray{T, 2 , CuArray {T, 2 }} where T
147
162
@test eltype (cu (bmat)) === Float32
148
163
end
149
164
end
@@ -164,7 +179,10 @@ function testme()
164
179
@test LinearAlgebra. mul! (cblock, Ablock, xblock) ≈ A * x
165
180
cblock .= 0
166
181
@test banded_mul! (cblock, Ablock, xblock) ≈ A * x
182
+ cgpu = cu (cblock)
183
+ @test streamed_mul! (cgpu, cu (A), cu (x)) ≈ A * x
167
184
end
185
+
168
186
end
169
187
end
170
188
@@ -173,16 +191,14 @@ using Statistics
173
191
174
192
function benchmarks ()
175
193
suite = BenchmarkGroup ()
176
- suite[" viabm" ] = BenchmarkGroup ()
194
+ # suite["viabm"] = BenchmarkGroup()
177
195
suite[" pseudo" ] = BenchmarkGroup ()
178
196
suite[" block" ] = BenchmarkGroup ()
179
- possibles = [5 , 10 , 100 , 500 , 1000 ]
180
- for N in possibles # , n in possibles
181
- n = N
182
- suite[" pseudo" ][" N=$N n=$n " ] = BenchmarkGroup ()
183
- suite[" block" ][" N=$N n=$n " ] = BenchmarkGroup ()
184
- suite[" viabm" ][" N=$N n=$n " ] = BenchmarkGroup ()
185
-
197
+ if @isdefined CuArrays
198
+ suite[" gpu" ] = BenchmarkGroup ()
199
+ suite[" streams" ] = BenchmarkGroup ()
200
+ end
201
+ for N in [10 , 100 , 500 , 1000 ], n = [N ÷ 5 , N, 5 N, 10 N]
186
202
l, u, λ, μ = rand (0 : 2 , 4 )
187
203
M, m = N, n
188
204
@@ -201,9 +217,17 @@ function benchmarks()
201
217
LinearAlgebra. mul! ($ (adapt (BlockArray, c)), $ (adapt (BlockArray, A)),
202
218
$ (adapt (BlockArray, x)))
203
219
end
204
- suite[" viabm" ][" N=$N n=$n " ] = @benchmarkable begin
205
- banded_mul! ($ (adapt (BlockArray, c)), $ (adapt (BlockArray, A)),
206
- $ (adapt (BlockArray, x)))
220
+ if @isdefined CuArrays
221
+ gpus = Dict (:c => adapt (CuArray, c),
222
+ :x => adapt (CuArray, x),
223
+ :A => adapt (CuArray, A))
224
+
225
+ suite[" gpu" ][" N=$N n=$n " ] = @benchmarkable begin
226
+ LinearAlgebra. mul! ($ (gpus[:c ]), $ (gpus[:A ]), $ (gpus[:x ]))
227
+ end
228
+ suite[" streams" ][" N=$N n=$n " ] = @benchmarkable begin
229
+ gpuc = streamed_mul! ($ (gpus[:c ]), $ (gpus[:A ]), $ (gpus[:x ]))
230
+ end
207
231
end
208
232
end
209
233
suite
0 commit comments