WIP: Shared arrays and GPU arrays (#19)

mdavezac · dlfivefifty · commit ab96be6dea1b · 2018-10-31T20:20:58.000Z
* block array back end with example M*V

* Missing generalization over container types

* Shared array back end works, now to benchmark it

* Add editorconfig
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,7 @@
+[*]
+end_of_line = lf
+insert_final_newline = true
+indent_style = space
+indent_size = 4
+trim_trailing_whitespace = true
+max_line_length = 100
diff --git a/examples/blockarray_backend.jl b/examples/blockarray_backend.jl
@@ -1,6 +1,8 @@
-# using CUDAnative
-# device!(0)
-# using CuArrays
+ using CUDAnative
+device!(0)
+using CuArrays
+using CuArrays.CUBLAS: libcublas_handle, cublasSetStream_v2, CuDefaultStream
+using CUDAnative.CUDAdrv: CuStream
 using GPUArrays
 using BlockArrays: _BlockArray, PseudoBlockArray, BlockArray, BlockMatrix, BlockVector,
                   nblocks, Block, cumulsizes, AbstractBlockVector
@@ -30,7 +32,7 @@ if @isdefined CuArray
 end
 ###############
 
-adapt(T::Type, b::BandedBlockBandedMatrix) =
+  adapt(T::Type, b::BandedBlockBandedMatrix) =
     _BandedBlockBandedMatrix(adapt(T, b.data), b.block_sizes)
 
 
@@ -55,6 +57,44 @@ function LinearAlgebra.mul!(c::BlockVector{T},
                    x.blocks[j], one(T), c.blocks[i])
     end
 
+    # mostly for symmetry with streamed version
+    # synchronizing could be left to user
+    synchronize(c)
+    c
+end
+
+function streamed_mul!(c::BlockVector{T},
+                       A::BandedBlockBandedMatrix{T, <: BlockMatrix},
+                       x::BlockVector{T};
+                       nstreams :: Union{Integer, Nothing} = nothing) where T
+    @assert nblocks(A, 1) == nblocks(c, 1)
+    @assert cumulsizes(A, 1) == cumulsizes(c, 1)
+    @assert nblocks(A, 2) == nblocks(x, 1)
+    @assert cumulsizes(A, 2) == cumulsizes(x, 1)
+
+    if nstreams isa Nothing
+      nₛ = nblocks(A, 1)
+    else
+      nₛ = min(nstreams, nblocks(A, 1))
+    end
+    streams = [CuStream() for u in 1:nₛ]
+    for (i, block) in enumerate(c.blocks)
+      cublasSetStream_v2(libcublas_handle[], streams[(i - 1) % nₛ + 1])
+      fill!(block, zero(eltype(block)))
+    end
+    l, u = blockbandwidths(A)
+    λ, μ = subblockbandwidths(A)
+    N,M = nblocks(A)
+
+    @inbounds for i = 1:N, j = max(1,i-l):min(M,i+u)
+        cublasSetStream_v2(libcublas_handle[], streams[(i - 1) % nₛ + 1])
+        BLAS.gbmv!('N', size(view(A, Block(i, j)), 1), λ, μ, one(T),
+                   A.data.blocks[i - j + u + 1, j],
+                   x.blocks[j], one(T), c.blocks[i])
+    end
+
+    synchronize(c)
+    cublasSetStream_v2(libcublas_handle[], CuDefaultStream())
     c
 end
 
@@ -83,31 +123,6 @@ function banded_mul!(c::BlockVector{T},
     c
 end
 
-function nofill_mul!(Cblock::BandedBlockBandedMatrix{T, <: BlockMatrix},
-                     Ablock::BandedBlockBandedMatrix{T, <: BlockMatrix},
-                     Xblock::BandedBlockBandedMatrix{T, <: BlockMatrix}) where T
-    @assert nblocks(Ablock, 1) == nblocks(Cblock, 1)
-    @assert cumulsizes(Ablock, 1) == cumulsizes(Cblock, 1)
-    @assert nblocks(Ablock, 2) == nblocks(Xblock, 1)
-    @assert cumulsizes(Ablock, 2) == cumulsizes(Xblock, 1)
-    @assert nblocks(Xblock, 2) == nblocks(Cblock, 2)
-    @assert cumulsizes(Xblock, 2) == cumulsizes(Xblock, 2)
-
-    lₐ, uₐ = blockbandwidths(Ablock)
-    lₓ, uₓ = blockbandwidths(xblock)
-    λ, μ = subblockbandwidths(Ablock)
-    N,M = nblocks(Ablock)
-    M, K = nblocks(Xblock)
-
-    @inbounds for i = 1:N, j = max(1,i-lₐ):min(M,i+uₐ), k = max(1, j - lₓ):min(j + uₓ, K)
-        BLAS.gbmv!('N', size(view(Ablock, Block(i, j)), 1), λ, μ, one(T),
-                   Ablock.data.blocks[i - j + u + 1, j],
-                   Xblock.blocks[j], one(T), Cblock.blocks[i])
-    end
-
-    Cblock
-end
-
 using Test
 
 function testme()
@@ -118,7 +133,7 @@ function testme()
       @test adapt(JLArray, bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
       @test eltype(adapt(JLArray, bmat)) === Float64
       if @isdefined CuArray
-        @test cu(bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
+        @test cu(bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
         @test eltype(cu(bmat)) === Float32
       end
     end
@@ -129,8 +144,8 @@ function testme()
       @test adapt(JLArray, bmat) isa PseudoBlockArray
       if @isdefined CuArray
         @test !(adapt(CuArray, bmat) isa PseudoBlockArray)
-        @test adapt(CuArray, bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
-        @test cu(bmat) isa BlockArray{T, 2, JLArray{T, 2}} where T
+        @test adapt(CuArray, bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
+        @test cu(bmat) isa BlockArray{T, 2, CuArray{T, 2}} where T
         @test eltype(cu(bmat)) === Float32
       end
     end
@@ -143,7 +158,7 @@ function testme()
       if @isdefined CuArray
         @test adapt(CuArray, bmat).data isa BlockArray{T, 2, CuArray{T, 2}} where T
         @test cu(bmat) isa BandedBlockBandedMatrix
-        @test cu(bmat).data isa BlockArray{T, 2, JLArray{T, 2}} where T
+        @test cu(bmat).data isa BlockArray{T, 2, CuArray{T, 2}} where T
         @test eltype(cu(bmat)) === Float32
       end
     end
@@ -164,7 +179,10 @@ function testme()
        @test LinearAlgebra.mul!(cblock, Ablock, xblock) ≈ A * x
        cblock .= 0
        @test banded_mul!(cblock, Ablock, xblock) ≈ A * x
+       cgpu = cu(cblock)
+       @test streamed_mul!(cgpu, cu(A), cu(x)) ≈ A * x
     end
+
   end
 end
 
@@ -173,16 +191,14 @@ using Statistics
 
 function benchmarks()
   suite = BenchmarkGroup()
-  suite["viabm"] = BenchmarkGroup()
+  # suite["viabm"] = BenchmarkGroup()
   suite["pseudo"] = BenchmarkGroup()
   suite["block"] = BenchmarkGroup()
-  possibles = [5, 10, 100, 500, 1000]
-  for N in possibles #, n in possibles
-    n = N
-    suite["pseudo"]["N=$N n=$n"] = BenchmarkGroup()
-    suite["block"]["N=$N n=$n"] = BenchmarkGroup()
-    suite["viabm"]["N=$N n=$n"] = BenchmarkGroup()
-
+  if @isdefined CuArrays
+    suite["gpu"] = BenchmarkGroup()
+    suite["streams"] = BenchmarkGroup()
+  end
+  for N in [10, 100, 500, 1000], n = [N ÷ 5, N, 5N, 10N]
     l, u, λ, μ = rand(0:2, 4)
     M, m = N, n
   
@@ -201,9 +217,17 @@ function benchmarks()
       LinearAlgebra.mul!($(adapt(BlockArray, c)), $(adapt(BlockArray, A)),
                          $(adapt(BlockArray, x)))
     end
-    suite["viabm"]["N=$N n=$n"] = @benchmarkable begin
-      banded_mul!($(adapt(BlockArray, c)), $(adapt(BlockArray, A)),
-                  $(adapt(BlockArray, x)))
+    if @isdefined CuArrays
+      gpus = Dict(:c => adapt(CuArray, c),
+                  :x => adapt(CuArray, x),
+                  :A => adapt(CuArray, A))
+
+      suite["gpu"]["N=$N n=$n"] = @benchmarkable begin
+        LinearAlgebra.mul!($(gpus[:c]), $(gpus[:A]), $(gpus[:x]))
+      end
+      suite["streams"]["N=$N n=$n"] = @benchmarkable begin
+        gpuc = streamed_mul!($(gpus[:c]), $(gpus[:A]), $(gpus[:x]))
+      end
     end
   end
   suite
diff --git a/examples/shared_array_backend.jl b/examples/shared_array_backend.jl
@@ -0,0 +1,175 @@
+using BlockArrays: _BlockArray, PseudoBlockArray, BlockArray, BlockMatrix, BlockVector,
+                  nblocks, Block, cumulsizes, AbstractBlockVector
+using BlockBandedMatrices: BandedBlockBandedMatrix, _BandedBlockBandedMatrix,
+                           blockbandwidths, subblockbandwidths, blockbandwidth,
+                           BandedBlockBandedSizes
+using LinearAlgebra: BLAS
+import LinearAlgebra
+using BandedMatrices: _BandedMatrix, BandedMatrix
+using SharedArrays
+using LazyArrays
+using Distributed: procs, remotecall_wait
+import Distributed
+
+import Adapt: adapt
+
+adapt(T::Type, b::BandedBlockBandedMatrix) =
+    _BandedBlockBandedMatrix(adapt(T, b.data), b.block_sizes)
+adapt(T::Type{<:AbstractArray}, b::PseudoBlockArray) =
+    PseudoBlockArray(T(b.blocks), b.block_sizes)
+
+
+const SharedBandedBlockBandedMatrix =
+    BandedBlockBandedMatrix{T, PseudoBlockArray{T, 2, SharedArray{T, 2}}} where T
+
+function SharedBandedBlockBandedMatrix{T}(::UndefInitializer,
+                                          bs::BandedBlockBandedSizes;
+                                          kwargs...) where T
+  Block = fieldtype(SharedBandedBlockBandedMatrix{T}, :data)
+  Shared = fieldtype(Block, :blocks)
+  kwargs = Dict(kwargs)
+  init = pop!(kwargs, :init, nothing)
+  shared = Shared(size(bs); kwargs...)
+  result = _BandedBlockBandedMatrix(Block(shared, bs.data_block_sizes), bs)
+  populate!(result, init)
+  result
+end
+
+Distributed.procs(A::SharedBandedBlockBandedMatrix) = procs(A.data.blocks)
+
+function populate!(A::SharedBandedBlockBandedMatrix, range, block_populate!::Function)
+  k = 1
+  for i in 1:nblocks(A, 1), j in max(i - A.u, 1):min(i + A.l, nblocks(A, 2))
+    if k in range
+      block_populate!(view(A, Block(i, j)), i, j)
+    end
+    k += 1
+  end
+  A
+end
+
+
+function populate!(A::SharedBandedBlockBandedMatrix, block_populate!::Function)
+  n = nnzero(nblocks(A)..., A.l, A.u)
+  m = length(procs(A))
+  @sync begin
+    for (i, proc) in enumerate(procs(A))
+      start = (n ÷ m) * (i - 1) + min((n % m), i - 1) + 1
+      stop = (n ÷ m) * i + min((n % m), i)
+      @async remotecall_wait(populate!, proc, A, start:stop, block_populate!)
+    end
+  end
+  A
+end
+
+populate!(block_populate!::Function, A::SharedBandedBlockBandedMatrix) =
+    populate!(A, block_populate!)
+
+SharedBandedBlockBandedMatrix{T}(init::Function,
+                                 bs::BandedBlockBandedSizes;
+                                 pids=Int[]) where T =
+    SharedBandedBlockBandedMatrix{T}(undef, bs; pids=pids, init=init)
+function SharedBandedBlockBandedMatrix{T}(init::Function,
+                                          dims::NTuple{2, AbstractVector{Int}},
+                                          lu::NTuple{2, Int}, λμ::NTuple{2, Int};
+                                          pids=Int[]) where T
+  bs = BandedBlockBandedSizes(dims..., lu..., λμ...)
+  SharedBandedBlockBandedMatrix{T}(init, bs; pids=pids)
+end
+
+"""Number of non-zero elements in an banded matrix"""
+function nnzero(n::Integer, m::Integer, l::Integer, u::Integer)
+  result = zero(n)
+  for i = 0:min(n, l)
+    result += min(n - i, m)
+  end
+  for i = 1:min(m, u)
+    result += min(m - i, n)
+  end
+  result
+end
+
+function LinearAlgebra.mul!(c::AbstractBlockVector{T},
+                            A::SharedBandedBlockBandedMatrix{T},
+                            x::AbstractBlockVector{T}) where T
+    @assert nblocks(A, 1) == nblocks(c, 1)
+    @assert cumulsizes(A, 1) == cumulsizes(c, 1)
+    @assert nblocks(A, 2) == nblocks(x, 1)
+    @assert cumulsizes(A, 2) == cumulsizes(x, 1)
+
+    n = nblocks(A, 1)
+    m = length(procs(A))
+
+    @sync for (p, proc) in enumerate(procs(A))
+
+        p > n && continue
+        start = (n ÷ m) * (p - 1) + min((n % m), p - 1) + 1
+        stop = (n ÷ m) * p + min((n % m), p)
+
+        @async begin
+          remotecall_wait(proc, start:stop) do irange
+            @inbounds for i in irange
+                fill!(view(c, Block(i)), zero(eltype(c)))
+                for j = max(1, i - A.l):min(nblocks(A, 2), i + A.u)
+                    c[Block(i)] .+= Mul(view(A, Block(i, j)), view(x, Block(j)))
+                end
+            end
+          end
+        end
+
+    end
+    c
+end
+
+
+using Test
+
+function testme()
+  SBBB = SharedBandedBlockBandedMatrix
+  @testset "shared array backend" begin
+
+    @testset "Initialization" begin
+      n, m = repeat([2], 4), repeat([3], 2)
+      A = SBBB{Int64}((n, m), (1, 1), (1, 0)) do block, i, j
+        block .= 0
+        if (i == 3) && (j == 2); block[2, 2] = 1 end
+      end
+      @test view(A, Block(3, 2))[2, 2] == 1
+      view(A, Block(3, 2))[2, 2] = 0
+      @test all(A .== 0)
+    end
+
+    @testset "count non-zero elements" begin
+        for i in 1:100
+          n, m = rand(1:10, 2)
+          l, u = rand(0:10, 2)
+          A = BandedMatrix{Int8}(undef, n, m, l, u)
+          A.data .= 1
+          @test sum(A) == nnzero(n, m, l, u)
+        end
+    end
+
+    @testset "Multiplication" begin
+       N, M = rand(1:3, 2)
+       l, u, λ, μ = rand(0:2, 4)
+       n, m = rand(max(l, u, λ, μ):20, N), rand(max(l, u, λ, μ):20, M)
+       A = BandedBlockBandedMatrix{Float64}(undef, (n, m), (l, u), (λ, μ))
+       A.data .= rand.()
+       x = PseudoBlockArray(Array{Float64, 1}(undef, size(A, 2)), m)
+       x .= rand.()
+
+       Ashared = adapt(SharedArray, A)
+       @test Ashared.data.blocks isa SharedArray
+       @test Ashared isa SharedBandedBlockBandedMatrix
+       @test length(procs(Ashared)) == max(1, length(procs()) - 1)
+       cshared = adapt(SharedArray,
+                      PseudoBlockArray(Array{Float64, 1}(undef, size(A, 1)), n))
+       @test cshared.blocks isa SharedArray
+       cshared .= rand.()
+       xshared = adapt(SharedArray, x)
+
+       @test LinearAlgebra.mul!(cshared, Ashared, xshared) ≈ A * x
+    end
+
+  end
+end
diff --git a/src/linalg.jl b/src/linalg.jl
@@ -3,7 +3,7 @@ const Block1 = Block{1,Int}
 const BlockRange1 = BlockRange{1,Tuple{UnitRange{Int}}}
 const BlockIndexRange1 = BlockIndexRange{1,Tuple{UnitRange{Int64}}}
 const SubBlockBandedMatrix{T,R1,R2} =
-    SubArray{T,2,BlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}
+    SubArray{T,2,<:BlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}
 
 const SubBandedBlockBandedMatrix{T,R1,R2} =
     SubArray{T,2,<:BandedBlockBandedMatrix{T},Tuple{BlockSlice{R1},BlockSlice{R2}}}