Fix broken precompile statements, add kwargs to matmul_params for loop lengths and vector width

chriselrod · chriselrod · commit 18bb7722c94d · 2021-10-08T09:10:15.000-04:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <elrodc@gmail.com>"]
-version = "0.12.83"
+version = "0.12.84"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
diff --git a/src/parse/memory_ops_common.jl b/src/parse/memory_ops_common.jl
@@ -93,7 +93,7 @@ function subset_vptr!(ls::LoopSet, vptr::Symbol, indnum::Int, ind, previndices,
   subsetvptr
 end
 
-function gesp_const_offset!(ls::LoopSet, vptrarray, ninds, indices, loopedindex, mlt::Integer, sym, D)
+function gesp_const_offset!(ls::LoopSet, vptrarray::Symbol, ninds::Int, indices::Vector{Symbol}, loopedindex::Vector{Bool}, mlt::Integer, sym, D::Int)
   if isone(mlt)
     subset_vptr!(ls, vptrarray, ninds, sym, indices, loopedindex, D)
   else        
@@ -102,7 +102,7 @@ function gesp_const_offset!(ls::LoopSet, vptrarray, ninds, indices, loopedindex,
     subset_vptr!(ls, vptrarray, ninds, mltsym, indices, loopedindex, D)
   end
 end
-function gesp_const_offsets!(ls::LoopSet, vptrarray, ninds, indices, loopedindex, mltsyms, D)
+function gesp_const_offsets!(ls::LoopSet, vptrarray::Symbol, ninds::Int, indices::Vector{Symbol}, loopedindex::Vector{Bool}, mltsyms::Vector{Tuple{Int,Symbol}}, D::Int)
   length(mltsyms) > 1 && sort!(mltsyms, by = last) # if multiple have same combination of syms, make sure they match even if order is different
   for (mlt,sym) ∈ mltsyms
     vptrarray = gesp_const_offset!(ls, vptrarray, ninds, indices, loopedindex, mlt, sym, D)
diff --git a/src/precompile.jl b/src/precompile.jl
@@ -15,7 +15,7 @@ function _precompile_()
     Base.precompile(Tuple{typeof(substitute_broadcast),Expr,Symbol,Bool,Int8,Int8,Int8,Int,Int})   # time: 0.02281322
     Base.precompile(Tuple{typeof(push!),LoopSet,Expr,Int,Int})   # time: 0.022659862
     Base.precompile(Tuple{typeof(add_compute!),LoopSet,Symbol,Expr,Int,Int,Nothing})   # time: 0.02167476
-    Base.precompile(Tuple{typeof(checkforoffset!),LoopSet,Symbol,Int,Vector{Operation},Vector{Symbol},Vector{Int8},Vector{Int8},Vector{Bool},Vector{Symbol},Vector{Symbol},Expr})   # time: 0.020454278
+    Base.precompile(Tuple{typeof(checkforoffset!),LoopSet,Symbol,Int,Vector{Operation},Vector{Symbol},Vector{Int8},Vector{Int8},Vector{Bool},Vector{Symbol},Vector{Symbol},Expr,Int})   # time: 0.020454278
     Base.precompile(Tuple{typeof(generate_call),LoopSet,Tuple{Bool, Int8, Int8, Int8},UInt,Bool})   # time: 0.020274462
     Base.precompile(Tuple{typeof(expandbyoffset!),Vector{Tuple{Int, Tuple{Int, Int32, Bool}}},Vector{Any},Vector{Int}})   # time: 0.019860294
     Base.precompile(Tuple{typeof(isscopedname),Symbol,Symbol,Symbol})   # time: 0.016642524
@@ -43,19 +43,19 @@ function _precompile_()
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float32, 4, 1, 0, (1, 2, 3, 4), Tuple{StaticInt{4}, Int, Int, Int}, NTuple{4, StaticInt{1}}},Tuple{StaticInt{-1}, StaticInt{-1}, StaticInt{1}, StaticInt{1}}})   # time: 0.006164707
     Base.precompile(Tuple{typeof(add_ci_call!),Expr,Any,Vector{Any},Vector{Symbol},Int,Expr,Symbol})   # time: 0.006148137
     Base.precompile(Tuple{typeof(add_ci_call!),Expr,Any,Vector{Any},Vector{Symbol},Int})   # time: 0.006063301
-    Base.precompile(Tuple{typeof(mem_offset),Operation,UnrollArgs,Vector{Bool},Bool,LoopSet})   # time: 0.005945972
+    Base.precompile(Tuple{typeof(mem_offset),Operation,UnrollArgs,Vector{Bool},Bool,LoopSet,Bool})   # time: 0.005945972
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 3, 1, 0, (1, 2), Tuple{StaticInt{8}, StaticInt{16}, Int}, Tuple{StaticInt{1}, StaticInt{1}, StaticInt{1}}},Tuple{StaticInt{1}, StaticInt{1}, StaticInt{0}}})   # time: 0.005927015
     Base.precompile(Tuple{typeof(sizeofeltypes),Core.SimpleVector})   # time: 0.005828176
     Base.precompile(Tuple{typeof(cse_constant_offsets!),LoopSet,Vector{ArrayReferenceMeta},Int,Vector{Vector{Int}},Vector{Vector{Tuple{Int, Int, Int}}}})   # time: 0.005694307
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 4, 1, 0, (1, 2, 3, 4), Tuple{StaticInt{8}, StaticInt{16}, Int, Int}, NTuple{4, StaticInt{1}}},Tuple{StaticInt{1}, VectorizationBase.NullStep, StaticInt{2}, VectorizationBase.NullStep}})   # time: 0.005314204
     Base.precompile(Tuple{typeof(indices_loop!),LoopSet,Expr,Symbol})   # time: 0.005283243
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 5, 1, 0, (1, 2, 3, 4, 5), Tuple{StaticInt{8}, Int, Int, Int, Int}, NTuple{5, StaticInt{1}}},Tuple{VectorizationBase.CartesianVIndex{0, Tuple{}}, VectorizationBase.NullStep, VectorizationBase.CartesianVIndex{4, NTuple{4, StaticInt{1}}}}})   # time: 0.005256126
-    Base.precompile(Tuple{typeof(gesp_const_offsets!),LoopSet,Symbol,Int,Vector{Symbol},Vector{Bool},Vector{Tuple{Int, Symbol}}})   # time: 0.005168524
+    Base.precompile(Tuple{typeof(gesp_const_offsets!),LoopSet,Symbol,Int,Vector{Symbol},Vector{Bool},Vector{Tuple{Int, Symbol}},Int})   # time: 0.005168524
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 5, 1, 0, (1, 2, 3, 4, 5), Tuple{StaticInt{8}, Int, Int, Int, Int}, NTuple{5, StaticInt{1}}},Tuple{VectorizationBase.CartesianVIndex{4, NTuple{4, StaticInt{1}}}, VectorizationBase.NullStep, VectorizationBase.CartesianVIndex{0, Tuple{}}}})   # time: 0.005122315
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 5, 1, 0, (1, 2, 3, 4, 5), Tuple{StaticInt{8}, Int, Int, Int, Int}, NTuple{5, StaticInt{1}}},Tuple{VectorizationBase.CartesianVIndex{2, Tuple{StaticInt{1}, StaticInt{1}}}, VectorizationBase.NullStep, VectorizationBase.CartesianVIndex{2, Tuple{StaticInt{1}, StaticInt{1}}}}})   # time: 0.005078802
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float32, 2, 1, 0, (1, 2), Tuple{StaticInt{4}, Int}, Tuple{StaticInt{1}, StaticInt{1}}},Tuple{StaticInt{0}, StaticInt{0}}})   # time: 0.005036135
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 4, 2, 0, (3, 1, 4, 2), Tuple{Int, StaticInt{8}, Int, Int}, NTuple{4, StaticInt{1}}},Tuple{VectorizationBase.CartesianVIndex{4, NTuple{4, StaticInt{1}}}}})   # time: 0.004968671
-    Base.precompile(Tuple{typeof(subset_vptr!),LoopSet,Symbol,Int,Symbol,Vector{Symbol},Vector{Bool},Bool})   # time: 0.004904486
+    Base.precompile(Tuple{typeof(subset_vptr!),LoopSet,Symbol,Int,Symbol,Vector{Symbol},Vector{Bool},Int})   # time: 0.004904486
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 5, 1, 0, (1, 2, 3, 4, 5), Tuple{StaticInt{8}, Int, Int, Int, Int}, NTuple{5, StaticInt{1}}},Tuple{VectorizationBase.CartesianVIndex{3, Tuple{StaticInt{1}, StaticInt{1}, StaticInt{1}}}, VectorizationBase.NullStep, VectorizationBase.CartesianVIndex{1, Tuple{StaticInt{1}}}}})   # time: 0.004722758
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 5, 1, 0, (1, 2, 3, 4, 5), Tuple{StaticInt{8}, Int, Int, Int, Int}, NTuple{5, StaticInt{1}}},Tuple{VectorizationBase.CartesianVIndex{0, Tuple{}}, Int, VectorizationBase.CartesianVIndex{4, NTuple{4, StaticInt{1}}}}})   # time: 0.004705647
     Base.precompile(Tuple{typeof(gespf1),StridedPointer{Float64, 5, 1, 0, (1, 2, 3, 4, 5), Tuple{StaticInt{8}, Int, Int, Int, Int}, NTuple{5, StaticInt{1}}},Tuple{VectorizationBase.CartesianVIndex{1, Tuple{StaticInt{1}}}, VectorizationBase.NullStep, VectorizationBase.CartesianVIndex{3, Tuple{StaticInt{1}, StaticInt{1}, StaticInt{1}}}}})   # time: 0.00464261
diff --git a/src/user_api_conveniences.jl b/src/user_api_conveniences.jl
@@ -12,10 +12,31 @@ const GEMMLOOPSET = loopset(
 );
 
 
-function matmul_params(rs::Int, rc::Int, cls::Int)
-    set_hw!(GEMMLOOPSET, rs, rc, cls, Int(cache_size(StaticInt(1))), Int(cache_size(StaticInt(2))), Int(cache_size(StaticInt(3))))
-    order = choose_order(GEMMLOOPSET)
-    order[5], last(order)
+# function matmul_params(rs::Int, rc::Int, cls::Int)
+#     set_hw!(GEMMLOOPSET, rs, rc, cls, Int(cache_size(StaticInt(1))), Int(cache_size(StaticInt(2))), Int(cache_size(StaticInt(3))))
+#     order = choose_order(GEMMLOOPSET)
+#     order[5], last(order)
+# end
+function matmul_params(rs::Int, rc::Int, cls::Int; M = nothing, K = nothing, N = nothing, W = 0)
+  set_hw!(GEMMLOOPSET, rs, rc, cls, Int(cache_size(StaticInt(1))), Int(cache_size(StaticInt(2))), Int(cache_size(StaticInt(3))))
+  if N ≢ nothing
+    nloop = GEMMLOOPSET.loops[1]
+    GEMMLOOPSET.loops[1] = Loop(:n, MaybeKnown(1), MaybeKnown(N), MaybeKnown(1), nloop.rangesym, nloop.lensym)
+  end
+  if M ≢ nothing
+    mloop = GEMMLOOPSET.loops[2]
+    GEMMLOOPSET.loops[2] = Loop(:m, MaybeKnown(1), MaybeKnown(M), MaybeKnown(1), mloop.rangesym, mloop.lensym)
+  end
+  if K ≢ nothing
+    kloop = GEMMLOOPSET.loops[3]
+    GEMMLOOPSET.loops[3] = Loop(:k, MaybeKnown(1), MaybeKnown(K), MaybeKnown(1), kloop.rangesym, kloop.lensym)
+  end
+  GEMMLOOPSET.vector_width = W
+  order = choose_order(GEMMLOOPSET)
+  (N ≢ nothing) && (GEMMLOOPSET.loops[1] = nloop)
+  (M ≢ nothing) && (GEMMLOOPSET.loops[2] = mloop)
+  (K ≢ nothing) && (GEMMLOOPSET.loops[3] = kloop)
+  order[5], last(order)
 end
 @generated function matmul_params(::StaticInt{RS}, ::StaticInt{RC}, ::StaticInt{CLS}) where {RS,RC,CLS}
     mᵣ, nᵣ = matmul_params(RS, RC, CLS)
diff --git a/test/gemm.jl b/test/gemm.jl
@@ -9,6 +9,12 @@
     if LoopVectorization.register_count() != 8
         @test @inferred(LoopVectorization.matmul_params()) == (Unum, Tnum)
     end
+
+  @test LoopVectorization.matmul_params(64, 32, 64; M=8, K=100, N=100, W=8) == (1, 25)
+  @test LoopVectorization.matmul_params(64, 32, 64; M=8, K=100, N= 96, W=8) == (1, 24)
+  @test LoopVectorization.matmul_params(64, 32, 64; M=8, K=100, N= 92, W=8) == (1, 23)
+  @test LoopVectorization.matmul_params(64, 32, 64; M=8, K=100, N= 95, W=8) == (1, 10)
+
     AmulBtq1 = :(for m ∈ axes(A,1), n ∈ axes(B,2)
                  C[m,n] = zeroB
                  for k ∈ axes(A,2)