Speed up Cholesky Jacobi matrices (#169)

TSGut · dlfivefifty · web-flow · commit d3d03ef57dfd · 2024-02-07T19:44:36.000Z
* symmetric clenshaw getindex + better resizing

* adjustments to caching + doc improvements

* simplify

* minor change

* fix a bunch of tests

* bugfix

* another bugfix

* increase coverage

* Update Project.toml

* Update src/clenshaw.jl

* Update src/clenshaw.jl

* Update src/clenshaw.jl

---------

Co-authored-by: Sheehan Olver &lt;solver@mac.com&gt;
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "ClassicalOrthogonalPolynomials"
 uuid = "b30e2e7b-c4ee-47da-9d5f-2c5c27239acd"
 authors = ["Sheehan Olver <solver@mac.com>"]
-version = "0.12.3"
+version = "0.12.4"
 
 [deps]
 ArrayLayouts = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
diff --git a/src/ClassicalOrthogonalPolynomials.jl b/src/ClassicalOrthogonalPolynomials.jl
@@ -17,7 +17,7 @@ import LazyArrays: MemoryLayout, Applied, ApplyStyle, flatten, _flatten, adjoint
                 sub_materialize, arguments, sub_paddeddata, paddeddata, PaddedLayout, resizedata!, LazyVector, ApplyLayout, call,
                 _mul_arguments, CachedVector, CachedMatrix, LazyVector, LazyMatrix, axpy!, AbstractLazyLayout, BroadcastLayout,
                 AbstractCachedVector, AbstractCachedMatrix, paddeddata, cache_filldata!,
-                simplifiable, PaddedArray, converteltype
+                simplifiable, PaddedArray, converteltype, simplify
 import ArrayLayouts: MatMulVecAdd, materialize!, _fill_lmul!, sublayout, sub_materialize, lmul!, ldiv!, ldiv, transposelayout, triangulardata,
                         subdiagonaldata, diagonaldata, supdiagonaldata, mul, rowsupport, colsupport
 import LazyBandedMatrices: SymTridiagonal, Bidiagonal, Tridiagonal, unitblocks, BlockRange1, AbstractLazyBandedLayout
@@ -33,7 +33,7 @@ import QuasiArrays: cardinality, checkindex, QuasiAdjoint, QuasiTranspose, Inclu
                     AbstractQuasiFill, equals_layout, QuasiArrayLayout, PolynomialLayout, diff_layout
 
 import InfiniteArrays: OneToInf, InfAxes, Infinity, AbstractInfUnitRange, InfiniteCardinal, InfRanges
-import InfiniteLinearAlgebra: chop!, chop, pad, choplength, compatible_resize!
+import InfiniteLinearAlgebra: chop!, chop, pad, choplength, compatible_resize!, partialcholesky!
 import ContinuumArrays: Basis, Weight, basis_axes, @simplify, Identity, AbstractAffineQuasiVector, ProjectionFactorization,
     grid, plotgrid, plotgrid_layout, plotvalues_layout, grid_layout, transform_ldiv, TransformFactorization, QInfAxes, broadcastbasis, ExpansionLayout, basismap,
     AffineQuasiVector, AffineMap, AbstractWeightLayout, AbstractWeightedBasisLayout, WeightedBasisLayout, WeightedBasisLayouts, demap, AbstractBasisLayout, BasisLayout,
diff --git a/src/choleskyQR.jl b/src/choleskyQR.jl
@@ -37,9 +37,9 @@ orthogonalpolynomial(w::Function, P::AbstractQuasiMatrix) = orthogonalpolynomial
 cholesky_jacobimatrix(w, P)
 
 returns the Jacobi matrix `X` associated to a quasi-matrix of polynomials
-orthogonal with respect to `w(x) w_p(x)` where `w_p(x)` is the weight of the polynomials in `P` by computing a Cholesky decomposition of the weight modification.
+orthogonal with respect to `w(x)` by computing a Cholesky decomposition of the weight modification.
 
-The resulting polynomials are orthonormal on the same domain as `P`. The supplied `P` must be normalized. Accepted inputs are `w` as a function or `W` as an infinite matrix representing multiplication with the function `w` on the basis `P`.
+The resulting polynomials are orthonormal on the same domain as `P`. The supplied `P` must be normalized. Accepted inputs are `w` as a function or `W` as an infinite matrix representing the weight modifier multiplication by the function `w / w_P` on `P` where `w_P` is the orthogonality weight of `P`.
 """
 cholesky_jacobimatrix(w::Function, P) = cholesky_jacobimatrix(w.(axes(P,1)), P)
 
@@ -52,9 +52,7 @@ end
 function cholesky_jacobimatrix(W::AbstractMatrix, Q)
     isnormalized(Q) || error("Polynomials must be orthonormal")
     U = cholesky(W).U
-    X = jacobimatrix(Q)
-    UX = ApplyArray(*,U,X)
-    CJD = CholeskyJacobiData(U,UX)
+    CJD = CholeskyJacobiData(U,Q)
     return SymTridiagonal(view(CJD,:,1),view(CJD,:,2))
 end
 
@@ -63,24 +61,35 @@ mutable struct CholeskyJacobiData{T} <: LazyMatrix{T}
     dv::AbstractVector{T} # store diagonal band entries in adaptively sized vector
     ev::AbstractVector{T} # store off-diagonal band entries in adaptively sized vector
     U::UpperTriangular{T} # store upper triangular conversion matrix (needed to extend available entries)
-    UX::ApplyArray{T}     # store U*X, where X is the Jacobi matrix of the original P (needed to extend available entries)
+    X::AbstractMatrix{T}  # store Jacobi matrix of original polynomials
+    P                     # store original polynomials
     datasize::Int         # size of so-far computed block 
 end
 
 # Computes the initial data for the Jacobi operator bands
-function CholeskyJacobiData(U::AbstractMatrix{T}, UX) where T
+function CholeskyJacobiData(U::AbstractMatrix{T}, P) where T
     # compute a length 2 vector on first go and circumvent BigFloat issue
-    dv = zeros(T,2) 
+    dv = zeros(T,2)
     ev = zeros(T,2)
+    X = jacobimatrix(P)
+    partialcholesky!(U.data, 3)
+    UX = view(U,1:3,1:3)*X[1:3,1:3]
     dv[1] = UX[1,1]/U[1,1] # this is dot(view(UX,1,1), U[1,1] \ [one(T)])
     dv[2] = -U[1,2]*UX[2,1]/(U[1,1]*U[2,2])+UX[2,2]/U[2,2] # this is dot(view(UX,2,1:2), U[1:2,1:2] \ [zero(T); one(T)])
     ev[1] = -UX[1,1]*U[1,2]/(U[1,1]*U[2,2])+UX[1,2]/U[2,2] # this is dot(view(UX,1,1:2), U[1:2,1:2] \ [zero(T); one(T)])
     ev[2] = UX[2,1]/U[3,3]*(-U[1,3]/U[1,1]+U[1,2]*U[2,3]/(U[1,1]*U[2,2]))+UX[2,2]/U[3,3]*(-U[2,3]/U[2,2])+UX[2,3]/U[3,3] # this is dot(view(UX,2,1:3), U[1:3,1:3] \ [zeros(T,2); one(T)])[1:3,1:3] \ [zeros(T,2); one(T)])
-    return CholeskyJacobiData{T}(dv, ev, U, UX, 2)
+    return CholeskyJacobiData{T}(dv, ev, U, X, P, 2)
 end
 
 size(::CholeskyJacobiData) = (ℵ₀,2) # Stored as two infinite cached bands
 
+function getindex(C::SymTridiagonal{<:Any, <:SubArray{<:Any, 1, <:CholeskyJacobiData, <:Tuple, false}, <:SubArray{<:Any, 1, <:CholeskyJacobiData, <:Tuple, false}}, kr::UnitRange, jr::UnitRange)
+    m = 1+max(kr.stop,jr.stop)
+    resizedata!(C.dv.parent,m,1)
+    resizedata!(C.ev.parent,m,2)
+    return copy(view(C,kr,jr))
+end
+
 function getindex(K::CholeskyJacobiData, n::Integer, m::Integer)
     @assert (m==1) || (m==2)
     resizedata!(K,n,m)
@@ -90,7 +99,7 @@ end
 
 # Resize and filling functions for cached implementation
 function resizedata!(K::CholeskyJacobiData, n::Integer, m::Integer)
-    nm = max(n,m)
+    nm = 1+max(n,m)
     νμ = K.datasize
     if nm > νμ
         resize!(K.dv,nm)
@@ -102,12 +111,14 @@ function resizedata!(K::CholeskyJacobiData, n::Integer, m::Integer)
 end
 
 function _fillcholeskybanddata!(J::CholeskyJacobiData{T}, inds::UnitRange{Int}) where T
-    # pre-fill U and UX to prevent expensive step-by-step filling in of cached U and UX in the loop
-    resizedata!(J.U,inds[end]+1,inds[end]+1)
-    resizedata!(J.UX,inds[end]+1,inds[end]+1)
+    # pre-fill U to prevent expensive step-by-step filling in
+    m = inds[end]+1
+    partialcholesky!(J.U.data, m)
+    dv, ev, U, X = J.dv, J.ev, J.U, J.X
+
+    UX = view(U,1:m,1:m)*X[1:m,1:m]
 
-    dv, ev, UX, U = J.dv, J.ev, J.UX, J.U
-    @inbounds for k in inds
+    @inbounds Threads.@threads for k = inds  
         # this is dot(view(UX,k,k-1:k), U[k-1:k,k-1:k] \ ek)
         dv[k] = -U[k-1,k]*UX[k,k-1]/(U[k-1,k-1]*U[k,k])+UX[k,k]/U[k,k]
         ev[k] = UX[k,k-1]/U[k+1,k+1]*(-U[k-1,k+1]/U[k-1,k-1]+U[k-1,k]*U[k,k+1]/(U[k-1,k-1]*U[k,k]))+UX[k,k]/U[k+1,k+1]*(-U[k,k+1]/U[k,k])+UX[k,k+1]/U[k+1,k+1]  
@@ -116,20 +127,21 @@ end
 
 
 """
-qr_jacobimatrix(sqrtw, P)
+qr_jacobimatrix(w, P)
 
 returns the Jacobi matrix `X` associated to a quasi-matrix of polynomials
-orthogonal with respect to `w(x) w_p(x)` where `w_p(x)` is the weight of the polynomials in `P` by computing a QR decomposition of the square root weight modification.
+orthogonal with respect to `w(x)` by computing a QR decomposition of the square root weight modification.
 
-The resulting polynomials are orthonormal on the same domain as `P`. The supplied `P` must be normalized. Accepted inputs for `sqrtw` are the square root of the weight modification as a function or `sqrtW` as an infinite matrix representing multiplication with the function `sqrt(w)` on the basis `P`.
+The resulting polynomials are orthonormal on the same domain as `P`. The supplied `P` must be normalized. Accepted inputs for `w` are the target weight as a function or `sqrtW`, representing the multiplication operator of square root weight modification on the basis `P`.
 
 The underlying QR approach allows two methods, one which uses the Q matrix and one which uses the R matrix. To change between methods, an optional argument :Q or :R may be supplied. The default is to use the Q method.
 """
-function qr_jacobimatrix(sqrtw::Function, P, method = :Q)
+qr_jacobimatrix(w::Function, P, method = :Q) = qr_jacobimatrix(w.(axes(P,1)), P, Symbol(method))
+function qr_jacobimatrix(w::AbstractQuasiVector, P, method = :Q)
     Q = normalized(P)
-    x = axes(P,1)
-    sqrtW = (Q \ (sqrtw.(x) .* Q))  # Compute weight multiplication via Clenshaw
-    return qr_jacobimatrix(sqrtW, Q, method)
+    w_P = orthogonalityweight(P)
+    sqrtW = Symmetric(Q \ (sqrt.(w ./ w_P) .* Q)) # Compute weight multiplication via Clenshaw
+    return qr_jacobimatrix(sqrtW, Q, Symbol(method))
 end
 function qr_jacobimatrix(sqrtW::AbstractMatrix{T}, Q, method = :Q) where T
     isnormalized(Q) || error("Polynomials must be orthonormal")
@@ -143,7 +155,7 @@ mutable struct QRJacobiData{method,T} <: LazyMatrix{T}
     dv::AbstractVector{T} # store diagonal band entries in adaptively sized vector
     ev::AbstractVector{T} # store off-diagonal band entries in adaptively sized vector
     U                     # store conversion, Q method: stores QR object. R method: only stores R.
-    UX                    # Auxilliary matrix. Q method: stores in-progress incomplete modification. R method: stores U*X for efficiency.
+    UX                    # Auxilliary matrix. Q method: stores in-progress incomplete modification. R method: stores Jacobi matrix of original polynomials
     P                     # Remember original polynomials
     datasize::Int         # size of so-far computed block 
 end
@@ -179,15 +191,15 @@ function QRJacobiData{:R,T}(F, P) where T
     U = F.R
     U = ApplyArray(*,Diagonal(sign.(view(U,band(0)))),U)  # QR decomposition does not force positive diagonals on R by default
     X = jacobimatrix(P)
-    UX = ApplyArray(*,U,X)
+    UX = view(U,1:3,1:3)*X[1:3,1:3]
     # compute a length 2 vector on first go and circumvent BigFloat issue
     dv = zeros(T,2) 
     ev = zeros(T,2)
     dv[1] = UX[1,1]/U[1,1] # this is dot(view(UX,1,1), U[1,1] \ [one(T)])
     dv[2] = -U[1,2]*UX[2,1]/(U[1,1]*U[2,2])+UX[2,2]/U[2,2] # this is dot(view(UX,2,1:2), U[1:2,1:2] \ [zero(T); one(T)])
     ev[1] = -UX[1,1]*U[1,2]/(U[1,1]*U[2,2])+UX[1,2]/U[2,2] # this is dot(view(UX,1,1:2), U[1:2,1:2] \ [zero(T); one(T)])
     ev[2] = UX[2,1]/U[3,3]*(-U[1,3]/U[1,1]+U[1,2]*U[2,3]/(U[1,1]*U[2,2]))+UX[2,2]/U[3,3]*(-U[2,3]/U[2,2])+UX[2,3]/U[3,3] # this is dot(view(UX,2,1:3), U[1:3,1:3] \ [zeros(T,2); one(T)])
-    return QRJacobiData{:R,T}(dv, ev, U, UX, P, 2)
+    return QRJacobiData{:R,T}(dv, ev, U, X, P, 2)
 end
 
 
@@ -200,9 +212,16 @@ function getindex(K::QRJacobiData, n::Integer, m::Integer)
     m == 2 && return K.ev[n]
 end
 
+function getindex(C::SymTridiagonal{<:Any, <:SubArray{<:Any, 1, <:QRJacobiData, <:Tuple, false}, <:SubArray{<:Any, 1, <:QRJacobiData, <:Tuple, false}}, kr::UnitRange, jr::UnitRange)
+    m = maximum(max(kr,jr))+1
+    resizedata!(C.dv.parent,m,2)
+    resizedata!(C.ev.parent,m,2)
+    return copy(view(C,kr,jr))
+end
+
 # Resize and filling functions for cached implementation
 function resizedata!(K::QRJacobiData, n::Integer, m::Integer)
-    nm = max(n,m)
+    nm = 1+max(n,m)
     νμ = K.datasize
     if nm > νμ
         resize!(K.dv,nm)
@@ -246,10 +265,11 @@ function _fillqrbanddata!(J::QRJacobiData{:R,T}, inds::UnitRange{Int}) where T
     # pre-fill U and UX to prevent expensive step-by-step filling in of cached U and UX in the loop
     m = inds[end]+1
     resizedata!(J.U,m,m)
-    resizedata!(J.UX,m,m)
+    dv, ev, X, U = J.dv, J.ev, J.UX, J.U
+    
+    UX = view(U,1:m,1:m)*X[1:m,1:m]
 
-    dv, ev, UX, U = J.dv, J.ev, J.UX, J.U
-    @inbounds for k in inds
+    @inbounds Threads.@threads for k in inds
         dv[k] = -U[k-1,k]*UX[k,k-1]/(U[k-1,k-1]*U[k,k])+UX[k,k]./U[k,k] # this is dot(view(UX,k,k-1:k), U[k-1:k,k-1:k] \ ek)
         ev[k] = UX[k,k-1]/U[k+1,k+1]*(-U[k-1,k+1]/U[k-1,k-1]+U[k-1,k]*U[k,k+1]/(U[k-1,k-1]*U[k,k]))+UX[k,k]/U[k+1,k+1]*(-U[k,k+1]/U[k,k])+UX[k,k+1]/U[k+1,k+1]  # this is dot(view(UX,k,k-1:k+1), U[k-1:k+1,k-1:k+1] \ ek)
     end
diff --git a/src/clenshaw.jl b/src/clenshaw.jl
@@ -289,15 +289,15 @@ function _BandedMatrix(::ClenshawLayout, V::SubArray{<:Any,2})
     M = parent(V)
     kr,jr = parentindices(V)
     b = bandwidth(M,1)
-    jkr=max(1,min(first(jr),first(kr))-b÷2):max(last(jr),last(kr))+b÷2
+    jkr = max(1,min(first(jr),first(kr))-b÷2):max(last(jr),last(kr))+b÷2
     # relationship between jkr and kr, jr
     kr2,jr2 = kr.-first(jkr).+1,jr.-first(jkr).+1
     lmul!(M.p0, clenshaw(M.c, M.A, M.B, M.C, M.X[jkr, jkr])[kr2,jr2])
 end
 
 function getindex(M::Clenshaw{T}, kr::AbstractUnitRange, j::Integer) where T
     b = bandwidth(M,1)
-    jkr=max(1,min(j,first(kr))-b÷2):max(j,last(kr))+b÷2
+    jkr = max(1,min(j,first(kr))-b÷2):max(j,last(kr))+b÷2
     # relationship between jkr and kr, jr
     kr2,j2 = kr.-first(jkr).+1,j-first(jkr)+1
     f = [Zeros{T}(j2-1); one(T); Zeros{T}(length(jkr)-j2)]
@@ -306,6 +306,19 @@ end
 
 getindex(M::Clenshaw, k::Int, j::Int) = M[k:k,j][1]
 
+function getindex(S::Symmetric{T,<:Clenshaw}, k::Integer, jr::AbstractUnitRange) where T
+    m = max(jr.start,jr.stop,k)
+    return Symmetric(getindex(S.data,1:m,1:m),Symbol(S.uplo))[k,jr]
+end
+function getindex(S::Symmetric{T,<:Clenshaw}, kr::AbstractUnitRange, j::Integer) where T
+    m = max(kr.start,kr.stop,j)
+    return Symmetric(getindex(S.data,1:m,1:m),Symbol(S.uplo))[kr,j]
+end
+function getindex(S::Symmetric{T,<:Clenshaw}, kr::AbstractUnitRange, jr::AbstractUnitRange) where T
+    m = max(kr.start,jr.start,kr.stop,jr.stop)
+    return Symmetric(getindex(S.data,1:m,1:m),Symbol(S.uplo))[kr,jr]
+end
+
 transposelayout(M::ClenshawLayout) = LazyBandedMatrices.LazyBandedLayout()
 # TODO: generalise for layout, use Base.PermutedDimsArray
 Base.permutedims(M::Clenshaw{<:Number}) = transpose(M)
diff --git a/test/test_choleskyQR.jl b/test/test_choleskyQR.jl
diff --git a/test/test_normalized.jl b/test/test_normalized.jl