Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 74 additions & 38 deletions src/solver/linear/preconditioners/l1_gauss_seidel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,26 @@
## L1 Gauss Seidel Preconditioner ##
####################################

using TimerOutputs

# Global timer for L1GS preconditioner profiling
const L1GS_TIMER = TimerOutput()

"""
show_l1gs_timer()

Display timing and allocation information for L1GS preconditioner operations.
Call this after running your code to see detailed profiling results.
"""
show_l1gs_timer() = show(L1GS_TIMER)

"""
reset_l1gs_timer!()

Reset the L1GS timer to clear all accumulated timing data.
"""
reset_l1gs_timer!() = TimerOutputs.reset_timer!(L1GS_TIMER)

## Structs & Constructors ##
"""
BlockPartitioning{Ti<:Integer, Backend}
Expand Down Expand Up @@ -153,29 +173,37 @@ function _blockpartitioning(builder::L1GSPrecBuilder{<:AbstractGPUDevice}, A::Ab
end

function LinearSolve.ldiv!(y::VectorType, P::L1GSPreconditioner{BlockPartitioning{Ti,Backend}}, x::VectorType) where {VectorType<:AbstractVector,Ti<:Integer,Backend}
# x: residual
# y: preconditioned residual
y .= x #works either way, whether x is GpuVectorType (e.g. CuArray) or Vector
@unpack partitioning, D_Dl1 = P
@unpack backend = partitioning
# The following code is required because there is no assumption on the compatibality of x with the backend.
_y = adapt(backend, y)
_forward_sweep!(_y, P)
copyto!(y, _y)
@timeit L1GS_TIMER "ldiv! (generic)" begin
# x: residual
# y: preconditioned residual
# FIXME: big memory issue
@timeit L1GS_TIMER "y .= x" y .= x #works either way, whether x is GpuVectorType (e.g. CuArray) or Vector
@unpack partitioning, D_Dl1 = P
@unpack backend = partitioning
# The following code is required because there is no assumption on the compatibality of x with the backend.
# FIXME: big memory issue
@timeit L1GS_TIMER "adapt(backend, y)" _y = adapt(backend, y)
@timeit L1GS_TIMER "_forward_sweep!" _forward_sweep!(_y, P)
@timeit L1GS_TIMER "copyto!(y, _y)" copyto!(y, _y)
end
return nothing
end

function LinearSolve.ldiv!(y::Vector, P::L1GSPreconditioner{BlockPartitioning{Ti,CPU}}, x::Vector) where {Ti<:Integer}
# x: residual
# y: preconditioned residual
y .= x
_forward_sweep!(y, P)
@timeit L1GS_TIMER "ldiv! (CPU)" begin
# x: residual
# y: preconditioned residual
# FIXME: big memory issue
@timeit L1GS_TIMER "y .= x" y .= x
@timeit L1GS_TIMER "_forward_sweep!" _forward_sweep!(y, P)
end
return nothing
end

function (\)(P::L1GSPreconditioner{BlockPartitioning{Ti,Backend}}, x::VectorType) where {VectorType<:AbstractVector,Ti,Backend}
# P is a preconditioner
# x is a vector
# FIXME: big memory issue
y = similar(x)
LinearSolve.ldiv!(y, P, x)
return y
Expand Down Expand Up @@ -327,24 +355,30 @@ _pack_strict_lower!(::AbstractMatrixSymmetry, ::CSRFormat, SLbuffer, A, start_id


function _precompute_blocks(_A::AbstractSparseMatrix, partitioning::BlockPartitioning)
# No assumptions on A, i.e. A here might be in either backend compatible format or not.
# So we have to convert it to backend compatible format, if it is not already.
# `partsize` is the size of each partition, `nparts` is the total number of partitions.
# `nchunks` is the number of CPU cores or GPU blocks.
# `chunksize` is the number of threads per block in GPU backend.
@unpack partsize, nparts, nchunks, chunksize, backend = partitioning
A = adapt(backend, _A)
N = size(A, 1)
D_Dl1 = adapt(backend, zeros(eltype(A), N)) # D + Dˡ
last_partsize = N - (nparts - 1) * partsize # size of the last partition
SLbuffer_size = (partsize * (partsize - 1) * (nparts-1)) ÷ 2 + last_partsize * (last_partsize - 1) ÷ 2
SLbuffer = adapt(backend, zeros(eltype(A), SLbuffer_size)) # strictly lower triangular part of all diagonal blocks stored in a 1D array
symA = isapprox(A, A', rtol=1e-12) ? SymmetricMatrix() : NonSymmetricMatrix()
ndrange = nchunks * chunksize
kernel = _precompute_blocks_kernel!(backend, chunksize, ndrange)
kernel(D_Dl1, SLbuffer, A, symA, partsize, nparts, nchunks, chunksize; ndrange=ndrange)
synchronize(backend)
return D_Dl1, SLbuffer
@timeit L1GS_TIMER "_precompute_blocks" begin
# No assumptions on A, i.e. A here might be in either backend compatible format or not.
# So we have to convert it to backend compatible format, if it is not already.
# `partsize` is the size of each partition, `nparts` is the total number of partitions.
# `nchunks` is the number of CPU cores or GPU blocks.
# `chunksize` is the number of threads per block in GPU backend.
@unpack partsize, nparts, nchunks, chunksize, backend = partitioning
# FIXME: big memory issue
@timeit L1GS_TIMER "adapt(backend, _A)" A = adapt(backend, _A)
N = size(A, 1)
# FIXME: big memory issue
@timeit L1GS_TIMER "allocate D_Dl1" D_Dl1 = adapt(backend, zeros(eltype(A), N)) # D + Dˡ
last_partsize = N - (nparts - 1) * partsize # size of the last partition
SLbuffer_size = (partsize * (partsize - 1) * (nparts-1)) ÷ 2 + last_partsize * (last_partsize - 1) ÷ 2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should store the lower/upper triangular parts in some sparse format. I do not have opinions about which one exactly.

# FIXME: big memory issue
@timeit L1GS_TIMER "allocate SLbuffer" SLbuffer = adapt(backend, zeros(eltype(A), SLbuffer_size)) # strictly lower triangular part of all diagonal blocks stored in a 1D array
# FIXME: big memory issue
@timeit L1GS_TIMER "isapprox(A, A')" symA = isapprox(A, A', rtol=1e-12) ? SymmetricMatrix() : NonSymmetricMatrix()
ndrange = nchunks * chunksize
@timeit L1GS_TIMER "kernel setup" kernel = _precompute_blocks_kernel!(backend, chunksize, ndrange)
@timeit L1GS_TIMER "kernel execution" kernel(D_Dl1, SLbuffer, A, symA, partsize, nparts, nchunks, chunksize; ndrange=ndrange)
@timeit L1GS_TIMER "synchronize" synchronize(backend)
return D_Dl1, SLbuffer
end
end


Expand All @@ -367,13 +401,15 @@ end
end

function _forward_sweep!(y, P)
@unpack partitioning, D_Dl1, SLbuffer = P
@unpack partsize, nparts, nchunks, chunksize, backend = partitioning
ndrange = nchunks * chunksize
kernel = _forward_sweep_kernel!(backend, chunksize, ndrange)
size_A = convert(typeof(nparts), length(y))
kernel(y, D_Dl1, SLbuffer, size_A, partsize, nparts, nchunks, chunksize; ndrange=ndrange)
synchronize(backend)
@timeit L1GS_TIMER "_forward_sweep! (internal)" begin
@timeit L1GS_TIMER "@unpack P" @unpack partitioning, D_Dl1, SLbuffer = P
@timeit L1GS_TIMER "@unpack partitioning" @unpack partsize, nparts, nchunks, chunksize, backend = partitioning
@timeit L1GS_TIMER "ndrange calc" ndrange = nchunks * chunksize
@timeit L1GS_TIMER "kernel construction" kernel = _forward_sweep_kernel!(backend, chunksize, ndrange)
@timeit L1GS_TIMER "convert size_A" size_A = convert(typeof(nparts), length(y))
@timeit L1GS_TIMER "kernel call" kernel(y, D_Dl1, SLbuffer, size_A, partsize, nparts, nchunks, chunksize; ndrange=ndrange)
@timeit L1GS_TIMER "synchronize" synchronize(backend)
end
return nothing
end

Expand Down
21 changes: 20 additions & 1 deletion test/test_preconditioners.jl
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we are missing tests for the symmetric case.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@testset "Symmetric A" begin
md = mdopen("HB/bcsstk15")
A = md.A # type here is Symmetric{Float64, SparseMatrixCSC{Float64, Int64}}
b = ones(size(A, 1))
test_l1gs_prec(A, b)
end
end

matrix here is already of type Symmetric
also for the test algorithmic logic I added this

function test_sym(testname, A, x, y_exp, D_Dl1_exp, SLbuffer_exp, partsize)
@testset "$testname Symmetric" begin
total_ncores = 8 # Assuming 8 cores for testing
for ncores in 1:total_ncores # testing for multiple cores to check that the answer is independent of the number of cores
builder = L1GSPrecBuilder(PolyesterDevice(ncores))
P = A isa Symmetric ? builder(A, partsize) : builder(A, partsize; isSymA=true)
@test P.D_Dl1 D_Dl1_exp
@test P.SLbuffer SLbuffer_exp
y = P \ x
@test y y_exp
end
end
end

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. Thanks!

However, we should use CG for the symmetric problems to ensure that the preconditioner actually conserves the symmetry.

Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,27 @@ function test_l1gs_prec(A, b)
@test isapprox(A * sol_unprec.u, b, rtol=1e-1, atol=1e-1)

# Test L1GS Preconditioner
# Reset timer before building preconditioner
Thunderbolt.Preconditioners.reset_l1gs_timer!()

P = L1GSPrecBuilder(PolyesterDevice(ncores))(A, partsize)

println("\n" * "="^60)
println("Preconditioner Construction Timings:")
println("="^60)
Thunderbolt.Preconditioners.show_l1gs_timer()

# Reset timer before testing ldiv!
Thunderbolt.Preconditioners.reset_l1gs_timer!()

sol_prec = solve(prob, KrylovJL_GMRES(P); Pl=P)

println("\n" * "="^60)
println("ldiv! Timings during solve:")
println("="^60)
Thunderbolt.Preconditioners.show_l1gs_timer()
println("="^60)

println("Unprec. no. iters: $(sol_unprec.iters), time: $(sol_unprec.stats.timer)")
println("Prec. no. iters: $(sol_prec.iters), time: $(sol_prec.stats.timer)")
@test isapprox(A * sol_prec.u, b, rtol=1e-1, atol=1e-1)
Expand Down Expand Up @@ -95,7 +114,7 @@ end
end

@testset "Symmetric A" begin
md = mdopen("HB/bcsstk15")
md = mdopen("HB/bcsstk15")
A = md.A
b = ones(size(A, 1))
test_l1gs_prec(A, b)
Expand Down
Loading