using Printf

function answer_checker(answer,solution)
    if answer == solution
        "🥳 Well done! "
    else
        "It's not correct. Keep trying! 💪"
    end |> println
end
gauss_seidel_1_check(answer) = answer_checker(answer,"c")
jacobi_1_check(answer) = answer_checker(answer, "d")
jacobi_2_check(answer) = answer_checker(answer, "b")
jacobi_3_check(answer) = answer_checker(answer, "c")
lh_check(answer) = answer_checker(answer, "c")
sndrcv_check(answer) = answer_checker(answer,"b")
function partition_1d_answer(bool)
    bool || return
    msg = """
- We update N^2/P items per iteration
- We need data from 2 neighbors (2 messages per iteration)
- We communicate N items per message
- Communication/computation ratio is 2N/(N^2/P) = 2P/N =O(P/N)
    """
    println(msg)
end
function partition_2d_answer(bool)
        bool || return
    msg = """
- We update N^2/P items per iteration
- We need data from 4 neighbors (4 messages per iteration)
- We communicate N/sqrt(P) items per message
- Communication/computation ratio is (4N/sqrt(P)/(N^2/P)= 4sqrt(P)/N =O(sqrt(P)/N)
    """
    println(msg)
end
function partition_cyclic_answer(bool)
        bool || return
    msg = """
- We update N^2/P items
- We need data from 4 neighbors (4 messages per iteration)
- We communicate N^2/P items per message (the full data owned by the neighbor)
- Communication/computation ratio is O(1)
    """
println(msg)
end
function sndrcv_fix_answer(bool)
        bool || return
    msg = """
    One needs to carefully order the sends and the receives to avoid cyclic dependencies
    that might result in deadlocks. The actual implementation is left as an exercise.   
    """
    println(msg)
end
jacobitest_check(answer) = answer_checker(answer,"a")
function jacobitest_why(bool)
        bool || return
    msg = """
    The test will pass. The parallel implementation does exactly the same operations
    in exactly the same order than the sequential one. Thus, the result should be
    exactly the same. Note however this is often not true in other parallel algorithms.
    Specially, when one does parallel reductions.
    """
    println(msg)
end
gauss_seidel_2_check(answer) = answer_checker(answer,"d")
function gauss_seidel_2_why(bool)
        bool || return
    msg = """
    All "red" cells can be updated in parallel as they only depend on the values of "black" cells.
    In order workds, we can update the "red" cells in any order whithout changing the result. They only
    depend on values in the "black" cells, which will not change during the loop over "red" cells.
    Similarly, all "black" cells can be updated in parallel as they only depend on "red" cells.
    """
    println(msg)
end
println("🥳 Well done! ")

function jacobi(n,niters)
    u = zeros(n+2)
    u[1] = -1
    u[end] = 1
    u_new = copy(u)
    for t in 1:niters
        for i in 2:(n+1)
            u_new[i] = 0.5*(u[i-1]+u[i+1])
        end
        u, u_new = u_new, u
    end
    u
end

jacobi(5,0)

jacobi(5,100)

function jacobi_with_tol(n,tol)
    u = zeros(n+2)
    u[1] = -1
    u[end] = 1
    u_new = copy(u)
    while true
        diff = 0.0
        for i in 2:(n+1)
            ui_new = 0.5*(u[i-1]+u[i+1])
            u_new[i] = ui_new
            diff_i = abs(ui_new-u[i])
            diff = max(diff_i,diff)            
        end
        if diff < tol
            return u_new
        end
        u, u_new = u_new, u
    end
    u
end

n = 5
tol = 1e-10
jacobi_with_tol(n,tol)

for t in 1:nsteps
    for i in 2:(n+1)
        u_new[i] = 0.5*(u[i-1]+u[i+1])
    end
    u, u_new = u_new, u
end

function jacobi_2d(n,niters)
    u = zeros(n+2,n+2)
    u[1,:] = u[end,:] = u[:,1] = u[:,end] .= 1
    u_new = copy(u)
    for t in 1:niters
        for j in 2:(n+1)
            for i in 2:(n+1)
                north = u[i,j+1]
                south = u[i,j-1]
                east = u[i+1,j]
                west = u[i-1,j]
                u_new[i,j] = 0.25*(north+south+east+west)
            end
        end
        u, u_new = u_new, u
    end
    u
end

n = 10
niters = 0
u = jacobi_2d(n,niters)

n = 10
niters = 300
u = jacobi_2d(n,niters)

for t in 1:niters
    for j in 2:(n+1)
        for i in 2:(n+1)
            north = u[i,j+1]
            south = u[i,j-1]
            east = u[i+1,j]
            west = u[i-1,j]
            u_new[i,j] = 0.25*(north+south+east+west)
        end
    end
    u, u_new = u_new, u
end

uncover = false # Change to true to see the answer
partition_1d_answer(uncover)

uncover = false
partition_2d_answer(uncover)

uncover = false
partition_cyclic_answer(uncover)

function gauss_seidel(n,niters)
    u = zeros(n+2)
    u[1] = -1
    u[end] = 1
    for t in 1:niters
        for i in 2:(n+1)
            u[i] = 0.5*(u[i-1]+u[i+1])
        end
    end
    u
end

n = 5
niters = 1000
gauss_seidel(n,niters)

jacobi(n,niters)

for t in 1:niters
    for i in 2:(n+1)
        u[i] = 0.5*(u[i-1]+u[i+1])
    end
end

answer = "x" # replace x with a, b, c or d
gauss_seidel_1_check(answer)

function backward_gauss_seidel(n,niters)
    u = zeros(n+2)
    u[1] = -1
    u[end] = 1
    for t in 1:niters
        for i in (n+1):-1:2
            u[i] = 0.5*(u[i-1]+u[i+1])
        end
    end
    u
end

n = 5
niters = 1
u_forward = gauss_seidel(n,niters)

u_backward = backward_gauss_seidel(n,niters)

u_jacobi = jacobi(n,niters)

function red_black_gauss_seidel(n,niters)
    u = zeros(n+2)
    u[1] = -1
    u[end] = 1
    for t in 1:niters
        for color in (0,1)
            for i in (n+1):-1:2
                if color == mod(i,2)
                    u[i] = 0.5*(u[i-1]+u[i+1])
                end
            end
        end
    end
    u
end

n = 5
niters = 1000
red_black_gauss_seidel(n,niters)

for t in 1:niters
    for color in (0,1)
        for i in (n+1):-1:2
            if color == mod(i,2)
                u[i] = 0.5*(u[i-1]+u[i+1])
            end
        end
    end
end

answer = "x" # replace x with a, b, c or d
gauss_seidel_2_check(answer)

uncover = false
gauss_seidel_2_why(uncover)

u = jacobi_mpi(n,niters,comm)

code1 = quote
    function jacobi_mpi(n,niters,comm)
        # Initialization
        u, u_new = init(n,comm)
        for t in 1:niters
            # Communication
            ghost_exchange!(u,comm)
            # Local computation
            local_update!(u,u_new)
            u, u_new = u_new, u
        end
        return u
    end
end;

code2 = quote
    function init(n,comm)
        nranks = MPI.Comm_size(comm)
        rank = MPI.Comm_rank(comm)
        if mod(n,nranks) != 0
            println("n must be a multiple of nranks")
            MPI.Abort(comm,1)
        end
        load = div(n,nranks)
        u = zeros(load+2)
        if rank == 0
            u[1] = -1
        end
        if rank == nranks-1
            u[end] = 1
        end
        u_new = copy(u)
        u, u_new
    end
end;

code3 = quote
    function ghost_exchange!(u,comm)
        load = length(u)-2
        rank = MPI.Comm_rank(comm)
        nranks = MPI.Comm_size(comm)
        if rank != 0
            neig_rank = rank-1
            u_snd = view(u,2:2)
            u_rcv = view(u,1:1)
            dest = neig_rank
            source = neig_rank
            MPI.Sendrecv!(u_snd,u_rcv,comm;dest,source)
        end
        if rank != (nranks-1)
            neig_rank = rank+1
            u_snd = view(u,(load+1):(load+1))
            u_rcv = view(u,(load+2):(load+2))
            dest = neig_rank
            source = neig_rank
            MPI.Sendrecv!(u_snd,u_rcv,comm;dest,source)
        end
    end
end;

function ghost_exchange!(u,comm)
        load = length(u)-2
        rank = MPI.Comm_rank(comm)
        nranks = MPI.Comm_size(comm)
        if rank != 0
            neig_rank = rank-1
            u_snd = view(u,2:2)
            u_rcv = view(u,1:1)
            dest = neig_rank
            source = neig_rank
            MPI.Send(u_snd,comm;dest)
            MPI.Recv!(u_rcv,comm;source)
        end
        if rank != (nranks-1)
            neig_rank = rank+1
            u_snd = view(u,(load+1):(load+1))
            u_rcv = view(u,(load+2):(load+2))
            dest = neig_rank
            source = neig_rank
            MPI.Send(u_snd,comm;dest)
            MPI.Recv!(u_rcv,comm;source)
        end
    end

answer = "x" # replace x with a, b, c or d
sndrcv_check(answer)

uncover = false
sndrcv_fix_answer(uncover)

code4 = quote
    function local_update!(u,u_new)
        load = length(u)-2
        for i in 2:(load+1)
            u_new[i] = 0.5*(u[i-1]+u[i+1])
        end
    end
end;

] add MPI

using MPI
code = quote
    using MPI
    MPI.Init()
    $code1
    $code2
    $code3
    $code4
    n = 9
    niters = 200
    comm = MPI.Comm_dup(MPI.COMM_WORLD)
    u = jacobi_mpi(n,niters,comm)
    @show u
end
run(`$(mpiexec()) -np 3 julia --project=. -e $code`);

code5 = quote
    function gather_final_result(u,comm)
        load = length(u)-2
        rank = MPI.Comm_rank(comm)
        if rank !=0
            # If I am not rank 0
            # I send my own data to rank 0
            u_snd = view(u,2:(load+1))
            MPI.Send(u_snd,comm,dest=0)
            u_root = zeros(0) # This will nevel be used
        else
            # If I am rank 0
            nranks = MPI.Comm_size(comm)
            n = load*nranks
            u_root = zeros(n+2)
            # Set boundary
            u_root[1] = -1
            u_root[end] = 1
            # Set data for rank 0
            lb = 2
            ub = load+1
            u_root[lb:ub] = view(u,lb:ub)
            # Receive and set data from other ranks
            for other_rank in 1:(nranks-1)
                lb += load
                ub += load
                u_rcv = view(u_root,lb:ub)
                MPI.Recv!(u_rcv,comm;source=other_rank)
            end
        end
        # NB onle the root (rank 0) will
        # contain meaningfull data
        return u_root
    end
end;

code = quote
    using MPI
    MPI.Init()
    $code1
    $code2
    $code3
    $code4
    $code5
    n = 9
    niters = 200
    comm = MPI.Comm_dup(MPI.COMM_WORLD)
    u = jacobi_mpi(n,niters,comm)
    u_root = gather_final_result(u,comm)
    @show u_root
end
run(`$(mpiexec()) -np 3 julia --project=. -e $code`);

code6 = quote
    function jacobi(n,niters)
        u = zeros(n+2)
        u[1] = -1
        u[end] = 1
        u_new = copy(u)
        for t in 1:niters
            for i in 2:(n+1)
                u_new[i] = 0.5*(u[i-1]+u[i+1])
            end
            u, u_new = u_new, u
        end
        u
    end
end;

code = quote
    using MPI
    MPI.Init()
    $code1
    $code2
    $code3
    $code4
    $code5
    $code6
    n = 12
    niters = 100
    comm = MPI.Comm_dup(MPI.COMM_WORLD)
    u = jacobi_mpi(n,niters,comm)
    u_root = gather_final_result(u,comm)
    rank = MPI.Comm_rank(comm)
    if rank == 0
        # Compare agains serial
        u_seq = jacobi(n,niters)
        if isapprox(u_root,u_seq)
            println("Test passed 🥳")
        else
            println("Test failed")
        end
    end
end
run(`$(mpiexec()) -np 3 julia --project=. -e $code`);

answer = "x" # replace x with a, b, c or d
lh_check(answer)

# file ex1.jl (begin)

using MPI
MPI.Init()

function jacobi_mpi_latency_hiding(n,niters,comm)
    u, u_new = init(n,comm)
    load = length(u)-2
    rank = MPI.Comm_rank(comm)
    nranks = MPI.Comm_size(comm)
    nreqs = 2*((rank != 0) + (rank != (nranks-1)))
    reqs = MPI.MultiRequest(nreqs)
    for t in 1:niters
        ireq = 0
        if rank != 0
            neig_rank = rank-1
            u_snd = view(u,2:2)
            u_rcv = view(u,1:1)
            dest = neig_rank
            source = neig_rank
            ireq += 1
            MPI.Isend(u_snd,comm,reqs[ireq];dest)
            ireq += 1
            MPI.Irecv!(u_rcv,comm,reqs[ireq];source)
        end
        if rank != (nranks-1)
            neig_rank = rank+1
            u_snd = view(u,(load+1):(load+1))
            u_rcv = view(u,(load+2):(load+2))
            dest = neig_rank
            source = neig_rank
            ireq += 1
            MPI.Isend(u_snd,comm,reqs[ireq];dest)
            ireq += 1
            MPI.Irecv!(u_rcv,comm,reqs[ireq];source)
        end
        MPI.Waitall(reqs)
        for i in 2:load+1
            u_new[i] = 0.5*(u[i-1]+u[i+1])
        end
        u, u_new = u_new, u
    end
    return u
end

function init(n,comm)
    nranks = MPI.Comm_size(comm)
    rank = MPI.Comm_rank(comm)
    if mod(n,nranks) != 0
        println("n must be a multiple of nranks")
        MPI.Abort(comm,1)
    end
    load = div(n,nranks)
    u = zeros(load+2)
    if rank == 0
        u[1] = -1
    end
    if rank == nranks-1
        u[end] = 1
    end
    u_new = copy(u)
    u, u_new
end

function gather_final_result(u,comm)
    load = length(u)-2
    rank = MPI.Comm_rank(comm)
    if rank !=0
        u_snd = view(u,2:(load+1))
        MPI.Send(u_snd,comm,dest=0)
        u_root = zeros(0)
    else
        nranks = MPI.Comm_size(comm)
        n = load*nranks
        u_root = zeros(n+2)
        u_root[1] = -1
        u_root[end] = 1
        lb = 2
        ub = load+1
        u_root[lb:ub] = view(u,lb:ub)
        for other_rank in 1:(nranks-1)
            lb += load
            ub += load
            u_rcv = view(u_root,lb:ub)
            MPI.Recv!(u_rcv,comm;source=other_rank)
        end
    end
    return u_root
end

function jacobi(n,niters)
    u = zeros(n+2)
    u[1] = -1
    u[end] = 1
    u_new = copy(u)
    for t in 1:niters
        for i in 2:(n+1)
            u_new[i] = 0.5*(u[i-1]+u[i+1])
        end
        u, u_new = u_new, u
    end
    u
end

n = 12
niters = 100
comm = MPI.Comm_dup(MPI.COMM_WORLD)
u = jacobi_mpi_latency_hiding(n,niters,comm)
u_root = gather_final_result(u,comm)
rank = MPI.Comm_rank(comm)
if rank == 0
    u_seq = jacobi(n,niters)
    if isapprox(u_root,u_seq)
        println("Test passed 🥳")
    else
        println("Test failed 😢")
    end
end

# file ex1.jl (end)

Partition	Messages per iteration	Communication per worker	Computation per worker	Ratio communication/ computation
1D block	2	O(N)	N²/P	O(P/N)
2D block	4	O(N/√P)	N²/P	O(√P/N)
2D cyclic	4	O(N²/P)	N²/P	O(1)

Programming large-scale parallel systems¶

Jacobi method¶

Contents¶

The Jacobi method for the Laplace equation¶

Serial implementation¶

Parallelization of the Jacobi method¶

Where can we exploit parallelism?¶

Parallelization strategy¶

Data dependencies¶

Communication overhead¶

Ghost (aka halo) cells¶

Extension to 2D¶

Serial implementation¶

Where can we exploit parallelism?¶

Parallelization strategies¶

1D block partition¶

2D block partition¶

2D cyclic partition¶

Summary¶

Which partition is the best one?¶

The Gauss-Seidel method¶

Backwards Gauss-Seidel¶

Red-black Gauss-Seidel¶

Changing an algorithm to make it parallel¶

MPI implementation¶

Initialization¶

Communication¶

Local computation¶

Running the code¶

Checking the result¶

Latency hiding¶

Conclusion¶

Exercises¶

Exercise 1¶

Exercise 2¶

License¶