Solutions

Julia Basics

Exercise 1

function ex1(a)
    j = 1
    m = a[j]
    for (i,ai) in enumerate(a)
        if m < ai
            m = ai
            j = i
        end
    end
    (m,j)
end

Exercise 2

ex2(f,g) = x -> f(x) + g(x)

Exercise 3

using GLMakie
max_iters = 100
n = 1000
x = LinRange(-1.7,0.7,n)
y = LinRange(-1.2,1.2,n)
heatmap(x,y,(i,j)->mandel(i,j,max_iters))

Asynchronous programming in Julia

Distributed computing in Julia

Exercise 1

f = () -> Channel{Int}(1)
chnls = [ RemoteChannel(f,w) for w in workers() ]
@sync for (iw,w) in enumerate(workers())
    @spawnat w begin
        chnl_snd = chnls[iw]
        if w == 2
            chnl_rcv = chnls[end]
            msg = 2
            println("msg = $msg")
            put!(chnl_snd,msg)
            msg = take!(chnl_rcv)
            println("msg = $msg")
        else
           chnl_rcv = chnls[iw-1]
           msg = take!(chnl_rcv)
           msg += 1
           println("msg = $msg")
           put!(chnl_snd,msg)
        end
    end
end

This is another possible solution.

@everywhere function work(msg)
    println("msg = $msg")
    if myid() != nprocs()
        next = myid() + 1
        @fetchfrom next work(msg+1)
    else
        @fetchfrom 2 println("msg = $msg")
    end
end
msg = 2
@fetchfrom 2 work(msg)

Matrix-matrix multiplication

Exercise 1

function matmul_dist_3!(C,A,B)
    m = size(C,1)
    n = size(C,2)
    l = size(A,2)
    @assert size(A,1) == m
    @assert size(B,2) == n
    @assert size(B,1) == l
    @assert mod(m,nworkers()) == 0
    nrows_w = div(m,nworkers())
    @sync for (iw,w) in enumerate(workers())
        lb = 1 + (iw-1)*nrows_w
        ub = iw*nrows_w
        A_w = A[lb:ub,:]
        ftr = @spawnat w begin
             C_w = similar(A_w)
             matmul_seq!(C_w,A_w,B)
             C_w
        end
        @async C[lb:ub,:] = fetch(ftr)
    end
    C
end

@everywhere function matmul_seq!(C,A,B)
    m = size(C,1)
    n = size(C,2)
    l = size(A,2)
    @assert size(A,1) == m
    @assert size(B,2) == n
    @assert size(B,1) == l
    z = zero(eltype(C))
    for j in 1:n
        for i in 1:m
            Cij = z
            for k in 1:l
                @inbounds Cij = Cij + A[i,k]*B[k,j]
            end
            C[i,j] = Cij
        end
    end
    C
end

MPI (Point-to-point)

Exercise 1

function matmul_mpi_3!(C,A,B)
    comm = MPI.COMM_WORLD
    rank = MPI.Comm_rank(comm)
    P = MPI.Comm_size(comm)
    if  rank == 0
        N = size(A,1)
        myB = B
        for dest in 1:(P-1)
            MPI.Send(B,comm;dest)
        end
    else
        source = 0
        status = MPI.Probe(comm,MPI.Status;source)
        count = MPI.Get_count(status,eltype(B))
        N = Int(sqrt(count))
        myB = zeros(N,N)
        MPI.Recv!(myB,comm;source)
    end
    L = div(N,P)
    myA = zeros(L,N)
    if  rank == 0
        lb = L*rank+1
        ub = L*(rank+1)
        myA[:,:] = view(A,lb:ub,:)
        for dest in 1:(P-1)
            lb = L*dest+1
            ub = L*(dest+1)
            MPI.Send(view(A,lb:ub,:),comm;dest)
        end
    else
        source = 0
        MPI.Recv!(myA,comm;source)
    end
    myC = myA*myB
    if rank == 0
        lb = L*rank+1
        ub = L*(rank+1)
        C[lb:ub,:] = myC
        for source in 1:(P-1)
            lb = L*source+1
            ub = L*(source+1)
            MPI.Recv!(view(C,lb:ub,:),comm;source)
        end
    else
        dest = 0
        MPI.Send(myC,comm;dest)
    end
    C
end

Exercise 2

using MPI
MPI.Init()
comm = MPI.COMM_WORLD
rank = MPI.Comm_rank(comm)
nranks = MPI.Comm_size(comm)
buffer = Ref(0)
if rank == 0
    msg = 2
    buffer[] = msg
    println("msg = $(buffer[])")
    MPI.Send(buffer,comm;dest=rank+1,tag=0)
    MPI.Recv!(buffer,comm;source=nranks-1,tag=0)
    println("msg = $(buffer[])")
else
    dest = if (rank != nranks-1)
        rank+1
    else
        0
    end
    MPI.Recv!(buffer,comm;source=rank-1,tag=0)
    buffer[] += 1
    println("msg = $(buffer[])")
    MPI.Send(buffer,comm;dest,tag=0)
end

MPI (collectives)

Exercise 1

function matmul_mpi_3!(C,A,B)
    comm = MPI.COMM_WORLD
    rank = MPI.Comm_rank(comm)
    P = MPI.Comm_size(comm)
    root = 0
    if  rank == root
        N = size(A,1)
        Nref = Ref(N)
    else
        Nref = Ref(0)
    end
    MPI.Bcast!(Nref,comm;root)
    N = Nref[]
    if  rank == root
        myB = B
    else
        myB = zeros(N,N)
    end
    MPI.Bcast!(myB,comm;root)
    L = div(N,P)
    # Tricky part
    # Julia works "col major"
    myAt = zeros(N,L)
    At = collect(transpose(A))
    MPI.Scatter!(At,myAt,comm;root)
    myCt = transpose(myB)*myAt
    Ct = similar(C)
    MPI.Gather!(myCt,Ct,comm;root)
    C .= transpose(Ct)
    C
end

This other solution uses a column partition instead of a row partition. It is more natural to work with column partitions in Julia if possible since matrices are in "col major" format. Note that we do not need all the auxiliary transposes anymore.

function matmul_mpi_3!(C,A,B)
    comm = MPI.COMM_WORLD
    rank = MPI.Comm_rank(comm)
    P = MPI.Comm_size(comm)
    root = 0
    if  rank == root
        N = size(A,1)
        Nref = Ref(N)
    else
        Nref = Ref(0)
    end
    MPI.Bcast!(Nref,comm;root)
    N = Nref[]
    if  rank == root
        myA = A
    else
        myA = zeros(N,N)
    end
    MPI.Bcast!(myA,comm;root)
    L = div(N,P)
    myB = zeros(N,L)
    MPI.Scatter!(B,myB,comm;root)
    myC = myA*myB
    MPI.Gather!(myC,C,comm;root)
    C
end

Jacobi method

Exercise 1

function jacobi_mpi(n,niters)
    u, u_new = init(n,comm)
    load = length(u)-2
    rank = MPI.Comm_rank(comm)
    nranks = MPI.Comm_size(comm)
    nreqs = 2*((rank != 0) + (rank != (nranks-1)))
    reqs = MPI.MultiRequest(nreqs)
    for t in 1:niters
        ireq = 0
        if rank != 0
            neig_rank = rank-1
            u_snd = view(u,2:2)
            u_rcv = view(u,1:1)
            dest = neig_rank
            source = neig_rank
            ireq += 1
            MPI.Isend(u_snd,comm,reqs[ireq];dest)
            ireq += 1
            MPI.Irecv!(u_rcv,comm,reqs[ireq];source)
        end
        if rank != (nranks-1)
            neig_rank = rank+1
            u_snd = view(u,(load+1):(load+1))
            u_rcv = view(u,(load+2):(load+2))
            dest = neig_rank
            source = neig_rank
            ireq += 1
            MPI.Isend(u_snd,comm,reqs[ireq];dest)
            ireq += 1
            MPI.Irecv!(u_rcv,comm,reqs[ireq];source)
        end
        # Upload interior cells
        for i in 3:load
            u_new[i] = 0.5*(u[i-1]+u[i+1])
        end
        # Wait for the communications to finish
        MPI.Waitall(reqs)
        # Update boundaries
        for i in (2,load+1)
            u_new[i] = 0.5*(u[i-1]+u[i+1])
        end
        u, u_new = u_new, u
    end
    return u
end

Exercise 2

function jacobi_mpi(n,niters,tol,comm) # new tol arg
    u, u_new = init(n,comm)
    load = length(u)-2
    rank = MPI.Comm_rank(comm)
    nranks = MPI.Comm_size(comm)
    nreqs = 2*((rank != 0) + (rank != (nranks-1)))
    reqs = MPI.MultiRequest(nreqs)
    for t in 1:niters
        ireq = 0
        if rank != 0
            neig_rank = rank-1
            u_snd = view(u,2:2)
            u_rcv = view(u,1:1)
            dest = neig_rank
            source = neig_rank
            ireq += 1
            MPI.Isend(u_snd,comm,reqs[ireq];dest)
            ireq += 1
            MPI.Irecv!(u_rcv,comm,reqs[ireq];source)
        end
        if rank != (nranks-1)
            neig_rank = rank+1
            u_snd = view(u,(load+1):(load+1))
            u_rcv = view(u,(load+2):(load+2))
            dest = neig_rank
            source = neig_rank
            ireq += 1
            MPI.Isend(u_snd,comm,reqs[ireq];dest)
            ireq += 1
            MPI.Irecv!(u_rcv,comm,reqs[ireq];source)
        end
        MPI.Waitall(reqs)
        # Compute the max diff in the current
        # rank while doing the local update
        mydiff = 0.0
        for i in 2:load+1
            u_new[i] = 0.5*(u[i-1]+u[i+1])
            diff_i = abs(u_new[i] - u[i])
            mydiff = max(mydiff,diff_i)
        end
        # Now we need to find the global diff
        diff_ref = Ref(mydiff)
        MPI.Allreduce!(diff_ref,max,comm)
        diff = diff_ref[]
        # If global diff below tol, stop!
        if diff < tol
            return u_new
        end
        u, u_new = u_new, u
    end
    return u
end

All pairs of shortest paths

Exercise 1

function floyd_iterations!(myC,comm)
    L = size(myC,1)
    N = size(myC,2)
    rank = MPI.Comm_rank(comm)
    P = MPI.Comm_size(comm)
    lb = L*rank+1
    ub = L*(rank+1)
    C_k = similar(myC,N)
    for k in 1:N
        if (lb<=k) && (k<=ub)
            # If I have the row, fill in the buffer
            myk = (k-lb)+1
            C_k[:] = view(myC,myk,:)
        end
        # We need to find out the owner of row k.
        # Easy since N is a multiple of P
        root = div(k-1,L)
        MPI.Bcast!(C_k,comm;root)
        # Now, we have the data dependencies and
        # we can do the updates locally
        for j in 1:N
            for i in 1:L
                myC[i,j] = min(myC[i,j],myC[i,k]+C_k[j])
            end
        end
    end
    myC
end