This routine does redistributes using entirely non-blocking comms.
This routine may use more memory than blocking approach --> Can perform slower than blocking routine when redistribute is particularly large, else typically gives better performance
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(redist_type), | intent(inout) | :: | r | |||
complex, | intent(in), | dimension ( r%from_low(1):, r%from_low(2): ) | :: | from_here | ||
complex, | intent(inout), | dimension ( r%to_low(1):, r%to_low(2): ) | :: | to_here |
subroutine c_redist_22_mpi_copy_nonblock(r, from_here, to_here)
use mp, only: iproc, send, receive
use mp, only: barrier,waitall,nbsend,nbrecv
implicit none
type (redist_type), intent (in out) :: r
complex, dimension ( &
r%from_low(1):, &
r%from_low(2): &
), intent (in) :: from_here
complex, dimension ( &
r%to_low(1):, &
r%to_low(2): &
), intent (in out) :: to_here
complex, dimension(:, :), allocatable :: r_buff, s_buff
integer, dimension(:), allocatable :: recvhand, sendhand
integer :: i, ip, nn, nrecv, nsend, j
!Work out how big recv buffer has to be and allocate it
nrecv = r%nrecv
if ( nrecv > 0) then
allocate( r_buff(size(r%complex_buff), nrecv))
allocate( recvhand(nrecv))
end if
!Now work out how big send buffer has to be, allocate it and fill it
nsend = r%nsend
if (nsend > 0) then
allocate( s_buff(size(r%complex_buff),nsend))
allocate( sendhand(nsend))
!$OMP PARALLEL DEFAULT(none) &
!$OMP PRIVATE(i, ip, nn, j) &
!$OMP SHARED(nsend, r, s_buff, from_here, sendhand, iproc)
do i = 1, nsend
ip = r%recv_inv_ip(i)
nn = r%from(ip)%nn
!Any benefit in making this a pointer assignment instead?
!Avoids copying data here but could lead to questionable
!memory access during the actual send?
!$OMP DO &
!$OMP SCHEDULE(static)
do j = 1, nn
s_buff(j, i) = from_here( &
r%from(ip)%k(j), &
r%from(ip)%l(j) &
)
end do
!$OMP END DO
!$OMP MASTER
call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i))
!$OMP END MASTER
end do
!$OMP END PARALLEL
end if
!Post receives
!Probably not much advantage to doing these ASAP as not going to do anything
!until data is on its way --> Could post sends first so chance of data being
!available as soon as receive posted.
if (nrecv > 0) then
! Loops of this form require us to be able to call MPI routines
! from any thread. Given there is little work in this loop
! we may consider removing OpenMP pragmas if it allows us to
! drop to MPI_THREAD_FUNNELED
!$OMP PARALLEL DO DEFAULT(none) &
!$OMP PRIVATE(i, ip, nn) &
!$OMP SHARED(nrecv, r, r_buff, recvhand) &
!$OMP SCHEDULE(static)
do i = 1, nrecv
ip = r%recv_ip(i)
nn = r%to(ip)%nn
call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i))
end do
!$OMP END PARALLEL DO
end if
if (nrecv > 0) then
!Now we don't have anything to do until all the data has arrived (so should we post sends first?)
!We could do a wait all but may be better to keep on checking each message seperately so that when
!it arrives we can copy out whilst waiting for others to arrive.
!Start with a wait all
call waitall(nrecv, recvhand)
!Now unpack data.
! We choose to distribute the inner loop, unpacking data, rather
! than the outer loop (distributing messages) as we expect
! the message size nn to be rather variable.
!$OMP PARALLEL DEFAULT(none) &
!$OMP PRIVATE(i, ip, nn, j) &
!$OMP SHARED(nrecv, to_here, r, r_buff)
do i = 1, nrecv
!Which processor is this data from? Would we be better looping over all procs and just checking
!if that's one to unpack? Removes memory lookup of array but requires branching and incrementing
!a counter to work out the equivalent of i here (i.e. which receive we're looking at)
ip = r%recv_ip(i)
nn = r%to(ip)%nn
!Could this be vectorised and is it useful to do so?
!Could we make r_buff a pointer array so that MPI put data directly into
!to_here array? Would this be beneficial? Would need to be done before RECVs posted
!$OMP DO &
!$OMP SCHEDULE(static)
do j = 1, nn
to_here( &
r%to(ip)%k(j), &
r%to(ip)%l(j) &
) = r_buff(j, i)
enddo
!$OMP END DO NOWAIT
enddo
!$OMP END PARALLEL
deallocate(r_buff)
deallocate(recvhand)
endif
if (nsend > 0) then
call waitall(nsend, sendhand)
deallocate(s_buff)
deallocate(sendhand)
endif
end subroutine c_redist_22_mpi_copy_nonblock