This routine does redistributes using entirely non-blocking comms.
This routine may use more memory than blocking approach --> Can perform slower than blocking routine when redistribute is particularly large, else typically gives better performance
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
type(redist_type), | intent(inout) | :: | r | |||
complex, | intent(in), | dimension ( r%to_low(1):, r%to_low(2):, r%to_low(3): ) | :: | from_here | ||
complex, | intent(inout), | dimension ( r%from_low(1):, r%from_low(2):, r%from_low(3): ) | :: | to_here |
subroutine c_redist_33_inv_mpi_copy_nonblock(r, from_here, to_here)
use mp, only: iproc, send, receive
use mp, only: barrier,waitall,nbsend,nbrecv
implicit none
type (redist_type), intent (in out) :: r
complex, dimension ( &
r%from_low(1):, &
r%from_low(2):, &
r%from_low(3): &
), intent (in out) :: to_here
complex, dimension (&
r%to_low(1):, &
r%to_low(2):, &
r%to_low(3): &
), intent (in) :: from_here
complex, dimension(:, :), allocatable :: r_buff, s_buff
integer, dimension(:), allocatable :: recvhand, sendhand
integer :: i, ip, nn, nrecv, nsend, j
!Work out how big recv buffer has to be and allocate it
! Note that we use redistribute type's nsend (not nrecv) because we are
! doing the inverse transform.
nrecv = r%nsend
if (nrecv > 0) then
allocate( r_buff(size(r%complex_buff), nrecv))
allocate( recvhand(nrecv))
end if
!Now work out how big send buffer has to be, allocate it and fill it
! Note that we use redistribute type's nrecv (not nsend) because we are
! doing the inverse transform.
nsend = r%nrecv
if (nsend > 0) then
allocate( s_buff(size(r%complex_buff), nsend))
allocate( sendhand(nsend))
!$OMP PARALLEL DEFAULT(none) &
!$OMP PRIVATE(i, ip, nn, j) &
!$OMP SHARED(nsend, s_buff, r, from_here, iproc, sendhand)
do i = 1, nsend
ip = r%recv_ip(i)
nn = r%to(ip)%nn
!Any benefit in making this a pointer assignment instead?
!Avoids copying data here but could lead to questionable
!memory access during the actual send?
!$OMP DO &
!$OMP SCHEDULE(static)
do j = 1, nn
s_buff(j, i) = from_here( &
r%to(ip)%k(j), &
r%to(ip)%l(j), &
r%to(ip)%m(j) &
)
end do
!$OMP END DO
!$OMP MASTER
call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i))
!$OMP END MASTER
end do
!$OMP END PARALLEL
end if
!Post receives
if (nrecv > 0) then
!$OMP PARALLEL DO DEFAULT(none) &
!$OMP PRIVATE(i, ip, nn) &
!$OMP SHARED(nrecv, r_buff, r, recvhand) &
!$OMP SCHEDULE(static)
do i = 1, nrecv
ip = r%recv_inv_ip(i)
nn = r%from(ip)%nn
call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i))
end do
!$OMP END PARALLEL DO
end if
if (nrecv > 0) then
!Now we don't have anything to do until all the data has arrived (so should we post sends first?)
!We could do a wait all but may be better to keep on checking each message seperately so that when
!it arrives we can copy out whilst waiting for others to arrive.
!Start with a wait all
call waitall(nrecv, recvhand)
!Now unpack data
!$OMP PARALLEL DEFAULT(none) &
!$OMP PRIVATE(i, ip, nn, j) &
!$OMP SHARED(nrecv, r, to_here, r_buff)
do i = 1, nrecv
!Which processor is this data from
ip = r%recv_inv_ip(i)
nn = r%from(ip)%nn
!$OMP DO &
!$OMP SCHEDULE(static)
do j = 1, nn
to_here( &
r%from(ip)%k(j), &
r%from(ip)%l(j), &
r%from(ip)%m(j) &
) = r_buff(j, i)
enddo
!$OMP END DO NOWAIT
enddo
!$OMP END PARALLEL
deallocate(r_buff)
deallocate(recvhand)
endif
if (nsend > 0) then
call waitall(nsend, sendhand)
deallocate(s_buff)
deallocate(sendhand)
endif
end subroutine c_redist_33_inv_mpi_copy_nonblock