c_redist_22_mpi_copy_nonblock Subroutine

private subroutine c_redist_22_mpi_copy_nonblock(r, from_here, to_here)

Uses

This routine does redistributes using entirely non-blocking comms.

Arguments

Type IntentOptional Attributes Name
type(redist_type), intent(inout) :: r
complex, intent(in), dimension ( r%from_low(1):, r%from_low(2): ) :: from_here
complex, intent(inout), dimension ( r%to_low(1):, r%to_low(2): ) :: to_here

Contents


Source Code

  subroutine c_redist_22_mpi_copy_nonblock(r, from_here, to_here)
    use mp, only: iproc, send, receive
    use mp, only: barrier,waitall,nbsend,nbrecv
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):  &
         ), intent (in) :: from_here
    complex, dimension ( &
         r%to_low(1):,   &
         r%to_low(2):    &
         ), intent (in out) :: to_here
    complex, dimension(:, :), allocatable :: r_buff, s_buff
    integer, dimension(:), allocatable :: recvhand, sendhand
    integer :: i, ip, nn, nrecv, nsend, j

    !Work out how big recv buffer has to be and allocate it
    nrecv = r%nrecv
    if ( nrecv > 0) then
       allocate( r_buff(size(r%complex_buff), nrecv))
       allocate( recvhand(nrecv))
    end if

    !Now work out how big send buffer has to be, allocate it and fill it
    nsend = r%nsend
    if (nsend > 0) then
       allocate( s_buff(size(r%complex_buff),nsend))
       allocate( sendhand(nsend))
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, r, s_buff, from_here, sendhand, iproc)
       do i = 1, nsend
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?

          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             s_buff(j, i) = from_here( &
                  r%from(ip)%k(j),     &
                  r%from(ip)%l(j)      &
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if

    !Post receives
    !Probably not much advantage to doing these ASAP as not going to do anything
    !until data is on its way --> Could post sends first so chance of data being
    !available as soon as receive posted.
    if (nrecv > 0) then
       ! Loops of this form require us to be able to call MPI routines
       ! from any thread. Given there is little work in this loop
       ! we may consider removing OpenMP pragmas if it allows us to
       ! drop to MPI_THREAD_FUNNELED

       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn) &
       !$OMP SHARED(nrecv, r, r_buff, recvhand) &
       !$OMP SCHEDULE(static)
       do i = 1, nrecv
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i))
       end do
       !$OMP END PARALLEL DO
    end if

    if (nrecv > 0) then
       !Now we don't have anything to do until all the data has arrived (so should we post sends first?)
       !We could do a wait all but may be better to keep on checking each message seperately so that when
       !it arrives we can copy out whilst waiting for others to arrive. 
       !Start with a wait all
       call waitall(nrecv, recvhand)

       !Now unpack data.
       ! We choose to distribute the inner loop, unpacking data, rather
       ! than the outer loop (distributing messages) as we expect
       ! the message size nn to be rather variable.
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, to_here, r, r_buff)
       do i = 1, nrecv
          !Which processor is this data from? Would we be better looping over all procs and just checking
          !if that's one to unpack? Removes memory lookup of array but requires branching and incrementing
          !a counter to work out the equivalent of i here (i.e. which receive we're looking at)         
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !Could this be vectorised and is it useful to do so?
          !Could we make r_buff a pointer array so that MPI put data directly into
          !to_here array? Would this be beneficial? Would need to be done before RECVs posted

          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(            &
                  r%to(ip)%k(j), &
                  r%to(ip)%l(j)  &
                  ) = r_buff(j, i)
          enddo
          !$OMP END DO NOWAIT
       enddo
       !$OMP END PARALLEL

       deallocate(r_buff)
       deallocate(recvhand)
    endif

    if (nsend > 0) then
       call waitall(nsend, sendhand)
       deallocate(s_buff)
       deallocate(sendhand)
    endif

  end subroutine c_redist_22_mpi_copy_nonblock