!> Redistribute distributed (integer, real, complex or logical) !! (1, 2, 3, or 4) dimensional arrays into two dimensional arrays with !! first index on local processor, and vice versa. !! !! The first operation is called 'gather' and the second is called 'scatter.' !! !! One can also do a 'fill' operation. This consists of copying !! values from a (2, 3, or 4) dimensional array of !! (integer, real, complex, or logical ) values into !! another array with the same number of dimensions. !! !! One can also do a three index to four index redistribution for complex numbers. module redistribute ! Modifications for optimised local copy in c_redist_22 and c_redist_32 ! (and their inverse routines): ! (c) The Numerical Algorithms Group (NAG) Ltd, 2012 ! on behalf of EPSRC for the HECToR project implicit none public :: opt_redist_nbk, using_measure_scatter, opt_redist_persist, opt_redist_persist_overlap public :: write_connection_matrix, get_redist_times, reset_redist_times private logical :: opt_local_copy = .false. logical :: opt_redist_nbk = .true. !< Do we use non-blocking sends? logical :: opt_redist_persist = .false. !< Do we use persistent sends? logical :: opt_redist_persist_overlap = .false. !< Do we try to do a bit of overlapping with persistent sends? logical :: using_measure_scatter=.true. !< If true then don't time in gather/scatter directly integer :: naky = 0 integer :: ntgrid = 0 integer :: nlambda = 0 integer :: ntheta0 = 0 integer :: negrid = 0 integer :: nx = 0 integer :: xxf_lo_ulim_proc = 0 integer :: yxf_lo_ulim_proc = 0 integer :: g_lo_ulim = 0 character (len=5) :: layout = '' public :: index_list_type, delete_list public :: redist_type, delete_redist public :: report_map_property, measure_gather, measure_scatter public :: gather_count, scatter_count, time_redist public :: init_redist, gather, scatter public :: init_fill, fill public :: set_redist_character_type public :: set_xxf_optimised_variables public :: set_yxf_optimised_variables public :: set_optimised_choice public :: get_redistname interface gather module procedure c_redist_22, r_redist_22, i_redist_22, l_redist_22 module procedure c_redist_32, r_redist_32, i_redist_32, l_redist_32 module procedure c_redist_42, r_redist_42, i_redist_42, l_redist_42 module procedure c_redist_23 module procedure c_redist_34, c_redist_36 module procedure r_redist_62 ! MRH I have left this subroutine in place for now, although we may consider removing it since it is redundant (only r_redist_26 is required) module procedure r_redist_26 module procedure c_redist_33, r_redist_33 end interface interface scatter module procedure c_redist_12, r_redist_12, i_redist_12, l_redist_12 module procedure c_redist_22_inv, r_redist_22_inv, i_redist_22_inv, l_redist_22_inv module procedure c_redist_32_inv, r_redist_32_inv, i_redist_32_inv, l_redist_32_inv module procedure c_redist_42_inv, r_redist_42_inv, i_redist_42_inv, l_redist_42_inv module procedure c_redist_36_inv module procedure r_redist_62_inv ! MRH I have left this subroutine in place for now, although we may consider removing it since it is redundant (only r_redist_26_inv is required) module procedure r_redist_26_inv module procedure c_redist_33_inv, r_redist_33_inv end interface interface measure_gather module procedure measure_gather_32, measure_gather_33 module procedure measure_gather_22 end interface interface measure_scatter module procedure measure_scatter_23, measure_scatter_33 module procedure measure_scatter_22 end interface interface init_redist module procedure init_redist_gs2, init_redist_agk end interface integer :: gather_count=0, scatter_count=0 real :: time_redist(2)=0. real :: time_redist_mpi = 0. interface fill module procedure c_fill_2, c_fill_3, c_fill_4 module procedure r_fill_2, r_fill_3, r_fill_4 module procedure i_fill_2, i_fill_3, i_fill_4 module procedure l_fill_2, l_fill_3, l_fill_4 end interface !> FIXME : Add documentation type :: index_map integer :: nn integer, dimension (:), allocatable :: k integer, dimension (:), allocatable :: l integer, dimension (:), allocatable :: m integer, dimension (:), allocatable :: n integer, dimension (:), allocatable :: o integer, dimension (:), allocatable :: p end type index_map !> Small container type simply providing a 1D allocatable !> buffer. Used to allow array of buffers of different sizes. type :: redist_buffer complex, dimension(:), allocatable :: complex_buffer real, dimension(:), allocatable :: real_buffer end type redist_buffer !> FIXME : Add documentation type :: redist_type private integer, dimension (6) :: to_low, from_low, to_high, from_high type (index_map), dimension (:), allocatable :: to type (index_map), dimension (:), allocatable :: from complex, dimension (:), allocatable :: complex_buff real, dimension (:), allocatable :: real_buff integer, dimension (:), allocatable :: integer_buff logical, dimension (:), allocatable :: logical_buff integer :: nsend, nrecv !Note these swap meaning when doing inverse communication type(redist_buffer), dimension(:), allocatable :: buff_send, buff_recv integer, dimension(:), allocatable :: send_hand, recv_hand, send_inv_hand, recv_inv_hand integer, dimension(:), allocatable :: recv_ip, recv_inv_ip ! optimised choice and optimised_choice_inv set the algorithm used for ! gather/scatters. With default zero, different algorithms are timed ! during initialization. Putting these inside the redist object allows ! this behaviour to be overridden. This is useful, for example, when using ! "split domains" layouts, where one of the optimized algorithms cannot be ! used as it makes assumptions about the form of layouts. integer :: optimised_choice = 0 integer :: optimised_choice_inv = 0 character (len=3) :: redistname = "" character (len=1) :: char = 'X' end type redist_type !> FIXME : Add documentation type :: index_list_type integer, dimension (:), allocatable :: first integer, dimension (:), allocatable :: second integer, dimension (:), allocatable :: third integer, dimension (:), allocatable :: fourth integer, dimension (:), allocatable :: fifth integer, dimension (:), allocatable :: sixth end type index_list_type contains !> Returns current requested timer values subroutine get_redist_times(total_time, mpi_time, copy_time) implicit none real, intent(out), optional :: total_time, mpi_time, copy_time if (present(total_time)) total_time = time_redist(1) if (present(mpi_time)) mpi_time = time_redist_mpi if (present(copy_time)) copy_time = time_redist(1) - time_redist_mpi end subroutine get_redist_times !> Resets redist timers to zero subroutine reset_redist_times implicit none time_redist = 0. time_redist_mpi = 0. end subroutine reset_redist_times !> FIXME : Add documentation subroutine set_redist_character_type(r, chartype) implicit none type (redist_type), intent (inout) :: r character (3), intent (in) :: chartype r%redistname = chartype end subroutine set_redist_character_type !> FIXME : Add documentation subroutine set_yxf_optimised_variables(lyxf_lo_ulim_proc) implicit none integer, intent (in) :: lyxf_lo_ulim_proc yxf_lo_ulim_proc = lyxf_lo_ulim_proc end subroutine set_yxf_optimised_variables !> FIXME : Add documentation subroutine set_xxf_optimised_variables(lopt_local_copy, lnaky, lntgrid, lntheta0, lnlambda, lnegrid, lnx, & lxxf_lo_ulim_proc, lg_lo_ulim, llayout) implicit none logical, intent (in) :: lopt_local_copy integer, intent (in) :: lnaky, lntgrid, lxxf_lo_ulim_proc, lg_lo_ulim, lntheta0, lnlambda, lnegrid, lnx character (len=5), intent (in) :: llayout opt_local_copy = lopt_local_copy naky = lnaky ntgrid = lntgrid xxf_lo_ulim_proc = lxxf_lo_ulim_proc ntheta0 = lntheta0 nlambda = lnlambda negrid = lnegrid nx = lnx g_lo_ulim = lg_lo_ulim layout = llayout end subroutine set_xxf_optimised_variables !> Set choice of redist algorithm based on properties of g_lo subroutine set_optimised_choice(r,val) implicit none type (redist_type), intent (inout) :: r integer, intent (in) :: val r%optimised_choice = val r%optimised_choice_inv = val end subroutine set_optimised_choice !> Get redistname function get_redistname(r) implicit none type (redist_type), intent (inout) :: r character (len=3) :: get_redistname get_redistname = r%redistname end function get_redistname !> FIXME : Add documentation subroutine setup_persistent(r) use mp, only: iproc, nproc, send_init, recv_init, mp_request_null implicit none type (redist_type), intent (inout) :: r integer :: ip, count_r, count_s !Count messages r%nrecv = 0 do ip = 0, nproc - 1 if (r%to(ip)%nn > 0 .and. ip /= iproc) r%nrecv = r%nrecv + 1 end do r%nsend = 0 do ip = 0, nproc - 1 if (r%from(ip)%nn > 0 .and. ip /= iproc) r%nsend = r%nsend + 1 end do if(r%nrecv>0)then allocate(r%recv_ip(r%nrecv)) end if if(r%nsend>0)then allocate(r%recv_inv_ip(r%nsend)) end if !Setup processor lookups count_s = 0 ; count_r = 0 do ip = 0, nproc - 1 if (r%from(ip)%nn > 0 .and. ip /= iproc) then count_s = count_s + 1 r%recv_inv_ip(count_s) = ip endif if (r%to(ip)%nn > 0 .and. ip /= iproc) then count_r = count_r + 1 r%recv_ip(count_r) = ip end if end do !//Persistent comms -- Note could use these buffers in nonblocking case as well if (opt_redist_persist) then !Now allocate some buffers !Only implemented complex and real so far if (r%char == 'c' .or. r%char == 'r') then if (r%nrecv>0) then allocate(r%buff_recv(r%nrecv)) count_r = 0 do ip = 0, nproc - 1 if (r%to(ip)%nn > 0 .and. ip /= iproc) then count_r=count_r+1 if (r%char == 'c') then allocate(r%buff_recv(count_r)%complex_buffer(r%to(ip)%nn)) else if (r%char == 'r') then allocate(r%buff_recv(count_r)%real_buffer(r%to(ip)%nn)) end if end if end do allocate(r%recv_hand(r%nrecv)) r%recv_hand = mp_request_null allocate(r%send_inv_hand(r%nrecv)) r%send_inv_hand = mp_request_null end if if (r%nsend>0) then allocate(r%buff_send(r%nsend)) count_s = 0 do ip = 0, nproc - 1 if (r%from(ip)%nn > 0 .and. ip /= iproc) then count_s=count_s+1 if (r%char == 'c') then allocate(r%buff_send(count_s)%complex_buffer(r%from(ip)%nn)) else if (r%char == 'r') then allocate(r%buff_send(count_s)%real_buffer(r%from(ip)%nn)) end if end if end do allocate(r%send_hand(r%nsend)) r%send_hand = mp_request_null allocate(r%recv_inv_hand(r%nsend)) r%recv_inv_hand = mp_request_null end if !Setup communications count_s = 0 ; count_r = 0 do ip = 0, nproc - 1 if (r%from(ip)%nn > 0 .and. ip /= iproc) then count_s = count_s+1 if (r%char == 'c') then call send_init(r%buff_send(count_s)%complex_buffer, & ip, iproc, r%send_hand(count_s)) call recv_init(r%buff_send(count_s)%complex_buffer, & ip, ip, r%recv_inv_hand(count_s)) else if (r%char == 'r') then call send_init(r%buff_send(count_s)%real_buffer, & ip, iproc, r%send_hand(count_s)) call recv_init(r%buff_send(count_s)%real_buffer, & ip, ip, r%recv_inv_hand(count_s)) end if end if if (r%to(ip)%nn > 0 .and. ip /= iproc) then count_r = count_r+1 if (r%char == 'c') then call recv_init(r%buff_recv(count_r)%complex_buffer, & ip, ip, r%recv_hand(count_r)) call send_init(r%buff_recv(count_r)%complex_buffer, & ip, iproc, r%send_inv_hand(count_r)) else if (r%char == 'r') then call recv_init(r%buff_recv(count_r)%real_buffer, & ip, ip, r%recv_hand(count_r)) call send_init(r%buff_recv(count_r)%real_buffer, & ip, iproc, r%send_inv_hand(count_r)) end if end if end do end if else ! Disable opt_redist_persist if we don't support the type opt_redist_persist = .false. end if end subroutine setup_persistent !> FIXME : Add documentation subroutine init_redist_gs2 (r, char, to_low, to_high, to_list, & from_low, from_high, from_list, ierr) use mp, only: iproc, nproc, proc0, mp_abort, sum_allreduce implicit none type (redist_type), intent (inout) :: r character(1), intent (in) :: char type (index_list_type), dimension (0:nproc-1), intent (in) :: to_list, from_list integer, dimension(:), intent (in) :: to_low, from_low, to_high, from_high integer, optional, intent (out) :: ierr integer :: ip, n_to, n_from, buff_size integer :: ndim_to, ndim_from, idim logical, parameter :: debug = .false. integer, dimension(2) :: nn_total ! Check consistency of inputs if (size(to_low) /= size(to_high)) then call mp_abort("Inconisistent sizes of to_low/to_high in init_redist_gs2.", .true.) end if if (size(from_low) /= size(from_high)) then call mp_abort("Inconisistent sizes of from_low/from_high in init_redist_gs2.", .true.) end if ndim_to = size(to_low) ndim_from = size(from_low) if (ndim_to > 6) then call mp_abort("Too many to_low values passed in init_redist_gs2. Maximum of 6 supported.", .true.) end if if (ndim_from > 6) then call mp_abort("Too many from_low values passed in init_redist_gs2. Maximum of 6 supported.", .true.) end if allocate (r%to(0:nproc-1), r%from(0:nproc-1)) r%char=char if (present(ierr)) ierr = 0 buff_size = 0 ! Initialise limits r%from_low = 1 r%to_low = 1 r%from_high = 0 r%to_high = 0 ! Copy the passed in limits r%from_low(1:ndim_from) = from_low r%to_low(1:ndim_to) = to_low r%from_high(1:ndim_from) = from_high r%to_high(1:ndim_to) = to_high do ip = 0, nproc - 1 if (allocated(to_list(ip)%first)) then n_to = size(to_list(ip)%first) r%to(ip)%nn = n_to if (ip /= iproc) buff_size = max(buff_size, n_to) idim = 0 ! Copy indexing arrays call copy_indices_and_sanity_check(r%to(ip)%k, to_list(ip)%first, & idim, r%to_low, r%to_high) call copy_indices_and_sanity_check(r%to(ip)%l, to_list(ip)%second, & idim, r%to_low, r%to_high) call copy_indices_and_sanity_check(r%to(ip)%m, to_list(ip)%third, & idim, r%to_low, r%to_high) call copy_indices_and_sanity_check(r%to(ip)%n, to_list(ip)%fourth, & idim, r%to_low, r%to_high) call copy_indices_and_sanity_check(r%to(ip)%o, to_list(ip)%fifth, & idim, r%to_low, r%to_high) call copy_indices_and_sanity_check(r%to(ip)%p, to_list(ip)%sixth, & idim, r%to_low, r%to_high) if (idim /= ndim_to) then call mp_abort("Passed to_list has different number of& & allocated members than passed to_low has& & dimensions.", .true.) end if else r%to(ip)%nn = 0 endif enddo do ip = 0, nproc - 1 if (allocated(from_list(ip)%first)) then n_from = size(from_list(ip)%first) if (ip /= iproc) buff_size = max(buff_size, n_from) r%from(ip)%nn = n_from idim = 0 ! Copy indexing arrays call copy_indices_and_sanity_check(r%from(ip)%k, from_list(ip)%first, & idim, r%from_low, r%from_high) call copy_indices_and_sanity_check(r%from(ip)%l, from_list(ip)%second, & idim, r%from_low, r%from_high) call copy_indices_and_sanity_check(r%from(ip)%m, from_list(ip)%third, & idim, r%from_low, r%from_high) call copy_indices_and_sanity_check(r%from(ip)%n, from_list(ip)%fourth, & idim, r%from_low, r%from_high) call copy_indices_and_sanity_check(r%from(ip)%o, from_list(ip)%fifth, & idim, r%from_low, r%from_high) call copy_indices_and_sanity_check(r%from(ip)%p, from_list(ip)%sixth, & idim, r%from_low, r%from_high) if (idim /= ndim_from) then call mp_abort("Passed from_list has different number of& & allocated members than passed from_low has& & dimensions.", .true.) end if else r%from(ip)%nn = 0 endif enddo ! Minor sanity check for the local copy part of the redistribute if (r%to(iproc)%nn /= r%from(iproc)%nn) then call mp_abort('Inconsistent amount of data to copy locally.', .true.) end if ! Additional checks to aid with debugging if (debug) then ! Check total amount of data to be sent matches total amount to receive nn_total = 0 nn_total(1) = sum(r%to%nn) nn_total(2) = sum(r%from%nn) call sum_allreduce(nn_total) if (nn_total(1) /= nn_total(2)) then call mp_abort('Total amount of data to be sent and received in disagreement.', .true.) end if end if if (buff_size > 0) then select case (r%char) case ('c') allocate (r%complex_buff(buff_size)) case ('r') allocate (r%real_buff(buff_size)) case ('i') allocate (r%integer_buff(buff_size)) case ('l') allocate (r%logical_buff(buff_size)) case default if (proc0) then write(*,*) 'Type to be redistributed invalid. Must stop.' write(*,*) r%char endif call mp_abort('Type to be redistributed invalid. Must stop.') end select end if !Setup buffers etc. call setup_persistent(r) contains subroutine copy_indices_and_sanity_check(destination, source, dim_counter, llims, ulims) implicit none integer, dimension(:), allocatable, intent(in out) :: destination integer, dimension(:), allocatable, intent(in) :: source integer, intent(in out) :: dim_counter integer, dimension(:), intent(in) :: llims, ulims if (.not. allocated(source)) return ! Update the dimension counter dim_counter = dim_counter + 1 ! Copy into destination destination = source ! Sanity check values -- ensure indices are not outside the lower/upper ! limits for the array(s) we will be indexing if (any(destination < llims(dim_counter) .or. destination > ulims(dim_counter))) then call mp_abort("Passed indices out of bounds in init_redist_gs2", .true.) end if end subroutine copy_indices_and_sanity_check end subroutine init_redist_gs2 !> FIXME : Add documentation subroutine init_redist_agk (r, char, to_low, to_high, to_list, & from_low, from_high, from_list, ierr) use mp, only: nproc implicit none type (redist_type), intent (inout) :: r character(1), intent (in) :: char type (index_list_type), dimension (0:nproc-1), intent (in) :: to_list, from_list integer, intent (in) :: to_low integer, dimension(:), intent (in) :: from_low, to_high, from_high integer, optional, intent (out) :: ierr integer, dimension(:), allocatable :: full_to_low allocate(full_to_low, mold = to_high) full_to_low = 1 full_to_low(size(to_high)) = to_low call init_redist(r, char, full_to_low, to_high, to_list, & from_low, from_high, from_list, ierr) end subroutine init_redist_agk !> FIXME : Add documentation subroutine init_fill (f, char, to_low, to_high, to_list, & from_low, from_high, from_list, ierr) use mp, only: nproc implicit none type (redist_type), intent (out) :: f character(1), intent (in) :: char type (index_list_type), dimension (0:nproc-1), intent (in) :: to_list, from_list integer, dimension(:), intent (in) :: to_low, from_low, to_high, from_high integer, optional, intent (out) :: ierr call init_redist(f, char, to_low, to_high, to_list, & from_low, from_high, from_list, ierr) end subroutine init_fill !> FIXME : Add documentation subroutine delete_redist(r) use mp, only: free_request implicit none type (redist_type), intent (in out) :: r if (allocated(r%to)) deallocate (r%to) if (allocated(r%from)) deallocate (r%from) if (allocated (r%complex_buff)) deallocate (r%complex_buff) if (allocated (r%real_buff)) deallocate (r%real_buff) if (allocated (r%integer_buff)) deallocate (r%integer_buff) if (allocated (r%logical_buff)) deallocate (r%logical_buff) !Persistent related if(allocated(r%recv_hand))then call free_request(r%recv_hand) deallocate(r%recv_hand) endif if(allocated(r%send_hand))then call free_request(r%send_hand) deallocate(r%send_hand) endif if(allocated(r%recv_inv_hand))then call free_request(r%recv_inv_hand) deallocate(r%recv_inv_hand) endif if(allocated(r%send_inv_hand))then call free_request(r%send_inv_hand) deallocate(r%send_inv_hand) endif if(allocated(r%recv_ip)) deallocate(r%recv_ip) if(allocated(r%recv_inv_ip)) deallocate(r%recv_inv_ip) if(allocated(r%buff_send)) deallocate(r%buff_send) if(allocated(r%buff_recv)) deallocate(r%buff_recv) end subroutine delete_redist !> FIXME : Add documentation subroutine delete_list(list) use mp, only: nproc implicit none type (index_list_type), dimension(0:nproc-1), intent (inout) :: list integer :: ip do ip = 0, nproc-1 if(allocated(list(ip)%first)) deallocate(list(ip)%first) if(allocated(list(ip)%second)) deallocate(list(ip)%second) if(allocated(list(ip)%third)) deallocate(list(ip)%third) if(allocated(list(ip)%fourth)) deallocate(list(ip)%fourth) if(allocated(list(ip)%fifth)) deallocate(list(ip)%fifth) if(allocated(list(ip)%sixth)) deallocate(list(ip)%sixth) enddo end subroutine delete_list !> FIXME : Add documentation subroutine c_redist_12 (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):), intent (in) :: from_here complex, dimension (r%to_low(1):,r%to_low(2):), intent (in out) :: to_here real :: mp_total, mp_total_after integer :: i, idp, ipto, ipfrom, iadp if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%from(iproc)%nn to_here(r%to(iproc)%k(i), r%to(iproc)%l(i)) & = from_here(r%from(iproc)%k(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_12 !> FIXME : Add documentation subroutine c_redist_22 (r, from_here, to_here) use job_manage, only: time_message use mp, only: get_mp_times implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if !If overlapping then start comms now if(opt_redist_persist_overlap) call c_redist_22_mpi_copy_persist_start(r, from_here) ! redistribute from local processor to local processor ! The flag opt_local_copy is set by the user in the input ! file to specify whether optimised local copy routines are used. ! These c_redist_22_new_copy routine is the new local copy ! functionality where indirect addressing has largely been removed. ! c_redist_22_old_copy is the original local copy code. if(opt_local_copy .and. (r%redistname .eq. 'x2y')) then ! c_redist_22_new_copy is the new local copy functionality where ! indirect addressing has largely been removed call c_redist_22_new_copy(r, from_here, to_here) else ! c_redist_22_old_copy is the original local copy functionality call c_redist_22_old_copy(r, from_here, to_here) end if ! c_redist_22_mpi_copy contains all the remote to local ! copy functionality !<DD> Pick from non-blocking and blocking approach if(opt_redist_nbk)then if(opt_redist_persist)then if(.not.opt_redist_persist_overlap) call c_redist_22_mpi_copy_persist_start(r, from_here) call c_redist_22_mpi_copy_persist_end(r, to_here) else call c_redist_22_mpi_copy_nonblock(r, from_here, to_here) endif else call c_redist_22_mpi_copy(r, from_here, to_here) endif if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_22 !> FIXME : Add documentation subroutine c_redist_22_old_copy (r, from_here, to_here) use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i, nn !CMR ! In the GS2 standard FFT situation this routine maps ! xxf(it,ixxf) to yxf(ik,iyxf) data type ! where it is kx (or x) index, ik is ky (or y) index, ! ixxf is (y,ig,isgn,"les") and iyxf is (x,ig,isgn,"les") ! nn = r%from(iproc)%nn !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(i) & !$OMP SHARED(nn, to_here, from_here, iproc, r) & !$OMP SCHEDULE(static) do i = 1, nn ! ! redistribute from local processor to local processor ! NB r%from(iproc)%nn is #elements sent by THIS processor to THIS processor ! In this situation the data at (r%to(iproc)%k(i),r%to(iproc)%l(i)) ! should come from (r%from(iproc)%k(i), r%from(iproc)%l(i)). ! ! This do loop, in GS2 standard FFT situation, corresponds to: ! to_here(ik,iyxf)=from_here(it,ixxf) ! to_here(r%to(iproc)%k(i), & r%to(iproc)%l(i)) & = from_here(r%from(iproc)%k(i), & r%from(iproc)%l(i)) end do !$OMP END PARALLEL DO end subroutine c_redist_22_old_copy !> FIXME : Add documentation subroutine c_redist_22_new_copy (r, from_here, to_here) !===================================================================== !AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing !===================================================================== ! ! AJ, June 2011: ! Modified LOCAL COPY part of c_redist_22 routine, as used by GS2 to ! transform xxf data type to yxf data type. ! Here we REDUCE indirect addressing and cache load by EXPLOITING ! understanding of xxf and yxf data types in GS2. ! use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i,ik,it,itmin,itmax,it_nlocal,ixxf,iyxf i = 1 !AJ Loop over all local copies from THIS proc (iproc) to THIS proc do while (i .le. r%from(iproc)%nn) !AJ Initialise the inner loop indices from the existing indirect addresses itmin = r%from(iproc)%k(i) ixxf = r%from(iproc)%l(i) ik = r%to(iproc)%k(i) iyxf = r%to(iproc)%l(i) !AJ it_nlocal is max #it-indices that can be accommodated on iproc it_nlocal = (yxf_lo_ulim_proc+1) - iyxf !AJ itmax selects either the it_nlocal computed above or the maximum !AJ yxf_lo%nx space available to this process. itmax = min((itmin-1)+it_nlocal,nx) do it = itmin,itmax to_here(ik,iyxf) = from_here(it,ixxf) iyxf = iyxf + 1 i = i + 1 end do end do end subroutine c_redist_22_new_copy !> FIXME : Add documentation subroutine c_redist_22_mpi_copy (r, from_here, to_here) use mp, only: iproc, nproc, send, receive implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do end subroutine c_redist_22_mpi_copy !> This routine does redistributes using entirely non-blocking !> comms. !> !> @note This routine may use more memory than blocking approach !> --> Can perform slower than blocking routine when redistribute !> is particularly large, else typically gives better performance subroutine c_redist_22_mpi_copy_nonblock(r, from_here, to_here) use mp, only: iproc, send, receive use mp, only: barrier,waitall,nbsend,nbrecv implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%from_low(1):, & r%from_low(2): & ), intent (in) :: from_here complex, dimension ( & r%to_low(1):, & r%to_low(2): & ), intent (in out) :: to_here complex, dimension(:, :), allocatable :: r_buff, s_buff integer, dimension(:), allocatable :: recvhand, sendhand integer :: i, ip, nn, nrecv, nsend, j !Work out how big recv buffer has to be and allocate it nrecv = r%nrecv if ( nrecv > 0) then allocate( r_buff(size(r%complex_buff), nrecv)) allocate( recvhand(nrecv)) end if !Now work out how big send buffer has to be, allocate it and fill it nsend = r%nsend if (nsend > 0) then allocate( s_buff(size(r%complex_buff),nsend)) allocate( sendhand(nsend)) !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nsend, r, s_buff, from_here, sendhand, iproc) do i = 1, nsend ip = r%recv_inv_ip(i) nn = r%from(ip)%nn !Any benefit in making this a pointer assignment instead? !Avoids copying data here but could lead to questionable !memory access during the actual send? !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn s_buff(j, i) = from_here( & r%from(ip)%k(j), & r%from(ip)%l(j) & ) end do !$OMP END DO !$OMP MASTER call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i)) !$OMP END MASTER end do !$OMP END PARALLEL end if !Post receives !Probably not much advantage to doing these ASAP as not going to do anything !until data is on its way --> Could post sends first so chance of data being !available as soon as receive posted. if (nrecv > 0) then ! Loops of this form require us to be able to call MPI routines ! from any thread. Given there is little work in this loop ! we may consider removing OpenMP pragmas if it allows us to ! drop to MPI_THREAD_FUNNELED !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(i, ip, nn) & !$OMP SHARED(nrecv, r, r_buff, recvhand) & !$OMP SCHEDULE(static) do i = 1, nrecv ip = r%recv_ip(i) nn = r%to(ip)%nn call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i)) end do !$OMP END PARALLEL DO end if if (nrecv > 0) then !Now we don't have anything to do until all the data has arrived (so should we post sends first?) !We could do a wait all but may be better to keep on checking each message seperately so that when !it arrives we can copy out whilst waiting for others to arrive. !Start with a wait all call waitall(nrecv, recvhand) !Now unpack data. ! We choose to distribute the inner loop, unpacking data, rather ! than the outer loop (distributing messages) as we expect ! the message size nn to be rather variable. !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nrecv, to_here, r, r_buff) do i = 1, nrecv !Which processor is this data from? Would we be better looping over all procs and just checking !if that's one to unpack? Removes memory lookup of array but requires branching and incrementing !a counter to work out the equivalent of i here (i.e. which receive we're looking at) ip = r%recv_ip(i) nn = r%to(ip)%nn !Could this be vectorised and is it useful to do so? !Could we make r_buff a pointer array so that MPI put data directly into !to_here array? Would this be beneficial? Would need to be done before RECVs posted !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn to_here( & r%to(ip)%k(j), & r%to(ip)%l(j) & ) = r_buff(j, i) enddo !$OMP END DO NOWAIT enddo !$OMP END PARALLEL deallocate(r_buff) deallocate(recvhand) endif if (nsend > 0) then call waitall(nsend, sendhand) deallocate(s_buff) deallocate(sendhand) endif end subroutine c_redist_22_mpi_copy_nonblock !> Launch persistent communications associated with redistribute. !> Packs send buffer. subroutine c_redist_22_mpi_copy_persist_start(r, from_here) use mp, only: start_comm implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%from_low(1):, & r%from_low(2): & ), intent (in) :: from_here integer :: i, ip, nn, nrecv, nsend, j !Start recv comms nrecv = r%nrecv if (nrecv > 0) then call start_comm(r%recv_hand) end if !Pack buffer and start communication nsend = r%nsend if (nsend > 0) then !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nsend, r, from_here) do i = 1, nsend ip = r%recv_inv_ip(i) nn = r%from(ip)%nn !Any benefit in making this a pointer assignment instead? !Avoids copying data here but could lead to questionable !memory access during the actual send? !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn r%buff_send(i)%complex_buffer(j) = from_here( & r%from(ip)%k(j), & r%from(ip)%l(j) & ) end do !$OMP END DO !$OMP MASTER call start_comm(r%send_hand(i)) !$OMP END MASTER end do !$OMP END PARALLEL end if end subroutine c_redist_22_mpi_copy_persist_start !> Finish persistent communications associated with redistribute. !> Unpacks receive buffer. subroutine c_redist_22_mpi_copy_persist_end(r, to_here) use mp, only: waitall implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%to_low(1):, & r%to_low(2): & ), intent (in out) :: to_here integer :: i, ip, nn, nrecv, nsend, j !Wait for recv communications to complete and unpack buffer nrecv = r%nrecv if (nrecv > 0) then !Now we don't have anything to do until all the data has arrived (so should we post sends first?) !We could do a wait all but may be better to keep on checking each message seperately so that when !it arrives we can copy out whilst waiting for others to arrive. !Start with a wait all call waitall(nrecv, r%recv_hand) !Now unpack data !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nrecv, to_here, r) do i = 1, nrecv !Which processor is this data from? ip = r%recv_ip(i) nn = r%to(ip)%nn !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn to_here( & r%to(ip)%k(j), & r%to(ip)%l(j) & ) = r%buff_recv(i)%complex_buffer(j) enddo !$OMP END DO NOWAIT enddo !$OMP END PARALLEL endif !Wait for all sends to complete nsend = r%nsend if (nsend > 0) then call waitall(nsend, r%send_hand) endif end subroutine c_redist_22_mpi_copy_persist_end !> FIXME : Add documentation subroutine c_redist_22_inv (r, from_here, to_here) use job_manage, only: time_message use mp, only: get_mp_times type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in out) :: to_here real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if !If overlapping then start comms now if(opt_redist_persist_overlap) call c_redist_22_inv_mpi_copy_persist_start(r, from_here) ! redistribute from local processor to local processor if(opt_local_copy .and. (r%redistname .eq. 'x2y')) then ! c_redist_22_inv_new_copy is the new local copy functionality where ! indirect addressing has largely been removed call c_redist_22_inv_new_copy(r, from_here, to_here) else ! c_redist_22_inv_old_copy is the original local copy functionality call c_redist_22_inv_old_copy(r, from_here, to_here) end if ! c_redist_22_inv_mpi_copy contains all the remote to local ! copy functionality !<DD> if(opt_redist_nbk)then if(opt_redist_persist)then if(.not.opt_redist_persist_overlap) call c_redist_22_inv_mpi_copy_persist_start(r, from_here) call c_redist_22_inv_mpi_copy_persist_end(r, to_here) else call c_redist_22_inv_mpi_copy_nonblock(r, from_here, to_here) endif else call c_redist_22_inv_mpi_copy(r, from_here, to_here) endif if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_22_inv !> FIXME : Add documentation subroutine c_redist_22_inv_old_copy (r, from_here, to_here) use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in out) :: to_here integer :: i, nn !CMR ! In the GS2 standard FFT situation this routine maps ! yxf(ik,iyxf) to xxf(it,ixxf) data type ! where it is kx (or x) index, ik is ky (or y) index, ! ixxf is (y,ig,isgn,"les") and iyxf is (x,ig,isgn,"les") ! nn = r%to(iproc)%nn !$OMP PARALLEL DO DEFAULT(none) & !$OMP SHARED(nn, to_here, from_here, r, iproc) & !$OMP SCHEDULE(static) do i = 1, nn ! ! redistribute from local processor to local processor ! NB r%from(iproc)%nn is #elements sent by THIS processor to THIS processor ! In this situation the data at (r%from(iproc)%k(i),r%from(iproc)%l(i)) ! should come from (r%to(iproc)%k(i), r%to(iproc)%l(i)). ! ! This do loop, in GS2 standard FFT situation, corresponds to: ! to_here(it,ixxf)=from_here(ik,iyxf) ! to_here(r%from(iproc)%k(i), & r%from(iproc)%l(i)) & = from_here(r%to(iproc)%k(i), & r%to(iproc)%l(i)) end do !$OMP END PARALLEL DO end subroutine c_redist_22_inv_old_copy !> FIXME : Add documentation subroutine c_redist_22_inv_new_copy (r, from_here, to_here) !===================================================================== !AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing !===================================================================== ! ! AJ, June 2011: ! Modified LOCAL COPY part of c_redist_22_inv routine, used by GS2 to ! transform yxf data type back to xxf data type. ! Here we REDUCE indirect addressing and cache load by EXPLOITING ! understanding of xxf and yxf data types in GS2. ! use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in out) :: to_here integer :: i,ik,it,itmin,itmax,it_nlocal,ixxf,iyxf i = 1 !AJ Loop over all local copies from THIS proc (iproc) to THIS proc do while (i .le. r%to(iproc)%nn) !AJ Initialise the inner loop indices from the existing indirect addresses itmin = r%from(iproc)%k(i) ixxf = r%from(iproc)%l(i) ik = r%to(iproc)%k(i) iyxf = r%to(iproc)%l(i) !AJ it_nlocal is max #it-indices that can be accommodated on iproc it_nlocal = (yxf_lo_ulim_proc+1) - iyxf !AJ itmax selects either the it_nlocal computed above or the maximum !AJ yxf_lo%nx space available to this process. itmax = min((itmin-1)+it_nlocal,nx) do it = itmin,itmax to_here(it,ixxf) = from_here(ik,iyxf) iyxf = iyxf + 1 i = i + 1 end do end do end subroutine c_redist_22_inv_new_copy !> FIXME : Add documentation subroutine c_redist_22_inv_mpi_copy (r, from_here, to_here) use mp, only: iproc, nproc, send, receive implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%complex_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%complex_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp) end if end if end do end subroutine c_redist_22_inv_mpi_copy !> This routine does redistributes using entirely non-blocking !> comms. !> !> @note This routine may use more memory than blocking approach !> --> Can perform slower than blocking routine when redistribute !> is particularly large, else typically gives better performance subroutine c_redist_22_inv_mpi_copy_nonblock(r, from_here, to_here) use mp, only: iproc, send, receive use mp, only: barrier,waitall,nbsend,nbrecv implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%from_low(1):, & r%from_low(2): & ), intent (in out) :: to_here complex, dimension (& r%to_low(1):, & r%to_low(2): & ), intent (in) :: from_here complex, dimension(:, :), allocatable :: r_buff, s_buff integer, dimension(:), allocatable :: recvhand, sendhand integer :: i, ip, nn, nrecv, nsend, j !Work out how big recv buffer has to be and allocate it ! Note that we use redistribute type's nsend (not nrecv) because we are ! doing the inverse transform. nrecv = r%nsend if (nrecv > 0) then allocate( r_buff(size(r%complex_buff), nrecv)) allocate( recvhand(nrecv)) end if !Now work out how big send buffer has to be, allocate it and fill it ! Note that we use redistribute type's nrecv (not nsend) because we are ! doing the inverse transform. nsend = r%nrecv if (nsend > 0) then allocate( s_buff(size(r%complex_buff), nsend)) allocate( sendhand(nsend)) !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nsend, s_buff, from_here, r, iproc, sendhand) do i = 1, nsend ip = r%recv_ip(i) nn = r%to(ip)%nn !Any benefit in making this a pointer assignment instead? !Avoids copying data here but could lead to questionable !memory access during the actual send? !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn s_buff(j, i) = from_here( & r%to(ip)%k(j), & r%to(ip)%l(j) & ) end do !$OMP END DO !$OMP MASTER call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i)) !$OMP END MASTER end do !$OMP END PARALLEL end if !Post receives if (nrecv > 0) then !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(i, ip, nn) & !$OMP SHARED(nrecv, r, r_buff, recvhand) & !$OMP SCHEDULE(static) do i = 1, nrecv ip = r%recv_inv_ip(i) nn = r%from(ip)%nn call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i)) end do !$OMP END PARALLEL DO end if if (nrecv > 0) then !Now we don't have anything to do until all the data has arrived (so should we post sends first?) !We could do a wait all but may be better to keep on checking each message seperately so that when !it arrives we can copy out whilst waiting for others to arrive. !Start with a wait all call waitall(nrecv, recvhand) !Now unpack data !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nrecv, r, to_here, r_buff) do i = 1, nrecv !Which processor is this data from ip = r%recv_inv_ip(i) nn = r%from(ip)%nn !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn to_here( & r%from(ip)%k(j), & r%from(ip)%l(j) & ) = r_buff(j, i) enddo !$OMP END DO NOWAIT enddo !$OMP END PARALLEL deallocate(r_buff) deallocate(recvhand) endif if (nsend > 0) then call waitall(nsend, sendhand) deallocate(s_buff) deallocate(sendhand) endif end subroutine c_redist_22_inv_mpi_copy_nonblock !> Launch persistent communications associated with redistribute. !> Packs send buffer. subroutine c_redist_22_inv_mpi_copy_persist_start(r, from_here) use mp, only: start_comm implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%to_low(1):, & r%to_low(2): & ), intent (in) :: from_here integer :: i, ip, nn, nrecv, nsend, j !For inverse transform SEND actually refers to received data and !RECV refers to data to be sent !Post receives nrecv = r%nsend if (nrecv > 0) then call start_comm(r%recv_inv_hand) endif !Pack send buffer and start sends nsend = r%nrecv if (nsend > 0) then !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nsend, r, from_here) do i = 1, nsend ip = r%recv_ip(i) nn = r%to(ip)%nn !Any benefit in making this a pointer assignment instead? !Avoids copying data here but could lead to questionable !memory access during the actual send? !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn r%buff_recv(i)%complex_buffer(j) = from_here( & r%to(ip)%k(j), & r%to(ip)%l(j) & ) end do !$OMP END DO !$OMP MASTER call start_comm(r%send_inv_hand(i)) !$OMP END MASTER end do !$OMP END PARALLEL end if end subroutine c_redist_22_inv_mpi_copy_persist_start !> Finish persistent communications associated with redistribute. !> Unpacks receive buffer. subroutine c_redist_22_inv_mpi_copy_persist_end(r, to_here) use mp, only: waitall implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%from_low(1):, & r%from_low(2): & ), intent (in out) :: to_here integer :: i, ip, nn, nrecv, nsend, j !For inverse transform SEND actually refers to received data and !RECV refers to data to be sent !Wait for receives to complete and unpack data nrecv = r%nsend if (nrecv > 0) then call waitall(nrecv, r%recv_inv_hand) !Now unpack data !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nrecv, to_here, r) do i = 1, nrecv !Which processor is this data from ip = r%recv_inv_ip(i) nn = r%from(ip)%nn !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn to_here( & r%from(ip)%k(j), & r%from(ip)%l(j) & ) = r%buff_send(i)%complex_buffer(j) end do !$OMP END DO NOWAIT end do !$OMP END PARALLEL end if !Wait for sends to complete nsend = r%nrecv if (nsend > 0) then call waitall(nsend, r%send_inv_hand) end if end subroutine c_redist_22_inv_mpi_copy_persist_end !> FIXME : Add documentation subroutine c_redist_32 (r, from_here, to_here) use job_manage, only: time_message use mp, only: get_mp_times implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here real :: time_optimised_loop_1(2), time_optimised_loop_2(2) real :: mp_total, mp_total_after integer :: i if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if !If overlapping then start comms now if(opt_redist_persist_overlap) call c_redist_32_mpi_copy_persist_start(r, from_here) !AJ redistribute from local processor to local processor !AJ The flag opt_local_copy is set by the user in the input !AJ file to specify whether optimised local copy routines are used. !AJ These c_redist_32_*_copy routines are the new local copy !AJ functionality where indirect addressing has largely been removed !AJ There are two different versions of the optimised local copy !AJ functionality which are selected at runtime (providing !AJ opt_local_copy is true) through a performance measurement process !AJ documented below. if(opt_local_copy .and. (r%redistname .eq. 'g2x')) then !AJ Because there are two different optimised local copy routines !AJ which provide benefits for different process counts and use cases !AJ we use an auto-tuning method to select which routine to use. This !AJ works by timing both of the optimised routines on the first run of !AJ this functionality and then using the quickest for the rests of the !AJ times this routine is called. The optimised_choice varible is used !AJ to record the choice of routine (it is a SAVE variable so will maintain !AJ a value between calls to the routine), it is initialise to 0 when the !AJ code first runs. We run each routine 4 times, only timing the last !AJ three runs to avoid any potential initialisation penalties. We time !AJ three rather than one to deal ensure that we collect enough timing !AJ data. if(r%optimised_choice .eq. 0) then time_optimised_loop_1 = 0. time_optimised_loop_2 = 0. call c_redist_32_new_copy(r, from_here, to_here) call time_message(.false.,time_optimised_loop_1,' Optimised Loop 1') do i= 1,3 call c_redist_32_new_copy(r, from_here, to_here) end do call time_message(.false.,time_optimised_loop_1,' Optimised Loop 1') call c_redist_32_new_opt_copy(r, from_here, to_here) call time_message(.false.,time_optimised_loop_2,' Optimised Loop 2') do i = 1,3 call c_redist_32_new_opt_copy(r, from_here, to_here) end do call time_message(.false.,time_optimised_loop_2,' Optimised Loop 2') if(time_optimised_loop_1(1) .gt. time_optimised_loop_2(1)) then r%optimised_choice = 2 else r%optimised_choice = 1 end if !AJ This else is encountered once the optimised auto-tuning choice has !AJ been calculated above. else if(r%optimised_choice .eq. 1) then call c_redist_32_new_copy(r, from_here, to_here) else call c_redist_32_new_opt_copy(r, from_here, to_here) end if end if !AJ This else is encountered when the optimised local copy functionality is !AJ not enabled. else !AJ c_redist_32_old_copy is the original local copy functionality call c_redist_32_old_copy(r, from_here, to_here) end if !AJ c_redist_32_mpi_copy contains all the remote to local !AJ copy functionality !<DD> if(opt_redist_nbk)then if(opt_redist_persist) then if(.not.opt_redist_persist_overlap) call c_redist_32_mpi_copy_persist_start(r, from_here) !DD call c_redist_32_mpi_copy_persist_end(r, to_here) !DD else call c_redist_32_mpi_copy_nonblock(r, from_here, to_here) !DD endif else call c_redist_32_mpi_copy(r, from_here, to_here) endif if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_32 !> FIXME : Add documentation subroutine c_redist_32_old_copy(r, from_here, to_here) use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i, nn !CMR ! In the GS2 standard FFT situation this routine maps ! g(ig, isgn, iglo) to xxf(it,ixxf) data type ! where it is kx (or x) index, ixxf is (y,ig,isgn,"les") ! and iglo is ("xyles") nn = r%from(iproc)%nn !$OMP PARALLEL DO DEFAULT(none) & !$OMP SHARED(nn, to_here, r, iproc, from_here) & !$OMP SCHEDULE(static) do i = 1, nn ! ! redistribute from local processor to local processor ! NB r%from(iproc)%nn is #elements sent by THIS processor to THIS processor ! In this situation the data at (r%to(iproc)%k(i),r%to(iproc)%l(i)) ! should come from (r%from(iproc)%k(i),r%from(iproc)%l(i),r%from(iproc)%m(i)). ! ! This do loop, in GS2 standard FFT situation, corresponds to: ! to_here(it,ixxf)=from_here(ig,isgn,iglo) ! to_here(r%to(iproc)%k(i),& r%to(iproc)%l(i)) & = from_here(r%from(iproc)%k(i), & r%from(iproc)%l(i), & r%from(iproc)%m(i)) end do !$OMP END PARALLEL DO end subroutine c_redist_32_old_copy !> FIXME : Add documentation subroutine c_redist_32_new_copy(r, from_here, to_here) !===================================================================== !AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing !===================================================================== ! ! AJ, June 2011: ! Modified LOCAL COPY part of c_redist_32 routine, as used by GS2 to ! transform g data type to xxf data type. ! Here we REDUCE indirect addressing and cache load by EXPLOITING ! understanding of g and xxf data types in GS2. ! use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i,k,t2,t1,f3,f2,f1,fhigh,thigh,f2max real :: nakyrecip i = 1 !AJ We want to be able to divide by naky with floating point !AJ arithmetic so we need to convert naky to a float (hence !AJ the need for nakyrecip). Also, to remove the necessity !AJ to have a division in the body of the loop below we are !AJ calculating the reciprocal of naky so that instead of !AJ dividing by naky we can multiply by 1/naky. nakyrecip = naky nakyrecip = 1/nakyrecip f2max = r%from_high(2) !AJ Loop over all local copies from THIS proc (iproc) to THIS proc do while(i .le. r%from(iproc)%nn) !AJ Initialise the inner loop indices. As the from_here array has !AJ 3 indices (c_redist_32 goes from 3 indices to 2 indices in the !AJ copy) we need two inner loops to be able to move through all !AJ the to_here and from_here indices. !AJ t1 (which equates to the it index) and f3 (which equates to iglo) !AJ do not change for the iterations of the inner loops. f2 (isgn) !AJ can only be 1 or 2 so the first inner loop is restricted to at !AJ most 2 iterations. f2 = r%from(iproc)%l(i) f3 = r%from(iproc)%m(i) t1 = r%to(iproc)%k(i) !AJ Ensure that isgn (f2) is either 1 or 2. do while (f2 .le. f2max) !AJ Get initial value of f1 (which equates to ig) and t2 (which equates !AJ to ixxf). f1 = r%from(iproc)%k(i) t2 = r%to(iproc)%l(i) !AJ Work out the maximum value ixxf (t2) can have by calculating the range !AJ of ixxf that this process owns. We step through t2 by naky each iteration !AJ so the line below calculates thigh as the number of ixxf steps for these !AJ inner loops of the operation owned by this process. thigh = ceiling(((xxf_lo_ulim_proc+1) - t2)*nakyrecip) !AJ from_here and to_here are calculated for c_redist_32 using ig, isgn, !AJ and iglo. The calculation of the maximum number if ixxf steps above can, !AJ therefore, be used to also calculate the maximum number of ig setsp. THis !AJ is what the following line does. thigh = thigh + (f1-1) !AJ Finally check the actual number of inner loop steps by ensuring that the !AJ computed maximum bound of ig is not beyond the actual allowed maximum value !AJ in r%from_high(1). fhigh = min(thigh,r%from_high(1)) do k = f1,fhigh to_here(t1,t2) = from_here(k,f2,f3) t2 = t2 + naky i = i + 1 end do !AJ If at the end of the inner loop we still have theoretical iterations !AJ left on ixxf (i.e. the calculated thigh is higher than the allowed !AJ maximum of ig) then move to the next isgn. If there aren't any !AJ iterations left then exit the inner loops (the f2 loop). if(thigh .gt. r%from_high(1)) then f2 = f2 + 1 else f2 = f2max + 1 end if end do end do end subroutine c_redist_32_new_copy !> FIXME : Add documentation subroutine c_redist_32_new_opt_copy(r, from_here, to_here) !===================================================================== !AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing !===================================================================== ! ! AJ, June 2011: ! Modified LOCAL COPY part of c_redist_32 routine, as used by GS2 to ! transform g data type to xxf data type. ! Here we REDUCE indirect addressing and cache load by EXPLOITING ! understanding of g and xxf data types in GS2. ! This functionality is different to c_redist_32_new_copy because ! that routine does not always provide optimsed performance ! (due to the way memory reads and writes are performed, particularly ! the use of a strided write which triggers a write allocate cache ! miss). ! This new functionality moves through the loop indices in a different ! order to the other optimised local copy, so instead of following the ! original functionality by moving through the to_here and from_here loops ! using the i variable from i = 1 to from(iproc)%nn we start at i = 1 and ! then skip through i's for a certain range, then move back to i = 2 and ! follow the same process until all the i range has been processed. ! use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i,t2,t1,f3,f2,f1 integer :: f3max,f3maxmultiple,f3incr,innermax,iincrem,iglomax,t1test integer :: innermaxmultiplier,outerf3limit,startf3 real :: innermaxrealvalue,tempnaky !AJ tempnaky is a real version (rather than integer version) of naky !AJ It is used to enable real arthimetic rather than integer arthimetic. tempnaky = naky !AJ innermaxmultiplier is the upper limit of the xxf data range owned by this process innermaxmultiplier = xxf_lo_ulim_proc+1 !AJ This optimised new local copy functionality is reliant on the particular !AJ layout used to work out how to increment through the f3 (iglo) indices. !AJ f3maxmultiple sets the upper bound of the f3 blocks we step through. !AJ f3incr sets the amount added to f3 at each step of the inner loops. select case (layout) case ('yxels') f3maxmultiple = naky*ntheta0 f3incr = naky case ('yxles') f3maxmultiple = naky*ntheta0 f3incr = naky case ('lexys') f3maxmultiple = nlambda*negrid*ntheta0 f3incr = nlambda*negrid case ('lxyes') f3maxmultiple = nlambda*ntheta0 f3incr = nlambda case ('lyxes') f3maxmultiple = nlambda*naky*ntheta0 f3incr = nlambda*naky case('xyles') f3maxmultiple = ntheta0 f3incr = 1 end select i = 1 !AJ iglomax is the maximum iglo (f3) owned by this process (actually one more than ! this as g_lo is zero indexed) iglomax = 1 + g_lo_ulim !AJ t1test is used handle the it (t1) index which has a !AJ range 1 -> (xxf_lo%ntheta0+1)/2) and then !AJ (it - xxf_lo%ntheta0 + xxf_lo%nx) -> xxf_lo%nx. t1test is used to !AJ identify when this change in range happens and enable the code to !AJ compensate for it. t1test = ((ntheta0+1)/2)+1 !AJ Loop over all local copies from THIS proc (iproc) to THIS proc do while(i .le. r%from(iproc)%nn) !AJ Initial look up of values needed to calculate innermax !AJ and setup the first iteration of the loop below (do while i .le. innermax) f1 = r%from(iproc)%k(i) f2 = r%from(iproc)%l(i) f3 = r%from(iproc)%m(i) t1 = r%to(iproc)%k(i) t2 = r%to(iproc)%l(i) !AJ outerf3limit calculates the number of f3 iterations that can be undertaken !AJ before reaching the next f3 point. outerf3limit = f3 + f3incr do while(f3 .lt. outerf3limit) !AJ iincrem counts how much i needs to be skipped at the end of the final inner loop !AJ The i loop below moves through the starting i's for these inner loop iterations !AJ but the innermost loop increments through further i's without actually moving i !AJ so once we've finished the loop below we need to move i forward to skip all the !AJ elements we have just processed. iincrem = i !AJ work out the size of the inner loops body !AJ innermax is the range of the t2 index (ixxf) this process !AJ owns. innermaxrealvalue = (innermaxmultiplier-t2)/tempnaky innermax = ceiling(innermaxrealvalue)-1 !AJ If the isgn (f2) index is at maximum (2) and the ig (f1) index !AJ will also be at maximum if we use the innermax value calculated above !AJ (ig maximum is r%from_high(1) which currently is ntgrid in the code) !AJ then limit innermax to the difference between ig and ntgrid (which is !AJ ig max). if(f2 .eq. 2 .and. (f1+innermax) .gt. r%from_high(1)) then innermax = i + (ntgrid - f1) !AJ If the isgn (f2) index is not at maximum, but the calculated innermax would !AJ take the indexes beyond whaty is allowed for ig (which is the maximum of ig !AJ multipled by each increment of ig) then restrict innermax to the maximum !AJ number of allowed ig increments. else if((f2 .eq. 1 .and. innermax .gt. (((2*ntgrid)+1)+(ntgrid-f1)))) then innermax = i + ((2*ntgrid)+1) + (ntgrid - f1) !AJ If we have reached this else then the calculated innermax does not breach !AJ any of the data limits it can be used for the number of iterations of the !AJ inner loop. else innermax = i + innermax end if do while(i .le. innermax) !AJ Get the initial address variables. There are potentially some extra data lookups !AJ here so there may be scope to optimise this, although this has not been checked. f1 = r%from(iproc)%k(i) f2 = r%from(iproc)%l(i) f3 = r%from(iproc)%m(i) t1 = r%to(iproc)%k(i) t2 = r%to(iproc)%l(i) !AJ Record the initial f3 variable value for the inner loops for use in the bounds checking !AJ at the end of the inner loops. startf3 = f3 !AJ Calculate the size of the innermost loop by working out the maximum that f3 (iglo) !AJ can go to for this process (using the f3maxmultiple as calculated previously that !AJ effectively defines the f3 blocksize of this process). f3max = ((f3/f3maxmultiple)+1)*f3maxmultiple !AJ Ensure that the calculate f3max is not larger than the total iglo space this process !AJ owns. f3max = min(f3max,iglomax) do while (f3 .lt. f3max) to_here(t1,t2) = from_here(f1,f2,f3) f3 = f3 + f3incr t1 = t1 + 1 !AJ Deal with the issues with t1 (it) being a split range variable as described in the !AJ comment for t1test. if(t1 .eq. t1test) then t1 = t1 - ntheta0 + nx end if iincrem = iincrem + 1 end do !AJ Increment the outermost inner loop (move forwards in the i loop one step) i = i + 1 end do !AJ Ensure that we have not finished the i loop altogether if(i .lt. r%from(iproc)%nn .and. iincrem .lt. r%from(iproc)%nn) then !AJ The f1, f2 and t2 lookups are required to ensure that !AJ the calculation of innermax the next time round this loop !AJ are correctly performed as we are now moving forward in the loop !AJ It may be that it is better to re-order the initialisation of values !AJ so this happens at the beginning of the loop rather than at this point. f1 = r%from(iproc)%k(i) f2 = r%from(iproc)%l(i) t2 = r%to(iproc)%l(i) !AJ Updated to ensure the f3 loop we are in is correctly setup as with the code !AJ above we are moving to a new i so we need to get the correct value of f3 for this i f3 = startf3 + 1 else !AJ If we have gone beyond the end of the i loop then force the exit of the inner loops. !AJ This should never happen as the loop functionality is designed to always perform the !AJ correct number of iterations, so this check could be removed from the code. f3 = outerf3limit end if end do !AJ Once all the inner loops have been iterated move the i variable on to the next point !AJ point to be considered. i = iincrem end do end subroutine c_redist_32_new_opt_copy !> FIXME : Add documentation subroutine c_redist_32_mpi_copy(r, from_here, to_here) use mp, only: iproc, nproc, send, receive implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i), & r%from(ipto)%m(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i), & r%from(ipto)%m(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do end subroutine c_redist_32_mpi_copy !> This routine does redistributes using entirely non-blocking !> comms. !> !> @note This routine may use more memory than blocking approach !> --> Can perform slower than blocking routine when redistribute !> is particularly large, else typically gives better performance subroutine c_redist_32_mpi_copy_nonblock(r, from_here, to_here) use mp, only: iproc, send, receive use mp, only: barrier,waitall,nbsend,nbrecv implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%from_low(1):, & r%from_low(2):, & r%from_low(3): & ), intent (in) :: from_here complex, dimension ( & r%to_low(1):, & r%to_low(2): & ), intent (in out) :: to_here complex, dimension(:, :), allocatable :: r_buff, s_buff integer, dimension(:), allocatable :: recvhand, sendhand integer :: i, ip, nn, nrecv, nsend, j !Work out how big recv buffer has to be and allocate it nrecv = r%nrecv if ( nrecv > 0) then allocate( r_buff(size(r%complex_buff), nrecv)) allocate( recvhand(nrecv)) end if !Now work out how big send buffer has to be, allocate it and fill it nsend = r%nsend if (nsend > 0) then allocate( s_buff(size(r%complex_buff),nsend)) allocate( sendhand(nsend)) !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nsend, s_buff, from_here, r, iproc, sendhand) do i = 1, nsend ip = r%recv_inv_ip(i) nn = r%from(ip)%nn !Any benefit in making this a pointer assignment instead? !Avoids copying data here but could lead to questionable !memory access during the actual send? !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn s_buff(j, i) = from_here( & r%from(ip)%k(j), & r%from(ip)%l(j), & r%from(ip)%m(j) & ) end do !$OMP END DO !$OMP MASTER call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i)) !$OMP END MASTER end do !$OMP END PARALLEL end if !Post receives !Probably not much advantage to doing these ASAP as not going to do anything !until data is on its way --> Could post sends first so chance of data being !available as soon as receive posted. if (nrecv > 0) then !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(i, ip, nn) & !$OMP SHARED(nrecv, r, r_buff, recvhand) & !$OMP SCHEDULE(static) do i = 1, nrecv ip = r%recv_ip(i) nn = r%to(ip)%nn call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i)) end do !$OMP END PARALLEL DO end if if (nrecv > 0) then !Now we don't have anything to do until all the data has arrived (so should we post sends first?) !We could do a wait all but may be better to keep on checking each message seperately so that when !it arrives we can copy out whilst waiting for others to arrive. !Start with a wait all call waitall(nrecv, recvhand) !Now unpack data !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nrecv, to_here, r, r_buff) do i = 1, nrecv !Which processor is this data from? Would we be better looping over all procs and just checking !if that's one to unpack? Removes memory lookup of array but requires branching and incrementing !a counter to work out the equivalent of i here (i.e. which receive we're looking at) ip = r%recv_ip(i) nn = r%to(ip)%nn !Could this be vectorised and is it useful to do so? !Could we make r_buff a pointer array so that MPI put data directly into !to_here array? Would this be beneficial? Would need to be done before RECVs posted !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn to_here( & r%to(ip)%k(j), & r%to(ip)%l(j) & ) = r_buff(j, i) enddo !$OMP END DO NOWAIT enddo !$OMP END PARALLEL deallocate(r_buff) deallocate(recvhand) endif if (nsend > 0) then call waitall(nsend, sendhand) deallocate(s_buff) deallocate(sendhand) endif end subroutine c_redist_32_mpi_copy_nonblock !> Launch persistent communications associated with redistribute. !> Packs send buffer. subroutine c_redist_32_mpi_copy_persist_start(r, from_here) use mp, only: start_comm implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%from_low(1):, & r%from_low(2):, & r%from_low(3): & ), intent (in) :: from_here integer :: i, ip, nn, nrecv, nsend, j !Start recv comms nrecv = r%nrecv if (nrecv > 0) then call start_comm(r%recv_hand) end if !Pack buffer and start communication nsend = r%nsend if (nsend > 0) then !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nsend, r, from_here) do i = 1, nsend ip = r%recv_inv_ip(i) nn = r%from(ip)%nn !Any benefit in making this a pointer assignment instead? !Avoids copying data here but could lead to questionable !memory access during the actual send? !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn r%buff_send(i)%complex_buffer(j) = from_here( & r%from(ip)%k(j), & r%from(ip)%l(j), & r%from(ip)%m(j) & ) end do !$OMP END DO !$OMP MASTER call start_comm(r%send_hand(i)) !$OMP END MASTER end do !$OMP END PARALLEL end if end subroutine c_redist_32_mpi_copy_persist_start !> Finish persistent communications associated with redistribute. !> Unpacks receive buffer. subroutine c_redist_32_mpi_copy_persist_end(r, to_here) use mp, only: waitall implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%to_low(1):, & r%to_low(2): & ), intent (in out) :: to_here integer :: i, ip, nn, nrecv, nsend, j !Wait for recv communications to complete and unpack buffer nrecv = r%nrecv if (nrecv > 0) then !Now we don't have anything to do until all the data has arrived (so should we post sends first?) !We could do a wait all but may be better to keep on checking each message seperately so that when !it arrives we can copy out whilst waiting for others to arrive. !Start with a wait all call waitall(nrecv, r%recv_hand) !Now unpack data !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nrecv, to_here, r) do i = 1, nrecv !Which processor is this data from? ip = r%recv_ip(i) nn = r%to(ip)%nn !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn to_here( & r%to(ip)%k(j), & r%to(ip)%l(j) & ) = r%buff_recv(i)%complex_buffer(j) enddo !$OMP END DO NOWAIT enddo !$OMP END PARALLEL endif !Wait for all sends to complete nsend = r%nsend if (nsend > 0) then call waitall(nsend, r%send_hand) endif end subroutine c_redist_32_mpi_copy_persist_end !> FIXME : Add documentation subroutine c_redist_32_inv (r, from_here, to_here) use job_manage, only: time_message use mp, only: get_mp_times implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in out) :: to_here real :: time_optimised_loop_1(2), time_optimised_loop_2(2) real :: mp_total, mp_total_after integer :: i if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if !If overlapping then start comms now if(opt_redist_persist_overlap) call c_redist_32_inv_mpi_copy_persist_start(r, from_here) !AJ redistribute from local processor to local processor !AJ The flag opt_local_copy is set by the user in the input !AJ file to specify whether optimised local copy routines are used. !AJ These c_redist_32_inv_*_copy routines are the new local copy !AJ functionality where indirect addressing has largely been removed !AJ There are two different versions of the optimised local copy !AJ functionality which are selected at runtime (providing !AJ opt_local_copy is true) through a performance measurement process !AJ documented below. if(opt_local_copy .and. (r%redistname .eq. 'g2x')) then !AJ Because there are two different optimised local copy routines !AJ which provide benefits for different process counts and use cases !AJ we use an auto-tuning method to select which routine to use. This !AJ works by timing both of the optimised routines on the first run of !AJ this functionality and then using the quickest for the rests of the !AJ times this routine is called. The optimised_choice varible is used !AJ to record the choice of routine (it is a SAVE variable so will maintain !AJ a value between calls to the routine), it is initialise to 0 when the !AJ code first runs. We run each routine 4 times, only timing the last !AJ three runs to avoid any potential initialisation penalties. We time !AJ three rather than one to deal ensure that we collect enough timing !AJ data. if(r%optimised_choice_inv .eq. 0) then time_optimised_loop_1 = 0. time_optimised_loop_2 = 0. call c_redist_32_inv_new_copy(r, from_here, to_here) call time_message(.false.,time_optimised_loop_1,' Optimised Loop 1') do i= 1,3 call c_redist_32_inv_new_copy(r, from_here, to_here) end do call time_message(.false.,time_optimised_loop_1,' Optimised Loop 1') call c_redist_32_inv_new_opt_copy(r, from_here, to_here) call time_message(.false.,time_optimised_loop_2,' Optimised Loop 2') do i = 1,3 call c_redist_32_inv_new_opt_copy(r, from_here, to_here) end do call time_message(.false.,time_optimised_loop_2,' Optimised Loop 2') if(time_optimised_loop_1(1) .gt. time_optimised_loop_2(1)) then r%optimised_choice_inv = 2 else r%optimised_choice_inv = 1 end if !AJ This else is encountered once the optimised auto-tuning choice has !AJ been calculated above. else if(r%optimised_choice_inv .eq. 1) then !AJ c_redist_32_inv_new_copy is the new local copy functionality where !AJ indirect addressing has largely been removed call c_redist_32_inv_new_copy(r, from_here, to_here) else call c_redist_32_inv_new_opt_copy(r, from_here, to_here) end if end if !AJ This else is encountered when the optimised local copy functionality is !AJ not enabled. else !AJ c_redist_32_inv_old_copy is the original local copy functionality call c_redist_32_inv_old_copy(r, from_here, to_here) end if !AJ c_redist_32_inv_mpi_copy contains all the remote to local !AJ copy functionality !<DD> if(opt_redist_nbk)then if(opt_redist_persist)then if(.not.opt_redist_persist_overlap) call c_redist_32_inv_mpi_copy_persist_start(r,from_here) call c_redist_32_inv_mpi_copy_persist_end(r,to_here) else call c_redist_32_inv_mpi_copy_nonblock(r,from_here,to_here) endif else call c_redist_32_inv_mpi_copy(r, from_here, to_here) endif if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_32_inv !> FIXME : Add documentation subroutine c_redist_32_inv_old_copy(r, from_here, to_here) use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in out) :: to_here integer :: i, nn !CMR ! In the GS2 standard FFT situation this routine maps ! xxf(it,ixxf) to g(ig, isgn, iglo) data type ! where it is kx (or x) index, ixxf is (y,ig,isgn,"les") ! and iglo is ("xyles") nn = r%to(iproc)%nn !$OMP PARALLEL DO DEFAULT(none) & !$OMP SHARED(nn, to_here, r, iproc, from_here) & !$OMP SCHEDULE(static) do i = 1, nn ! ! redistribute from local processor to local processor ! NB r%to(iproc)%nn is #elements sent by THIS processor to THIS processor ! In this situation the data at (r%from(iproc)%k(i),r%from(iproc)%l(i),r%from(iproc)%m(i)) ! should come from (r%to(iproc)%k(i),r%to(iproc)%l(i)). ! ! This do loop, in GS2 standard FFT situation, corresponds to: ! to_here(ig,isgn,iglo)=from_here(it,ixxf) ! to_here(r%from(iproc)%k(i), & r%from(iproc)%l(i), & r%from(iproc)%m(i)) & = from_here(r%to(iproc)%k(i), & r%to(iproc)%l(i)) end do !$OMP END PARALLEL DO end subroutine c_redist_32_inv_old_copy !> FIXME : Add documentation subroutine c_redist_32_inv_new_copy(r, from_here, to_here) !===================================================================== !AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing !===================================================================== ! ! AJ, June 2011: ! Modified LOCAL COPY part of c_redist_32_inv routine, as used by GS2 to ! transform xxf data type to g data type. ! Here we REDUCE indirect addressing and cache load by EXPLOITING ! understanding of g and xxf data types in GS2. ! These c_redist_32_inv_* routines use exactly the same code created for ! the c_redist_32_* functionality, the only difference is in the actual ! data copy the to_here_ and from_here indices are swapped. ! use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in out) :: to_here integer :: i,k,t2,t1,f3,f2,f1,fhigh,thigh,f2max real :: nakyrecip i = 1 !AJ We want to be able to divide by naky with floating point !AJ arithmetic so we need to convert naky to a float (hence !AJ the need for nakyrecip). Also, to remove the necessity !AJ to have a division in the body of the loop below we are !AJ calculating the reciprocal of naky so that instead of !AJ dividing by naky we can multiply by 1/naky. nakyrecip = naky nakyrecip = 1/nakyrecip f2max = r%from_high(2) !AJ Loop over all local copies from THIS proc (iproc) to THIS proc do while(i .le. r%to(iproc)%nn) !AJ Initialise the inner loop indices. As the to_here array has !AJ 3 indices (c_redist_32_inv goes from 2 indices to 3 indices in the !AJ copy) we need two inner loops to be able to move through all !AJ the to_here and from_here indices. !AJ t1 (which equates to the it index) and f3 (which equates to iglo) !AJ do not change for the iterations of the inner loops. f2 (isgn) !AJ can only be 1 or 2 so the first inner loop is restricted to at !AJ most 2 iterations. f2 = r%from(iproc)%l(i) f3 = r%from(iproc)%m(i) t1 = r%to(iproc)%k(i) !AJ Ensure that isgn (f2) is either 1 or 2. do while (f2 .le. f2max) !AJ Get initial value of f1 (which equates to ig) and t2 (which equates !AJ to ixxf). f1 = r%from(iproc)%k(i) t2 = r%to(iproc)%l(i) !AJ Work out the maximum value ixxf (t2) can have by calculating the range !AJ of ixxf that this process owns. We step through t2 by naky each iteration !AJ so the line below calculates thigh as the number of ixxf steps for these !AJ inner loops of the operation owned by this process. thigh = ceiling(((xxf_lo_ulim_proc+1) - t2)*nakyrecip) !AJ from_here and to_here are calculated for c_redist_32 using ig, isgn, !AJ and iglo. The calculation of the maximum number if ixxf steps above can, !AJ therefore, be used to also calculate the maximum number of ig setsp. THis !AJ is what the following line does. thigh = thigh + (f1-1) !AJ Finally check the actual number of inner loop steps by ensuring that the !AJ computed maximum bound of ig is not beyond the actual allowed maximum value !AJ in r%from_high(1). fhigh = min(thigh,r%from_high(1)) do k = f1,fhigh i = i + 1 to_here(k,f2,f3) = from_here(t1,t2) t2 = t2 + naky end do !AJ If at the end of the inner loop we still have theoretical iterations !AJ left on ixxf (i.e. the calculated thigh is higher than the allowed !AJ maximum of ig) then move to the next isgn. If there aren't any !AJ iterations left then exit the inner loops (the f2 loop). if(thigh .gt. r%from_high(1)) then f2 = f2 + 1 else f2 = f2max + 1 end if end do end do end subroutine c_redist_32_inv_new_copy !> FIXME : Add documentation subroutine c_redist_32_inv_new_opt_copy(r, from_here, to_here) !===================================================================== !AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing !===================================================================== ! ! AJ, June 2011: ! Modified LOCAL COPY part of c_redist_32_inv routine, as used by GS2 to ! transform xxf data type to g data type. ! Here we REDUCE indirect addressing and cache load by EXPLOITING ! understanding of g and xxf data types in GS2. ! This functionality is different to c_redist_32_inv_new_copy because ! that routine does not always provide optimsed performance ! (due to the way memory reads and writes are performed, particularly ! the use of a strided write which triggers a write allocate cache ! miss). ! This new functionality moves through the loop indices in a different ! order to the other optimised local copy, so instead of following the ! original functionality by moving through the to_here and from_here loops ! using the i variable from i = 1 to to(iproc)%nn we start at i = 1 and ! then skip through i's for a certain range, then move back to i = 2 and ! follow the same process until all the i range has been processed. ! This routine uses the same functionality as teh c_redist_32_new_opt_copy ! routine, the only difference is in the actual copy the indices of the ! from_here and to_here loops are swapped. ! use mp, only: iproc implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in out) :: to_here integer :: i,t2,t1,f3,f2,f1 integer :: f3max,f3maxmultiple,f3incr,innermax,iincrem,iglomax,t1test integer :: innermaxmultiplier,outerf3limit,startf3 real :: innermaxrealvalue,tempnaky !AJ tempnaky is a real version (rather than integer version) of naky !AJ It is used to enable real arthimetic rather than integer arthimetic. tempnaky = naky !AJ innermaxmultiplier is the upper limit of the xxf data range owned by this process innermaxmultiplier = xxf_lo_ulim_proc+1 !AJ This optimised new local copy functionality is reliant on the particular !AJ layout used to work out how to increment through the f3 (iglo) indices. !AJ f3maxmultiple sets the upper bound of the f3 blocks we step through. !AJ f3incr sets the amount added to f3 at each step of the inner loops. select case (layout) case ('yxels') f3maxmultiple = naky*ntheta0 f3incr = naky case ('yxles') f3maxmultiple = naky*ntheta0 f3incr = naky case ('lexys') f3maxmultiple = nlambda*negrid*ntheta0 f3incr = nlambda*negrid case ('lxyes') f3maxmultiple = nlambda*ntheta0 f3incr = nlambda case ('lyxes') f3maxmultiple = nlambda*naky*ntheta0 f3incr = nlambda*naky case('xyles') f3maxmultiple = ntheta0 f3incr = 1 end select i = 1 !AJ iglomax is the maximum iglo (f3) owned by this process (actually one more than ! this as g_lo is zero indexed) iglomax = 1 + g_lo_ulim !AJ t1test is used handle the it (t1) index which has a !AJ range 1 -> (xxf_lo%ntheta0+1)/2) and then !AJ (it - xxf_lo%ntheta0 + xxf_lo%nx) -> xxf_lo%nx. t1test is used to !AJ identify when this change in range happens and enable the code to !AJ compensate for it. t1test = ((ntheta0+1)/2)+1 !AJ Loop over all local copies from THIS proc (iproc) to THIS proc do while(i .le. r%to(iproc)%nn) !AJ Initial look up of values needed to calculate innermax !AJ and setup the first iteration of the loop below (do while i .le. innermax) f1 = r%from(iproc)%k(i) f2 = r%from(iproc)%l(i) f3 = r%from(iproc)%m(i) t1 = r%to(iproc)%k(i) t2 = r%to(iproc)%l(i) !AJ outerf3limit calculates the number of f3 iterations that can be undertaken !AJ before reaching the next f3 point. outerf3limit = f3 + f3incr do while(f3 .lt. outerf3limit) !AJ iincrem counts how much i needs to be skipped at the end of the final inner loop !AJ The i loop below moves through the starting i's for these inner loop iterations !AJ but the innermost loop increments through further i's without actually moving i !AJ so once we've finished the loop below we need to move i forward to skip all the !AJ elements we have just processed. iincrem = i !AJ work out the size of the inner loops body !AJ innermax is the range of the t2 index (ixxf_ this process !AJ owns. innermaxrealvalue = (innermaxmultiplier-t2)/tempnaky innermax = ceiling(innermaxrealvalue)-1 !AJ If the isgn (f2) index is at maximum (2) and the ig (f1) index !AJ will also be at maximum if we use the innermax value calculated above !AJ (ig maximum is r%from_high(1) which currently is ntgrid in the code) !AJ then limit innermax to the difference between ig and ntgrid (which is !AJ ig max). if(f2 .eq. 2 .and. (f1+innermax) .gt. r%from_high(1)) then innermax = i + (ntgrid - f1) !AJ If the isgn (f2) index is not at maximum, but the calculated innermax would !AJ take the indexes beyond whaty is allowed for ig (which is the maximum of ig !AJ multipled by each increment of ig) then restrict innermax to the maximum !AJ number of allowed ig increments. else if((f2 .eq. 1 .and. innermax .gt. (((2*ntgrid)+1)+(ntgrid-f1)))) then innermax = i + ((2*ntgrid)+1) + (ntgrid - f1) !AJ If we have reached this else then the calculated innermax does not breach !AJ any of the data limits it can be used for the number of iterations of the !AJ inner loop. else innermax = i + innermax end if do while(i .le. innermax) !AJ Get the initial address variables. There are potentially some extra data lookups !AJ here so there may be scope to optimise this, although this has not been checked. f1 = r%from(iproc)%k(i) f2 = r%from(iproc)%l(i) f3 = r%from(iproc)%m(i) t1 = r%to(iproc)%k(i) t2 = r%to(iproc)%l(i) !AJ Record the initial f3 variable value for the inner loops for use in the bounds checking !AJ at the end of the inner loops. startf3 = f3 !AJ Calculate the size of the innermost loop by working out the maximum that f3 (iglo) !AJ can go to for this process (using the f3maxmultiple as calculated previously that !AJ effectively defines the f3 blocksize of this process). f3max = ((f3/f3maxmultiple)+1)*f3maxmultiple !AJ Ensure that the calculate f3max is not larger than the total iglo space this process !AJ owns. f3max = min(f3max,iglomax) do while (f3 .lt. f3max) to_here(f1,f2,f3) = from_here(t1,t2) f3 = f3 + f3incr t1 = t1 + 1 !AJ Deal with the issues with t1 (it) being a split range variable as described in the !AJ comment for t1test. if(t1 .eq. t1test) then t1 = t1 - ntheta0 + nx end if iincrem = iincrem + 1 end do !AJ Increment the outermost inner loop (move forwards in the i loop one step) i = i + 1 end do !AJ Ensure that we have not finished the i loop altogether if(i .lt. r%from(iproc)%nn .and. iincrem .lt. r%from(iproc)%nn) then !AJ The f1, f2 and t2 lookups are required to ensure that !AJ the calculation of innermax the next time round this loop !AJ are correctly performed as we are now moving forward in the loop !AJ It may be that it is better to re-order the initialisation of values !AJ so this happens at the beginning of the loop rather than at this point. f1 = r%from(iproc)%k(i) f2 = r%from(iproc)%l(i) t2 = r%to(iproc)%l(i) !AJ Updated to ensure the f3 loop we are in is correctly setup as with the code !AJ above we are moving to a new i so we need to get the correct value of f3 for this i f3 = startf3 + 1 else !AJ If we have gone beyond the end of the i loop then force the exit of the inner loops. !AJ This should never happen as the loop functionality is designed to always perform the !AJ correct number of iterations, so this check could be removed from the code. f3 = outerf3limit end if end do !AJ Once all the inner loops have been iterated move the i variable on to the next point !AJ point to be considered. i = iincrem end do end subroutine c_redist_32_inv_new_opt_copy !> FIXME : Add documentation subroutine c_redist_32_inv_mpi_copy(r, from_here, to_here) use mp, only: iproc, nproc, send, receive implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%complex_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i), & r%from(ipfrom)%m(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i), & r%from(ipfrom)%m(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%complex_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp) end if end if end do end subroutine c_redist_32_inv_mpi_copy !> This routine does redistributes using entirely non-blocking !> comms. !> !> @note This routine may use more memory than blocking approach !> --> Can perform slower than blocking routine when redistribute !> is particularly large, else typically gives better performance subroutine c_redist_32_inv_mpi_copy_nonblock(r, from_here, to_here) use mp, only: iproc, send, receive use mp, only: barrier,waitall,nbsend,nbrecv implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%from_low(1):, & r%from_low(2):, & r%from_low(3): & ), intent (in out) :: to_here complex, dimension (& r%to_low(1):, & r%to_low(2): & ), intent (in) :: from_here complex, dimension(:, :), allocatable :: r_buff, s_buff integer, dimension(:), allocatable :: recvhand, sendhand integer :: i, ip, nn, nrecv, nsend, j !Work out how big recv buffer has to be and allocate it ! Note that we use redistribute type's nsend (not nrecv) because we are ! doing the inverse transform. nrecv = r%nsend if (nrecv > 0) then allocate( r_buff(size(r%complex_buff), nrecv)) allocate( recvhand(nrecv)) end if !Now work out how big send buffer has to be, allocate it and fill it ! Note that we use redistribute type's nrecv (not nsend) because we are ! doing the inverse transform. nsend = r%nrecv if (nsend > 0) then allocate( s_buff(size(r%complex_buff), nsend)) allocate( sendhand(nsend)) !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nsend, r, s_buff, from_here, iproc, sendhand) do i = 1, nsend ip = r%recv_ip(i) nn = r%to(ip)%nn !Any benefit in making this a pointer assignment instead? !Avoids copying data here but could lead to questionable !memory access during the actual send? !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn s_buff(j, i) = from_here( & r%to(ip)%k(j), & r%to(ip)%l(j) & ) end do !$OMP END DO !$OMP MASTER call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i)) !$OMP END MASTER end do !$OMP END PARALLEL end if !Post receives if (nrecv > 0) then !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(i, ip, nn) & !$OMP SHARED(nrecv, r, r_buff, recvhand) & !$OMP SCHEDULE(static) do i = 1, nrecv ip = r%recv_inv_ip(i) nn = r%from(ip)%nn call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i)) end do !$OMP END PARALLEL DO end if if (nrecv > 0) then !Now we don't have anything to do until all the data has arrived (so should we post sends first?) !We could do a wait all but may be better to keep on checking each message seperately so that when !it arrives we can copy out whilst waiting for others to arrive. !Start with a wait all call waitall(nrecv, recvhand) !Now unpack data !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nrecv, r, to_here, r_buff) do i = 1, nrecv !Which processor is this data from ip = r%recv_inv_ip(i) nn = r%from(ip)%nn !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn to_here( & r%from(ip)%k(j), & r%from(ip)%l(j), & r%from(ip)%m(j) & ) = r_buff(j, i) enddo !$OMP END DO NOWAIT enddo !$OMP END PARALLEL deallocate(r_buff) deallocate(recvhand) endif if (nsend > 0) then call waitall(nsend, sendhand) deallocate(s_buff) deallocate(sendhand) endif end subroutine c_redist_32_inv_mpi_copy_nonblock !> Launch persistent communications associated with redistribute. !> Packs send buffer. subroutine c_redist_32_inv_mpi_copy_persist_start(r, from_here) use mp, only: start_comm implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%to_low(1):, & r%to_low(2): & ), intent (in) :: from_here integer :: i, ip, nn, nrecv, nsend, j !For inverse transform SEND actually refers to received data and !RECV refers to data to be sent !Post receives nrecv = r%nsend if (nrecv > 0) then call start_comm(r%recv_inv_hand) endif !Pack send buffer and start sends nsend = r%nrecv if (nsend > 0) then !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nsend, r, from_here) do i = 1, nsend ip = r%recv_ip(i) nn = r%to(ip)%nn !Any benefit in making this a pointer assignment instead? !Avoids copying data here but could lead to questionable !memory access during the actual send? !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn r%buff_recv(i)%complex_buffer(j) = from_here( & r%to(ip)%k(j), & r%to(ip)%l(j) & ) end do !$OMP END DO !$OMP MASTER call start_comm(r%send_inv_hand(i)) !$OMP END MASTER end do !$OMP END PARALLEL end if end subroutine c_redist_32_inv_mpi_copy_persist_start !> Finish persistent communications associated with redistribute. !> Unpacks receive buffer. subroutine c_redist_32_inv_mpi_copy_persist_end(r, to_here) use mp, only: waitall implicit none type (redist_type), intent (in out) :: r complex, dimension ( & r%from_low(1):, & r%from_low(2):, & r%from_low(3): & ), intent (in out) :: to_here integer :: i, ip, nn, nrecv, nsend, j !For inverse transform SEND actually refers to received data and !RECV refers to data to be sent !Wait for receives to complete and unpack data nrecv = r%nsend if (nrecv > 0) then call waitall(nrecv, r%recv_inv_hand) !Now unpack data !$OMP PARALLEL DEFAULT(none) & !$OMP PRIVATE(i, ip, nn, j) & !$OMP SHARED(nrecv, r, to_here) do i = 1, nrecv !Which processor is this data from ip = r%recv_inv_ip(i) nn = r%from(ip)%nn !$OMP DO & !$OMP SCHEDULE(static) do j = 1, nn to_here( & r%from(ip)%k(j), & r%from(ip)%l(j), & r%from(ip)%m(j) & ) = r%buff_send(i)%complex_buffer(j) end do !$OMP END DO NOWAIT end do !$OMP END PARALLEL end if !Wait for sends to complete nsend = r%nrecv if (nsend > 0) then call waitall(nsend, r%send_inv_hand) end if end subroutine c_redist_32_inv_mpi_copy_persist_end !> FIXME : Add documentation subroutine c_redist_42 (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):, & r%from_low(4):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%from(iproc)%nn to_here(r%to(iproc)%k(i),& r%to(iproc)%l(i)) & = from_here(r%from(iproc)%k(i), & r%from(iproc)%l(i), & r%from(iproc)%m(i), & r%from(iproc)%n(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i), & r%from(ipto)%m(i), & r%from(ipto)%n(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i), & r%from(ipto)%m(i), & r%from(ipto)%n(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_42 !> FIXME : Add documentation subroutine c_redist_42_inv (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r complex, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):, & r%from_low(4):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%to(iproc)%nn to_here(r%from(iproc)%k(i), & r%from(iproc)%l(i), & r%from(iproc)%m(i), & r%from(iproc)%n(i)) & = from_here(r%to(iproc)%k(i), & r%to(iproc)%l(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%complex_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i), & r%from(ipfrom)%m(i), & r%from(ipfrom)%n(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i), & r%from(ipfrom)%m(i), & r%from(ipfrom)%n(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%complex_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_42_inv !> FIXME : Add documentation subroutine c_redist_23 (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):, & r%to_low(3):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%from(iproc)%nn to_here(r%to(iproc)%k(i),& r%to(iproc)%l(i),& r%to(iproc)%m(i)) & = from_here(r%from(iproc)%k(i), & r%from(iproc)%l(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i), & r%to(ipfrom)%m(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i), & r%to(ipfrom)%m(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_23 !> FIXME : Add documentation subroutine c_redist_34 (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r complex, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in) :: from_here complex, dimension (r%to_low(1):, & r%to_low(2):, & r%to_low(3):, & r%to_low(4):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%from(iproc)%nn to_here(r%to(iproc)%k(i),& r%to(iproc)%l(i),& r%to(iproc)%m(i),& r%to(iproc)%n(i)) & = from_here(r%from(iproc)%k(i), & r%from(iproc)%l(i), & r%from(iproc)%m(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i), & r%from(ipto)%m(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i), & r%to(ipfrom)%m(i), & r%to(ipfrom)%n(i)) & = r%complex_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i), & r%to(ipfrom)%m(i), & r%to(ipfrom)%n(i)) & = r%complex_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%complex_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i), & r%from(ipto)%m(i)) end do call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine c_redist_34 !> FIXME : Add documentation subroutine r_redist_12 (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r real, dimension (r%from_low(1):), intent (in) :: from_here real, dimension (r%to_low(1):,r%to_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%from(iproc)%nn to_here(r%to(iproc)%k(i), r%to(iproc)%l(i)) & = from_here(r%from(iproc)%k(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%real_buff(i) = from_here(r%from(ipto)%k(i)) end do call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%real_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%real_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%real_buff(i) = from_here(r%from(ipto)%k(i)) end do call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine r_redist_12 !> FIXME : Add documentation subroutine r_redist_22 (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r real, dimension (r%from_low(1):, & r%from_low(2):), intent (in) :: from_here real, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%from(iproc)%nn to_here(r%to(iproc)%k(i),& r%to(iproc)%l(i)) & = from_here(r%from(iproc)%k(i), & r%from(iproc)%l(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%real_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i)) end do call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%real_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%real_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%real_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i)) end do call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine r_redist_22 !> FIXME : Add documentation subroutine r_redist_22_inv (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r real, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here real, dimension (r%from_low(1):, & r%from_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%to(iproc)%nn to_here(r%from(iproc)%k(i), & r%from(iproc)%l(i)) & = from_here(r%to(iproc)%k(i), & r%to(iproc)%l(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%real_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%real_buff(1:r%to(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i)) & = r%real_buff(i) end do end if else ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i)) & = r%real_buff(i) end do end if ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%real_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%real_buff(1:r%to(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine r_redist_22_inv !> FIXME : Add documentation subroutine r_redist_32 (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r real, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in) :: from_here real, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%from(iproc)%nn to_here(r%to(iproc)%k(i),& r%to(iproc)%l(i)) & = from_here(r%from(iproc)%k(i), & r%from(iproc)%l(i), & r%from(iproc)%m(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%real_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i), & r%from(ipto)%m(i)) end do call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%real_buff(i) end do end if else ! receive from idpth preceding processor if (r%to(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp) do i = 1, r%to(ipfrom)%nn to_here(r%to(ipfrom)%k(i), & r%to(ipfrom)%l(i)) & = r%real_buff(i) end do end if ! send to idpth next processor if (r%from(ipto)%nn > 0) then do i = 1, r%from(ipto)%nn r%real_buff(i) = from_here(r%from(ipto)%k(i), & r%from(ipto)%l(i), & r%from(ipto)%m(i)) end do call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine r_redist_32 !> FIXME : Add documentation subroutine r_redist_32_inv (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r real, dimension (r%to_low(1):, & r%to_low(2):), intent (in) :: from_here real, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%to(iproc)%nn to_here(r%from(iproc)%k(i), & r%from(iproc)%l(i), & r%from(iproc)%m(i)) & = from_here(r%to(iproc)%k(i), & r%to(iproc)%l(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp) ! avoid deadlock AND ensure mostly parallel resolution if (mod(iproc/iadp,2) == 0) then ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%real_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%real_buff(1:r%to(ipto)%nn), ipto, idp) end if ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i), & r%from(ipfrom)%m(i)) & = r%real_buff(i) end do end if else ! receive from idpth preceding processor if (r%from(ipfrom)%nn > 0) then call receive (r%real_buff(1:r%from(ipfrom)%nn), ipfrom, idp) do i = 1, r%from(ipfrom)%nn to_here(r%from(ipfrom)%k(i), & r%from(ipfrom)%l(i), & r%from(ipfrom)%m(i)) & = r%real_buff(i) end do end if ! send to idpth next processor if (r%to(ipto)%nn > 0) then do i = 1, r%to(ipto)%nn r%real_buff(i) = from_here(r%to(ipto)%k(i), & r%to(ipto)%l(i)) end do call send (r%real_buff(1:r%to(ipto)%nn), ipto, idp) end if end if end do if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total_after) time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total) end if end subroutine r_redist_32_inv !> FIXME : Add documentation subroutine r_redist_42 (r, from_here, to_here) use job_manage, only: time_message use mp, only: iproc, nproc, send, receive, get_mp_times implicit none type (redist_type), intent (in out) :: r real, dimension (r%from_low(1):, & r%from_low(2):, & r%from_low(3):, & r%from_low(4):), intent (in) :: from_here real, dimension (r%to_low(1):, & r%to_low(2):), intent (in out) :: to_here integer :: i, idp, ipto, ipfrom, iadp real :: mp_total, mp_total_after if (.not. using_measure_scatter) then call time_message(.false.,time_redist,' Redistribute') call get_mp_times(total_time = mp_total) end if ! redistribute from local processor to local processor do i = 1, r%from(iproc)%nn to_here(r%to(iproc)%k(i),& r%to(iproc)%l(i)) & = from_here(r%from(iproc)%k(i), & r%from(iproc)%l(i), & r%from(iproc)%m(i), & r%from(iproc)%n(i)) end do ! redistribute to idpth next processor from idpth preceding processor ! or redistribute from idpth preceding processor to idpth next processor ! to avoid deadlocks do idp = 1, nproc-1 ipto = mod(iproc + idp, nproc) ipfrom = mod(iproc + nproc - idp, nproc) iadp = min(idp, nproc - idp)