Gather all the fields to proc0/all for diagnostics etc. DD>TAGGED:Worth looking at improving this bad memory pattern
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
class(fieldmat_type), | intent(inout) | :: | self | |||
complex, | intent(inout), | dimension(:,:,:) | :: | ph | ||
complex, | intent(inout), | dimension(:,:,:) | :: | ap | ||
complex, | intent(inout), | dimension(:,:,:) | :: | bp | ||
logical, | intent(in), | optional | :: | to_all_in | ||
logical, | intent(in), | optional | :: | do_allreduce_in |
subroutine fm_gather_fields(self,ph,ap,bp,to_all_in,do_allreduce_in)
use theta_grid, only: ntgrid
use kt_grids, only: naky, ntheta0
use mp, only: broadcast, sum_reduce_sub, sum_allreduce, sum_allreduce_sub
use mp, only: rank_comm, comm_type, proc0
use run_parameters, only: has_phi, has_apar, has_bpar
use gs2_layouts, only: g_lo, intspec_sub
use array_utils, only: zero_array, copy
use optionals, only: get_option_with_default
implicit none
class(fieldmat_type), intent(inout) :: self
complex, dimension(:,:,:), intent(inout) :: ph,ap,bp
logical, optional, intent(in) :: to_all_in, do_allreduce_in
logical :: to_all, do_allreduce
complex, dimension(:,:,:,:), allocatable :: tmp, tmp_tmp
integer :: ifl, ik, is, ic, it, iex, bnd, ig
logical :: use_sub
type(comm_type), save :: les_comm
logical, save :: first=.true.
logical, save :: sub_has_proc0=.false.
logical :: xy_block_comm_is_available
logical :: x_is_not_split
!Set the to_all flag
to_all = get_option_with_default(to_all_in, .false.)
!Set the do_allreduce flag
do_allreduce = get_option_with_default(do_allreduce_in, .false.)
!Shouldn't do to_all if do_allreduce
if(do_allreduce) to_all=.false.
!Work out if we want to use sub communicators in reduction and
!if we are allowed to.
xy_block_comm_is_available = .not. (g_lo%x_local .and. g_lo%y_local) .and. intspec_sub
x_is_not_split = g_lo%x_local
use_sub=( &
xy_block_comm_is_available .and. x_is_not_split .and. & !Parallel decomposition constraints
do_allreduce .and. field_local_allreduce_sub) !Runtime choices
!If using sub-communicators set up object and work out which procs
!have proc0 as a member
if(use_sub.and.first)then
!Setup type
les_comm%id=g_lo%lesblock_comm
call rank_comm(les_comm%id,les_comm%iproc)
!Decide who has proc0
ig=0
if(proc0) ig=1
call sum_allreduce_sub(ig,les_comm%id)
if(ig/=0) sub_has_proc0=.true.
endif
first=.false.
!Allocate the tmp array if we're part of the head subcom
!or if we're sending to all
if((self%fm_sub_headsc_p0%nproc>0).or.to_all.or.do_allreduce) then
if(use_sub)then
!Use a smaller array so we ignore zero entries, really would like to reuse some of the
!integrate species code to do reduction and gather but have extra dimension due to fields
allocate(tmp_tmp(-ntgrid:ntgrid,g_lo%it_min:g_lo%it_max,g_lo%ik_min:g_lo%ik_max,self%nfield))
else
allocate(tmp_tmp(-ntgrid:ntgrid,ntheta0,naky,self%nfield))
endif
allocate(tmp(-ntgrid:ntgrid,ntheta0,naky,self%nfield))
call zero_array(tmp_tmp)
if(use_sub) call zero_array(tmp) !Only need to zero if tmp_tmp is not same size as tmp
endif
!Now loop over supercells, and store sections
!on procs which are part of the head subcom
if(self%fm_sub_headsc_p0%nproc>0)then
!$OMP PARALLEL DO DEFAULT(none) &
!$OMP PRIVATE(ik, is, iex, ifl, it, bnd, ig) &
!$OMP SHARED(self, ntgrid, tmp_tmp) &
!$OMP SCHEDULE(static)
do ik=1,self%naky
do is=1,self%kyb(ik)%nsupercell
!Skip if we're not the head supercell
if(.not.self%kyb(ik)%supercells(is)%is_head) cycle
iex=0
do ifl=1,self%nfield
do ic=1,self%kyb(ik)%supercells(is)%ncell
it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
bnd=0
if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
do ig=-ntgrid,ntgrid-bnd
iex=iex+1
tmp_tmp(ig,it,ik,ifl)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
! if(ifl .eq. 1) write(*,*) self%kyb(ik)%supercells(is)%tmp_sum(iex)
enddo
enddo
enddo
!<DD>TAGGED:Worth looking at improving this bad memory pattern
!/Now fix boundary points
if(self%kyb(ik)%supercells(is)%ncell>1) then
do ic=1,self%kyb(ik)%supercells(is)%ncell-1
tmp_tmp(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik,:)=&
tmp_tmp(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik,:)
enddo
endif
enddo
enddo
!$OMP END PARALLEL DO
!Now reduce
!Really we should be able to do some form of gather here
!as every proc knows how long the supercell is and where to put it
! call sum_allreduce_sub(tmp,self%fm_sub_headsc_p0%id)
if(.not.do_allreduce)call sum_reduce_sub(tmp_tmp,0,self%fm_sub_headsc_p0)
endif
! write(*,*) 'tmp_tmp',tmp_tmp(:,:,:,1)
!Should really be able to do this on the xyblock sub communicator
!but proc0 needs to know full result for diagnostics so would need
!some way to tell proc0 about the other parts of the field (or
!update diagnostics to work in parallel). We have the lesblock comm
!which should help with this, but we'd need to work out which procs
!are in the same comm as proc0.
if(do_allreduce) then
if(use_sub) then
!This reduction (and copy) ensures that every proc knows the field
!in its local x-y cells. It could be implemented as an allgatherv which
!would be slightly more efficient but is more complicated and may involve
!more local operations which outweigh the benefits.
call sum_allreduce_sub(tmp_tmp,g_lo%xyblock_comm)
else
call sum_allreduce(tmp_tmp)
endif
endif
!Now copy tmp_tmp into tmp if we have it
if((self%fm_sub_headsc_p0%nproc>0).or.to_all.or.do_allreduce) then
if(use_sub)then
tmp(:,g_lo%it_min:g_lo%it_max,g_lo%ik_min:g_lo%ik_max,:)=tmp_tmp
!Here we reduce to proc0 of the lesblock comm (which should include the
!global proc0). Should only really need to call this on procs which
!have proc0 in their subcomm but no way to determine this currently
!NOTE: We don't need to know the result of this reduction until we
!reach the diagnostics. With non-blocking communications we may be able
!to do this required communication in the background whilst we proceed
!with the second call to timeadv. There is a slight complication in that
!we're communicating the change in the fields so we have to remember to
!increment the field before diagnostics are called.
if(sub_has_proc0) call sum_reduce_sub(tmp,0,les_comm)
else
call copy(tmp_tmp, tmp) !It would be nice to avoid this copy if possible
endif
endif
!////////////////////////
!// Distribute and unpack
!////////////////////////
!Note: Currently use a broadcast from proc0 to distribute data to all procs
!May be more efficient to use a scatter type approach so we send the minimum
!amount of data. This may end up being slower as we don't take advantage of
!optimised broadcast which improves bandwidth (branching), as we're doing 1 to many.
!Might expect a broadcast from the supercell head on sc_sub_all to be most
!efficient, but it doesn't appear to be better than global broadcast.
!As the supercell head has knowledge of the result quite early on in the
!field update algorithm a non-blocking broadcast (MPI-3) on sc_sub_all
!may be useful and would allow the following broadcasts to be replaced
!with wait variants plus unpacking.
!Note: By doing the broadcast at supercell level we would end up sending
!more data than required (if x is parallel) but to do it at cell level
!would require a subcommunicator for each cell, which can easily exceed
!the maximum number allowed.
!Fetch to all procs
if(to_all)then
call broadcast(tmp)
endif
!Finally, unpack tmp and deallocate arrays
! We could perhaps use copy here to add OpenMP acceleration
! but I worry this could result in the creation of a temporary
! in some cases, leading to double copying?
if(self%fm_sub_all%proc0.or.to_all.or.do_allreduce)then
ifl=0
if(has_phi) then
ifl=ifl+1
call copy(tmp(:,:,:,ifl), ph)
endif
if(has_apar) then
ifl=ifl+1
call copy(tmp(:,:,:,ifl), ap)
endif
if(has_bpar) then
ifl=ifl+1
call copy(tmp(:,:,:,ifl), bp)
endif
endif
if(allocated(tmp)) deallocate(tmp)
if(allocated(tmp_tmp)) deallocate(tmp_tmp)
!/Fill the empties by broadcasting from proc0 if we
!didn't do it with the earlier broadcast. Note that here we
!have up to three broadcasts whilst with to_all we only have one.
!However with to_all we have the additional cost of unpacking data.
!It is possible that for some values of nfield to_all is faster than
!.not.to_all, this is also likely to depend on the machine
if(.not.(to_all.or.do_allreduce)) then
if(has_phi) call broadcast(ph)
if(has_apar) call broadcast(ap)
if(has_bpar) call broadcast(bp)
endif
end subroutine fm_gather_fields