fm_gather_fields Subroutine

private subroutine fm_gather_fields(self, ph, ap, bp, to_all_in, do_allreduce_in)

Gather all the fields to proc0/all for diagnostics etc. DD>TAGGED:Worth looking at improving this bad memory pattern

Type Bound

fieldmat_type

Arguments

Type IntentOptional Attributes Name
class(fieldmat_type), intent(inout) :: self
complex, intent(inout), dimension(:,:,:) :: ph
complex, intent(inout), dimension(:,:,:) :: ap
complex, intent(inout), dimension(:,:,:) :: bp
logical, intent(in), optional :: to_all_in
logical, intent(in), optional :: do_allreduce_in

Contents

Source Code


Source Code

  subroutine fm_gather_fields(self,ph,ap,bp,to_all_in,do_allreduce_in)
    use theta_grid, only: ntgrid
    use kt_grids, only: naky, ntheta0
    use mp, only: broadcast, sum_reduce_sub, sum_allreduce, sum_allreduce_sub
    use mp, only: rank_comm, comm_type, proc0
    use run_parameters, only: has_phi, has_apar, has_bpar
    use gs2_layouts, only: g_lo, intspec_sub
    use array_utils, only: zero_array, copy
    use optionals, only: get_option_with_default
    implicit none
    class(fieldmat_type), intent(inout) :: self
    complex, dimension(:,:,:), intent(inout) :: ph,ap,bp
    logical, optional, intent(in) :: to_all_in, do_allreduce_in
    logical :: to_all, do_allreduce
    complex, dimension(:,:,:,:), allocatable :: tmp, tmp_tmp
    integer :: ifl, ik, is, ic, it, iex, bnd, ig
    logical :: use_sub
    type(comm_type), save :: les_comm
    logical, save :: first=.true.
    logical, save :: sub_has_proc0=.false.
    logical :: xy_block_comm_is_available
    logical :: x_is_not_split

    !Set the to_all flag
    to_all = get_option_with_default(to_all_in, .false.)

    !Set the do_allreduce flag
    do_allreduce = get_option_with_default(do_allreduce_in, .false.)
    
    !Shouldn't do to_all if do_allreduce
    if(do_allreduce) to_all=.false.

    !Work out if we want to use sub communicators in reduction and
    !if we are allowed to.
    xy_block_comm_is_available = .not. (g_lo%x_local .and. g_lo%y_local) .and. intspec_sub
    x_is_not_split = g_lo%x_local
    use_sub=( &
         xy_block_comm_is_available .and. x_is_not_split .and. & !Parallel decomposition constraints
         do_allreduce .and. field_local_allreduce_sub) !Runtime choices

    !If using sub-communicators set up object and work out which procs
    !have proc0 as a member
    if(use_sub.and.first)then
       !Setup type
       les_comm%id=g_lo%lesblock_comm
       call rank_comm(les_comm%id,les_comm%iproc)
       !Decide who has proc0
       ig=0
       if(proc0) ig=1
       call sum_allreduce_sub(ig,les_comm%id)
       if(ig/=0) sub_has_proc0=.true.
    endif
    first=.false.

    !Allocate the tmp array if we're part of the head subcom
    !or if we're sending to all
    if((self%fm_sub_headsc_p0%nproc>0).or.to_all.or.do_allreduce) then
       if(use_sub)then
          !Use a smaller array so we ignore zero entries, really would like to reuse some of the
          !integrate species code to do reduction and gather but have extra dimension due to fields
          allocate(tmp_tmp(-ntgrid:ntgrid,g_lo%it_min:g_lo%it_max,g_lo%ik_min:g_lo%ik_max,self%nfield))
       else
          allocate(tmp_tmp(-ntgrid:ntgrid,ntheta0,naky,self%nfield))
       endif
       allocate(tmp(-ntgrid:ntgrid,ntheta0,naky,self%nfield))
       call zero_array(tmp_tmp)
       if(use_sub) call zero_array(tmp) !Only need to zero if tmp_tmp is not same size as tmp
    endif

    !Now loop over supercells, and store sections
    !on procs which are part of the head subcom
    if(self%fm_sub_headsc_p0%nproc>0)then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(ik, is, iex, ifl, it, bnd, ig) &
       !$OMP SHARED(self, ntgrid, tmp_tmp) &
       !$OMP SCHEDULE(static)
       do ik=1,self%naky
          do is=1,self%kyb(ik)%nsupercell
             !Skip if we're not the head supercell
             if(.not.self%kyb(ik)%supercells(is)%is_head) cycle
  
             iex=0
             do ifl=1,self%nfield
                do ic=1,self%kyb(ik)%supercells(is)%ncell
                   it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                   bnd=0
                   if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
                   do ig=-ntgrid,ntgrid-bnd
                      iex=iex+1
                      tmp_tmp(ig,it,ik,ifl)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
!                      if(ifl .eq. 1) write(*,*) self%kyb(ik)%supercells(is)%tmp_sum(iex)
                  enddo
                enddo
             enddo
!<DD>TAGGED:Worth looking at improving this bad memory pattern
             !/Now fix boundary points
             if(self%kyb(ik)%supercells(is)%ncell>1) then
                do ic=1,self%kyb(ik)%supercells(is)%ncell-1
                   tmp_tmp(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik,:)=&
                        tmp_tmp(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik,:)
                enddo
             endif
          enddo
       enddo
       !$OMP END PARALLEL DO

       !Now reduce
       !Really we should be able to do some form of gather here
       !as every proc knows how long the supercell is and where to put it
!       call sum_allreduce_sub(tmp,self%fm_sub_headsc_p0%id)
       if(.not.do_allreduce)call sum_reduce_sub(tmp_tmp,0,self%fm_sub_headsc_p0)
    endif

!    write(*,*) 'tmp_tmp',tmp_tmp(:,:,:,1)

    !Should really be able to do this on the xyblock sub communicator
    !but proc0 needs to know full result for diagnostics so would need
    !some way to tell proc0 about the other parts of the field (or
    !update diagnostics to work in parallel). We have the lesblock comm
    !which should help with this, but we'd need to work out which procs
    !are in the same comm as proc0. 
    if(do_allreduce) then
       if(use_sub) then
          !This reduction (and copy) ensures that every proc knows the field
          !in its local x-y cells. It could be implemented as an allgatherv which
          !would be slightly more efficient but is more complicated and may involve 
          !more local operations which outweigh the benefits. 
          call sum_allreduce_sub(tmp_tmp,g_lo%xyblock_comm)
       else
          call sum_allreduce(tmp_tmp)
       endif
    endif

    !Now copy tmp_tmp into tmp if we have it
    if((self%fm_sub_headsc_p0%nproc>0).or.to_all.or.do_allreduce) then
       if(use_sub)then
          tmp(:,g_lo%it_min:g_lo%it_max,g_lo%ik_min:g_lo%ik_max,:)=tmp_tmp
          
          !Here we reduce to proc0 of the lesblock comm (which should include the
          !global proc0). Should only really need to call this on procs which
          !have proc0 in their subcomm but no way to determine this currently
          !NOTE: We don't need to know the result of this reduction until we
          !reach the diagnostics. With non-blocking communications we may be able
          !to do this required communication in the background whilst we proceed
          !with the second call to timeadv. There is a slight complication in that
          !we're communicating the change in the fields so we have to remember to 
          !increment the field before diagnostics are called.
          if(sub_has_proc0) call sum_reduce_sub(tmp,0,les_comm) 
       else
          call copy(tmp_tmp, tmp) !It would be nice to avoid this copy if possible
       endif
    endif

    !////////////////////////
    !// Distribute and unpack
    !////////////////////////
    !Note: Currently use a broadcast from proc0 to distribute data to all procs
    !May be more efficient to use a scatter type approach so we send the minimum
    !amount of data. This may end up being slower as we don't take advantage of
    !optimised broadcast which improves bandwidth (branching), as we're doing 1 to many.
    !Might expect a broadcast from the supercell head on sc_sub_all to be most
    !efficient, but it doesn't appear to be better than global broadcast.
    !As the supercell head has knowledge of the result quite early on in the
    !field update algorithm a non-blocking broadcast (MPI-3) on sc_sub_all
    !may be useful and would allow the following broadcasts to be replaced
    !with wait variants plus unpacking.
    !Note: By doing the broadcast at supercell level we would end up sending
    !more data than required (if x is parallel) but to do it at cell level 
    !would require a subcommunicator for each cell, which can easily exceed
    !the maximum number allowed.

    !Fetch to all procs
    if(to_all)then
       call broadcast(tmp)
    endif

    !Finally, unpack tmp and deallocate arrays
    ! We could perhaps use copy here to add OpenMP acceleration
    ! but I worry this could result in the creation of a temporary
    ! in some cases, leading to double copying?
    if(self%fm_sub_all%proc0.or.to_all.or.do_allreduce)then
       ifl=0
       if(has_phi) then
          ifl=ifl+1
          call copy(tmp(:,:,:,ifl), ph)
       endif
       if(has_apar) then
          ifl=ifl+1
          call copy(tmp(:,:,:,ifl), ap)
       endif
       if(has_bpar) then
          ifl=ifl+1
          call copy(tmp(:,:,:,ifl), bp)
       endif
    endif

    if(allocated(tmp)) deallocate(tmp)
    if(allocated(tmp_tmp)) deallocate(tmp_tmp)

    !/Fill the empties by broadcasting from proc0 if we
    !didn't do it with the earlier broadcast. Note that here we
    !have up to three broadcasts whilst with to_all we only have one.
    !However with to_all we have the additional cost of unpacking data.
    !It is possible that for some values of nfield to_all is faster than
    !.not.to_all, this is also likely to depend on the machine
    if(.not.(to_all.or.do_allreduce)) then
       if(has_phi) call broadcast(ph)
       if(has_apar) call broadcast(ap)
       if(has_bpar) call broadcast(bp)
    endif
  end subroutine fm_gather_fields