integrate_species_master Subroutine

private subroutine integrate_species_master(g, weights, total, nogath, gf_lo)

Integrate_species on subcommunicator with gather Falls back to original method if not using xyblock sub comm

Arguments

Type IntentOptional Attributes Name
complex, intent(in), dimension (-ntgrid:,:,g_lo%llim_proc:) :: g
real, intent(inout), dimension (:) :: weights
complex, intent(out), dimension (0:,:,:) :: total
logical, intent(in), optional :: nogath
logical, intent(in), optional :: gf_lo

Contents


Source Code

  subroutine integrate_species_master (g, weights, total, nogath, gf_lo)
    use theta_grid, only: ntgrid
    use kt_grids, only: ntheta0, naky
    use gs2_layouts, only: g_lo, intspec_sub
    use gs2_layouts, only: is_idx, ik_idx, it_idx, ie_idx, il_idx
    use mp, only: sum_allreduce_sub, allgatherv, nproc_comm,rank_comm,mp_abort
    use optionals, only: get_option_with_default
    use species, only: spec, tracer_species
    use array_utils, only: zero_array
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in) :: g
    real, dimension (:), intent (in out) :: weights
    complex, dimension (0:,:,:), intent (out) :: total
    logical, intent(in), optional :: nogath
    logical, intent(in), optional :: gf_lo
    complex, dimension (:), allocatable :: total_flat
    complex, dimension (:,:,:), allocatable :: total_transp
    integer :: nl,nr, ik, it, iglo, ip, ie,is,il, ig
    logical :: gf_lo_local, nogath_local

    gf_lo_local = get_option_with_default(gf_lo, .false.)
    if(gf_lo_local) then
       call integrate_species_gf(g,weights,total)
       return
    end if

    !If not using sub-communicators then just use original method
    !Note that if x and y are entirely local then we force intspec_sub=.false.
    if(.not.intspec_sub) then
       call integrate_species_original(g,weights,total)
       return
    endif

    !If we don't want to gather then use integrate_species_sub
    nogath_local = get_option_with_default(nogath, .false.)
    if(nogath_local)then
       call integrate_species_sub(g,weights,total)
       return
    endif

    !->First intialise gather vars
    !Note: We only do this on the first call !!May be better to move this to some init routine?
    if(.not.allocated(recvcnts_intspec)) then
       !Get subcomm size
       call nproc_comm(g_lo%lesblock_comm,sz_intspec)

       !Get local rank
       call rank_comm(g_lo%lesblock_comm,local_rank_intspec)

       !Create displacement and receive count arrays
       allocate(recvcnts_intspec(sz_intspec),displs_intspec(sz_intspec))

       do ip=0,sz_intspec-1
          displs_intspec(ip+1)=MIN(g_lo%les_kxky_range(1,ip)*(2*ntgrid+1),ntheta0*naky*(2*ntgrid+1)-1)
          recvcnts_intspec(ip+1)=MAX((g_lo%les_kxky_range(2,ip)-g_lo%les_kxky_range(1,ip)+1)*(2*ntgrid+1),0)
       enddo
    endif

    !Allocate array and ensure is zero
    allocate(total_flat(g_lo%les_kxky_range(1,local_rank_intspec)*&
         (2*ntgrid+1):(1+g_lo%les_kxky_range(2,local_rank_intspec))*(2*ntgrid+1)))
    call zero_array(total_flat)
    where (spec%type == tracer_species) weights = 0
    !Performed integral (weighted sum) over local velocity space and species
    if(g_lo%x_before_y) then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(iglo, it, ik, ie, il, is, nl, nr) &
       !$OMP SHARED(g_lo, ntgrid, weights, w, wl, g, ntheta0) &
       !$OMP REDUCTION(+ : total_flat) &
       !$OMP SCHEDULE(static)
       do iglo = g_lo%llim_proc, g_lo%ulim_proc
          !Convert from iglo to the separate indices
          is = is_idx(g_lo,iglo)
          ik = ik_idx(g_lo,iglo)
          it = it_idx(g_lo,iglo)
          ie = ie_idx(g_lo,iglo)
          il = il_idx(g_lo,iglo)
          
          !Calculate extent
          nl=(2*ntgrid+1)*(it-1+ntheta0*(ik-1))
          nr=nl+(2*ntgrid)
          
          !Sum up weighted g
          total_flat(nl:nr) = total_flat(nl:nr) + &
               (weights(is)*w(ie,is))*wl(:,il)*(g(:,1,iglo)+g(:,2,iglo))
       end do
       !$OMP END PARALLEL DO
    else
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(iglo, it, ik, ie, il, is, nl, nr) &
       !$OMP SHARED(g_lo, ntgrid, weights, w, wl, g, naky) &
       !$OMP REDUCTION(+ : total_flat) &
       !$OMP SCHEDULE(static)
       do iglo = g_lo%llim_proc, g_lo%ulim_proc
          !Convert from iglo to the separate indices
          is = is_idx(g_lo,iglo)
          ik = ik_idx(g_lo,iglo)
          it = it_idx(g_lo,iglo)
          ie = ie_idx(g_lo,iglo)
          il = il_idx(g_lo,iglo)
          
          !Calculate extent
          nl=(2*ntgrid+1)*(ik-1+naky*(it-1))
          nr=nl+(2*ntgrid)

          !Sum up weighted g
          total_flat(nl:nr) = total_flat(nl:nr) + &
               (weights(is)*w(ie,is))*wl(:,il)*(g(:,1,iglo)+g(:,2,iglo))
       end do
       !$OMP END PARALLEL DO
    endif

    !Reduce sum across all procs in sub communicator to make integral over all velocity space and species
    call sum_allreduce_sub(total_flat,g_lo%xyblock_comm)

    !Now gather missing xy data from other procs (only talk to procs
    !with the same piece of les)

    if(g_lo%x_before_y)then
       call allgatherv(total_flat,recvcnts_intspec(local_rank_intspec+1),total,recvcnts_intspec,displs_intspec,g_lo%lesblock_comm)
    else
       allocate(total_transp(0:2*ntgrid,naky,ntheta0))
       call allgatherv(total_flat,recvcnts_intspec(local_rank_intspec+1),total_transp,recvcnts_intspec,displs_intspec,g_lo%lesblock_comm)
       do ig=0,2*ntgrid
          total(ig,:,:)=transpose(total_transp(ig,:,:))
       enddo
       !This is pretty bad for memory access so can do this all at once
       !using reshape with a specified order :
       !total=RESHAPE(total_transp,(/2*ntgrid+1,ntheta0,naky/),ORDER=(/1,3,2/))
       !BUT timings in a simple test code suggest loop+transpose can be faster.
       !In the case where ntgrid is large and ntheta0 is small reshape can win
       !whilst in the case where ntgrid is small and ntheta0 is large transpose wins.
       !When both are large reshape seems to win.
       !In conclusion it's not clear which method is better but if we assume we care most
       !about nonlinear simulations then small ntgrid with large ntheta0 is most likely so
       !pick transpose method
       deallocate(total_transp)
    endif

    deallocate(total_flat)
  end subroutine integrate_species_master