Integrate_species on subcommunicator with gather Falls back to original method if not using xyblock sub comm
Type | Intent | Optional | Attributes | Name | ||
---|---|---|---|---|---|---|
complex, | intent(in), | dimension (-ntgrid:,:,g_lo%llim_proc:) | :: | g | ||
real, | intent(inout), | dimension (:) | :: | weights | ||
complex, | intent(out), | dimension (0:,:,:) | :: | total | ||
logical, | intent(in), | optional | :: | nogath | ||
logical, | intent(in), | optional | :: | gf_lo |
subroutine integrate_species_master (g, weights, total, nogath, gf_lo)
use theta_grid, only: ntgrid
use kt_grids, only: ntheta0, naky
use gs2_layouts, only: g_lo, intspec_sub
use gs2_layouts, only: is_idx, ik_idx, it_idx, ie_idx, il_idx
use mp, only: sum_allreduce_sub, allgatherv, nproc_comm,rank_comm,mp_abort
use optionals, only: get_option_with_default
use species, only: spec, tracer_species
use array_utils, only: zero_array
implicit none
complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in) :: g
real, dimension (:), intent (in out) :: weights
complex, dimension (0:,:,:), intent (out) :: total
logical, intent(in), optional :: nogath
logical, intent(in), optional :: gf_lo
complex, dimension (:), allocatable :: total_flat
complex, dimension (:,:,:), allocatable :: total_transp
integer :: nl,nr, ik, it, iglo, ip, ie,is,il, ig
logical :: gf_lo_local, nogath_local
gf_lo_local = get_option_with_default(gf_lo, .false.)
if(gf_lo_local) then
call integrate_species_gf(g,weights,total)
return
end if
!If not using sub-communicators then just use original method
!Note that if x and y are entirely local then we force intspec_sub=.false.
if(.not.intspec_sub) then
call integrate_species_original(g,weights,total)
return
endif
!If we don't want to gather then use integrate_species_sub
nogath_local = get_option_with_default(nogath, .false.)
if(nogath_local)then
call integrate_species_sub(g,weights,total)
return
endif
!->First intialise gather vars
!Note: We only do this on the first call !!May be better to move this to some init routine?
if(.not.allocated(recvcnts_intspec)) then
!Get subcomm size
call nproc_comm(g_lo%lesblock_comm,sz_intspec)
!Get local rank
call rank_comm(g_lo%lesblock_comm,local_rank_intspec)
!Create displacement and receive count arrays
allocate(recvcnts_intspec(sz_intspec),displs_intspec(sz_intspec))
do ip=0,sz_intspec-1
displs_intspec(ip+1)=MIN(g_lo%les_kxky_range(1,ip)*(2*ntgrid+1),ntheta0*naky*(2*ntgrid+1)-1)
recvcnts_intspec(ip+1)=MAX((g_lo%les_kxky_range(2,ip)-g_lo%les_kxky_range(1,ip)+1)*(2*ntgrid+1),0)
enddo
endif
!Allocate array and ensure is zero
allocate(total_flat(g_lo%les_kxky_range(1,local_rank_intspec)*&
(2*ntgrid+1):(1+g_lo%les_kxky_range(2,local_rank_intspec))*(2*ntgrid+1)))
call zero_array(total_flat)
where (spec%type == tracer_species) weights = 0
!Performed integral (weighted sum) over local velocity space and species
if(g_lo%x_before_y) then
!$OMP PARALLEL DO DEFAULT(none) &
!$OMP PRIVATE(iglo, it, ik, ie, il, is, nl, nr) &
!$OMP SHARED(g_lo, ntgrid, weights, w, wl, g, ntheta0) &
!$OMP REDUCTION(+ : total_flat) &
!$OMP SCHEDULE(static)
do iglo = g_lo%llim_proc, g_lo%ulim_proc
!Convert from iglo to the separate indices
is = is_idx(g_lo,iglo)
ik = ik_idx(g_lo,iglo)
it = it_idx(g_lo,iglo)
ie = ie_idx(g_lo,iglo)
il = il_idx(g_lo,iglo)
!Calculate extent
nl=(2*ntgrid+1)*(it-1+ntheta0*(ik-1))
nr=nl+(2*ntgrid)
!Sum up weighted g
total_flat(nl:nr) = total_flat(nl:nr) + &
(weights(is)*w(ie,is))*wl(:,il)*(g(:,1,iglo)+g(:,2,iglo))
end do
!$OMP END PARALLEL DO
else
!$OMP PARALLEL DO DEFAULT(none) &
!$OMP PRIVATE(iglo, it, ik, ie, il, is, nl, nr) &
!$OMP SHARED(g_lo, ntgrid, weights, w, wl, g, naky) &
!$OMP REDUCTION(+ : total_flat) &
!$OMP SCHEDULE(static)
do iglo = g_lo%llim_proc, g_lo%ulim_proc
!Convert from iglo to the separate indices
is = is_idx(g_lo,iglo)
ik = ik_idx(g_lo,iglo)
it = it_idx(g_lo,iglo)
ie = ie_idx(g_lo,iglo)
il = il_idx(g_lo,iglo)
!Calculate extent
nl=(2*ntgrid+1)*(ik-1+naky*(it-1))
nr=nl+(2*ntgrid)
!Sum up weighted g
total_flat(nl:nr) = total_flat(nl:nr) + &
(weights(is)*w(ie,is))*wl(:,il)*(g(:,1,iglo)+g(:,2,iglo))
end do
!$OMP END PARALLEL DO
endif
!Reduce sum across all procs in sub communicator to make integral over all velocity space and species
call sum_allreduce_sub(total_flat,g_lo%xyblock_comm)
!Now gather missing xy data from other procs (only talk to procs
!with the same piece of les)
if(g_lo%x_before_y)then
call allgatherv(total_flat,recvcnts_intspec(local_rank_intspec+1),total,recvcnts_intspec,displs_intspec,g_lo%lesblock_comm)
else
allocate(total_transp(0:2*ntgrid,naky,ntheta0))
call allgatherv(total_flat,recvcnts_intspec(local_rank_intspec+1),total_transp,recvcnts_intspec,displs_intspec,g_lo%lesblock_comm)
do ig=0,2*ntgrid
total(ig,:,:)=transpose(total_transp(ig,:,:))
enddo
!This is pretty bad for memory access so can do this all at once
!using reshape with a specified order :
!total=RESHAPE(total_transp,(/2*ntgrid+1,ntheta0,naky/),ORDER=(/1,3,2/))
!BUT timings in a simple test code suggest loop+transpose can be faster.
!In the case where ntgrid is large and ntheta0 is small reshape can win
!whilst in the case where ntgrid is small and ntheta0 is large transpose wins.
!When both are large reshape seems to win.
!In conclusion it's not clear which method is better but if we assume we care most
!about nonlinear simulations then small ntgrid with large ntheta0 is most likely so
!pick transpose method
deallocate(total_transp)
endif
deallocate(total_flat)
end subroutine integrate_species_master