!> This module implements the same implicit algorithm as fields_implicit. !! However, it is an almost complete rewrite of the implementation !! which can give large gains in the speed of calculating the response !! matrix, as well as significant gains to the advance steps module fields_local use mp, only: comm_type implicit none private ! Module level routines public :: init_fields_local, init_allfields_local, finish_fields_local public :: advance_local, reset_fields_local, dump_response_to_file_local public :: fields_local_functional, read_response_from_file_local public :: do_smart_update, field_local_allreduce, field_local_allreduce_sub public :: field_local_tuneminnrow, field_local_nonblocking_collectives public :: minNRow, init_fields_matrixlocal public :: rowblock_type, cell_type, supercell_type, ky_type, fieldmat_type, pc_type ! User set parameters public :: dump_response, read_response ! Made public for testing public :: fieldmat !> Public type allowing tests to access otherwise-private routines type, public :: fields_local_testing contains procedure, nopass :: getfield_local end type fields_local_testing interface dump_response_to_file_local module procedure dump_response_to_file_local_passed module procedure dump_response_to_file_local_nopass end interface dump_response_to_file_local !////////////////////////////// !// CUSTOM TYPES !////////////////////////////// !> This is the lowest level of data object and will hold the actual data !! involved with the field matrix type :: rowblock_type complex, dimension(:,:), allocatable :: data !< This is the field data integer :: row_llim, row_ulim !< The row limits this block corresponds to integer :: col_llim, col_ulim !< The col limits this block corresponds to integer :: nrow, ncol !< The size of the row block complex, dimension(:),allocatable :: tmp_sum !< Work space for field update !/Row and parent properties. Mostly for debug printing integer :: ir_ind !< Property of rowblock integer :: it_ind, ik_ind !< Details of parents integer :: is_ind, ic_ind !< Details of parents so cell can find itself contains procedure :: deallocate => rb_deallocate procedure :: allocate => rb_allocate procedure :: init => rb_init procedure :: debug_print => rb_debug_print procedure :: mv_mult_fun => rb_mv_mult_fun procedure :: mv_mult => rb_mv_mult procedure :: irex_to_ir => rb_irex_to_irloc procedure :: reset => rb_reset procedure :: set_nrow => rb_set_nrow end type rowblock_type !> This is the next level up of data and represents the cell type. !! A cell has a unique it,ik pair and represents all the fields/field eq !! on the extended domain for the given it/ik type :: cell_type type(rowblock_type), dimension(:), allocatable :: rb !< These are the row blocks, currently one for each field equation integer :: nrb !< How many row blocks, equal to nfield !//NOTE: probably don't want these as this data should be stored by the row blocks only integer :: row_llim, row_ulim !< The row limits, always 1 to nfield*nextend integer, dimension(:), allocatable :: col_llim, col_ulim !< These are the column limits for each fdq !////// integer :: nrow, ncol !< The number of rows and columns for each field equation. integer :: ncol_tot !< The total number of columns (nfdq*ncol) logical :: is_local !< Does this cell have any data on this proc? logical :: is_empty !< Have we got any data for this cell on this proc? logical :: is_all_local !< Is all of this cells data on this proc? logical :: ignore_boundary !< Do we ignore the boundary point (column) in this cell? complex, dimension(:),allocatable :: tmp_sum !< Work space for field update type(comm_type) :: parent_sub !< Sub communicator involving all processors in parent !Cell and parent properties. Mostly for debug printing. integer :: ic_ind, it_ind !< Cell properties integer :: ik_ind, is_ind !< Parent properties contains procedure :: deallocate => c_deallocate procedure :: allocate => c_allocate procedure :: init => c_init procedure :: debug_print => c_debug_print procedure :: mv_mult_rb => c_mv_mult_rb procedure :: get_field_update => c_get_field_update procedure :: reset => c_reset procedure :: set_locality => c_set_locality procedure :: has_row => c_has_row end type cell_type !> This is the next level up of data and represents the supercell type. !! A supercell represents a collection of connected cells type :: supercell_type type(cell_type), dimension(:), allocatable :: cells !< These are the cells integer :: ncell integer :: nextend !< Length of the extended domain integer :: nrow, ncol !< The number of rows and columns. Equal to nextend*nfield integer :: it_leftmost !< It index of leftmost cell integer :: head_iproc_world !< The proc id (in comm world) of the head proc integer :: head_iproc !< The proc id (in sc_sub_all) of the head proc integer :: head_iproc_pd !< The proc id (in sc_sub_pd) of the head proc logical :: is_local !< Does this supercell have any data on this proc? logical :: is_empty !< Have we got any data for this supercell on this proc? logical :: is_all_local !< Is all of this supercells data on this proc? complex, dimension(:),allocatable :: tmp_sum !< Work space for field update type(comm_type) :: sc_sub_all !< Sub communicator involving all processors with this supercell type(comm_type) :: sc_sub_pd !< Sub communicator for all procs with some data but not all of it type(comm_type) :: parent_sub !< Sub communicator involving all processors in parent integer, dimension(:),allocatable :: nb_req_hand !< For non-blocking broadcast request handle storage logical :: initdone !< Have we finished initialising this block? logical, dimension(:), allocatable :: initialised !< Have we initialised each point? logical :: is_head = .false. !< Are we the head of this supercell? !Cell and parent properties. Mostly for debug printing. !> The is_ind value is the index of the supercell in the parent ky_type's !> supercell array. integer :: is_ind !> The is_label is the supercell_label of this supercell as determined !> by calculate_supercell_labels from elsewhere. integer :: is_label integer :: ik_ind !< Parent properties integer :: collective_request real :: condition_number = -1 !< Condition number of the associated response matrix. Only valid on the head. contains procedure :: deallocate => sc_deallocate procedure :: allocate => sc_allocate procedure :: init => sc_init procedure :: debug_print => sc_debug_print procedure :: get_field_update => sc_get_field_update procedure :: reduce_tmpsum => sc_reduce_tmpsum procedure :: iex_to_dims => sc_iex_to_dims procedure :: iex_to_ifl => sc_iex_to_ifl procedure :: iex_to_ic => sc_iex_to_ic procedure :: iex_to_ig => sc_iex_to_ig procedure :: iex_to_it => sc_iex_to_it procedure :: has_it => sc_has_it procedure :: reset => sc_reset procedure :: set_locality => sc_set_locality procedure :: get_left_it => sc_get_left_it procedure :: store_fq => sc_store_fq procedure :: pull_rows_to_arr => sc_pull_rows_to_arr procedure :: push_arr_to_rows => sc_push_arr_to_rows procedure :: prepare => sc_prepare procedure :: invert => sc_invert procedure :: invert_local => sc_invert_local procedure :: invert_mpi => sc_invert_mpi procedure :: dump => sc_dump procedure :: make_subcom_1 => sc_make_subcom_1 procedure :: make_subcom_2 => sc_make_subcom_2 procedure :: dump_to_file => sc_dump_to_file procedure :: read_from_file => sc_read_from_file end type supercell_type !> This is the next level up of data and represents the ky type. !! A ky block consists of a number of supercells type :: ky_type type(supercell_type), dimension(:), allocatable :: supercells !< These are the supercells integer :: nsupercell logical :: is_local !< Does this supercell have any data on this proc? logical :: is_empty !< Have we got any data for this supercell on this proc? logical :: is_all_local !< Is all of this supercells data on this proc? logical :: initdone !< Have we finished initialising this block? type(comm_type) :: ky_sub_all !< Sub communicator involving all processors with this ky type(comm_type) :: parent_sub !< Sub communicator involving all processors in parent !Cell and parent properties. Mostly for debug printing. integer :: ik_ind contains procedure :: deallocate => ky_deallocate procedure :: allocate => ky_allocate procedure :: init => ky_init procedure :: debug_print => ky_debug_print procedure :: get_field_update => ky_get_field_update procedure :: wait_for_nonblocking_collectives => ky_wait_for_nonblocking_collectives procedure :: reset => ky_reset procedure :: set_locality => ky_set_locality procedure :: is_from_it => ky_is_from_it procedure :: store_fq => ky_store_fq procedure :: prepare => ky_prepare procedure :: make_subcom_1 => ky_make_subcom_1 procedure :: make_subcom_2 => ky_make_subcom_2 end type ky_type !> This is the top level object, consisting of a collection of !! ky blocks. type :: fieldmat_type type(ky_type), dimension(:), allocatable :: kyb !< The ky blocks integer :: naky !< Number of ky blocks integer :: ntheta0 integer :: npts !< Total number of theta grid points on all extended domains integer :: nbound !< Number of ignored boundary points logical :: is_local !< Does this supercell have any data on this proc? logical :: is_empty !< Have we got any data for the fieldmat on this proc? logical :: is_all_local !< Is all of this supercells data on this proc? type(comm_type) :: fm_sub_all !< Sub communicator involving all processors with fieldmat type(comm_type) :: fm_sub_headsc_p0 !< Sub communicator involving the supercell heads and proc0 integer :: prepare_type=0 !< What sort of preparation do we do to the field matrix !Currently only support: !0 : Invert the field matrix !In the future may wish to do things like LU decomposition etc. !Could also do things like eigenvalue analysis etc. integer, dimension(:,:), allocatable :: heads !< An array that holds the iproc (from comm world) of the supercell head for a given ik,it point logical :: no_populate=.false. !< Advanced usage only, if true then don't populate response logical :: no_prepare=.false. !< Advanced usage only, if true then don't prepare (invert) response integer :: nfield !< Number of fields we're working with contains procedure :: deallocate => fm_deallocate procedure :: allocate => fm_allocate procedure :: debug_print => fm_debug_print procedure :: get_field_update => fm_get_field_update procedure :: init => fm_init procedure :: reset => fm_reset procedure :: populate => fm_populate procedure :: init_next_field_points => fm_init_next_field_points procedure :: prepare => fm_prepare procedure :: make_subcom_1 => fm_make_subcom_1 procedure :: make_subcom_2 => fm_make_subcom_2 procedure :: calc_sc_heads => fm_calc_sc_heads procedure :: gather_fields => fm_gather_fields procedure :: unpack_to_field => fm_unpack_to_field procedure :: write_debug_data => fm_write_debug_data procedure :: set_is_local => fm_set_is_local procedure :: count_subcom => fm_count_subcom procedure :: getfieldeq_nogath => fm_getfieldeq_nogath procedure :: reduce_an => fm_reduce_an_head_send_broadcast procedure :: check_an => fm_check_an procedure :: getfieldeq1_nogath => fm_getfieldeq1_nogath procedure :: update_fields => fm_update_fields procedure :: update_fields_newstep => fm_update_fields_newstep procedure :: get_condition_numbers => fm_get_condition_numbers end type fieldmat_type !> This is the type which controls/organises the parallel !! data decomposition etc. type :: pc_type integer, dimension(:,:), allocatable :: is_local !< Is the given it,ik on this proc? !NOTE: We have is_local as integer rather than logical so we can reduce across procs !to count how many procs have a given cell type(comm_type), dimension(:,:), allocatable :: itik_subcom !< Cell level sub-communicators integer, dimension(:,:), allocatable :: nproc_per_cell !< How many procs have this cell integer, dimension(:,:), allocatable :: nresp_per_cell !< How many rows are we personally responsible for in each cell integer, dimension(:,:), allocatable :: navail_per_cell !< How many rows could we be responsible for in each cell integer :: nresp_tot, navail_tot !< Total number of rows available/potentially available logical :: force_local_invert=.false. !< If true then we force local inversion !the local fields. This may impact diagnostics/parallel writing as it means only the processors g_lo local section !of the fields are evolved on this processor. !NOTE: If we're doing force local invert really when we populate the supercell we should store all of the !data locally so we don't need to fetch it later. logical :: has_to_gather=.true. !< If true we have to gather when calling getfieldeq, determined by decomp routine integer :: decomp_type=3 !< This sets what type of decomposition is done !By defining your own decomposition routine and associating it with a !particular value of decomp_type you should be able to easily add in !new parallel decompositions easily without having to rewrite any other !routines (although I don't promise this!) !//List of currently supported decomp_types !0 : Force everything to be done locally and serially, i.e. every proc has all field mat data !1 : Force local/serial but only for the supercells we would have anyway !2 : Force local/serial but only for the cells we would have anyway !3 : Simple mpi, minimal attempt to load balance or avoid splitting small blocks !4 : As above but with attempts to avoid splitting small blocks contains procedure :: deallocate => pc_deallocate procedure :: allocate => pc_allocate procedure :: debug_print => pc_debug_print procedure :: current_nresp => pc_current_nresp procedure :: find_locality => pc_find_locality procedure :: count_avail => pc_count_avail procedure :: has_ik => pc_has_ik procedure :: has_it => pc_has_it procedure :: make_decomp => pc_make_decomp procedure :: init => pc_init procedure :: reset => pc_reset !//The different decomposition routines procedure :: decomp_all_serial_local => pc_decomp_all_serial_local procedure :: decomp_own_serial_local => pc_decomp_own_serial_local procedure :: decomp_owncells_serial_local => pc_decomp_owncells_serial_local procedure :: decomp_owncells_simplempi => pc_decomp_owncells_simplempi end type pc_type !////////////////////////////// !// MODULE VARIABLES !////////////////////////////// integer :: dlun=6 !< File unit for debug printing !Tuning this can improve advance/init time (usually one at the expense of other) logical :: do_smart_update=.false. !< If true and x/y distributed then use a "smart" update which only updates logical,parameter :: debug=.false. !< Do we do debug stuff? logical :: initialised=.false. !< Have we initialised yet? logical :: reinit=.false. !< Are we reinitialising? logical :: dump_response=.false. !< Do we dump the response matrix? logical :: read_response=.false. !< Do we read the response matrix from dump? logical :: field_local_allreduce = .false. !< If true use an allreduce to gather field else use reduce+broadcast logical :: field_local_allreduce_sub = .false. !< If true and field_local_allreduce true then do two sub comm all reduces rather than 1 logical :: field_local_tuneminnrow = .false. !< Are auto-tuning minnrow logical :: field_local_nonblocking_collectives = .false. integer :: minNRow = 16 !< Tuning this can influence both init and advance times type(pc_type),save :: pc !< This is the parallel control object type(fieldmat_type),save :: fieldmat !< This is the top level field matrix object !> The condition numbers above this limit will trigger a warning real, parameter :: condition_number_warning_limit = sqrt(1.0/epsilon(1.0)) contains !////////////////////////////// !// CUSTOM TYPES !////////////////////////////// ! !================ !ROWBLOCK !================ !> Allocate storage space subroutine rb_allocate(self) implicit none class(rowblock_type), intent(inout) :: self allocate(self%data(self%ncol,self%nrow)) allocate(self%tmp_sum(self%nrow)) self%data = 0. self%tmp_sum = 0. end subroutine rb_allocate !> Deallocate storage space subroutine rb_deallocate(self) implicit none class(rowblock_type), intent(inout) :: self if(allocated(self%data)) deallocate(self%data) if(allocated(self%tmp_sum)) deallocate(self%tmp_sum) end subroutine rb_deallocate !> Initialise the members of the rowblock instance subroutine rb_init(self, ifq, ic, it, is, ik, ncol, nextend) use theta_grid, only: ntgrid implicit none class(rowblock_type), intent(in out) :: self integer, intent(in) :: ifq, ic, it, is, ik, ncol !> The length of the parent supercell's real space !> domain. Represents the spacing between the start of different !> field/field equations as we store data as the full real space !> domain for each field, e.g. [fieldeq(:), fieldeqa(:), !> fieldeqp(:)], rather than say interleaved, e.g. [[fieldeq(1), !> fieldeqa(1), fieldeqp(1)], [fieldeq(2), fieldeqa(2), !> fieldeqp(2)],...] integer, intent(in) :: nextend ! Store rowblock index, essentially which fieldeq we represent self%ir_ind = ifq ! Store indices of parent types self%ic_ind = ic self%it_ind = it self%is_ind = is self%ik_ind = ik ! Store data limits self%ncol = ncol self%col_llim = 1 + (ic - 1) * (2 * ntgrid) + (ifq - 1) * nextend self%col_ulim = self%col_llim + ncol - 1 end subroutine rb_init !> Debug printing subroutine rb_debug_print(self) implicit none class(rowblock_type), intent(in) :: self write(dlun,'("Rowblock debug print. Index ir=",I0," nrow=",I0," ncol=",I0," rl,ru=",I0," ",I0, " and cl,cu=",I0," ",I0)') & self%ir_ind, self%nrow, self%ncol, & self%row_llim, self%row_ulim, & self%col_llim, self%col_ulim end subroutine rb_debug_print !> Matrix vector multiplication function rb_mv_mult_fun(self,vect) implicit none class(rowblock_type), intent(in) :: self complex,dimension(self%ncol), intent(in) :: vect complex,dimension(self%nrow) :: rb_mv_mult_fun ! integer :: ir !As a 1d vector in fortran is a "row vector" (as the !first index represents columns) matmul is a bit annoying !when trying to do a matrix-vector product rb_mv_mult_fun = matmul(vect,self%data) ! do ir=1,self%nrow ! rb_mv_mult_fun(ir)=sum(self%data(:,ir)*vect) ! enddo end function rb_mv_mult_fun !> Matrix vector multiplication stored in tmp_sum subroutine rb_mv_mult(self,vect) implicit none class(rowblock_type), intent(inout) :: self complex,dimension(self%ncol), intent(in) :: vect self%tmp_sum=self%mv_mult_fun(vect) end subroutine rb_mv_mult !> Convert extended row index to local row index function rb_irex_to_irloc(self,irex) implicit none class(rowblock_type), intent(in) :: self integer, intent(in) :: irex integer :: rb_irex_to_irloc rb_irex_to_irloc=1+(irex-self%row_llim) end function rb_irex_to_irloc !> Set the nrow variables subroutine rb_set_nrow(self) implicit none class(rowblock_type), intent(inout) :: self if(self%row_ulim<=0)then self%nrow=0 else self%nrow=1+self%row_ulim-self%row_llim endif end subroutine rb_set_nrow !> A routine to reset the object subroutine rb_reset(self) implicit none class(rowblock_type), intent(inout) :: self !deallocate call self%deallocate !Could zero out variables but no real need self%row_llim=0 self%row_ulim=0 end subroutine rb_reset !------------------------------------------------------------------------ !================ !CELL !================ !> Allocate storage space subroutine c_allocate(self) implicit none class(cell_type), intent(inout) :: self integer :: ir allocate(self%tmp_sum(self%nrow)) do ir=1,self%nrb call self%rb(ir)%allocate enddo end subroutine c_allocate !> Deallocate storage space subroutine c_deallocate(self) implicit none class(cell_type), intent(inout) :: self if(allocated(self%tmp_sum)) deallocate(self%tmp_sum) if(allocated(self%rb)) deallocate(self%rb) end subroutine c_deallocate !> Initialise the members of the cell instance subroutine c_init(self, ic, it, is, ik, nfield, nrow, nextend, ignore_boundary) use theta_grid, only: ntgrid implicit none class(cell_type), intent(in out) :: self integer, intent(in) :: ic, it, is, ik, nfield, nrow, nextend logical, intent(in) :: ignore_boundary integer :: ifq ! Store basic cell properties self%nrb = nfield self%ic_ind = ic self%it_ind = it self%ignore_boundary = ignore_boundary ! Store parent properties self%is_ind = is self%ik_ind = ik ! Store related/derived properties self%row_llim = 1 self%row_ulim = nrow self%nrow = nrow self%ncol = 2 * ntgrid if (.not. ignore_boundary) self%ncol = self%ncol + 1 allocate (self%col_llim(nfield), self%col_ulim(nfield)) do ifq = 1, nfield self%col_llim(ifq) = 1 + (ic - 1) + (2 * ntgrid) + (ifq - 1) * nrow self%col_ulim(ifq) = self%col_llim(ifq) + self%ncol - 1 end do self%ncol_tot = self%ncol * nfield allocate (self%rb(self%nrb)) !Now make row block objects do ifq = 1, nfield call self%rb(ifq)%init(ifq, ic, it, is, ik, self%ncol, nextend) end do end subroutine c_init !> Debug printing subroutine c_debug_print(self) implicit none class(cell_type), intent(in) :: self write(dlun,'("Cell debug print. Index ic=",I0," nrow=",I0," ncol=",I0)') & self%ic_ind, self%nrow, self%ncol end subroutine c_debug_print !> Do matrix vector multiplication at rowblock level subroutine c_mv_mult_rb(self,vect,ifq) implicit none class(cell_type), intent(inout) :: self complex, dimension(self%ncol), intent(in) :: vect integer, intent(in) :: ifq call self%rb(ifq)%mv_mult(vect) end subroutine c_mv_mult_rb !> Get the field update for this cells data !! Note still need to reduce across other cells in this !! supercell. subroutine c_get_field_update(self,fq,fqa,fqp) use run_parameters, only: has_phi, has_apar, has_bpar implicit none class(cell_type), intent(inout) :: self complex, dimension(self%ncol), intent(in) :: fq,fqa,fqp integer :: ifq !If we don't have any data then exit if(self%is_empty) return !First do the multiplication at rowblock level ifq=0 if(has_phi) then ifq=ifq+1 call self%mv_mult_rb(fq,ifq) endif if(has_apar) then ifq=ifq+1 call self%mv_mult_rb(fqa,ifq) endif if(has_bpar) then ifq=ifq+1 call self%mv_mult_rb(fqp,ifq) endif !Now store at cell level self%tmp_sum=0 do ifq=1,self%nrb self%tmp_sum(self%rb(ifq)%row_llim:self%rb(ifq)%row_ulim)=& self%tmp_sum(self%rb(ifq)%row_llim:self%rb(ifq)%row_ulim)& -self%rb(ifq)%tmp_sum enddo end subroutine c_get_field_update !> A routine to reset the object subroutine c_reset(self) implicit none class(cell_type), intent(inout) :: self integer :: ir !Call reset on all children do ir=1,self%nrb call self%rb(ir)%reset enddo !deallocate call self%deallocate !Could zero out variables but no real need end subroutine c_reset !> Set the locality of each object subroutine c_set_locality(self, pc) use kt_grids, only: aky, akx implicit none class(cell_type), intent(inout) :: self class(pc_type), intent(in) :: pc !Set our locality. Note here we assume all rb have same size if(size(self%rb)>0)then self%is_empty=self%rb(1)%nrow==0 self%is_all_local=self%rb(1)%nrow==self%nrow else self%is_empty=.true. self%is_all_local=.false. endif !/Also ignore the ky=kx=0 mode self%is_empty=(self%is_empty.or.(abs(aky(self%ik_ind))<epsilon(0.0).and.abs(akx(self%it_ind))<epsilon(0.0))) self%is_local=pc%is_local(self%it_ind,self%ik_ind)==1 end subroutine c_set_locality !> Test if a given row belongs to the current cell function c_has_row(self,irow) implicit none class(cell_type), intent(in) :: self integer, intent(in) :: irow logical :: c_has_row !NOTE: Here we assume all row blocks in the cell have the same !row limits if(size(self%rb)>0)then c_has_row=(irow>=self%rb(1)%row_llim.and.irow<=self%rb(1)%row_ulim) else c_has_row=.false. endif end function c_has_row !------------------------------------------------------------------------ !================ !SUPERCELL !================ !> Allocate storage space subroutine sc_allocate(self) implicit none class(supercell_type), intent(inout) :: self integer :: ic allocate(self%tmp_sum(self%nrow)) do ic=1,self%ncell call self%cells(ic)%allocate enddo end subroutine sc_allocate !> Deallocate storage space subroutine sc_deallocate(self) implicit none class(supercell_type), intent(inout) :: self if(allocated(self%tmp_sum)) deallocate(self%tmp_sum) if(allocated(self%cells)) deallocate(self%cells) end subroutine sc_deallocate !> Initialise the supercell instance by setting and calculating !> some basic properties. Does not deal with allocating all !> storage etc. subroutine sc_init(self, is, itmin, ik, nfield, nbound) use theta_grid, only: ntgrid use mp, only: mp_request_null use dist_fn, only: itright, supercell_labels, n_cells implicit none class(supercell_type), intent(in out) :: self integer, intent(in) :: is, ik, itmin, nfield integer, intent(in out) :: nbound integer :: it, ic logical :: ignore_boundary self%is_ind = is self%it_leftmost = itmin self%ncell = n_cells(itmin, ik) self%ik_ind = ik self%is_label = supercell_labels(itmin, ik) self%nextend = 1 + 2 * ntgrid * self%ncell self%nrow = self%nextend * nfield self%ncol = self%nrow self%collective_request = mp_request_null allocate (self%initialised(self%nextend)) self%initialised = .false. ! Now setup cells allocate (self%cells(self%ncell)) it = itmin do ic = 1, self%ncell !We ignore the boundary for all cells except the rightmost ignore_boundary = ic < self%ncell if (ignore_boundary) nbound = nbound + 1 call self%cells(ic)%init( ic, it, is, ik, nfield, & self%nrow, self%nextend, ignore_boundary) !Update it if (ic < self%ncell) it = itright(it, ik) end do end subroutine sc_init !> Debug printing subroutine sc_debug_print(self) implicit none class(supercell_type), intent(in) :: self write(dlun,'("Supercell debug print. Index is=",I0," nrow=",I0," ncol=",I0)') & self%is_ind, self%nrow, self%ncol end subroutine sc_debug_print !> Get the field update subroutine sc_get_field_update(self,fq,fqa,fqp) implicit none class(supercell_type), intent(inout) :: self complex, dimension(:,:), intent(in) :: fq,fqa,fqp integer :: ic, it_ind, ulim !Exit if we don't have any of the data locally if(.not.self%is_local) return if(.not.self%is_empty)then ! Now loop over cells and trigger field update. We add OpenMP here ! to help distribute the work. Note that cells%get_field_update ends ! up calling matmul, so in the situation where matmul is replaced ! with a call to equivalent BLAS routines _and_ an OpenMP enabled ! BLAS is linked this _could_ lead to an attempt to use nested ! parallelism, which might be slow. !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ic, it_ind, ulim) & !$OMP SHARED(self, fq, fqa, fqp) & !$OMP SCHEDULE(static) do ic=1,self%ncell !Get cell properties it_ind=self%cells(ic)%it_ind ulim=self%cells(ic)%ncol !Do update call self%cells(ic)%get_field_update(fq(:ulim,it_ind),fqa(:ulim,it_ind),fqp(:ulim,it_ind)) end do !$OMP END PARALLEL DO end if !<DD>TAGGED !/If we're going to reduce why not reduce once on the top level object --> If we do !local reduction and store in the top level ky_field_update then we can reduce the fields !in order to do everything. This may require a new class of subcommunicators !Now we need to reduce across cells !AJ If non-blocking collective communications are active then they are called here call self%reduce_tmpsum end subroutine sc_get_field_update !> Reduce the field update across cells to give the final answer subroutine sc_reduce_tmpsum(self) use mp, only: sum_allreduce_sub, sum_reduce_sub, nb_sum_reduce_sub implicit none class(supercell_type), intent(in out) :: self integer :: ic complex, dimension(self%nrow) :: local_tmp_sum ! Zero out the local tmp_sum local_tmp_sum = 0.0 !NOTE: Here we rely on the cells having not reduced/ !gathered their data yet so there are no repeated rows. ! Clearly cells%tmp_sum must be the same size as self%tmp_sum ! at this point. If cells only updated a subset of their tmp_sum ! entries we could perhaps save some work here by only summing over ! the set regions. Currently the work involved here does not scale ! at all with increasing processor count _beyond_ changes to cell ! locality (i.e. when is_empty triggers). !First do local sums !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ic) & !$OMP SHARED(self) & !$OMP REDUCTION(+: local_tmp_sum) & !$OMP SCHEDULE(static) do ic = 1, self%ncell if (self%cells(ic)%is_empty) cycle local_tmp_sum = local_tmp_sum + self%cells(ic)%tmp_sum end do !$OMP END PARALLEL DO ! Copy local tmp_sum into super cell variant. Note this is an added cost ! due to OpenMP requiring us to use a non-member variable in reduction ! clauses. If this file was preprocessed then we could imagine offering ! an alternative code-path here for non-OpenMP builds. self%tmp_sum = local_tmp_sum !<DD>TAGGED: As we currently have to do fm_gather_fields on every time step we only need !to reduce to the head of the supercell, which may be an all_local proc in which case we don't !want to do any reduction. If it's a pd proc then really we should just do reduce_sub rather !than allreduce_sub as we're going to broadcast the result later anyway. !if(self%sc_sub_pd%nproc.gt.0) call sum_allreduce_sub(self%tmp_sum,self%sc_sub_pd%id) if (.not.(self%is_empty .or. self%is_all_local)) then !AJ If non-blocking collectives are enabled and available on the system then call them here if (field_local_nonblocking_collectives) then call nb_sum_reduce_sub(self%tmp_sum, 0, self%sc_sub_pd, self%collective_request) else call sum_reduce_sub(self%tmp_sum, 0, self%sc_sub_pd) end if end if !<DD> At this point the head of the supercell has the field update stored in self%tmp_sum !all other procs have partial/no data end subroutine sc_reduce_tmpsum !> Convert the extended domain index to ig, it and ifl !! !! @note The iex_to routines all need testing! subroutine sc_iex_to_dims(self,iex,ig,ic,it,ifl) use theta_grid, only: ntgrid implicit none class(supercell_type), intent(in) :: self integer, intent(in) :: iex integer, intent(out) :: ig,ic,it,ifl !Get field index ifl=1+int((iex-1)/self%nextend) !Get cell and it index ic=min(self%ncell,1+int((iex-(ifl-1)*self%nextend-1)/(2*ntgrid))) it=self%cells(ic)%it_ind !Get ig index ig=-ntgrid+iex-(ifl-1)*self%nextend-1-2*ntgrid*(ic-1) end subroutine sc_iex_to_dims !> Convert the extended domain index to ifl subroutine sc_iex_to_ifl(self,iex,ifl) implicit none class(supercell_type), intent(in) :: self integer, intent(in) :: iex integer, intent(out) :: ifl ifl=1+(iex-1)/self%nextend end subroutine sc_iex_to_ifl !> Convert the extended domain index to ic subroutine sc_iex_to_ic(self,iex,ic) use theta_grid, only: ntgrid implicit none class(supercell_type), intent(in) :: self integer, intent(in) :: iex integer, intent(out) :: ic integer :: ifl, iex_tmp !First work out how many fields there are call self%iex_to_ifl(iex,ifl) !Ensure we're looking between 1 and self%nextend iex_tmp=iex-(ifl-1)*self%nextend ic=MIN(int((iex_tmp-1)/(2*ntgrid))+1,self%ncell) end subroutine sc_iex_to_ic !> Convert the extended domain index to it subroutine sc_iex_to_it(self,iex,it) use theta_grid, only: ntgrid implicit none class(supercell_type), intent(in) :: self integer, intent(in) :: iex integer, intent(out) :: it integer :: ifl, iex_tmp, ic !First work out how many fields there are call self%iex_to_ifl(iex,ifl) !Ensure we're looking between 1 and self%nextend iex_tmp=iex-(ifl-1)*self%nextend ic=MIN(int((iex_tmp-1)/(2*ntgrid))+1,self%ncell) it=self%cells(ic)%it_ind end subroutine sc_iex_to_it !> Convert the extended domain index to ig subroutine sc_iex_to_ig(self,iex,ig) use theta_grid, only: ntgrid implicit none class(supercell_type), intent(in) :: self integer, intent(in) :: iex integer, intent(out) :: ig integer :: iex_tmp, ifl !First get ifl call self%iex_to_ifl(iex,ifl) !Limit to 1 and self%nextend iex_tmp=iex-(ifl-1)*self%nextend if(iex_tmp==self%nextend) then ig=ntgrid else ig=-ntgrid+mod(iex_tmp-1,2*ntgrid) endif end subroutine sc_iex_to_ig !> Is the passed it a member of this supercell function sc_has_it(self,it) implicit none class(supercell_type), intent(in) :: self integer, intent(in) :: it logical :: sc_has_it sc_has_it=any(self%cells%it_ind==it) end function sc_has_it !> A routine to reset the object subroutine sc_reset(self) use mp, only: free_comm implicit none class(supercell_type), intent(inout) :: self integer :: ic !Call reset on all children do ic=1,self%ncell call self%cells(ic)%reset enddo ! Free up communicators created for this supercell if(self%sc_sub_all%nproc>0 .and. self%sc_sub_all%id/=self%parent_sub%id) then call free_comm(self%sc_sub_all) end if if(self%sc_sub_pd%nproc>0) then call free_comm(self%sc_sub_pd) end if !deallocate call self%deallocate !Could zero out variables but no real need end subroutine sc_reset !> Set the locality of each object subroutine sc_set_locality(self, pc) implicit none class(supercell_type), intent(inout) :: self class(pc_type), intent(in) :: pc integer :: ic !Set the locality of children do ic=1,self%ncell call self%cells(ic)%set_locality(pc) enddo !Set our locality self%is_empty=all(self%cells%is_empty) self%is_local=any(self%cells%is_local) self%is_all_local=all(self%cells%is_all_local) end subroutine sc_set_locality !> Given an it value get the it of the left connected cell function sc_get_left_it(self,it) implicit none class(supercell_type),intent(in) :: self integer,intent(in)::it integer :: sc_get_left_it, ic, tmp !Default to no connection tmp=-1 !Look for early exit if((.not.self%has_it(it)).or.(self%ncell==1)) then tmp=-1 else !Now see which cell has this it do ic=1,self%ncell if(self%cells(ic)%it_ind==it) then tmp=ic-1 exit endif enddo !Now set appropriate value if(tmp<=0) then tmp=-1 else tmp=self%cells(tmp)%it_ind endif endif !Update result sc_get_left_it=tmp end function sc_get_left_it !> Debug routine to dump the current supercell subroutine sc_dump(self,prefix) use mp, only: proc0 use optionals, only: get_option_with_default implicit none class(supercell_type), intent(inout) :: self character(len=*), optional, intent(in) :: prefix character(len=80) :: fname complex, dimension(:,:), allocatable :: tmp integer :: lun !Make filename write(fname,'(A,"sc_is_",I0,"_ik_",I0,".dat")') & trim(get_option_with_default(prefix, '')), self%is_ind, self%ik_ind !Allocate array and fetch data allocate(tmp(self%ncol,self%nrow)) !Fetch data call self%pull_rows_to_arr(tmp) !Now we've fetched data we can write if(proc0) then !Write open(newunit=lun,file=fname,status='replace',form="unformatted") write(lun) tmp close(lun) endif !Free memory deallocate(tmp) end subroutine sc_dump !> Routine to write the response matrix for this supercell to netcdf file. !> !> @note Only the head processor actually writes to file. However, in current !> use we only call this routine on processors which have this supercell local !> and which don't consider it empty. This is usually ok, but for box runs the !> mode with ky=kx=0 (super)cell is forced to be considered empty which means !> that, as it stands, this supercell won't be dumped here (although it is for !> the fields_implicit approach). This could cause issues for restoring these !> response matrices as we end up recalculating these if we can't load any !> expected file. As such we have to be careful how we deal with this matrix. subroutine sc_dump_to_file(self, suffix) use gs2_save, only: gs2_save_response use fields_arrays, only: get_specific_response_file_name, time_dump_response use gs2_time, only: code_dt use job_manage, only: time_message implicit none !> The instance of the supercell class class(supercell_type), intent(in out) :: self !> If passed then use as part of file suffix character(len=*), optional, intent(in) :: suffix complex, dimension(:,:), allocatable :: tmp_arr ! Exit early if we're not involved in this supercell if(.not.self%is_local) return call time_message(.false.,time_dump_response,' Field Dump') !Allocate array -- for big problems this might be an issue allocate(tmp_arr(self%nrow,self%ncol)) !First have to pull all row level data call self%pull_rows_to_arr(tmp_arr) !Now save if on head proc -- could probably only allocate tmp_arr and call !pull_rows_to_arr on this head proc as well, at least for now. if(self%is_head)then call gs2_save_response(tmp_arr, get_specific_response_file_name(self%ik_ind, self%is_ind, code_dt, suffix), code_dt, self%condition_number) endif deallocate(tmp_arr) call time_message(.false.,time_dump_response,' Field Dump') end subroutine sc_dump_to_file !> Routine to read the response matrix for this supercell from netcdf file. subroutine sc_read_from_file(self, could_read, suffix) use gs2_save, only: gs2_restore_response use mp, only: broadcast_sub, proc0 use fields_arrays, only: get_specific_response_file_name, time_read_response use gs2_time, only: code_dt use file_utils, only: error_unit use job_manage, only: time_message implicit none !> The instance of the supercell class class(supercell_type), intent(in out) :: self !> Flag to indicate if the file was successfully read logical, intent(out) :: could_read !> If passed then use as part of file suffix character(len=*), optional, intent(in) :: suffix complex, dimension(:,:), allocatable :: tmp_arr character(len = 256) :: file_name logical :: file_found real :: file_time_step ! Initialise reading state. could_read = .true. ! Construct the expected file name file_name = adjustl(get_specific_response_file_name(self%ik_ind, self%is_ind, code_dt, suffix)) ! First check if the expected file exists, if not then exit inquire( file = trim(file_name), exist = file_found) if (.not. file_found) then if(proc0) write(error_unit(), & '("Could not find response file ",A,", reverting to calculation.")') trim(file_name) ! Set the flag indicating if we could read the file. Note if this processor ! doesn't require this data (i.e. it considers this empty) then we ignore ! the presence of the file as we don't care. This is mostly important for ! the ky=kx=0 file which may not be present but also isn't required for any ! processor. could_read = .false. .or. self%is_empty return end if ! We start timing after an initial early exit call time_message(.false.,time_read_response,' Field Read') !Now allocate array -- for big problems this might be an issue allocate(tmp_arr(self%nrow,self%ncol)) !Now if on head proc read if (self%is_head) then call gs2_restore_response(tmp_arr, file_name, file_time_step, self%condition_number) ! Should perhaps double check that file_time_step == code_dt? end if !Now we need to broadcast the data to everyone with supercell. !NOTE: Really we would want a supercell bound method which will do !this so that this module level routine is insensitive to communicators etc. !Furthermore we are lazy and use a broadcast_sub to send data to everyone !NOTE: The processors which consider this to supercell to be empty (i.e. which !don't have any associated work) shouldn't need to take part in this communication. !If we can promise that only non-empty processors could be the head that should !work fine, provided we have a suitable communicator (sc_sub_pd?). call broadcast_sub(tmp_arr,self%head_iproc,self%sc_sub_all%id) !Now push array to local storage call self%push_arr_to_rows(tmp_arr) deallocate(tmp_arr) call time_message(.false.,time_read_response,' Field Read') end subroutine sc_read_from_file !> Store the field equations at row level subroutine sc_store_fq(self, fq, fqa, fqp, ifl_in, it_in, ig_in) use theta_grid, only: ntgrid use run_parameters, only: has_phi, has_apar, has_bpar implicit none class(supercell_type), intent(inout) :: self complex, dimension(:, :), intent(in) :: fq, fqa, fqp integer, intent(in) :: ifl_in, it_in, ig_in integer :: ic, ic_in, irow, ifq, ulim, ir !If we don't have anything in this supercell then exit if(self%is_empty) return !Find out which cell has our it ic_in=0 do ic=1,self%ncell if(self%cells(ic)%it_ind==it_in) then ic_in=ic exit endif enddo !Work out the row we want to put data in irow=(ig_in+ntgrid+1)+(ic_in-1)*(2*ntgrid)+(ifl_in-1)*self%nextend ! Loop over cells do ic=1,self%ncell !If we don't have this cell cycle if(self%cells(ic)%is_empty) then cycle endif !Check if we have this row, if not cycle if(.not.self%cells(ic)%has_row(irow)) then cycle endif !Find upper limit of column ulim=self%cells(ic)%ncol ifq=0 if(has_phi)then !Increment counter ifq=ifq+1 !Convert extended irow to local ir=self%cells(ic)%rb(ifq)%irex_to_ir(irow) !Store data self%cells(ic)%rb(ifq)%data(:,ir) = fq(:ulim, self%cells(ic)%it_ind) end if if(has_apar)then !Increment counter ifq=ifq+1 !Convert extended irow to local ir=self%cells(ic)%rb(ifq)%irex_to_ir(irow) !Store data self%cells(ic)%rb(ifq)%data(:,ir) = fqa(:ulim, self%cells(ic)%it_ind) end if if(has_bpar)then !Increment counter ifq=ifq+1 !Convert extended irow to local ir=self%cells(ic)%rb(ifq)%irex_to_ir(irow) !Store data self%cells(ic)%rb(ifq)%data(:,ir) = fqp(:ulim, self%cells(ic)%it_ind) end if end do end subroutine sc_store_fq !> Prepare the field matrix for calculating field updates subroutine sc_prepare(self,prepare_type,pc) use mp, only: mp_abort implicit none class(supercell_type), intent(inout) :: self class(pc_type), intent(in) :: pc integer, intent(in) :: prepare_type character (len=40) :: errmesg !Exit early if we're empty if(self%is_empty) return if(.not.self%is_local) return !Need to decide what method we're using to prepare the !field matrix select case(prepare_type) case(0) !Invert call self%invert(pc) case default write(errmesg,'("ERROR: Invalid prepare_type : ",I0)') prepare_type call mp_abort(trim(errmesg)) end select ! Now dump the matrix if (dump_response) call self%dump_to_file end subroutine sc_prepare !> A routine to invert the field matrix subroutine sc_invert(self, pc) use job_manage, only: time_message use fields_arrays, only: time_field_invert, time_field_invert_mpi use mp, only: get_mp_times implicit none class(supercell_type), intent(inout) :: self class(pc_type), intent(in) :: pc real :: mp_total, mp_total_after if(self%is_empty) return if(.not.self%is_local) return call time_message(.false.,time_field_invert,' Field Matrix Invert') call get_mp_times(total_time = mp_total) !If we have all of the supercell local then use an explictly local !matrix inversion !Note could have a variable which forces local inversion if(self%is_all_local.or.pc%force_local_invert) then ! call self%dump(prefix="orig_") call self%invert_local ! call self%dump(prefix="inv_") else call self%invert_mpi endif call time_message(.false.,time_field_invert,' Field Matrix Invert') call get_mp_times(total_time = mp_total_after) time_field_invert_mpi = time_field_invert_mpi + (mp_total_after - mp_total) end subroutine sc_invert !> A routine to invert the field matrix locally !> !> @note Currently every processor which is responsible for a chunk !> of the response matrix at the solve stage will do this inversion. !> This is obviously duplicated work but allows us to skip any !> communication in the push_arr_to_rows method. If only one !> processor did the inversion then we would need to communicate the !> result back. We are unlikely to want to do this in a non-blocking !> manner as it would require that we hold onto the tmp_arr for each !> response matrix, only freeing this memory after the non-blocking !> communications have completed. As these matrices are large we !> don't want them hanging around longer than required. In the typical !> use case where we have large minnrow, we would expect most matrices !> to belong to just one or a few processors so this point may not be !> important in most cases. subroutine sc_invert_local(self) use mat_inv, only: invert_serial use mp, only: broadcast_sub implicit none class(supercell_type), intent(inout) :: self complex, dimension(:,:), allocatable :: tmp_arr !Exit early if possible if(self%is_empty) return if(.not.self%is_local) return !Allocate temporary array allocate(tmp_arr(self%ncol,self%nrow)) !First have to pull all row level data call self%pull_rows_to_arr(tmp_arr) !It may be better to hook into the GK_VERBOSITY level system !rather than relying on the debug flag. if (self%is_head) then self%condition_number = l2norm(tmp_arr) end if !Now invert in place if (self%is_head) then call invert_serial(tmp_arr, self%nrow) tmp_arr=transpose(tmp_arr) end if !It would be quite nice if we could do the following in a non-blocking manner !but this would require us to either keep the potentially large tmp_arr allocated !for all supercells or for us to be able to do this "in-place" (i.e. we use something !like non-blocking scatterv rather than broadcast). if (.not. self%is_all_local) then if (self%sc_sub_pd%nproc > 0) call broadcast_sub(tmp_arr, & self%head_iproc_pd, self%sc_sub_pd%id) end if if (self%is_head) then self%condition_number = self%condition_number * l2norm(tmp_arr) end if !Now push back to row level call self%push_arr_to_rows(tmp_arr) !Free memory deallocate(tmp_arr) end subroutine sc_invert_local !> A routine to invert the field matrix using mpi subroutine sc_invert_mpi(self) use mp, only: mp_abort implicit none class(supercell_type), intent(inout) :: self self%is_empty=.true. !Until implemented make everything be treated as empty. call mp_abort("ERROR: Invert with mpi not yet implemented.") end subroutine sc_invert_mpi !> A routine to collect all the row level data and store in passed array !! Gather the row blocks up for this cell to fill an array subroutine sc_pull_rows_to_arr(self,arr) use mp, only: sum_allreduce_sub use array_utils, only: zero_array implicit none class(supercell_type), intent(in) :: self complex, dimension(:,:), intent(out) :: arr !Note: Dimensions should be self%ncol, self%nrow integer :: ic, rl,ru, cl,cu, irb ! Zero out arr |--> Not needed if gathering ! This may look like it is only needed for the sum_allreduce_sub branch below, ! however without this line we would end up not setting arr in cases where we ! think of this supercell as empty. This is usually fine, but for the ky=kx=0 ! mode all processors consider the supercell to be empty, but we still need to ! initialise this array in some cases (e.g. when dumping to file). We could ! attempt to only do this zeroing in this special case to save some time, but ! we don't expect to be calling this routine much so opt for simpler approach. call zero_array(arr) !Now see if we want to exit if (self%is_empty) return if (.not.self%is_local) return !Place local piece in arr !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ic, irb, rl, ru, cl, cu) & !$OMP SHARED(self, arr) & !$OMP SCHEDULE(static) do ic = 1, self%ncell if (self%cells(ic)%is_empty) cycle do irb = 1, self%cells(ic)%nrb rl=self%cells(ic)%rb(irb)%row_llim ru=self%cells(ic)%rb(irb)%row_ulim cl=self%cells(ic)%rb(irb)%col_llim cu=self%cells(ic)%rb(irb)%col_ulim arr(cl:cu,rl:ru)=self%cells(ic)%rb(irb)%data end do end do !$OMP END PARALLEL DO !If we're now all local then also need comms if (.not. self%is_all_local) then !Now use MPI to get the rest of the data from other procs !<DD>FOR NOW USE ALL_REDUCE AS EASIER, BUT SHOULD BE ABLE !TO DO THIS WITH ALL_GATHERV WHICH SHOULD BE FASTER ! We only actually need the result on the head of the supercell ! as this is the only process which operates on the result. ! We could therefore replace allreduce with reduce. if (self%sc_sub_pd%nproc > 0) call sum_allreduce_sub(arr, self%sc_sub_pd%id) end if end subroutine sc_pull_rows_to_arr !> A routine to distribute an array to appropriate row blocks subroutine sc_push_arr_to_rows(self,arr) implicit none class(supercell_type), intent(inout) :: self complex, dimension(self%nrow,self%ncol), intent(in) :: arr integer :: ic, ir integer :: rl,ru,cl,cu !If empty don't have anywhere to store data so exit if(self%is_empty) return !Store all local sections !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ic, ir, rl, ru, cl, cu) & !$OMP SHARED(self, arr) & !$OMP SCHEDULE(static) do ic = 1, self%ncell if (self%cells(ic)%is_empty) cycle do ir = 1, self%cells(ic)%nrb !Get row/col limits rl = self%cells(ic)%rb(ir)%row_llim ru = self%cells(ic)%rb(ir)%row_ulim cl = self%cells(ic)%rb(ir)%col_llim cu = self%cells(ic)%rb(ir)%col_ulim !Store data self%cells(ic)%rb(ir)%data = arr(cl:cu, rl:ru) end do end do !$OMP END PARALLEL DO end subroutine sc_push_arr_to_rows !> Create primary (top level) sub communicators subroutine sc_make_subcom_1(self) use mp, only: comm_type, split, free_comm, mp_undefined, sum_allreduce_sub implicit none class(supercell_type), intent(inout) :: self integer :: ic, colour type(comm_type) :: tmp !/First make a subcommunicator involving any procs !which can see this supercell colour = 0 if(self%is_local) colour = 1 !Here we check if the new subcom has same number of members as !parent, if so we can carry on using the parent instead call sum_allreduce_sub(colour, self%parent_sub%id) if (colour == self%parent_sub%nproc) then self%sc_sub_all=self%parent_sub else colour = mp_undefined if(self%is_local) colour = 1 call split(colour,tmp,self%parent_sub%id) !Note we only store the subcom for those procs which will use it !this ensures an error will occur if we try to use the subcom in !the wrong place if(self%is_local)then self%sc_sub_all=tmp else !If we can't see any of this object then !return return endif endif !Now make the supercell sub communicators do ic=1,self%ncell !Set parent subcom self%cells(ic)%parent_sub=self%sc_sub_all enddo end subroutine sc_make_subcom_1 !> Create the secondary (intraobject) subcommunicators subroutine sc_make_subcom_2(self) use mp, only: comm_type, split, sum_allreduce_sub, proc0 use mp, only: mp_undefined, min_allreduce_sub, nproc, iproc implicit none class(supercell_type), intent(inout) :: self integer :: colour, cc integer, dimension(3) :: colours type(comm_type) :: tmp logical :: some_empty, some_pd, some_all, is_partial_data !Exit if we don't have this supercell if (.not. self%is_local) return ! Set local flags colours = 0 is_partial_data = .not. (self%is_empty .or. self%is_all_local) if (self%is_empty) colours(1) = 1 if (is_partial_data) colours(2) = 1 if (self%is_all_local) colours(3) = 1 ! Get global flags call sum_allreduce_sub(colours, self%sc_sub_all%id) !/Work out if we have any empty procs some_empty = colours(1) /= 0 !/Work out if we have any partial data procs some_pd = colours(2) /= 0 !/Work out if we have any all local procs some_all = colours(3) /= 0 !/Now make a subcommunicator involving all procs !which have part of the supercell data range (but not all of it) if (some_pd) then colour = mp_undefined if (is_partial_data) colour = 1 call split(colour, tmp, self%sc_sub_all%id) if (colour /= mp_undefined) then !/Store communicator self%sc_sub_pd = tmp end if end if !Decide if we're the head proc !Note: We also decide which proc is in charge of filling the empty !procs with data. By allowing this to be different from the head proc !we have a better chance of being able to overlap comms and computations !Note: We try to make proc0 the head proc if it is a member of the supercell !which has data as it means we don't need to send this supercell to anyone. if (some_all) then ! Here we try to determine which processor will be the head. ! If proc0 is included and has is_all_local we prefer that. If not we can ! identify the head as the first proc in sc_sub_all which has is_all_local. ! 1. We can determine if proc0 is present and has is_all_local ! by simply setting cc = (proc0 .and. self%is_all_local) ? 1 : 0 ! and then reducing that on sc_sub_all. ! 2. If proc0 is not present then the head will just be the proc with ! the lowest rank in sc_sub_all with is_all_local true. Therefore we ! instead take min_allreduce of cc = is_all_local ? sc_sub_all%iproc : nproc + 1 ! and then set self%is_head = sc_sub_all%iproc == cc ! Work out if proc0 is a member of sc_sub_all _and_ has is_all_local cc = 0 if (proc0 .and. self%is_all_local) cc = 1 call sum_allreduce_sub(cc, self%sc_sub_all%id) if (cc == 1) then ! If we have proc0 then that is head self%is_head = proc0 else if (self%is_all_local) then cc = self%sc_sub_all%iproc else cc = nproc + 1 end if ! We don't currently have an implementation for this. call min_allreduce_sub(cc, self%sc_sub_all%id) ! If we don't have proc0 then the head is the processor ! with the lowest rank in sc_sub_all which has is_all_local self%is_head = cc == self%sc_sub_all%iproc end if else if (self%sc_sub_pd%nproc > 0)then cc = 0 if (proc0) cc = 1 call sum_allreduce_sub(cc, self%sc_sub_pd%id) !Decide if we're the head, prefer proc0 to be head !if it's a member of our group if (cc /= 0) then self%is_head = proc0 else self%is_head = self%sc_sub_pd%proc0 end if end if end if !Store the head_iproc values colours = 0 if (self%is_head) then colours(1) = self%sc_sub_all%iproc colours(2) = self%sc_sub_pd%iproc colours(3) = iproc end if call sum_allreduce_sub(colours, self%sc_sub_all%id) self%head_iproc = colours(1) self%head_iproc_pd = colours(2) self%head_iproc_world = colours(3) end subroutine sc_make_subcom_2 !------------------------------------------------------------------------ !================ !KYBLOCK !================ !> Allocate storage space subroutine ky_allocate(self) implicit none class(ky_type), intent(inout) :: self integer :: is do is=1,self%nsupercell call self%supercells(is)%allocate enddo end subroutine ky_allocate !> Deallocate storage space subroutine ky_deallocate(self) implicit none class(ky_type), intent(inout) :: self if(allocated(self%supercells)) deallocate(self%supercells) end subroutine ky_deallocate !> Setup properties of ky_type instance subroutine ky_init(self, ik, itmins, nsupercell, nfield, nbound) implicit none class(ky_type), intent(in out) :: self integer, intent(in) :: ik, nsupercell, nfield integer, dimension(:), intent(in) :: itmins integer, intent(in out) :: nbound integer :: is self%ik_ind = ik self%nsupercell = nsupercell allocate (self%supercells(nsupercell)) do is = 1, nsupercell call self%supercells(is)%init(is, itmins(is), ik, nfield, nbound) end do end subroutine ky_init !> Debug printing subroutine ky_debug_print(self) implicit none class(ky_type), intent(in) :: self write(dlun,'("Ky block debug print. Index ik=",I0," nsupercell=",I0)') & self%ik_ind, self%nsupercell end subroutine ky_debug_print !> Get the field update for this ik subroutine ky_get_field_update(self,fq,fqa,fqp) implicit none class(ky_type), intent(inout) :: self complex, dimension(:,:), intent(in) :: fq,fqa,fqp integer :: is !First trigger calculation of field update. !Note: Empty supercells don't do anything do is=1,self%nsupercell !/Calculate the field update call self%supercells(is)%get_field_update(fq,fqa,fqp) enddo end subroutine ky_get_field_update !> Wait on the nonblocking collective communications if they have been used. subroutine ky_wait_for_nonblocking_collectives(self) use mp, only : wait implicit none class(ky_type), intent(inout) :: self integer :: is !AJ This could be done more efficiently if the requests !AJ are put into a single array and waitany is called on that !AJ array. do is=1,self%nsupercell call wait(self%supercells(is)%collective_request) enddo end subroutine ky_wait_for_nonblocking_collectives !> Store the field equation at row level subroutine ky_store_fq(self,fq,fqa,fqp,ifl_in,it_in,ig_in) implicit none class(ky_type), intent(in out) :: self complex, dimension(:, :), intent(in) :: fq, fqa, fqp integer, intent(in) :: ifl_in, it_in, ig_in integer :: is !If we don't have anything in this supercell then exit if (self%is_empty) return !Delegate work to supercell do is = 1,self%nsupercell if (.not.self%supercells(is)%has_it(it_in)) cycle call self%supercells(is)%store_fq(fq, fqa, fqp, ifl_in, it_in, ig_in) end do end subroutine ky_store_fq !> Given it say what the supercell id is function ky_is_from_it(self,it) implicit none class(ky_type), intent(inout) :: self integer, intent(in) :: it integer :: ky_is_from_it integer :: is ky_is_from_it=-1 do is=1,self%nsupercell if(self%supercells(is)%has_it(it))then ky_is_from_it=is exit endif enddo end function ky_is_from_it !> A routine to reset the object subroutine ky_reset(self) use mp, only: free_comm, mp_comm_null implicit none class(ky_type), intent(inout) :: self integer :: is !Call reset on all children do is=1,self%nsupercell call self%supercells(is)%reset enddo !AJ Free communicators associated with this ky block if(self%ky_sub_all%id /= self%parent_sub%id .and. self%ky_sub_all%nproc > 0 .and. self%ky_sub_all%id /= mp_comm_null) then call free_comm(self%ky_sub_all) end if !deallocate call self%deallocate !Could zero out variables but no real need end subroutine ky_reset !> Set the locality of each object subroutine ky_set_locality(self, pc) implicit none class(ky_type), intent(inout) :: self class(pc_type), intent(in) :: pc integer :: is !Set the locality of children do is=1,self%nsupercell call self%supercells(is)%set_locality(pc) enddo !Set our locality self%is_empty=all(self%supercells%is_empty) self%is_local=any(self%supercells%is_local) self%is_all_local=all(self%supercells%is_all_local) end subroutine ky_set_locality !> Prepare the field matrix for calculating field updates subroutine ky_prepare(self,prepare_type,pc) implicit none class(ky_type), intent(inout) :: self integer, intent(in) :: prepare_type class(pc_type), intent(in) :: pc integer :: is !Exit early if we're empty if(self%is_empty) return if(.not.self%is_local) return !Tell each supercell to prepare ! This might be a good place to add OpenMP pragmas ! however it's possible that this could increase peak ! memory consumption due to now having to hold ! multiple matrices in memory at once, thereby ! potentially negating some of the benefit of using OpenMP ! to get more memory per process. More testing would ! be required to explore this further. do is=1,self%nsupercell call self%supercells(is)%prepare(prepare_type, pc) enddo end subroutine ky_prepare !> Create the primary subcommunicators subroutine ky_make_subcom_1(self) use mp, only: comm_type, split, free_comm, mp_undefined, sum_allreduce_sub implicit none class(ky_type), intent(inout) :: self integer :: is, colour type(comm_type) :: tmp !/First make a subcommunicator involving any procs !which can see this ky block colour = 0 if(self%is_local) colour = 1 call sum_allreduce_sub(colour, self%parent_sub%id) if ( colour == self%parent_sub%nproc) then self%ky_sub_all=self%parent_sub else colour = mp_undefined if(self%is_local) colour = 1 call split(colour,tmp,self%parent_sub%id) !Note we only store the subcom for those procs which will use it !this ensures an error will occur if we try to use the subcom in !the wrong place if(self%is_local)then self%ky_sub_all=tmp else return endif endif !Now make the supercell sub communicators do is=1,self%nsupercell !Set parent subcom self%supercells(is)%parent_sub=self%ky_sub_all !Now make child subcom call self%supercells(is)%make_subcom_1 enddo end subroutine ky_make_subcom_1 !> Create the secondary subcommunicators subroutine ky_make_subcom_2(self) implicit none class(ky_type), intent(inout) :: self integer :: is !Exit if we don't have this kyb if(.not.self%is_local)return !Now make the supercell sub communicators do is=1,self%nsupercell !Now make child subcom call self%supercells(is)%make_subcom_2 enddo end subroutine ky_make_subcom_2 !------------------------------------------------------------------------ !AJ Autotune MinNRow size to obtain optimal row size subroutine tuneMinNRow() use job_manage, only: timer_local use mp, only: nproc, max_reduce, min_reduce, sum_reduce, broadcast, proc0 use fields_arrays, only: phinew, aparnew, bparnew use gs2_layouts, only: g_lo use warning_helpers, only: exactly_equal implicit none integer :: i integer :: current_best_size real :: start_time, end_time, current_best, temp_best real :: min_best, max_best, av_best, current_av_best logical :: do_update, do_gather do_gather = .true. do_update=do_smart_update if(g_lo%x_local.and.g_lo%y_local) do_update=.false. minNRow = 2 current_best = -1 do i = 1,10 call init_fields_matrixlocal(fieldmat, pc, tuning_in=.true.) start_time = timer_local() call getfield_local(phinew,aparnew,bparnew,do_gather,do_update) end_time = timer_local() call finish_fields_local() temp_best = end_time - start_time max_best = temp_best call max_reduce(max_best, 0) min_best = temp_best call min_reduce(min_best, 0) av_best = temp_best call sum_reduce(av_best, 0) if(proc0) then av_best = av_best/nproc if(current_best < 0) then current_best = max_best current_av_best = av_best current_best_size = minNRow else if(max_best < current_best) then current_best = max_best current_av_best = av_best current_best_size = minNRow else if(exactly_equal(max_best, current_best)) then if(av_best < current_av_best) then current_best = max_best current_av_best = av_best current_best_size = minNRow end if end if end if end if minNRow = minNRow*2 end do call broadcast(current_best_size) minNRow = current_best_size if(proc0) then write(*,*) 'Chosen rowsize for matrix-vector operations in fields update',minNRow,current_best end if end subroutine tuneMinNRow !================ !FIELDMAT !================ !> Allocate storage space subroutine fm_allocate(self) implicit none class(fieldmat_type), intent(inout) :: self integer :: ik allocate(self%heads(self%ntheta0, self%naky)) if(.not.self%is_local) return do ik=1,self%naky call self%kyb(ik)%allocate enddo end subroutine fm_allocate !> Deallocate storage space subroutine fm_deallocate(self) implicit none class(fieldmat_type), intent(inout) :: self if(allocated(self%kyb)) deallocate(self%kyb) if(allocated(self%heads)) deallocate(self%heads) end subroutine fm_deallocate !> Debug printing subroutine fm_debug_print(self) implicit none class(fieldmat_type), intent(in) :: self write(dlun,'("Field debug print. naky=",I0)') & self%naky end subroutine fm_debug_print !> Just work out the locality (also sets is_empty etc. but is not intended for this) subroutine fm_set_is_local(self, pc) implicit none class(fieldmat_type), intent(inout) :: self class(pc_type), intent(in) :: pc integer :: ik !Set the locality of children do ik=1,self%naky call self%kyb(ik)%set_locality(pc) enddo !Set our locality self%is_empty=all(self%kyb%is_empty) self%is_local=any(self%kyb%is_local) self%is_all_local=all(self%kyb%is_local) end subroutine fm_set_is_local !> Initialise the field objects subroutine fm_init(self) use kt_grids, only: naky, ntheta0 use dist_fn, only: get_leftmost_it, n_supercells use run_parameters, only: has_phi, has_apar, has_bpar use theta_grid, only: ntgrid implicit none class(fieldmat_type), intent(inout) :: self integer :: it, cur_id, it_start, is_count, ik integer, dimension(:), allocatable :: tmp_supercell_ids integer, dimension(:,:), allocatable :: supercell_ids integer, dimension(:,:), allocatable :: is_to_itmin !Count the fields self%nfield = count([has_phi, has_apar, has_bpar]) !Allocate temp arrays allocate(supercell_ids(ntheta0, naky)) ! Here the first dimension is allocated as ntheta0 ! but for each ky we only use the first nsupercell(ik) ! entries. allocate(is_to_itmin(ntheta0, naky)) !Initialise arrays is_to_itmin = 0 supercell_ids = -1 !Find the leftmost it for each supercell and use this to label the !supercell. Note that this is different from the supercell_label !from dist_fn. do ik = 1, naky do it = 1, ntheta0 supercell_ids(it, ik) = get_leftmost_it(it, ik) end do end do !Now work out how many supercells there are for each !ik and what their properties are. do ik = 1, naky !Store the leftmost it values for current ik tmp_supercell_ids = supercell_ids(:, ik) !Now loop over all it values and count how many !have which it as the leftmost one. it_start = 0 is_count = 0 do while (any(tmp_supercell_ids >= 0)) !Increase the it value it_start = it_start + 1 !Get the leftmost it for supercell with it_start in it cur_id = tmp_supercell_ids(it_start) !If we've seen this supercell then cycle if (cur_id == -1) cycle !Increase supercell counter is_count = is_count + 1 !Store the leftmost it value is_to_itmin(is_count, ik) = cur_id !Set all matching entries to -1 so we know to skip !this supercell again do it = it_start, ntheta0 if (tmp_supercell_ids(it) == cur_id) then tmp_supercell_ids(it) = -1 end if end do end do end do !Free memory deallocate(tmp_supercell_ids) !Now we want to allocate the basic field structure !Store the basic properties self%naky = naky self%ntheta0 = ntheta0 self%npts = naky * ntheta0 * (2 * ntgrid + 1) self%nbound = 0 !Calculated whilst we create objects allocate(self%kyb(naky)) !/Make the kyb objects do ik = 1, naky call self%kyb(ik)%init(ik, is_to_itmin(:, ik), n_supercells(ik), self%nfield, self%nbound) end do !Free memory deallocate(supercell_ids, is_to_itmin) end subroutine fm_init !> Find the response of g to delta-fn field perturbations !! and store at the row level subroutine fm_populate(self, pc) use run_parameters, only: has_phi, has_apar, has_bpar use fields_arrays, only: phi, apar, bpar, phinew, aparnew, bparnew use dist_fn_arrays, only: g use kt_grids, only: kwork_filter use array_utils, only: zero_array implicit none class(fieldmat_type), intent(inout) :: self class(pc_type), intent(in) :: pc integer :: ifl, pts_remain, ik, is if(self%no_populate) return !First initialise everything to 0 call zero_array(g) call zero_array(phi) ; call zero_array(phinew) call zero_array(apar) ; call zero_array(aparnew) call zero_array(bpar) ; call zero_array(bparnew) !Initialise field counter ifl=0 !Do phi if(has_phi) then !Set the number of points left pts_remain=self%npts-self%nbound !Increment the field counter ifl=ifl+1 !Reset the filter array kwork_filter=.false. !Reset the init state arrays self%kyb%initdone=.false. do ik=1,self%naky self%kyb(ik)%supercells%initdone=.false. do is=1,self%kyb(ik)%nsupercell self%kyb(ik)%supercells(is)%initialised=.false. enddo enddo !Now loop over all points and initialise do while(pts_remain>0) call self%init_next_field_points(phinew,pts_remain,kwork_filter,ifl,pc) enddo endif !Do apar if(has_apar) then !Set the number of points left pts_remain=self%npts-self%nbound !Increment the field counter ifl=ifl+1 !Reset the filter array kwork_filter=.false. !Reset the init state arrays self%kyb%initdone=.false. do ik=1,self%naky self%kyb(ik)%supercells%initdone=.false. do is=1,self%kyb(ik)%nsupercell self%kyb(ik)%supercells(is)%initialised=.false. enddo enddo !Now loop over all points and initialise do while(pts_remain>0) call self%init_next_field_points(aparnew,pts_remain,kwork_filter,ifl,pc) enddo endif !Do bpar if(has_bpar) then !Set the number of points left pts_remain=self%npts-self%nbound !Increment the field counter ifl=ifl+1 !Reset the filter array kwork_filter=.false. !Reset the init state arrays self%kyb%initdone=.false. do ik=1,self%naky self%kyb(ik)%supercells%initdone=.false. do is=1,self%kyb(ik)%nsupercell self%kyb(ik)%supercells(is)%initialised=.false. enddo enddo !Now loop over all points and initialise do while(pts_remain>0) call self%init_next_field_points(bparnew,pts_remain,kwork_filter,ifl,pc) enddo endif !Reset the filter array kwork_filter=.false. end subroutine fm_populate !> Initialise the next set of delta functions, find the response !! and store in the appropriate rows. subroutine fm_init_next_field_points(self,field,pts_remain,kwork_filter,ifl,pc) use kt_grids, only: naky, ntheta0 use theta_grid, only: ntgrid use dist_fn, only: getfieldeq, getfieldeq_nogath, timeadv use dist_fn_arrays, only: fieldeq, fieldeqa, fieldeqp use fields_arrays, only: phi, apar, bpar, phinew, aparnew, bparnew use array_utils, only: zero_array implicit none class(fieldmat_type), intent(inout) :: self complex, dimension(-ntgrid:ntgrid,ntheta0,naky), intent(inout) :: field integer, intent(inout) :: pts_remain integer, intent(in) :: ifl logical, dimension(ntheta0, naky), intent(inout) :: kwork_filter class(pc_type), intent(in) :: pc integer :: ik, is, iex, ic, iex_init, ig, it integer :: it_cur, ig_cur integer :: pts_set_here logical :: found_ig, found_it pts_set_here = 0 !Loop over all ik !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ik, is, ic, iex_init, iex, ig, it) & !$OMP SHARED(self, kwork_filter, field, ntgrid) & !$OMP REDUCTION(+:pts_set_here) & !$OMP SCHEDULE(static) do ik = 1, self%naky !Technically should be able to skip ik values which are not local (though could be empty) !/would have to remember to decrease pts_remain !If we've finished this ky set the !filter and move on to next ik if (self%kyb(ik)%initdone) then kwork_filter(:, ik) = .true. cycle end if !Now look at all supercells with this ik do is = 1, self%kyb(ik)%nsupercell !If we've finished with this supercell !then set the filter and move on to next if (self%kyb(ik)%supercells(is)%initdone) then !Set filter for all it on supercell do ic = 1, self%kyb(ik)%supercells(is)%ncell kwork_filter(self%kyb(ik)%supercells(is)%cells(ic)%it_ind, ik) = .true. end do cycle end if !//If we get to this point then we know that we have at least !one grid point on supercell for which we've not made a delta-fn !Now look for the first point that we've not initialised iex_init = 0 do iex = 1, self%kyb(ik)%supercells(is)%nextend !If we've already done this point then cycle if (self%kyb(ik)%supercells(is)%initialised(iex)) cycle !Else this is the first point so store it and exit iex_init = iex exit end do !If this is the last point then mark supercell as done !but don't set filter yet, this will be automatically !applied on next loop if (iex_init == self%kyb(ik)%supercells(is)%nextend) then self%kyb(ik)%supercells(is)%initdone = .true. endif !Mark this point as initialised self%kyb(ik)%supercells(is)%initialised(iex_init) = .true. !Increment the number of points handled in this call pts_set_here = pts_set_here + 1 !Get the ig and it indices corresponding to this point call self%kyb(ik)%supercells(is)%iex_to_ig(iex_init,ig) call self%kyb(ik)%supercells(is)%iex_to_it(iex_init,it) !Initialise field field(ig, it, ik) = 1.0 !Set the duplicate boundary point if we're at the left !boundary of this call and we are not the leftmost cell in !the chain. if ((ig == -ntgrid) .and. & (it /= self%kyb(ik)%supercells(is)%it_leftmost)) then !Get the left connection it = self%kyb(ik)%supercells(is)%get_left_it(it) field(ntgrid, it, ik) = 1.0 end if end do !Ends loop over supercells !Decide if we've done this kyb self%kyb(ik)%initdone = all(self%kyb(ik)%supercells%initdone) end do !Ends loop over ik !$OMP END PARALLEL DO ! Update the number of points remaining pts_remain = pts_remain - pts_set_here !Now do the time advance call timeadv(phi, apar, bpar, phinew, aparnew, bparnew, 0) !Now get the field equations !In some(/most) situations the nogath version should !be ok, and will be better for scaling. if (pc%has_to_gather) then call getfieldeq(phinew, aparnew, bparnew, fieldeq, fieldeqa, fieldeqp) else ! call getfieldeq_nogath(phinew,aparnew,bparnew,fq,fqa,fqp) !<DD>Could improve performance by using a "smart" routine which only operates on local/not empty data call self%getfieldeq_nogath(phinew, aparnew, bparnew, fieldeq, fieldeqa, fieldeqp) end if !Now we need to store the field equations in the appropriate places !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ik, ig_cur, it_cur, found_ig, found_it, it, is) & !$OMP SHARED(self, field, ntgrid, ntheta0, ifl, fieldeq, fieldeqa, fieldeqp) & !$OMP SCHEDULE(static) do ik = 1, self%naky !If no data then skip ik -- because we don't skip setting !the field in the first loop of this routine but we do skip !zeroing it out here on processors which don't have any interest !in this ky, we need to zero the field after this loop to !ensure it has been fully reset to 0 on all procs. We might !consider skipping the setting of the field in the first loop !instead, although this could mean the field is different on !different processors (which should be fine) and might complicate !how we track the number of points we need to visit etc. if (self%kyb(ik)%is_empty) cycle !Loop until we've done all supercells ! Note we dont use gs2_max_abs here as we are already inside ! an OpenMP region. do while (maxval(abs(field(: , : ,ik))) > 0) !Reset variables ig_cur = -ntgrid-1 it_cur = 0 found_ig = .false. found_it = .false. !Look for first entry which was set to a delta-fn theta0: do it = 1, ntheta0 ! Find the first theta grid point equal to 1.0 ig_cur = findloc(field(:, it, ik), cmplx(1.0, 0.0), dim = 1) found_ig = ig_cur > 0 ! If we didn't find a theta grid value move on to ! next it if (.not. found_ig) cycle theta0 ! Adjust ig_cur to correct index range (findloc ! assumes arrays are 1 indexed) ig_cur = ig_cur - (ntgrid + 1) if (found_ig) then it_cur = it found_it = .true. exit theta0 end if end do theta0 !Now check we've found a point, else cycle !NOTE: If this triggers we're probably in trouble !as nothing will be different on the next iteration !and hence we will be stuck in an infinite loop. !As such this should really trigger an abort. if (.not. found_it) cycle !Zero out the field at this point so we ignore !it on future iterations field(ig_cur, it_cur, ik)=0.0 !Find the appropriate supercell is = self%kyb(ik)%is_from_it(it_cur) !Also zero out the boundary point if needed if((ig_cur == -ntgrid) .and. & (it_cur /= self%kyb(ik)%supercells(is)%it_leftmost)) then it = self%kyb(ik)%supercells(is)%get_left_it(it_cur) field(ntgrid, it, ik) = 0.0 end if !Now store data call self%kyb(ik)%store_fq(fieldeq(:, :, ik), fieldeqa(:, :, ik), & fieldeqp(:, :, ik), ifl, it_cur, ig_cur) end do !Ends loop over points end do !Ends loop over ky !$OMP END PARALLEL DO ! See comment in above loop for why this is needed call zero_array(field) end subroutine fm_init_next_field_points !> Prepare the field matrix for calculating field updates subroutine fm_prepare(self, pc) implicit none class(fieldmat_type), intent(inout) :: self class(pc_type), intent(in) :: pc integer :: ik !Exit early if we're empty if(self%is_empty) return if(.not.self%is_local) return if(self%no_prepare) return !Tell each ky to prepare !This might be a good place to add OpenMP as each prepare (inversion) !has the potential to be quite slow. However, as the cost per inversion !can be quite variable (due to very variable length supercell domains) !it may make more sense to OpenMP at the next level down (i.e. the kyb) !so we're more likely to be sharing work a bit more fairly. Also, at scale !one might expect most of the ky to be "empty" on a given processor. do ik=1,self%naky call self%kyb(ik)%prepare(self%prepare_type, pc) enddo end subroutine fm_prepare !> A routine to calculate the update to the fields subroutine fm_get_field_update(self, phi, apar, bpar, pc) use dist_fn, only: getfieldeq, getfieldeq_nogath use dist_fn_arrays, only: fieldeq, fieldeqa, fieldeqp implicit none class(fieldmat_type), intent(inout) :: self complex, dimension(:,:,:), intent(inout) :: phi,apar,bpar class(pc_type), intent(in) :: pc integer :: ik !First get the field eq if(pc%has_to_gather)then call getfieldeq(phi,apar,bpar,fieldeq,fieldeqa,fieldeqp) else !<DD>Could improve performance by using a "smart" routine which only operates on local/not empty data call self%getfieldeq_nogath(phi,apar,bpar,fieldeq,fieldeqa,fieldeqp) endif ! Now get the field update for each ik. Note that we don't OpenMP ! here as at scale the majority of kyb will not be local so it is ! expected to be better to OpenMP at the supercell level when ! iterating over cells. do ik=1,self%naky !Skip non-local ky if(.not.self%kyb(ik)%is_local) cycle !Trigger the supercell field updates !AJ Non-blocking can be done here call self%kyb(ik)%get_field_update(fieldeq(:,:,ik),fieldeqa(:,:,ik),fieldeqp(:,:,ik)) enddo !AJ Wait for collective comms here (these are setup in get_field_update if(field_local_nonblocking_collectives) then !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ik) & !$OMP SHARED(self) & !$OMP SCHEDULE(static) do ik=1,self%naky !Skip non-local ky if(.not.self%kyb(ik)%is_local) cycle call self%kyb(ik)%wait_for_nonblocking_collectives() enddo !$OMP END PARALLEL DO end if end subroutine fm_get_field_update !> A routine to unpack the supercell tmp_sum vectors to !! full field arrays subroutine fm_unpack_to_field(self,ph,ap,bp) use theta_grid, only: ntgrid use run_parameters, only: has_phi, has_apar, has_bpar use mp, only: proc0 implicit none class(fieldmat_type), intent(inout) :: self complex,dimension(-ntgrid:,:,:), intent(inout) :: ph,ap,bp integer :: ik,is,ic,iex,ig,it,bnd,ifl !We don't need to unpack if we're proc0 as we've done this already if(.not.proc0)then !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ik, is, iex, ifl, ic, it, bnd, ig) & !$OMP SHARED(self, ph, ap, bp, has_phi, has_apar, has_bpar, ntgrid) & !$OMP SCHEDULE(static) do ik=1,self%naky if(.not.self%kyb(ik)%is_local) cycle do is=1,self%kyb(ik)%nsupercell if(.not.self%kyb(ik)%supercells(is)%is_local) cycle iex=0 ifl=0 if(has_phi)then ifl=ifl+1 do ic=1,self%kyb(ik)%supercells(is)%ncell it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind bnd=0 if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1 do ig=-ntgrid,ntgrid-bnd iex=iex+1 ph(ig,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex) enddo enddo !/Fix boundary points if(self%kyb(ik)%supercells(is)%ncell>1) then do ic=1,self%kyb(ik)%supercells(is)%ncell-1 ph(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=& ph(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik) enddo endif endif if(has_apar)then ifl=ifl+1 do ic=1,self%kyb(ik)%supercells(is)%ncell it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind bnd=0 if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1 do ig=-ntgrid,ntgrid-bnd iex=iex+1 ap(ig,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex) enddo enddo !/Fix boundary points if(self%kyb(ik)%supercells(is)%ncell>1) then do ic=1,self%kyb(ik)%supercells(is)%ncell-1 ap(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=& ap(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik) enddo endif endif if(has_bpar)then ifl=ifl+1 do ic=1,self%kyb(ik)%supercells(is)%ncell it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind bnd=0 if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1 do ig=-ntgrid,ntgrid-bnd iex=iex+1 bp(ig,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex) enddo enddo !/Fix boundary points if(self%kyb(ik)%supercells(is)%ncell>1) then do ic=1,self%kyb(ik)%supercells(is)%ncell-1 bp(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=& bp(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik) enddo endif endif enddo enddo !$OMP END PARALLEL DO endif end subroutine fm_unpack_to_field !> A routine to reset the object subroutine fm_reset(self) use mp, only: free_comm implicit none class(fieldmat_type), intent(inout) :: self integer :: ik !Call reset on all children do ik=1,self%naky call self%kyb(ik)%reset enddo !deallocate call self%deallocate !AJ Free the communicators associated with this field matrix object. if(self%fm_sub_headsc_p0%nproc>0) then call free_comm(self%fm_sub_headsc_p0) end if !Could zero out variables but no real need end subroutine fm_reset !> Count how many subcommunicators will be created subroutine fm_count_subcom(self) use mp, only: iproc, barrier, nproc implicit none class(fieldmat_type), intent(inout) :: self integer :: ik, is, ip integer :: nsub_tot, nsub_proc !Initialise nsub_tot=2 !Subcom belonging to fieldmat object nsub_proc=nsub_tot !Now get children do ik=1,self%naky nsub_tot=nsub_tot+2 if(self%kyb(ik)%is_local) nsub_proc=nsub_proc+2 do is=1,self%kyb(ik)%nsupercell nsub_tot=nsub_tot+2 if(self%kyb(ik)%supercells(is)%is_local) nsub_proc=nsub_proc+2 enddo enddo !Now report if(iproc==0) write(dlun,'("Total number of subcommunicators is ",I0)') nsub_tot do ip=0,nproc-1 if(ip==iproc) write(dlun,'("Number of subcommunicators on ",I0," is ",I0)') iproc, nsub_proc call barrier enddo if(iproc==0)write(dlun,'()') end subroutine fm_count_subcom !> Create all the necessary subcommunicators subroutine fm_make_subcom_1(self) use mp, only: iproc, nproc, mp_comm, proc0 implicit none class(fieldmat_type), intent(inout) :: self integer :: ik !Set the all comm to be mp_comm self%fm_sub_all%id=mp_comm self%fm_sub_all%iproc=iproc self%fm_sub_all%nproc=nproc self%fm_sub_all%proc0=proc0 !NOTE: We may actually want to allow a communicator to be given as input !as this would allow us to split off a group of procs to do the field !calculations for separate fieldmat objects. This may be one way to look !at preparing response matrices for different time steps etc. !NOTE: We'd need to take this into account in the decomposition routines !as well where we split up the work. and also in any fieldmat routine which uses !iproc,nproc,proc0 etc. !NOTE: Best way to achieve this is to always reference self%fm_sub_all when wanting !iproc,nproc,proc0 etc. !Set the parents and make their subcommunicators do ik=1,self%naky !Set parent subcom self%kyb(ik)%parent_sub=self%fm_sub_all !Make individual subcom call self%kyb(ik)%make_subcom_1 enddo end subroutine fm_make_subcom_1 !> Create the secondary subcommunicators subroutine fm_make_subcom_2(self) use mp, only: split, free_comm, mp_undefined implicit none class(fieldmat_type), intent(inout) :: self integer :: ik, colour type(comm_type) :: tmp logical :: has_head_sc !Make child subcommunicators do ik=1,self%naky !Make individual subcom call self%kyb(ik)%make_subcom_2 enddo !Now make the supercell head communicator used for field gathers !/First work out if this proc is the head of any supercells has_head_sc=.false. do ik=1,self%naky has_head_sc=has_head_sc.or.any(self%kyb(ik)%supercells%is_head) enddo !/Now split to make a subcom colour = mp_undefined !NOTE: Here we assume that the supercell head will always !have the data we want *but* currently in ky_get_field we don't !store anything on the procs for which the supercell is local but !empty. This should really be changed as whilst empty procs don't !have any field data to calculate they will still need it when !advancing g etc. if(has_head_sc.or.self%fm_sub_all%proc0) colour=1 call split(colour,tmp,self%fm_sub_all%id) if(colour /= mp_undefined) then !/Store subcom self%fm_sub_headsc_p0=tmp endif end subroutine fm_make_subcom_2 !AJ Record the process that is the supercell head associated with a given ik,it point !AJ Supercells have a single ik but may have many it points so a given supercell head !AJ may be responsible for a range of points subroutine fm_calc_sc_heads(self) use mp, only: iproc, sum_allreduce implicit none class(fieldmat_type), intent(inout) :: self integer :: ik, is, ic, it !AJ There may be a problem here is any supercell has no assigned head. !AJ This is the case (I think) for 1,1; but it shouldn't actually cause !AJ a problem as 1,1 shouldn't be communicated anyway as only one process !AJ is involved in it. self%heads = 0 do ik=1,self%naky do is=1,self%kyb(ik)%nsupercell if(self%kyb(ik)%supercells(is)%is_head) then do ic=1,self%kyb(ik)%supercells(is)%ncell it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind self%heads(it,ik) = iproc end do end if end do end do call sum_allreduce(self%heads) end subroutine fm_calc_sc_heads !> Gather all the fields to proc0/all for diagnostics etc. subroutine fm_gather_fields(self,ph,ap,bp,to_all_in,do_allreduce_in) use theta_grid, only: ntgrid use kt_grids, only: naky, ntheta0 use mp, only: broadcast, sum_reduce_sub, sum_allreduce, sum_allreduce_sub use mp, only: rank_comm, comm_type, proc0 use run_parameters, only: has_phi, has_apar, has_bpar use gs2_layouts, only: g_lo, intspec_sub use array_utils, only: zero_array, copy use optionals, only: get_option_with_default implicit none class(fieldmat_type), intent(inout) :: self complex, dimension(:,:,:), intent(inout) :: ph,ap,bp logical, optional, intent(in) :: to_all_in, do_allreduce_in logical :: to_all, do_allreduce complex, dimension(:,:,:,:), allocatable :: tmp, tmp_tmp integer :: ifl, ik, is, ic, it, iex, bnd, ig logical :: use_sub type(comm_type), save :: les_comm logical, save :: first=.true. logical, save :: sub_has_proc0=.false. logical :: xy_block_comm_is_available logical :: x_is_not_split !Set the to_all flag to_all = get_option_with_default(to_all_in, .false.) !Set the do_allreduce flag do_allreduce = get_option_with_default(do_allreduce_in, .false.) !Shouldn't do to_all if do_allreduce if(do_allreduce) to_all=.false. !Work out if we want to use sub communicators in reduction and !if we are allowed to. xy_block_comm_is_available = .not. (g_lo%x_local .and. g_lo%y_local) .and. intspec_sub x_is_not_split = g_lo%x_local use_sub=( & xy_block_comm_is_available .and. x_is_not_split .and. & !Parallel decomposition constraints do_allreduce .and. field_local_allreduce_sub) !Runtime choices !If using sub-communicators set up object and work out which procs !have proc0 as a member if(use_sub.and.first)then !Setup type les_comm%id=g_lo%lesblock_comm call rank_comm(les_comm%id,les_comm%iproc) !Decide who has proc0 ig=0 if(proc0) ig=1 call sum_allreduce_sub(ig,les_comm%id) if(ig/=0) sub_has_proc0=.true. endif first=.false. !Allocate the tmp array if we're part of the head subcom !or if we're sending to all if((self%fm_sub_headsc_p0%nproc>0).or.to_all.or.do_allreduce) then if(use_sub)then !Use a smaller array so we ignore zero entries, really would like to reuse some of the !integrate species code to do reduction and gather but have extra dimension due to fields allocate(tmp_tmp(-ntgrid:ntgrid,g_lo%it_min:g_lo%it_max,g_lo%ik_min:g_lo%ik_max,self%nfield)) else allocate(tmp_tmp(-ntgrid:ntgrid,ntheta0,naky,self%nfield)) endif allocate(tmp(-ntgrid:ntgrid,ntheta0,naky,self%nfield)) call zero_array(tmp_tmp) if(use_sub) call zero_array(tmp) !Only need to zero if tmp_tmp is not same size as tmp endif !Now loop over supercells, and store sections !on procs which are part of the head subcom if(self%fm_sub_headsc_p0%nproc>0)then !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ik, is, iex, ifl, it, bnd, ig) & !$OMP SHARED(self, ntgrid, tmp_tmp) & !$OMP SCHEDULE(static) do ik=1,self%naky do is=1,self%kyb(ik)%nsupercell !Skip if we're not the head supercell if(.not.self%kyb(ik)%supercells(is)%is_head) cycle iex=0 do ifl=1,self%nfield do ic=1,self%kyb(ik)%supercells(is)%ncell it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind bnd=0 if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1 do ig=-ntgrid,ntgrid-bnd iex=iex+1 tmp_tmp(ig,it,ik,ifl)=self%kyb(ik)%supercells(is)%tmp_sum(iex) ! if(ifl .eq. 1) write(*,*) self%kyb(ik)%supercells(is)%tmp_sum(iex) enddo enddo enddo !<DD>TAGGED:Worth looking at improving this bad memory pattern !/Now fix boundary points if(self%kyb(ik)%supercells(is)%ncell>1) then do ic=1,self%kyb(ik)%supercells(is)%ncell-1 tmp_tmp(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik,:)=& tmp_tmp(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik,:) enddo endif enddo enddo !$OMP END PARALLEL DO !Now reduce !Really we should be able to do some form of gather here !as every proc knows how long the supercell is and where to put it ! call sum_allreduce_sub(tmp,self%fm_sub_headsc_p0%id) if(.not.do_allreduce)call sum_reduce_sub(tmp_tmp,0,self%fm_sub_headsc_p0) endif ! write(*,*) 'tmp_tmp',tmp_tmp(:,:,:,1) !Should really be able to do this on the xyblock sub communicator !but proc0 needs to know full result for diagnostics so would need !some way to tell proc0 about the other parts of the field (or !update diagnostics to work in parallel). We have the lesblock comm !which should help with this, but we'd need to work out which procs !are in the same comm as proc0. if(do_allreduce) then if(use_sub) then !This reduction (and copy) ensures that every proc knows the field !in its local x-y cells. It could be implemented as an allgatherv which !would be slightly more efficient but is more complicated and may involve !more local operations which outweigh the benefits. call sum_allreduce_sub(tmp_tmp,g_lo%xyblock_comm) else call sum_allreduce(tmp_tmp) endif endif !Now copy tmp_tmp into tmp if we have it if((self%fm_sub_headsc_p0%nproc>0).or.to_all.or.do_allreduce) then if(use_sub)then tmp(:,g_lo%it_min:g_lo%it_max,g_lo%ik_min:g_lo%ik_max,:)=tmp_tmp !Here we reduce to proc0 of the lesblock comm (which should include the !global proc0). Should only really need to call this on procs which !have proc0 in their subcomm but no way to determine this currently !NOTE: We don't need to know the result of this reduction until we !reach the diagnostics. With non-blocking communications we may be able !to do this required communication in the background whilst we proceed !with the second call to timeadv. There is a slight complication in that !we're communicating the change in the fields so we have to remember to !increment the field before diagnostics are called. if(sub_has_proc0) call sum_reduce_sub(tmp,0,les_comm) else call copy(tmp_tmp, tmp) !It would be nice to avoid this copy if possible endif endif !//////////////////////// !// Distribute and unpack !//////////////////////// !Note: Currently use a broadcast from proc0 to distribute data to all procs !May be more efficient to use a scatter type approach so we send the minimum !amount of data. This may end up being slower as we don't take advantage of !optimised broadcast which improves bandwidth (branching), as we're doing 1 to many. !Might expect a broadcast from the supercell head on sc_sub_all to be most !efficient, but it doesn't appear to be better than global broadcast. !As the supercell head has knowledge of the result quite early on in the !field update algorithm a non-blocking broadcast (MPI-3) on sc_sub_all !may be useful and would allow the following broadcasts to be replaced !with wait variants plus unpacking. !Note: By doing the broadcast at supercell level we would end up sending !more data than required (if x is parallel) but to do it at cell level !would require a subcommunicator for each cell, which can easily exceed !the maximum number allowed. !Fetch to all procs if(to_all)then call broadcast(tmp) endif !Finally, unpack tmp and deallocate arrays ! We could perhaps use copy here to add OpenMP acceleration ! but I worry this could result in the creation of a temporary ! in some cases, leading to double copying? if(self%fm_sub_all%proc0.or.to_all.or.do_allreduce)then ifl=0 if(has_phi) then ifl=ifl+1 call copy(tmp(:,:,:,ifl), ph) endif if(has_apar) then ifl=ifl+1 call copy(tmp(:,:,:,ifl), ap) endif if(has_bpar) then ifl=ifl+1 call copy(tmp(:,:,:,ifl), bp) endif endif if(allocated(tmp)) deallocate(tmp) if(allocated(tmp_tmp)) deallocate(tmp_tmp) !/Fill the empties by broadcasting from proc0 if we !didn't do it with the earlier broadcast. Note that here we !have up to three broadcasts whilst with to_all we only have one. !However with to_all we have the additional cost of unpacking data. !It is possible that for some values of nfield to_all is faster than !.not.to_all, this is also likely to depend on the machine if(.not.(to_all.or.do_allreduce)) then if(has_phi) call broadcast(ph) if(has_apar) call broadcast(ap) if(has_bpar) call broadcast(bp) endif end subroutine fm_gather_fields !> Update the fields using calculated update subroutine fm_update_fields(self,phi,apar,bpar) use fields_arrays, only: orig_phi=>phi, orig_apar=>apar, orig_bpar=>bpar use run_parameters, only: has_phi, has_apar, has_bpar use kt_grids, only: kwork_filter use mp, only: proc0 implicit none class(fieldmat_type), intent(in) :: self complex,dimension(:,:,:),intent(inout) :: phi,apar,bpar integer :: ik,it,is,ic !If we're proc0 then we need to do full array (for diagnostics) if(proc0) then if(has_phi) phi=phi+orig_phi if(has_apar) apar=apar+orig_apar if(has_bpar) bpar=bpar+orig_bpar return endif !Now loop over cells and calculate field equation as required !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ik, is, ic, it) & !$OMP SHARED(self, kwork_filter, has_phi, has_apar, has_bpar, & !$OMP phi, apar, bpar, orig_phi, orig_apar, orig_bpar) & !$OMP SCHEDULE(static) do ik=1,self%naky !Skip not local cells if(.not.self%kyb(ik)%is_local) cycle do is=1,self%kyb(ik)%nsupercell if(.not.self%kyb(ik)%supercells(is)%is_local) cycle do ic=1,self%kyb(ik)%supercells(is)%ncell if(.not.self%kyb(ik)%supercells(is)%cells(ic)%is_local) cycle it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle !Increment fields if(has_phi) phi(:,it,ik)=phi(:,it,ik)+orig_phi(:,it,ik) if(has_apar) apar(:,it,ik)=apar(:,it,ik)+orig_apar(:,it,ik) if(has_bpar) bpar(:,it,ik)=bpar(:,it,ik)+orig_bpar(:,it,ik) enddo enddo enddo !$OMP END PARALLEL DO end subroutine fm_update_fields !> Update the fields using the new fields subroutine fm_update_fields_newstep(self) use fields_arrays, only: phi,apar,bpar,phinew,aparnew,bparnew use run_parameters, only: has_phi, has_apar, has_bpar use kt_grids, only: kwork_filter use mp, only: proc0 use array_utils, only: copy implicit none class(fieldmat_type), intent(in) :: self integer :: ik,it,is,ic !If we're proc0 then we need to do full array (for diagnostics) if(proc0) then if(has_phi) call copy(phinew, phi) if(has_apar) call copy(aparnew,apar) if(has_bpar) call copy(bparnew, bpar) return endif !Now loop over cells and calculate field equation as required !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ik, is, ic, it) & !$OMP SHARED(self, kwork_filter, has_phi, has_apar, has_bpar, & !$OMP phi, apar, bpar, phinew, aparnew, bparnew) & !$OMP SCHEDULE(static) do ik=1,self%naky !Skip not local cells if(.not.self%kyb(ik)%is_local) cycle do is=1,self%kyb(ik)%nsupercell if(.not.self%kyb(ik)%supercells(is)%is_local) cycle do ic=1,self%kyb(ik)%supercells(is)%ncell if(.not.self%kyb(ik)%supercells(is)%cells(ic)%is_local) cycle it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle !Increment fields if(has_phi) phi(:,it,ik)=phinew(:,it,ik) if(has_apar) apar(:,it,ik)=aparnew(:,it,ik) if(has_bpar) bpar(:,it,ik)=bparnew(:,it,ik) enddo enddo enddo !OMP END PARALLEL DO end subroutine fm_update_fields_newstep !> Fetch all condition numbers to proc0 and report on !> worrying values as required. subroutine fm_get_condition_numbers(self) use file_utils, only: error_unit use gs2_time, only: code_dt use mp, only: proc0, sum_reduce use unit_tests, only: debug_message implicit none type ky_condition_type real, dimension(:), allocatable :: conditions end type ky_condition_type class(fieldmat_type), intent(in) :: self integer, parameter :: verb = 1 type(ky_condition_type), dimension(:), allocatable :: ky_conditions integer :: ik, is character(len = 100) :: condition_number_message real :: condition_number allocate(ky_conditions(self%naky)) ! Note that here we communciate over comm_world. It is likely that we ! could instead reduce individual `ky_conditions(:)%conditions` arrays ! amongst just the supercell heads of the ky block and then reduce ! across the ky heads to proc0. Currently we don't worry about this as ! we just do this once per initialisation, but this could become expensive ! when running at large scale? do ik = 1, self%naky allocate(ky_conditions(ik)%conditions(self%kyb(ik)%nsupercell)) ky_conditions(ik)%conditions = 0.0 do is = 1, self%kyb(ik)%nsupercell if (self%kyb(ik)%supercells(is)%is_head) then ky_conditions(ik)%conditions(is) = self%kyb(ik)%supercells(is)%condition_number end if end do call sum_reduce(ky_conditions(ik)%conditions, 0) end do if (proc0) then do ik = 1, self%naky do is = 1, self%kyb(ik)%nsupercell condition_number = ky_conditions(ik)%conditions(is) write(condition_number_message,'("Condition number for response with ik = ",I0,", is = ",I0," and dt = ",E14.6E3," is ",E14.6E3)') ik, is, code_dt, condition_number call debug_message(verb, "fields_local::fm_get_condition_numbers "//condition_number_message) if (condition_number > condition_number_warning_limit) then write(error_unit(), '("Warning: ",A)') trim(condition_number_message) end if ! In debug mode always display the condition number message to screen if (debug) then write(*,'(A)') trim(condition_number_message) end if end do end do end if do ik = 1, self%naky deallocate(ky_conditions(ik)%conditions) end do deallocate(ky_conditions) end subroutine fm_get_condition_numbers !> Write some debug data to file subroutine fm_write_debug_data(self) use file_utils, only: open_output_file, close_output_file use mp, only: iproc, nproc_comm, rank_comm implicit none class(fieldmat_type), intent(in) :: self character(len=20) :: ext integer :: unit integer :: ik,is,ic,np,ip !Make the file extension write(ext,'(".debug.",I0,".md")') iproc !Open output file call open_output_file(unit,ext) !Now write data write(unit,'("Field matrix debug data for proc : ",I0)') iproc write(unit,'(80("="))') write(unit,'()') !/Here we have details of what cells this proc can see and how !they're marked write(unit,'("Details of object locality and state")') write(unit,'(80("-"))') write(unit,'()') write(unit,'(7(" ",A13," "))') "ik", "is", "ic", "is_local", "is_empty", "is_all_local", "is_head" write(unit,'(7(" ",14("-")))') do ik=1,self%naky do is=1,self%kyb(ik)%nsupercell do ic=1,self%kyb(ik)%supercells(is)%ncell write(unit,'(3(I14," "),4(L14," "))') ik,is,ic,& self%kyb(ik)%supercells(is)%cells(ic)%is_local,& self%kyb(ik)%supercells(is)%cells(ic)%is_empty,& self%kyb(ik)%supercells(is)%cells(ic)%is_all_local,& self%kyb(ik)%supercells(is)%is_head enddo enddo enddo write(unit,'(" ",7(14("-")," "))') write(unit,'()') !/Here we see what limits the row blocks have for the cells !we actually have write(unit,'()') write(unit,'("Row limits")') write(unit,'(80("-"))') write(unit,'()') do ik=1,self%naky write(unit,'("* ik : ",I0)') ik ! if(.not.self%kyb(ik)%is_local) cycle do is=1,self%kyb(ik)%nsupercell write(unit,'(" * is : ",I0)') is ! if(.not.self%kyb(ik)%supercells(is)%is_local) cycle write(unit,'(" ","is_head - ",L1)') self%kyb(ik)%supercells(is)%is_head write(unit,'(" ","is_empty - ",L1)') self%kyb(ik)%supercells(is)%is_empty write(unit,'(" ","is_local - ",L1)') self%kyb(ik)%supercells(is)%is_local write(unit,'(" ","is_all_local - ",L1)') self%kyb(ik)%supercells(is)%is_all_local write(unit,'()') write(unit,'(" ",4(" ",A8," "))') "ic", "rllim", "rulim", "nrow" write(unit,'(" ",4(" ",9("-")))') do ic=1,self%kyb(ik)%supercells(is)%ncell if(size(self%kyb(ik)%supercells(is)%cells(ic)%rb)>0)then write(unit,'(" ",4(I9," "))') ic,& self%kyb(ik)%supercells(is)%cells(ic)%rb(1)%row_llim,& self%kyb(ik)%supercells(is)%cells(ic)%rb(1)%row_ulim,& self%kyb(ik)%supercells(is)%cells(ic)%rb(1)%nrow endif enddo write(unit,'(" ",4(" ",9("-")))') enddo enddo write(unit,'()') !/Subcommunicators write(unit,'()') write(unit,'("Subcommunicators")') write(unit,'(80("-"))') write(unit,'()') do ik=1,self%naky write(unit,'("* ik : ",I0)') ik do is=1,self%kyb(ik)%nsupercell write(unit,'(" * is : ",I0)') is write(unit,'()') write(unit,'(" ",6(" ",A7," "))') "name", "nproc", "iproc", "proc0", "REnproc", "REiproc" write(unit,'(" ",6(" ",8("-")))') if(self%kyb(ik)%supercells(is)%sc_sub_all%nproc>0)then call nproc_comm(self%kyb(ik)%supercells(is)%sc_sub_all%id,np) call rank_comm(self%kyb(ik)%supercells(is)%sc_sub_all%id,ip) else np=-1 ip=-1 endif write(unit,'(" ",(" ",A8),2(I8," "),1(L8," "),2(I8," "))') "all",& self%kyb(ik)%supercells(is)%sc_sub_all%nproc,& self%kyb(ik)%supercells(is)%sc_sub_all%iproc,& self%kyb(ik)%supercells(is)%sc_sub_all%proc0,& np,ip if(self%kyb(ik)%supercells(is)%sc_sub_pd%nproc>0)then call nproc_comm(self%kyb(ik)%supercells(is)%sc_sub_pd%id,np) call rank_comm(self%kyb(ik)%supercells(is)%sc_sub_pd%id,ip) else np=-1 ip=-1 endif write(unit,'(" ",(" ",A8),2(I8," "),1(L8," "),2(I8," "))') "pd",& self%kyb(ik)%supercells(is)%sc_sub_pd%nproc,& self%kyb(ik)%supercells(is)%sc_sub_pd%iproc,& self%kyb(ik)%supercells(is)%sc_sub_pd%proc0,& np,ip write(unit,'(" ",6(" ",8("-")))') write(unit,'()') enddo enddo write(unit,'()') !Now close output file call close_output_file(unit) end subroutine fm_write_debug_data subroutine fm_getfieldeq_nogath (self,phi, apar, bpar, fieldeq, fieldeqa, fieldeqp) use theta_grid, only: ntgrid use dist_fn, only: getan_nogath, gf_lo_integrate use dist_fn_arrays, only: antot, antota, antotp implicit none class(fieldmat_type), intent(in) :: self complex, dimension (-ntgrid:,:,:), intent (in) :: phi, apar, bpar complex, dimension (-ntgrid:,:,:), intent (in out) ::fieldeq,fieldeqa,fieldeqp call getan_nogath(antot, antota, antotp) !AJ If we are integrating in the gf_lo data distribution then the results of antot etc... from !AJ getan_nogath above will be in gf_lo distribution, perform a communication step to redistribute the !AJ result so the owners of the data in g_lo end up with the data. if(gf_lo_integrate) then call self%reduce_an(antot, antota, antotp) end if call self%getfieldeq1_nogath (phi, apar, bpar, antot, antota, antotp, & fieldeq, fieldeqa, fieldeqp) end subroutine fm_getfieldeq_nogath !AJ This routine takes in data that has been calculated in gf_lo layout (i.e. the antot etc... arrays !AJ have been partitioned in gf_lo format so that each naky,ntheta point is on a single process) and !AJ redistributes it to a g_lo layout (i.e. so the processes that own the naky,ntheta points in g_lo !AJ have the data at the end). subroutine fm_reduce_an_head_send_broadcast (self,antot, antota, antotp) use mp, only: broadcast, iproc, nbsend, nbrecv, initialise_requests, waitall, broadcast_sub use theta_grid, only: ntgrid use run_parameters, only: has_phi, has_apar, has_bpar use gs2_layouts, only: gf_lo, proc_id, idx, ik_idx, it_idx use kt_grids, only: kwork_filter implicit none class(fieldmat_type), intent(in) :: self complex, dimension (-ntgrid:,:,:), intent (in out) :: antot, antota, antotp integer :: igf, ik, it, ic, is, root, sendreqnum, recvreqnum, sender, receiver, tag, taglim integer, dimension (gf_lo%xypoints*3) :: send_handles !AJ This array is bigger than it needs to be but does mean we do not !AJ need to calculate how many supercells a give process is head for integer, dimension (gf_lo%naky*gf_lo%ntheta0*3) :: recv_handles integer :: tempit, tempis, tempismax complex, dimension(:,:,:,:,:), allocatable :: tempdata tempis = 0 tempismax = 0 do ik = 1,self%naky tempis = self%kyb(ik)%nsupercell if(tempis > tempismax) then tempismax = tempis end if end do allocate(tempdata(-ntgrid:ntgrid,gf_lo%ntheta0,self%nfield,gf_lo%naky,tempismax)) call initialise_requests(send_handles) call initialise_requests(recv_handles) recvreqnum = 1 taglim = self%naky*self%ntheta0+1 !AJ Iterate through the supercells and each supercell that I am a head for !AJ post a receive for data for a given element of the field array. do ik = 1,self%naky do is=1,self%kyb(ik)%nsupercell if(self%kyb(ik)%supercells(is)%is_head) then tempit = 1 do ic=1,self%kyb(ik)%supercells(is)%ncell it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle !AJ Calculate which process owns this it,ik point in gf_lo space !AJ i.e. which process will be sending the data to this supercell head sender = proc_id(gf_lo,idx(gf_lo,ik,it)) if(sender /= iproc) then tag = ik*self%ntheta0 + it if(has_phi) then call nbrecv(tempdata(:,tempit,1,ik,is),sender,tag,recv_handles(recvreqnum)) recvreqnum = recvreqnum + 1 end if tag = tag + taglim if(has_apar) then call nbrecv(tempdata(:,tempit,2,ik,is),sender,tag,recv_handles(recvreqnum)) recvreqnum = recvreqnum + 1 end if tag = tag + taglim if(has_bpar) then call nbrecv(tempdata(:,tempit,3,ik,is),sender,tag,recv_handles(recvreqnum)) recvreqnum = recvreqnum + 1 end if else if(has_phi) then tempdata(:,tempit,1,ik,is) = antot(:,it,ik) end if if(has_apar) then tempdata(:,tempit,2,ik,is) = antota(:,it,ik) end if if(has_bpar) then tempdata(:,tempit,3,ik,is) = antotp(:,it,ik) end if end if tempit = tempit + 1 end do end if end do end do sendreqnum = 1 !AJ Iterate through all the points I own in gf_lo and send that part of the field !AJ arrays to the process that is the supercell head for the supercell that contains !AJ that it, ik point. do igf = gf_lo%llim_proc, gf_lo%ulim_proc ik = ik_idx(gf_lo,igf) it = it_idx(gf_lo,igf) if(kwork_filter(it,ik)) cycle tag = ik*self%ntheta0 + it receiver = self%heads(it,ik) if(iproc /= receiver) then if(has_phi) then call nbsend(antot(:,it,ik),receiver,tag,send_handles(sendreqnum)) sendreqnum = sendreqnum + 1 end if tag = tag + taglim if(has_apar) then call nbsend(antota(:,it,ik),receiver,tag,send_handles(sendreqnum)) sendreqnum = sendreqnum + 1 end if tag = tag + taglim if(has_bpar) then call nbsend(antotp(:,it,ik),receiver,tag,send_handles(sendreqnum)) sendreqnum = sendreqnum + 1 end if end if end do call waitall(sendreqnum-1,send_handles) call waitall(recvreqnum-1,recv_handles) do ik = 1,self%naky if(.not.self%kyb(ik)%is_local) cycle do is=1,self%kyb(ik)%nsupercell if(.not. self%kyb(ik)%supercells(is)%is_local) cycle root = self%kyb(ik)%supercells(is)%head_iproc_pd if((self%kyb(ik)%supercells(is)%is_empty.or.self%kyb(ik)%supercells(is)%is_all_local)) cycle tempit = 1 do ic=1,self%kyb(ik)%supercells(is)%ncell it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle tempit = tempit + 1 end do call broadcast_sub(tempdata(:,1:tempit,1:self%nfield,ik,is),root,self%kyb(ik)%supercells(is)%sc_sub_pd%id) if(has_phi) then tempit = 1 do ic=1,self%kyb(ik)%supercells(is)%ncell it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle antot(:,it,ik) = tempdata(:,tempit,1,ik,is) tempit = tempit + 1 end do end if if(has_apar) then tempit = 1 do ic=1,self%kyb(ik)%supercells(is)%ncell it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle antota(:,it,ik) = tempdata(:,tempit,2,ik,is) tempit = tempit + 1 end do end if if(has_bpar) then tempit = 1 do ic=1,self%kyb(ik)%supercells(is)%ncell it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle antotp(:,it,ik) = tempdata(:,tempit,3,ik,is) tempit = tempit + 1 end do end if end do end do deallocate(tempdata) end subroutine fm_reduce_an_head_send_broadcast subroutine fm_check_an (self,antot, tempantot, antota, tempantota, antotp, tempantotp) use mp, only: broadcast_sub, iproc, sum_allreduce_sub, mp_abort use theta_grid, only: ntgrid use run_parameters, only: has_phi, has_apar, has_bpar use gs2_layouts, only: proc_id, idx use kt_grids, only: kwork_filter implicit none class(fieldmat_type), intent(in) :: self complex, dimension (-ntgrid:,:,:), intent (in) :: antot, antota, antotp, tempantot, tempantota, tempantotp complex, parameter :: tol = (1e-14,1e-14) integer :: ik, it, is, ic, ig loop1: do ik=1,self%naky !Skip empty cells, note this is slightly different to skipping !.not.is_local. Skipping empty is probably faster but may be more dangerous if(self%kyb(ik)%is_empty) cycle loop1 do is=1,self%kyb(ik)%nsupercell if(self%kyb(ik)%supercells(is)%is_empty) cycle do ic=1,self%kyb(ik)%supercells(is)%ncell if(self%kyb(ik)%supercells(is)%cells(ic)%is_empty) cycle it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle if(has_phi) then do ig=-ntgrid,ntgrid if(abs(aimag(antot(ig,it,ik)-tempantot(ig,it,ik)))>aimag(tol) .or. abs(real(antot(ig,it,ik)-tempantot(ig,it,ik)))>real(tol)) then write(*,*) iproc,'problem with antot',it,ik,antot(ig,it,ik),tempantot(ig,it,ik) call mp_abort('Problem with antot') exit loop1 end if end do endif if(has_apar) then do ig=-ntgrid,ntgrid if(abs(aimag(antota(ig,it,ik)-tempantota(ig,it,ik)))>aimag(tol) .or. abs(real(antota(ig,it,ik)-tempantota(ig,it,ik)))>real(tol)) then write(*,*) iproc,'problem with antota',it,ik,antota(ig,it,ik),tempantota(ig,it,ik) call mp_abort('Problem with antota') exit loop1 end if end do endif if(has_bpar)then do ig=-ntgrid,ntgrid if(abs(aimag(antotp(ig,it,ik)-tempantotp(ig,it,ik)))>aimag(tol) .or. abs(real(antotp(ig,it,ik)-tempantotp(ig,it,ik)))>real(tol)) then write(*,*) iproc,'problem with antotp',it,ik,antotp(ig,it,ik),tempantotp(ig,it,ik) call mp_abort('Problem with antotp') exit loop1 end if end do end if end do enddo enddo loop1 end subroutine fm_check_an subroutine fm_getfieldeq1_nogath (self,phi, apar, bpar, antot, antota, antotp, & fieldeq, fieldeqa, fieldeqp) use theta_grid, only: ntgrid, bmag use kt_grids, only: naky, ntheta0, kperp2, kwork_filter use run_parameters, only: has_phi, has_apar, has_bpar, beta use species, only: spec, has_electron_species use dist_fn, only: gamtot,gamtot1, gamtot2, fl_avg, apfac use dist_fn, only: adiabatic_option_switch, adiabatic_option_fieldlineavg, calculate_flux_surface_average implicit none class(fieldmat_type), intent(in) :: self complex, dimension (-ntgrid:,:,:), intent (in) :: phi, apar, bpar complex, dimension (-ntgrid:,:,:), intent (in) :: antot, antota, antotp complex, dimension (-ntgrid:,:,:), intent (in out) ::fieldeq,fieldeqa,fieldeqp integer :: ik, it, is, ic if (.not. allocated(fl_avg)) allocate (fl_avg(ntheta0, naky)) fl_avg = 0. if (.not. has_electron_species(spec)) then if (adiabatic_option_switch == adiabatic_option_fieldlineavg) then call calculate_flux_surface_average(fl_avg,antot) end if end if !Now loop over cells and calculate field equation as required. !Note we only populate the parts of the fieldeq arrays which !are local to this processor. !$OMP PARALLEL DO DEFAULT(none) & !$OMP PRIVATE(ik, it, is, ic) & !$OMP SHARED(self, kwork_filter, has_phi, antot, gamtot, fl_avg, & !$OMP fieldeq, bpar, gamtot1, has_apar, has_bpar, fieldeqa, antota, kperp2, apar, & !$OMP fieldeqp, gamtot2, beta, apfac, bmag, phi, antotp) & !$OMP SCHEDULE(static) do ik=1,self%naky !Skip empty cells, note this is slightly different to skipping !.not.is_local. Skipping empty is probably faster but may be more dangerous if(self%kyb(ik)%is_empty) cycle do is=1,self%kyb(ik)%nsupercell if(self%kyb(ik)%supercells(is)%is_empty) cycle do ic=1,self%kyb(ik)%supercells(is)%ncell if(self%kyb(ik)%supercells(is)%cells(ic)%is_empty) cycle it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind if(kwork_filter(it,ik)) cycle !Now fill in data if(has_phi) then fieldeq(:,it,ik) = antot(:,it,ik) - & gamtot(:,it,ik)*phi(:,it,ik) + & fl_avg(it, ik) if(has_bpar) fieldeq(:,it,ik)=fieldeq(:,it,ik)+bpar(:,it,ik)*gamtot1(:,it,ik) endif if(has_apar) then fieldeqa(:,it,ik) = antota(:,it,ik) - & kperp2(:,it,ik)*apar(:,it,ik) endif if(has_bpar)then fieldeqp(:,it,ik) = bpar(:,it,ik) + & (antotp(:,it,ik)+bpar(:,it,ik)*gamtot2(:,it,ik) + & 0.5*phi(:,it,ik)*gamtot1(:,it,ik))*(beta*apfac/bmag**2) endif enddo enddo enddo !$OMP END PARALLEL DO end subroutine fm_getfieldeq1_nogath !------------------------------------------------------------------------ !================ !PROC_DECOMP !================ !> Allocate storage space subroutine pc_allocate(self) use kt_grids, only: naky, ntheta0 implicit none class(pc_type), intent(inout) :: self allocate(self%is_local(ntheta0,naky)) allocate(self%itik_subcom(ntheta0,naky)) allocate(self%nproc_per_cell(ntheta0,naky)) allocate(self%nresp_per_cell(ntheta0,naky)) allocate(self%navail_per_cell(ntheta0,naky)) end subroutine pc_allocate !> Deallocate storage space subroutine pc_deallocate(self) implicit none class(pc_type), intent(inout) :: self if(allocated(self%is_local)) deallocate(self%is_local) if(allocated(self%itik_subcom)) deallocate(self%itik_subcom) if(allocated(self%nproc_per_cell)) deallocate(self%nproc_per_cell) if(allocated(self%nresp_per_cell)) deallocate(self%nresp_per_cell) if(allocated(self%navail_per_cell)) deallocate(self%navail_per_cell) end subroutine pc_deallocate !> Debug printing subroutine pc_debug_print(self) implicit none class(pc_type), intent(in) :: self write(dlun,'("Proc type debug print. Index nrow=",I0)') & self%current_nresp() end subroutine pc_debug_print !> Return the current number of rows we're responsible for function pc_current_nresp(self) implicit none class(pc_type), intent(in) :: self integer :: pc_current_nresp pc_current_nresp=sum(self%nresp_per_cell) end function pc_current_nresp !> Work out which cells are local subroutine pc_find_locality(self) use gs2_layouts, only: g_lo, ik_idx, it_idx use mp, only: sum_allreduce implicit none class(pc_type),intent(inout) :: self integer :: it,ik,iglo !Initialise self%is_local=0 !Fill in values, this may be fairly slow do iglo=1,g_lo%ikitrange ik=g_lo%local_ikit_points(iglo)%ik it=g_lo%local_ikit_points(iglo)%it self%is_local(it,ik)=1 end do ! do iglo=g_lo%llim_proc,g_lo%ulim_proc ! ik=ik_idx(g_lo,iglo) ! it=it_idx(g_lo,iglo) ! self%is_local(it,ik)=1 ! enddo !Now count how many procs have each cell self%nproc_per_cell=self%is_local call sum_allreduce(self%nproc_per_cell) end subroutine pc_find_locality !> Work out how many rows are available in each cell subroutine pc_count_avail(self, fieldmat) use kt_grids, only: ntheta0, naky implicit none class(pc_type),intent(inout) :: self class(fieldmat_type), intent(in) :: fieldmat integer :: it,ik,is !Initialise self%navail_per_cell=0 !Loop over cells do ik=1,naky do it=1,ntheta0 !If cell isn't local then cycle if(self%is_local(it,ik)/=1) cycle !Now find the supercell object with this it,ik do is=1,fieldmat%kyb(ik)%nsupercell !If this supercell has the it then store the !size of the supercell and exit loop if(fieldmat%kyb(ik)%supercells(is)%has_it(it)) then self%navail_per_cell(it,ik)=fieldmat%kyb(ik)%supercells(is)%nrow exit endif enddo enddo enddo !Set the total number of responsible rows self%navail_tot=sum(self%navail_per_cell) end subroutine pc_count_avail !> Determine if we have any cells with passed ik. !! !! @note This just says if we can see index !! not that we have any data there function pc_has_ik(self,ik) implicit none class(pc_type), intent(in) :: self integer, intent(in) :: ik logical :: pc_has_ik pc_has_ik=any(self%is_local(:,ik)==1) end function pc_has_ik !> Determine if we have any cells with passed it. !! !! @note This just says if we can see index !! not that we have any data there function pc_has_it(self,it) implicit none class(pc_type), intent(in) :: self integer, intent(in) :: it logical :: pc_has_it pc_has_it=any(self%is_local(it,:)==1) end function pc_has_it !> A wrapper routine to do the decomposition subroutine pc_make_decomp(self, fieldmat) use mp, only: mp_abort implicit none class(pc_type), intent(inout) :: self class(fieldmat_type), intent(in out) :: fieldmat character (len=40) :: errmesg write(errmesg,'("decomp_type ",I0," not yet implemented.")') self%decomp_type !Select the appropriate routine to use select case(self%decomp_type) case(0) !Entirely local and serial call self%decomp_all_serial_local(fieldmat) case(1) !All supercells we can see a bit of are done !serially/locally call self%decomp_own_serial_local(fieldmat) case(2) !Just do the cells that we can see call self%decomp_owncells_serial_local(fieldmat) case(3) !Simple mpi with no attempt at load balance !or avoiding splitting small blocks call self%decomp_owncells_simplempi(fieldmat) case(4) !Simple mpi with no attempt at load balance !but we attempt to avoid splitting small blocks call mp_abort(trim(errmesg)) case default !Invalid decomp_type write(errmesg,'("Invalid decomp_type : ",I0)') self%decomp_type call mp_abort(trim(errmesg)) end select end subroutine pc_make_decomp !> A routine to reset the pc object subroutine pc_reset(self) implicit none class(pc_type), intent(inout) :: self !deallocate call self%deallocate !Could zero out variables but no real need end subroutine pc_reset !A routine to initialise the object subroutine pc_init(self, fieldmat) implicit none class(pc_type), intent(inout) :: self class(fieldmat_type), intent(in) :: fieldmat !First allocate arrays call self%allocate !Now find cell locality call self%find_locality !Count avail data call self%count_avail(fieldmat) end subroutine pc_init !--------------------------- !Decomposition routines !--------------------------- !> All serial and local || Decomp_type=0 subroutine pc_decomp_all_serial_local(self, fieldmat) implicit none class(pc_type), intent(inout) :: self class(fieldmat_type), intent(in out) :: fieldmat integer :: ik, is, ic, ifq integer :: nrow_tmp !First of all override previously found is_local array !to force all cells to be considered on every processor self%is_local=1 !Now we calculate how much data is available call self%count_avail(fieldmat) !Set the number of rows responsible self%nresp_per_cell=self%navail_per_cell !Now loop over all cells, setting the row_llim of !their row blocks do ik=1,fieldmat%naky do is=1,fieldmat%kyb(ik)%nsupercell nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=1 fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=nrow_tmp call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow enddo enddo enddo enddo end subroutine pc_decomp_all_serial_local !> Serial and local but only for the supercells || Decomp_type=1 !! for which we have some data subroutine pc_decomp_own_serial_local(self,fieldmat) implicit none class(pc_type), intent(inout) :: self class(fieldmat_type), intent(in out) :: fieldmat integer :: ik, is, ic, ifq integer :: nrow_tmp logical :: have_some !First need to work out which supercells we have part of do ik=1,fieldmat%naky if(.not.any(self%is_local(:,ik)==1)) cycle do is=1,fieldmat%kyb(ik)%nsupercell have_some=.false. !/See if we have any parts of this supercell do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell if(self%is_local(fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)==1) then have_some=.true. endif enddo !/If so then make sure we mark is_local if(have_some)then do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell self%is_local(fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=1 enddo endif enddo enddo !Now we calculate how much data is available call self%count_avail(fieldmat) !Set the number of rows responsible self%nresp_per_cell=self%navail_per_cell !Now loop over all cells, setting the row_llim of !their row blocks do ik=1,fieldmat%naky do is=1,fieldmat%kyb(ik)%nsupercell nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb if(self%is_local(fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)==1) then fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=1 fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=nrow_tmp else fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=0 fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=0 endif call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow enddo enddo enddo enddo end subroutine pc_decomp_own_serial_local !> Serial and local but only for the cells || Decomp_type=2 !! for which we have some data subroutine pc_decomp_owncells_serial_local(self, fieldmat) implicit none class(pc_type), intent(inout) :: self class(fieldmat_type), intent(in out) :: fieldmat integer :: ik, is, ic, ifq integer :: nrow_tmp !/Don't need to change pc%is_local as this is already correct !Make sure we do the invert locally as mpi based version not implemented! self%force_local_invert=.true. !Now we say that we can use no gather based getfieldeq self%has_to_gather=.false. !Now we calculate how much data is available call self%count_avail(fieldmat) !Set the number of rows responsible self%nresp_per_cell=self%navail_per_cell !Now loop over all cells, setting the row_llim of !their row blocks do ik=1,fieldmat%naky do is=1,fieldmat%kyb(ik)%nsupercell nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb if(self%is_local(fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)==1) then fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=1 fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=nrow_tmp else fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=0 fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=0 endif call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow enddo enddo enddo enddo end subroutine pc_decomp_owncells_serial_local !> Cells are decomposed based on proc || Decomp_type=3 subroutine pc_decomp_owncells_simplempi(self, fieldmat) use mp, only: comm_type, sum_allreduce_sub use sorting, only: quicksort implicit none class(pc_type), intent(inout) :: self class(fieldmat_type), intent(in out) :: fieldmat integer :: ik, is, ic, ifq, it, ip, llim, ulim integer :: nrow_tmp, nrow_loc, remainder, np integer, dimension(:), allocatable :: nwork, iproc_work type(comm_type) :: supercell_comm logical :: cell_is_local !/Don't need to change pc%is_local as this is already correct !Make sure we do the invert locally as mpi based version not implemented! self%force_local_invert=.true. !Now we say that we can use no gather based getfieldeq self%has_to_gather=.false. !Now we (re)calculate how much data is available call self%count_avail(fieldmat) !Initialise the number of rows responsible self%nresp_per_cell=0 !Now loop over all cells, setting the row_llim of !their row blocks do ik=1,fieldmat%naky do is=1,fieldmat%kyb(ik)%nsupercell nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell ! If we have this supercell then determine which processors have ! this cell and decide which processors get which work. We attempt ! to distribute work to those processors with the least amount of ! work already allocated. if(fieldmat%kyb(ik)%supercells(is)%is_local)then ! Copy out a couple of values to make the code more readable supercell_comm = fieldmat%kyb(ik)%supercells(is)%cells(ic)%parent_sub cell_is_local = fieldmat%kyb(ik)%supercells(is)%cells(ic)%is_local ! First determine how many processors in the supercell communicator ! have this cell np = 0 if (cell_is_local) np = 1 call sum_allreduce_sub(np, supercell_comm%id) ! Now we can determine what the effective processor rank should be for distributing work. ! Essentially we just want to number the processors with this cell from 0 to np in order of ! how little work they currently have. ! First record how much work each processors has, forcing those without this cell to report -ve work allocate(nwork(0:supercell_comm%nproc-1)) nwork = 0 if (cell_is_local) then nwork(supercell_comm%iproc) = self%current_nresp() else nwork(supercell_comm%iproc) = -1 end if ! Now all processors in supercell communicator know how much work each processor has ! except those which aren't local and hence can't take part. call sum_allreduce_sub(nwork, supercell_comm%id) ! Now we have to label the processors in sorted order. To do this we first create an array ! of sequential processor ids and we then sort this based on the amount of work, as stored in nwork allocate(iproc_work(0:supercell_comm%nproc-1)) iproc_work(:) = [(ip, ip=0, supercell_comm%nproc-1)] ! Now sort iproc_work based on nwork call quicksort(supercell_comm%nproc, nwork, iproc_work) if (cell_is_local) then ! With 2008 standard can just do: ! ip_expect = findloc(iproc_work, supercell_comm%iproc) ! instead of this loop ! Search the processor list to find where the processor's id is found do ip = 0, supercell_comm%nproc-1 if(iproc_work(ip) == supercell_comm%iproc) exit end do ! Here we have to remove any offset in ip arising ! from the processors without this cell. As they ! are assigned -1 work they come at the start of ! the array. ip = ip - count(nwork == -1) end if deallocate(nwork, iproc_work) endif !NOTE: By sorting the procs as done above we end up with lots of different procs !which are proc0 for at least one supercell. This means that later when we assign !the supercell heads we have lots of members, whilst if we try to minimise the number !of unique procs which are proc0 for 1 or more supercells then there will be fewer !members and the subsequent allreduce will be cheaper. !Store it index it=fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind !If local work out limits, if not just set to 0. if(fieldmat%kyb(ik)%supercells(is)%cells(ic)%is_local)then !Calculate row limits based on cell size, nprocs and !proc rank !/Simple block size nrow_loc=max(ceiling(nrow_tmp*1.0/np),minNRow) !By enforcing a minimum value of nrow_loc we can ensure !that blocks don't get too small. Note it's easy to work !out the limits for any of the other procs *without* comms. !Hence should be quite cheap to check how many empty procs !we have, how small the smallest block is etc. So shouldn't !be too expensive to work out if we should do an extra row !or two if it will remove procs with little work. remainder=nrow_tmp-nrow_loc*np !Note this should be -ve due to use of ceiling !Now work out limits llim=1+ip*(nrow_loc) ulim=llim+nrow_loc-1 !Now make sure that we don't overstep the cell limits if(llim>nrow_tmp) then !If the lower limit is too big then this proc should !be made empty llim=0 ulim=0 else if(ulim>nrow_tmp) then !If just the upper limit is too big then !lower it ulim=nrow_tmp endif !Now loop over row blocks to set index do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=llim fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=ulim call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow enddo else do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=0 fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=0 call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow enddo endif !Record the number of responsible rows, note we assume all rb have same size if(size(fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb)>0)then self%nresp_per_cell(it,ik)=fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(1)%nrow else self%nresp_per_cell(it,ik)=0 endif enddo enddo enddo end subroutine pc_decomp_owncells_simplempi !------------------------------------------------------------------------ !////////////////////////////// !// MODULE LEVEL ROUTINES !////////////////////////////// !> Routine to initialise subroutine init_fields_local use antenna, only: init_antenna use theta_grid, only: init_theta_grid use kt_grids, only: init_kt_grids use gs2_layouts, only: init_gs2_layouts, g_lo use mp, only: proc0, mp_abort use file_utils, only: error_unit implicit none !Early exit if possible if (initialised) return !Exit if local fields not supported (e.g. compiled with pgi) if(.not.fields_local_functional()) call mp_abort("field_option='local' not supported with this build") !Make sure any dependencies are already initialised and !initialise the field matrix object. if (debug.and.proc0) write(6,*) "init_fields_local: gs2_layouts" call init_gs2_layouts if (debug.and.proc0) write(6,*) "init_fields_local: theta_grid" call init_theta_grid if (debug.and.proc0) write(6,*) "init_fields_local: kt_grids" call init_kt_grids if (debug.and.proc0) write(6,*) "init_fields_local: init_fields_matrixlocal" if(field_local_tuneminnrow) then call tuneMinNRow() end if call init_fields_matrixlocal(fieldmat, pc) if (debug.and.proc0) write(6,*) "init_fields_local: antenna" call init_antenna !Print a warning message if x_lo isn't local if((.not.(g_lo%x_local.and.g_lo%y_local)).and.field_local_allreduce_sub) then if(proc0)write(error_unit(),'("Warning : In this run not all procs will hold the full field data (only proc0)")') endif !Set the initialised state initialised = .true. reinit = .false. end subroutine init_fields_local !> Routine use to initialise field matrix data subroutine init_fields_matrixlocal(fieldmat, pc, tuning_in) use mp, only: proc0, mp_abort, land_allreduce use file_utils, only: error_unit use optionals, only: get_option_with_default implicit none class(fieldmat_type), intent(in out) :: fieldmat class(pc_type), intent(in out) :: pc logical, intent(in), optional :: tuning_in logical :: tuning logical :: response_was_read real :: ts,te !Exit if we've already initialised if(initialised) return tuning = get_option_with_default(tuning_in, .false.) if(.not.reinit)then !Use the fieldmat init routine to create fieldmat !structure and fill some of the basic variables !if(debug)call barrier if(proc0.and.debug) then write(dlun,'("Initialising fieldmat.")') call cpu_time(ts) endif call fieldmat%init !if(debug)call barrier if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif !Now initialise the parallel decomposition object !Note: We don't actually decompose until we make some !of the primary subcom so we know how many procs are !available to each object etc. !if(debug)call barrier if(proc0.and.debug) then write(dlun,'("Initialising pc.")') call cpu_time(ts) endif call pc%init(fieldmat) !if(debug)call barrier if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif !Now use the pc object to set field object locality !and make the primary subcommunicators !if(debug)call barrier if(proc0.and.debug) then write(dlun,'("Setting initial locality and making primary subcommunicators.")') call cpu_time(ts) endif call fieldmat%set_is_local(pc) call fieldmat%make_subcom_1 !if(debug)call barrier if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif !Now we can setup the decomposition, which basically !consists of setting the row limit of row block objects !if(debug)call barrier if(proc0.and.debug) then write(dlun,'("Initialising decomposition.")') call cpu_time(ts) endif call pc%make_decomp(fieldmat) !if(debug)call barrier if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif !Now set fieldmat locality and allocate data arrays !if(debug)call barrier if(proc0.and.debug) then write(dlun,'("Setting locality and allocating.")') call cpu_time(ts) endif call fieldmat%set_is_local(pc) call fieldmat%allocate !if(debug)call barrier if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif !Need to make cell/supercell level sub-communicators !at some point before we try to invert. !Can make them at any point after locality defined !/Note that to split a sub-comm everybody in the original !communicator needs to make the call. We can either split !all groups off from WORLD or we can take "nested subsets" !Note sure if there is any perfomance benefit in the actual !comms (i'd guess not) but it should be faster to split smaller !groups and can do it in parallel so the nested subset approach !may be better. !if(debug)call barrier if(proc0.and.debug)then write(dlun,'("Creating the secondary subcoms.")') call cpu_time(ts) endif call fieldmat%make_subcom_2 call fieldmat%calc_sc_heads !if(debug)call barrier if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif endif !!!!!!!!!! !!NOTE: All of the above should be a one off setup cost unless something !!fundamental changes with the parallel layout (a change in nproc/layout etc.) !!The remaining two tasks (populate+prepare) must be repeated each time the !!relevant physics and numerical parameters change. In practice this just means !!when the time step changes. !!!!!!!!!! if(.not. tuning) then response_was_read = .false. !Now fill the fieldmat with data if(read_response)then if(proc0.and.debug)then write(dlun,'("Populating from dump.")') call cpu_time(ts) endif call read_response_from_file_local(fieldmat, could_read = response_was_read) if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif ! Ensure we reduce response_was_read across all processors so if _any_ ! processors couldn't read the matrices we recalculate on all processors. call land_allreduce(response_was_read) if((.not. response_was_read) .and. proc0) write(error_unit(), & '("Could not find response file so reverting to calculation.")') end if !If we weren't able to read the response file, or didn't try to, !then we need to calculate the response matrix instead if (.not. response_was_read) then !if(debug)call barrier if(proc0.and.debug)then write(dlun,'("Populating.")') call cpu_time(ts) endif call fieldmat%populate(pc) !if(debug)call barrier if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif !Now prepare the response matrices !Note: Currently prepare just means invert !but at some point we may wish to LU decompose !or other approaches etc. !if(debug)call barrier if(proc0.and.debug) then write(dlun,'("Preparing.")') call cpu_time(ts) endif call fieldmat%prepare(pc) !if(debug)call barrier if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif call fieldmat%get_condition_numbers endif !Dump response to file if(dump_response)then if(proc0.and.debug)then write(dlun,'("Dumping to file.")') call cpu_time(ts) endif call dump_response_to_file_local(fieldmat) if(proc0.and.debug) then call cpu_time(te) write(dlun,'("--Done in ",F12.4," units")') te-ts endif endif endif !Now write debug data ! call fieldmat%write_debug_data end subroutine init_fields_matrixlocal !> Reset the fields_local module subroutine reset_fields_local implicit none initialised=.false. reinit=.true. end subroutine reset_fields_local !> Finish the fields_local module subroutine finish_fields_local() implicit none call pc%reset call fieldmat%reset reinit = .false. initialised=.false. end subroutine finish_fields_local !> Calculate the update to the fields subroutine getfield_local(phi,apar,bpar,do_gather_in,do_update_in) use unit_tests, only: debug_message use optionals, only: get_option_with_default implicit none complex, dimension(:,:,:), intent(out) :: phi,apar,bpar !Note, these are actually phinew,... in typical usage logical, optional, intent(in) :: do_gather_in, do_update_in logical :: do_gather, do_update integer, parameter :: verb = 4 !Set gather flag, this currently always needs to be true for !correct operation. do_gather = get_option_with_default(do_gather_in, .false.) do_update = get_option_with_default(do_update_in, .false.) call debug_message(verb, & 'fields_local::getfield_local calling get_field_update') !Use fieldmat routine to calculate the field update call fieldmat%get_field_update(phi,apar,bpar,pc) call debug_message(verb, & 'fields_local::getfield_local calling gather_fields') !Gather to proc0 if requested !NOTE: We currently calculate omega at every time step so we !actually need to gather everytime, which is a pain! !We also fill in the empties here. do_gather = .true. if(do_gather) call fieldmat%gather_fields(phi,apar,bpar,& to_all_in=.false.,do_allreduce_in=field_local_allreduce) call debug_message(verb, & 'fields_local::getfield_local calling update_fields') !This routine updates *new fields using gathered update if(do_update) call fieldmat%update_fields(phi,apar,bpar) call debug_message(verb, & 'fields_local::getfield_local finished') end subroutine getfield_local !> Initialise the fields from the initial g, just uses the !! fields_implicit routine subroutine init_allfields_local use fields_implicit, only: init_allfields_implicit implicit none !EGH Note that this will fail if someone has set ! the parameter new_field_init in init_g to .false. ! Add a warning/check? call init_allfields_implicit end subroutine init_allfields_local !> This routine advances the solution by one full time step subroutine advance_local(istep, remove_zonal_flows_switch) use run_parameters, only: has_phi, has_apar, has_bpar, reset use fields_implicit, only: remove_zonal_flows use fields_arrays, only: phi, apar, bpar, phinew, time_field use fields_arrays, only: aparnew, bparnew, apar_ext, time_field_mpi use dist_fn, only: timeadv, exb_shear, g_exb, g_exbfac use dist_fn, only: collisions_advance use dist_fn_arrays, only: g, gnew use antenna, only: antenna_amplitudes, no_driver use gs2_layouts, only: g_lo use unit_tests, only: debug_message use mp, only: get_mp_times use job_manage, only: time_message use array_utils, only: copy implicit none integer, intent(in) :: istep logical, intent(in) :: remove_zonal_flows_switch integer, parameter :: diagnostics = 1 logical, parameter :: do_gather=.true. logical :: do_update integer, parameter :: verb = 4 real :: mp_total, mp_total_after !do_gather=.true. => fields are collected from all procs and distributed to everyone (required) !do_update=.true. => "Smart" update routines are used which only update the local parts of the !field arrays. This could help when xy are strongly parallelised but is likely to be slower !when all local, hence we should turn it off if xy all local. do_update=do_smart_update if(g_lo%x_local.and.g_lo%y_local) do_update=.false. !GGH NOTE: apar_ext is initialized in this call if(.not.no_driver) call antenna_amplitudes (apar_ext) call debug_message(verb, & 'fields_local::advance_local calling g_exb') !Apply flow shear if active if(abs(g_exb*g_exbfac)>epsilon(0.)) call exb_shear(gnew,phinew,aparnew,bparnew,istep,field_local_allreduce_sub) !Update g and fields call copy(gnew,g) if(do_update)then !Here we tie the "smart" update to do_update call fieldmat%update_fields_newstep else if (has_phi) call copy(phinew, phi) if (has_apar) call copy(aparnew, apar) if (has_bpar) call copy(bparnew, bpar) endif call debug_message(verb, & 'fields_local::advance_local calling timeadv 1') !Find gnew given fields at time step midpoint call timeadv (phi, apar, bpar, phinew, & aparnew, bparnew, istep) if(reset) return !Return is resetting !Add in antenna driving if present !<DD>TAGGED: Should we only this is fapar>0 as well? if(.not.no_driver) aparnew=aparnew+apar_ext call debug_message(verb, & 'fields_local::advance_local calling getfield_local') !For timing !if(debug)call barrier !!/These barriers influence the reported time call time_message(.false.,time_field,' Field Solver') call get_mp_times(total_time = mp_total) !Calculate the fields at the next time point call getfield_local(phinew,aparnew,bparnew,do_gather,do_update) !If we do the update in getfield_local don't do it here if(.not.do_update)then if (has_phi) phinew = phinew + phi if (has_apar) aparnew = aparnew + apar if (has_bpar) bparnew = bparnew + bpar endif !For timing !Barrier usage ensures that proc0 measures the longest time taken on !any proc. !if(debug)call barrier !!/These barriers influence the reported time call time_message(.false.,time_field,' Field Solver') call get_mp_times(total_time = mp_total_after) time_field_mpi = time_field_mpi + (mp_total_after - mp_total) !Remove zonal component if requested if(remove_zonal_flows_switch) call remove_zonal_flows call debug_message(verb, & 'fields_local::advance_local calling timeadv 2') !Evolve gnew to next half time point call timeadv (phi, apar, bpar, phinew, & aparnew, bparnew, istep, diagnostics) ! Advance collisions, if separate from timeadv call collisions_advance (phi, bpar, phinew, aparnew, bparnew, diagnostics) end subroutine advance_local subroutine dump_response_to_file_local_nopass(suffix) implicit none !> If passed then use as part of file suffix character(len=*), optional, intent(in) :: suffix call dump_response_to_file_local(fieldmat, suffix) end subroutine dump_response_to_file_local_nopass !> Routine to dump the current response matrix data !! to file. One file per connected domain. Each written !! by the head of the supercell. subroutine dump_response_to_file_local_passed(fieldmat, suffix) implicit none class(fieldmat_type), intent(in out) :: fieldmat !> If passed then use as part of file suffix character(len=*), optional, intent(in) :: suffix integer :: ik, is !First we have to pull data so that head of each supercell has full data !Currently we are lazy and just ensure all procs in supercell has full data do ik=1,fieldmat%naky !If ik isn't local than cycle if(.not.fieldmat%kyb(ik)%is_local) cycle !Loop over supercells do is=1,fieldmat%kyb(ik)%nsupercell !If is isn't local than cycle if(.not.fieldmat%kyb(ik)%supercells(is)%is_local) cycle !Call supercell bound method to dump response call fieldmat%kyb(ik)%supercells(is)%dump_to_file(suffix) enddo enddo end subroutine dump_response_to_file_local_passed !> Routine to read the current response matrix data !> from file dump. One file per connected domain. Each written !> by the head of the supercell. !> NOTE: Have to have setup communicators etc. subroutine read_response_from_file_local(fieldmat, could_read, suffix) implicit none class(fieldmat_type), intent(in out) :: fieldmat !> Flag to indicate if all files successfully read logical, intent(out) :: could_read !> If passed then use as part of file suffix character(len=*), optional, intent(in) :: suffix integer :: ik, is could_read = .true. !First we have to pull data so that head of each supercell has full data !Currently we are lazy and just ensure all procs in supercell has full data do ik=1,fieldmat%naky !If ik isn't local than cycle if(.not.fieldmat%kyb(ik)%is_local) cycle !Loop over supercells do is=1,fieldmat%kyb(ik)%nsupercell !If is isn't local than cycle if (.not.fieldmat%kyb(ik)%supercells(is)%is_local) cycle call fieldmat%kyb(ik)%supercells(is)%read_from_file(could_read = could_read, suffix = suffix) ! If we failed to read this file then leave immediately if (.not. could_read) return end do end do end subroutine read_response_from_file_local !> Returns true if GS2 was built in such a way !! as to allow this module to work. !! Currently does not work with PGI compilers. !! See online discussions. function fields_local_functional() use runtime_tests, only: compiler_pgi logical :: fields_local_functional fields_local_functional = (.not. compiler_pgi()) end function fields_local_functional !> Calculate the L2/Frobenius/Euclidiean-norm of a 2D complex array real function l2norm(matrix) !> The matrix to calculate the norm of complex, dimension(:, :), intent(in) :: matrix !> The shape of the matrix integer, dimension(2) :: matrix_shape !> Indicies for indexing the matrix integer :: i, j l2norm = 0. matrix_shape = shape(matrix) do j = 1, matrix_shape(2) do i = 1, matrix_shape(1) l2norm = l2norm + real(matrix(i,j) * conjg(matrix(i,j))) end do end do l2norm = sqrt(l2norm) ! We could write all of the above as ! l2norm = sqrt(sum(matrix*matrix)) ! but this could run over the matrix twice ! and require a matrix sized temporary. Our ! approach avoids this. We could have an intermediate ! approach of ! do j = 1, matrix_shape(1) ! l2norm = l2norm + sum(matrix(:,j) * conjg(matrix(:,j))) ! end do ! l2norm = sqrt(l2norm) ! which might offer a bit of a middle ground ! Also note that a norm2 method was introduced in fortran 2008 ! but that only accepts real valued arguments. end function l2norm end module fields_local