fields_gf_local.f90 Source File


Contents

Source Code


Source Code

#include "unused_dummy.inc"
!> This module is based on the fields_local module (which is in term a re-implementation of the 
!> fields implicit functionality).  fields_gf_local implements the implicit field calculation 
!> using a gf_lo based data decomposition to improve performance on large process counts.  It
!> utilises a smaller set of processes to reduce communication costs (at the expense of significant 
!> load imbalance in the fields calculations).  It assumes gf_lo velocity space integration and 
!> stores field data in gf_lo format (i.e. ik,it are decomposed over processes, everthing else is 
!> local).
!>
!> Implemented by Adrian Jackson, EPCC, The University of Edinburgh; 2014-2016.  Funded by an 
!> EPSRC ARCHER eCSE project.
module fields_gf_local
  use mp, only: comm_type
  use fields_local, only: rowblock_type, cell_type, supercell_type, ky_type, fieldmat_type, pc_type
  implicit none
  private

  !> Module level routines
  public :: init_fields_gf_local, init_allfields_gf_local, finish_fields_gf_local
  public :: advance_gf_local, reset_fields_gf_local, dump_response_to_file_gf_local
  public :: fields_gf_local_functional, field_local_allreduce_sub

  !> User set parameters
  public :: dump_response, read_response

  !> Made public for testing
  public :: fieldmat

  !//////////////////////////////
  !// CUSTOM TYPES
  !//////////////////////////////

  !> This is the top level object, consisting of a collection of
  !! ky blocks.
  type, extends(fieldmat_type) :: fieldmat_gf_type
   contains
     procedure :: gather_fields_gf => fm_gather_fields_gf
     procedure :: gather_init_fields => fm_gather_init_fields
     procedure :: getfieldeq_nogath => fm_getfieldeq_nogath
     procedure, nopass :: update_fields_gf => fm_update_fields_gf
     procedure :: update_fields_newstep => fm_update_fields_newstep
  end type fieldmat_gf_type

  !> This is the type which controls/organises the parallel
  !! data decomposition etc.
  type, extends(pc_type) :: pc_gf_type
   contains
     procedure :: find_locality => pc_find_locality
     procedure :: make_decomp => pc_make_decomp
     !//The different decomposition routines
     procedure :: decomp => pc_decomp
  end type pc_gf_type

  !//////////////////////////////
  !// MODULE VARIABLES
  !//////////////////////////////
  !Tuning this can improve advance/init time (usually one at the expense of other)
  logical,parameter :: debug=.false. !< Do we do debug stuff?
  logical :: initialised=.false. !< Have we initialised yet?
  logical :: reinit=.false. !< Are we reinitialising?
  logical :: dump_response=.false. !< Do we dump the response matrix?
  logical :: read_response=.false. !< Do we read the response matrix from dump?
  logical :: field_local_allreduce_sub = .false. !< If true and field_local_allreduce true then do two sub comm all reduces rather than 1
  type(pc_gf_type),save :: pc !< This is the parallel control object
  type(fieldmat_gf_type),save :: fieldmat !< This is the top level field matrix object

contains

!//////////////////////////////
!// CUSTOM TYPES
!//////////////////////////////
!
!================
!FIELDMAT
!================

  !> Redistribute initial fields from gf_lo to g_lo.  These are fields that have not been reduced to the 
  !! supercell heads, just calculated on each process that owns gf_lo points
  subroutine fm_gather_init_fields(self,gf_phnew,gf_apnew,gf_bpnew,gf_ph,gf_ap,gf_bp,phnew,apnew,bpnew,ph,ap,bp)
    use theta_grid, only: ntgrid
    use kt_grids, only: naky, ntheta0, kwork_filter
    use mp, only: nbsend, nbrecv, waitall, broadcast_sub, sum_reduce_sub, proc0
    use mp, only: rank_comm, comm_type, initialise_requests, iproc
    use run_parameters, only: has_phi, has_apar, has_bpar
    use gs2_layouts, only: gf_lo, g_lo, proc_id, idx, ik_idx, it_idx
    implicit none
    class(fieldmat_gf_type), intent(inout) :: self
    !AJ We have two sets of field arrays; ph,ap,bp store the data we hold in g_lo (i.e. 
    !AJ the field points we have in the g_lo decomposition), and gf_ph,gf_ap,gf_bp hold 
    !AJ the field points we have in the gf_lo decomposition.  This was done for safety (to 
    !AJ ensure that updating fields in gf_lo or g_lo did not interfere with each other), however
    !AJ I THINK it should be possible to do this in a single array.
    complex, dimension(-ntgrid:,:,:), intent(inout) :: ph,ap,bp,gf_ph,gf_ap,gf_bp
    complex, dimension(-ntgrid:,:,:), intent(inout) :: phnew,apnew,bpnew,gf_phnew,gf_apnew,gf_bpnew
    integer :: is, ic
    integer :: ik, it, ikit, sendreqnum, recvreqnum, sender, receiver, tag
    integer, dimension (:), allocatable :: recv_handles, send_handles
    integer :: tempfield, local_iproc
    complex, dimension(:,:,:,:), allocatable :: tempdata

    allocate(tempdata(-ntgrid:ntgrid,self%nfield,ntheta0,naky))
    !AJ At this point each process has the correct initialised fields in gf_lo (i.e. the points it owns in gf_lo).
    !AJ what we want to do in this routine is distribute them so that all those processes who are part of the supercell
    !AJ in gf_lo have the data, and all the processes that own the points in g_lo also have the data.

    allocate(recv_handles(naky*ntheta0))
    allocate(send_handles(naky*ntheta0))

    call initialise_requests(send_handles)
    call initialise_requests(recv_handles)

    !AJ The first thing we do is send the data from the owner in gf_lo to the supercell head in gf_lo.  This 
    !AJ will enable us to distribute it to the whole supercell in gf_lo, and then after that to the processes 
    !AJ that need it in g_lo

    tempdata = 0.0

    recvreqnum = 1
    sendreqnum = 1
    do ik=1,self%naky
       if(self%kyb(ik)%is_empty) cycle
       do is=1,self%kyb(ik)%nsupercell
          if(self%kyb(ik)%supercells(is)%is_empty) cycle
          receiver = self%kyb(ik)%supercells(is)%head_iproc_world
          do ic=1,self%kyb(ik)%supercells(is)%ncell                
             it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
             sender = proc_id(gf_lo,idx(gf_lo,ik,it))
             tag = ik*self%ntheta0 + it
             if(sender == iproc .and. receiver /= iproc) then
                tempfield = 1
                if(has_phi) then
                   tempdata(:,tempfield,it,ik) = phnew(:,it,ik)
                   !AJ This copy ensures that the field data is also up to date in the gf_lo data structures 
                   !AJ (the implicit field initialise calculates it in the g_lo data structures.  This could be
                   !AJ fixed in fields_implicit init_allfields_implicit making this copy redundant)
                   gf_ph(:,it,ik) = phnew(:,it,ik)
                   gf_phnew(:,it,ik) = phnew(:,it,ik)
                   tempfield = tempfield + 1
                end if
                if(has_apar) then
                   tempdata(:,tempfield,it,ik) = apnew(:,it,ik)
                   !AJ This copy ensures that the field data is also up to date in the gf_lo data structures 
                   !AJ (the implicit field initialise calculates it in the g_lo data structures.  This could be
                   !AJ fixed in fields_implicit init_allfields_implicit making this copy redundant)
                   gf_ap(:,it,ik) = apnew(:,it,ik)
                   gf_apnew(:,it,ik) = apnew(:,it,ik)
                   tempfield = tempfield + 1
                end if
                if(has_bpar) then
                   tempdata(:,tempfield,it,ik) = bpnew(:,it,ik)
                   !AJ This copy ensures that the field data is also up to date in the gf_lo data structures 
                   !AJ (the implicit field initialise calculates it in the g_lo data structures.  This could be
                   !AJ fixed in fields_implicit init_allfields_implicit making this copy redundant)
                   gf_bp(:,it,ik) = bpnew(:,it,ik)
                   gf_bpnew(:,it,ik) = bpnew(:,it,ik)
                end if
                call nbsend(tempdata(:,:,it,ik),receiver,tag,send_handles(sendreqnum))
                sendreqnum = sendreqnum + 1
             else if(receiver == iproc .and. sender /= iproc) then
                call nbrecv(tempdata(:,:,it,ik),sender,tag,recv_handles(recvreqnum))                
                recvreqnum = recvreqnum + 1
                !AJ Both the sender and receiver are the same process, so this process 
                !AJ already has the correct gf_lo field data, but it is currently stored 
                !AJ in the g_lo arrays (phi,apar,bpar) instead of the gf_lo arrays 
                !AJ (gf_phi, gf_apar, gf_bpar).  Copy it across so it is in the correct array.
             else if(receiver == iproc .and. sender == iproc) then
                tempfield = 1
                if(has_phi) then
                   gf_phnew(:,it,ik) = phnew(:,it,ik)
                   gf_ph(:,it,ik) = phnew(:,it,ik)
                   !AJ This copy is necessary so the data is in the correct place for the next 
                   !AJ communication (the sending from supercell head to the first proc in the 
                   !AJ g_lo list).
                   tempdata(:,tempfield,it,ik) = phnew(:,it,ik)
                   tempfield = tempfield + 1
                end if
                if(has_apar) then
                   gf_apnew(:,it,ik) = apnew(:,it,ik)
                   gf_ap(:,it,ik) = apnew(:,it,ik)
                   !AJ This copy is necessary so the data is in the correct place for the next 
                   !AJ communication (the sending from supercell head to the first proc in the 
                   !AJ g_lo list).
                   tempdata(:,tempfield,it,ik) = apnew(:,it,ik)
                   tempfield = tempfield + 1
                end if
                if(has_bpar) then
                   gf_bpnew(:,it,ik) = bpnew(:,it,ik)
                   gf_bp(:,it,ik) = bpnew(:,it,ik)
                   !AJ This copy is necessary so the data is in the correct place for the next 
                   !AJ communication (the sending from supercell head to the first proc in the 
                   !AJ g_lo list).
                   tempdata(:,tempfield,it,ik) = bpnew(:,it,ik)
                end if
             end if
          end do
       end do
    end do


    call waitall(recvreqnum-1,recv_handles)
    
    do ik=1,self%naky
       if(self%kyb(ik)%is_empty) cycle
       do is=1,self%kyb(ik)%nsupercell
          if(self%kyb(ik)%supercells(is)%is_empty) cycle
          if(.not.self%kyb(ik)%supercells(is)%is_head) cycle
          do ic=1,self%kyb(ik)%supercells(is)%ncell                
             it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
             sender = proc_id(gf_lo,idx(gf_lo,ik,it))
             if(sender /= iproc) then
                tempfield = 1
                if(has_phi) then
                   gf_phnew(:,it,ik) = tempdata(:,tempfield,it,ik)
                   gf_ph(:,it,ik) = gf_phnew(:,it,ik)
                   tempfield = tempfield + 1
                end if
                if(has_apar) then
                   gf_apnew(:,it,ik) = tempdata(:,tempfield,it,ik)
                   gf_ap(:,it,ik) = gf_apnew(:,it,ik)
                   tempfield = tempfield + 1
                end if
                if(has_bpar) then
                   gf_bpnew(:,it,ik) = tempdata(:,tempfield,it,ik)
                   gf_bp(:,it,ik) = gf_bpnew(:,it,ik)
                end if
             end if
          end do
       end do
    end do
    
    call waitall(sendreqnum-1,send_handles)


    !AJ Now we collect all the data from the supercell heads to proc0 for diagnostics.  If
    !AJ we implemented distributed diagnostics then this would not be necessary.    
    if(self%fm_sub_headsc_p0%nproc>0)then
       call sum_reduce_sub(tempdata,0,self%fm_sub_headsc_p0)
       
       !AJ proc0 unpacks the received field data and updates it's fields from it.
       !AJ Because proc0 now has all the field data it can update both the gf_lo
       !AJ fields and the g_lo fields.
       if(proc0) then
          
          tempfield = 1
          if(has_phi) then
             ph = tempdata(:,tempfield,:,:)
             phnew = ph
             gf_ph = ph
             gf_phnew = ph
             tempfield = tempfield + 1
          end if
             
          if(has_apar) then
             ap = tempdata(:,tempfield,:,:)
             apnew = ap 
             gf_ap = ap
             gf_apnew = ap
             tempfield = tempfield + 1
          end if
          
          if(has_bpar) then
             bp = tempdata(:,tempfield,:,:)
             bpnew = bp
             gf_bp = bp
             gf_bpnew = bp
          end if
          
       end if
       
    end if

       
    call initialise_requests(send_handles)
    call initialise_requests(recv_handles)
    
    !AJ Now send the data from the supercell head in gf_lo to the first process in the list of owners for
    !AJ the same point in g_lo
    recvreqnum = 1
    do ikit=1, g_lo%ikitrange
       ik=g_lo%local_ikit_points(ikit)%ik
       it=g_lo%local_ikit_points(ikit)%it
       if(kwork_filter(it,ik)) cycle
       sender = self%heads(it,ik)
       if(sender /= iproc .and. .not. proc0) then
          if(g_lo%ikit_procs_assignment(it,ik)%proc_list(1) == iproc) then
             tag = ik*self%ntheta0 + it          
             call nbrecv(tempdata(:,:,it,ik),sender,tag,recv_handles(recvreqnum))             
             recvreqnum = recvreqnum + 1
          end if
       end if
    end do
    
    sendreqnum = 1
    
    do ik=1,self%naky
       if(self%kyb(ik)%is_empty) cycle
       do is=1,self%kyb(ik)%nsupercell
          if(self%kyb(ik)%supercells(is)%is_empty) cycle
          if(self%kyb(ik)%supercells(is)%is_head) then
             do ic=1,self%kyb(ik)%supercells(is)%ncell                
                it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                if(kwork_filter(it,ik)) cycle
                receiver = g_lo%ikit_procs_assignment(it,ik)%proc_list(1) 
                if(receiver /= iproc .and. receiver /= 0) then
                   tag = ik*self%ntheta0 + it    
                   call nbsend(tempdata(:,:,it,ik),receiver,tag,send_handles(sendreqnum))
                   sendreqnum = sendreqnum + 1
                !AJ If I'm to be sending to myself then just copy the data from gf_lo arrays (where 
                !AJ is was put in the communication to get it to supercell heads) to the correct g_lo arrays.
                else if(receiver == iproc .and. .not. proc0) then
                   if(has_phi) then
                      phnew(:,it,ik) = gf_ph(:,it,ik)
                      ph(:,it,ik) = phnew(:,it,ik)
                   end if
                   if(has_apar) then
                      apnew(:,it,ik) = gf_ap(:,it,ik)
                      ap(:,it,ik) = apnew(:,it,ik)
                   end if
                   if(has_bpar) then
                      bpnew(:,it,ik) = gf_bp(:,it,ik)
                      bp(:,it,ik) = bpnew(:,it,ik)
                   end if
                end if
             end do
          end if
       end do
    end do
    
    call waitall(recvreqnum-1,recv_handles)
    
    do ikit=1, g_lo%ikitrange
       ik=g_lo%local_ikit_points(ikit)%ik
       it=g_lo%local_ikit_points(ikit)%it
       if(kwork_filter(it,ik)) cycle
       sender = self%heads(it,ik)
       receiver = g_lo%ikit_procs_assignment(it,ik)%proc_list(1) 
       if(sender /= iproc .and. receiver == iproc .and. .not. proc0) then
          tempfield = 1
          if(has_phi) then
             phnew(:,it,ik) = tempdata(:,tempfield,it,ik)
             ph(:,it,ik) = phnew(:,it,ik)
             tempfield = tempfield + 1
          end if
          if(has_apar) then
             apnew(:,it,ik) = tempdata(:,tempfield,it,ik)
             ap(:,it,ik) = apnew(:,it,ik)
             tempfield = tempfield + 1
          end if
          if(has_bpar) then
             bpnew(:,it,ik) = tempdata(:,tempfield,it,ik)
             bp(:,it,ik) = bpnew(:,it,ik)
          end if
       end if
    end do
    
    call waitall(sendreqnum-1,send_handles)

    deallocate(send_handles, recv_handles)
    
    !AJ Now get the data from the process in g_lo that has received a given ik,it point to the rest of the 
    !AJ processes that own it.
    !AJ This could be optimised if the xyblock_comm is available, but as a first start the code below works
    !AJ and doesn't involve global collectives.
    do ik = 1,self%naky
       do it = 1,self%ntheta0       
          
          if(kwork_filter(it,ik)) cycle

          if(g_lo%ikit_procs_assignment(it,ik)%mine) then

             if(g_lo%ikit_procs_assignment(it,ik)%num_procs > 1) then
                sender = g_lo%ikit_procs_assignment(it,ik)%sub_proc_list(1)
                local_iproc = g_lo%ikit_procs_assignment(it,ik)%comm%iproc
                
                !AJ This could be avoided on some proceses as tempdata will already have the correct data 
                !AJ (providing sender .ne. receiver in the above calls)
                if(sender == local_iproc .and. .not. proc0) then
                   tempfield = 1
                   if(has_phi) then
                      tempdata(:,tempfield,it,ik) = phnew(:,it,ik)
                      tempfield = tempfield + 1
                   end if
                   
                   if(has_apar) then
                      tempdata(:,tempfield,it,ik) = apnew(:,it,ik)
                      tempfield = tempfield + 1
                   end if
                   
                   if(has_bpar) then
                      tempdata(:,tempfield,it,ik) = bpnew(:,it,ik)
                   end if
                   
                end if
                call broadcast_sub(tempdata(:,:,it,ik),sender,g_lo%ikit_procs_assignment(it,ik)%comm%id)
                
                if(sender /= local_iproc .and. .not. proc0) then
                   tempfield = 1
                   if(has_phi) then
                      phnew(:,it,ik) = tempdata(:,tempfield,it,ik)
                      ph(:,it,ik) = phnew(:,it,ik)
                      tempfield = tempfield + 1
                   end if
                   
                   if(has_apar) then
                      apnew(:,it,ik) = tempdata(:,tempfield,it,ik)
                      ap(:,it,ik) = apnew(:,it,ik)
                      tempfield = tempfield + 1
                   end if
                   
                   if(has_bpar) then
                      bpnew(:,it,ik) = tempdata(:,tempfield,it,ik)
                      bp(:,it,ik) = bpnew(:,it,ik)
                   end if
                end if
             end if
          end if
       end do
    end do
    
    deallocate(tempdata)

  end subroutine fm_gather_init_fields
  

  !> Redistribute fields from gf_lo to g_lo.  This first involves unpacking the calculated 
  !! fields from tmpsum on the supercell heads to the field arrays and then sending them back 
  !! to the processes that calculated them in gf_lo and the processes that need them in g_lo.
  subroutine fm_gather_fields_gf(self,gf_ph,gf_ap,gf_bp,ph,ap,bp)
    use theta_grid, only: ntgrid
    use kt_grids, only: naky, ntheta0, kwork_filter
    use mp, only: nbsend, nbrecv, waitall, broadcast_sub, sum_reduce_sub
    use mp, only: rank_comm, comm_type, proc0, iproc, initialise_requests
    use run_parameters, only: has_phi, has_apar, has_bpar
    use gs2_layouts, only: gf_lo, g_lo, proc_id, idx, ik_idx, it_idx
    implicit none
    class(fieldmat_gf_type), intent(inout) :: self
    !AJ We have two sets of field arrays; ph,ap,bp store the data we hold in g_lo (i.e. 
    !AJ the field points we have in the g_lo decomposition), and gf_ph,gf_ap,gf_bp hold 
    !AJ the field points we have in the gf_lo decomposition.  This was done for safety (to 
    !AJ ensure that updating fields in gf_lo or g_lo did not interfere with each other), however
    !AJ I THINK it should be possible to do this in a single array.
    complex, dimension(-ntgrid:,:,:), intent(inout) :: ph,ap,bp,gf_ph,gf_ap,gf_bp
    integer :: is, ic, iex, ig, bnd
    integer :: ik, it, ikit, sendreqnum, recvreqnum, sender, receiver, tag
    integer, dimension (:), allocatable :: recv_handles, send_handles
    integer :: tempfield, head, sub, local_iproc
    complex, dimension(:,:,:,:), allocatable :: tempdata,tempdata3
    complex, dimension(:,:,:), allocatable :: tempdata2
    logical, parameter :: use_broadcast = .false.
    allocate(tempdata(-ntgrid:ntgrid,self%nfield,ntheta0,naky))
    allocate(tempdata3(-ntgrid:ntgrid,self%nfield,ntheta0,naky))
    
    tempdata = 0.0
   
!CMRQ: cost of Supercell Head Packing?
!CMR: BEGIN  Supercell Head Packing
    !AJ Unpack tmp_sum on the supercell head, tmp_sum has the calculated field update on the supercell head
    do ik=1,self%naky
       if(.not.self%kyb(ik)%is_local) cycle
       do is=1,self%kyb(ik)%nsupercell
          !Skip if we're not the head supercell
          if(.not.self%kyb(ik)%supercells(is)%is_head) cycle
          
          iex=0
          tempfield = 1
          if(has_phi) then
             do ic=1,self%kyb(ik)%supercells(is)%ncell
                it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                bnd=0
                if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
                do ig=-ntgrid,ntgrid-bnd
                   iex=iex+1
                   tempdata(ig,tempfield,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
                enddo
             enddo
             tempfield = tempfield + 1
          end if
          
          if(has_apar) then
             do ic=1,self%kyb(ik)%supercells(is)%ncell
                it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                bnd=0
                if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
                do ig=-ntgrid,ntgrid-bnd
                   iex=iex+1
                   tempdata(ig,tempfield,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
                enddo
             enddo
             tempfield = tempfield + 1
          end if
          
          if(has_bpar) then
             do ic=1,self%kyb(ik)%supercells(is)%ncell
                it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                bnd=0
                if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
                do ig=-ntgrid,ntgrid-bnd
                   iex=iex+1
                   tempdata(ig,tempfield,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
                enddo
             enddo
          end if
          
          
          tempfield  = 1
          
          !<DD>TAGGED:Worth looking at improving this bad memory pattern
          !/Now fix boundary points
          if(self%kyb(ik)%supercells(is)%ncell>1) then
             if(has_phi) then
                do ic=1,self%kyb(ik)%supercells(is)%ncell-1
                   tempdata(ntgrid,tempfield,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=&
                        tempdata(-ntgrid,tempfield,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik)
                enddo
                tempfield = tempfield + 1
             end if
             if(has_apar) then
                do ic=1,self%kyb(ik)%supercells(is)%ncell-1
                   tempdata(ntgrid,tempfield,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=&
                        tempdata(-ntgrid,tempfield,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik)
                enddo
                tempfield = tempfield + 1
             end if
             if(has_bpar) then
                do ic=1,self%kyb(ik)%supercells(is)%ncell-1
                   tempdata(ntgrid,tempfield,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=&
                        tempdata(-ntgrid,tempfield,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik)
                enddo
             end if
             
          endif
       end do
    enddo
    
    !AJ At this point tempdata has the calculated field update for a supercell on the supercell head.
!CMR: END   Supercell Head Packing

!CMRQ: cost of Supercell Head => proc0?
!CMR: BEGIN All Supercell Heads => proc0
    allocate(recv_handles(naky*ntheta0))
    allocate(send_handles(naky*ntheta0))

    call initialise_requests(send_handles)
    call initialise_requests(recv_handles)

    recvreqnum = 1

    !AJ Now we collect all the data from the supercell heads to proc0 for diagnostics.  If
    !AJ we implemented distributed diagnostics then this would not be necessary.
    
    if(self%fm_sub_headsc_p0%nproc>0)then
       call sum_reduce_sub(tempdata,0,self%fm_sub_headsc_p0)
       
       !AJ proc0 unpacks the received field update data and updates it's fields from it.
       !AJ Because proc0 now has all the field update data it can update both the gf_lo
       !AJ fields and the g_lo fields.
       if(proc0) then
          
          tempfield = 1
          if(has_phi) then
             ph = tempdata(:,tempfield,:,:)
             gf_ph = ph
             tempfield = tempfield + 1
          end if
             
          if(has_apar) then
             ap = tempdata(:,tempfield,:,:)
             gf_ap = ap
             tempfield = tempfield + 1
          end if
          
          if(has_bpar) then
             bp = tempdata(:,tempfield,:,:)
             gf_bp = bp
          end if
          
       end if
       
    end if
!CMR: END All Supercell Heads => proc0
       
    !AJ Now send the data back to all processes in the supercell
    !AJ This is less efficient than it needs to be.
    !AJ We could also make use of the fact that in gf_lo a cell is only
    !AJ ever owned by a single owner so we could just send specific 
    !AJ it,ik points to the processes that own them with point-to-point 
    !AJ communications (see the else part of this)

!CMR: BEGIN Supercell Heads => cells
!
!CMR: Note AJ Comment above that USE_BROADCAST=T approach (default) is inefficient!
!
!CMR: Observation, probably need MPI rank to participate in SINGLE supercell
!     MPI rank holding multiple cells might be OK (and avoid comms contention) sometimes:
!        -- if all cells in SAME supercell
!        -- if cells in different supercells that need no COMMS (e.g. supercells with 1 cell)
    if(use_broadcast) then
       do ik=1,self%naky
!CMR: Loop over each supercell (labelled ik, self%kyb(ik)%nsupercell), 
          if(self%kyb(ik)%is_empty) cycle
          do is=1,self%kyb(ik)%nsupercell
             if(self%kyb(ik)%supercells(is)%is_empty) cycle
!CMR: if get here, iproc participates 
             head = self%kyb(ik)%supercells(is)%head_iproc
             sub = self%kyb(ik)%supercells(is)%sc_sub_all%id
             allocate(tempdata2(-ntgrid:ntgrid,self%nfield,self%kyb(ik)%supercells(is)%ncell))
!CMR: head does all packing ready for broadcast to all supercell
             if(self%kyb(ik)%supercells(is)%is_head) then
                do ic=1,self%kyb(ik)%supercells(is)%ncell                
                   it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                   tempfield = 1
                   if(has_phi) then
                      tempdata2(:,tempfield,ic) = tempdata(:,tempfield,it,ik)
                      tempfield = tempfield + 1
                   end if
                   if(has_apar) then
                      tempdata2(:,tempfield,ic) = tempdata(:,tempfield,it,ik)
                      tempfield = tempfield + 1
                   end if
                   if(has_bpar) then
                      tempdata2(:,tempfield,ic) = tempdata(:,tempfield,it,ik)
                   end if
                end do
             end if
             call broadcast_sub(tempdata2,head,sub)
             if(.not.self%kyb(ik)%supercells(is)%is_head) then
!CMR: everyone else unpacks result of broadcast
                do ic=1,self%kyb(ik)%supercells(is)%ncell
                   it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                   tempfield = 1
                   if(has_phi) then
                      gf_ph(:,it,ik) = tempdata2(:,tempfield,ic)
                      tempfield = tempfield + 1
                   end if
                   if(has_apar) then
                      gf_ap(:,it,ik) = tempdata2(:,tempfield,ic)
                      tempfield = tempfield + 1
                   end if
                   if(has_bpar) then
                      gf_bp(:,it,ik) = tempdata2(:,tempfield,ic)
                   end if
                end do
             end if
             deallocate(tempdata2)
          end do
       end do

       !AJ Don't use the broadcast to do the sending back of data in the supercell,
       !AJ use point to point communications instead.
    else
!CMR: NB this (more efficient?) approach is OFF by default!
!CMRQ:  Is alternative to USE_BROADCAST=F working?  Is it more efficient? Why not default?
       
       call initialise_requests(send_handles)
       call initialise_requests(recv_handles)
       
       recvreqnum = 1
       sendreqnum = 1
       
       do ik=1,self%naky
          if(self%kyb(ik)%is_empty) cycle
          do is=1,self%kyb(ik)%nsupercell
             if(self%kyb(ik)%supercells(is)%is_empty) cycle
             sender = self%kyb(ik)%supercells(is)%head_iproc_world
             do ic=1,self%kyb(ik)%supercells(is)%ncell                
                it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                receiver = proc_id(gf_lo,idx(gf_lo,ik,it))
                !AJ For each supercell that I am head of send the data to the process that owns it in gf_lo
                if(self%kyb(ik)%supercells(is)%is_head) then                
                   !AJ If I am the supercell head and I'm not sending this to proc0 (as that already has all the data)
                   !AJ or to myself then send the data.
                   if(receiver /= iproc .and. receiver /= 0) then
                      tag = ik*self%ntheta0 + it
                      call nbsend(tempdata(:,:,it,ik),receiver,tag,send_handles(sendreqnum))
                      sendreqnum = sendreqnum + 1
                   !AJ Otherwise, if I'm sending to myself and I'm not proc0 then unpack the 
                   !AJ the data directly from my buffer into the gf fields array.
                   else if(receiver == iproc .and. receiver /= 0) then
                      tempfield = 1
                      if(has_phi) then
                         gf_ph(:,it,ik) = tempdata(:,tempfield,it,ik)
                         tempfield = tempfield + 1
                      end if
                      if(has_apar) then
                         gf_ap(:,it,ik) = tempdata(:,tempfield,it,ik)
                         tempfield = tempfield + 1
                      end if
                      if(has_bpar) then
                         gf_bp(:,it,ik) = tempdata(:,tempfield,it,ik)
                      end if
                   end if
                else if(iproc == receiver .and. iproc /= 0) then
                   tag = ik*self%ntheta0 + it
                   call nbrecv(tempdata3(:,:,it,ik),sender,tag,recv_handles(recvreqnum))
                   recvreqnum = recvreqnum + 1                   
                end if
             end do
          end do
       end do
       
       call waitall(recvreqnum-1,recv_handles)
       
       do ik=1,self%naky
          if(self%kyb(ik)%is_empty) cycle
          do is=1,self%kyb(ik)%nsupercell
             if(self%kyb(ik)%supercells(is)%is_empty) cycle
             if(self%kyb(ik)%supercells(is)%is_head) cycle
             do ic=1,self%kyb(ik)%supercells(is)%ncell                
                it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                receiver = proc_id(gf_lo,idx(gf_lo,ik,it))
                if(receiver == iproc .and. receiver /= 0) then
                   tempfield = 1
                   if(has_phi) then
                      gf_ph(:,it,ik) = tempdata3(:,tempfield,it,ik)
                      tempfield = tempfield + 1
                   end if
                   if(has_apar) then
                      gf_ap(:,it,ik) = tempdata3(:,tempfield,it,ik)
                      tempfield = tempfield + 1
                   end if
                   if(has_bpar) then
                      gf_bp(:,it,ik) = tempdata3(:,tempfield,it,ik)
                   end if
                end if
             end do
          end do
       end do
       
       call waitall(sendreqnum-1,send_handles)
       
    end if

!CMR: END Supercell Heads => cells (gf_fields)

!CMR: BEGIN Supercell Heads => FIRST g_lo rank(ik,it)
       
    call initialise_requests(send_handles)
    call initialise_requests(recv_handles)
    
    recvreqnum = 1
    do ikit=1, g_lo%ikitrange
       ik=g_lo%local_ikit_points(ikit)%ik
       it=g_lo%local_ikit_points(ikit)%it
       if(kwork_filter(it,ik)) cycle
       sender = self%heads(it,ik)
       if(sender /= iproc .and. iproc /= 0) then
          if(g_lo%ikit_procs_assignment(it,ik)%proc_list(1) == iproc) then
!CMR: if iproc is FIRST processor on list that must receive ik,it in g_lo, set up nbrecv
             tag = ik*self%ntheta0 + it          
             call nbrecv(tempdata3(:,:,it,ik),sender,tag,recv_handles(recvreqnum))             
             recvreqnum = recvreqnum + 1
          end if
       end if
    end do
    
    sendreqnum = 1
    
    do ik=1,self%naky
       if(self%kyb(ik)%is_empty) cycle
       do is=1,self%kyb(ik)%nsupercell
          if(self%kyb(ik)%supercells(is)%is_empty) cycle
          if(self%kyb(ik)%supercells(is)%is_head) then
!CMR: if iproc is supercell head for it,ik, proceed to set up nbsends
             do ic=1,self%kyb(ik)%supercells(is)%ncell                
                it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                if(kwork_filter(it,ik)) cycle
                receiver = g_lo%ikit_procs_assignment(it,ik)%proc_list(1) 
                if(receiver /= iproc .and. receiver /= 0) then
                   tag = ik*self%ntheta0 + it    
!CMR: nbsend from supercell head to where needed in g_lo
                   call nbsend(tempdata(:,:,it,ik),receiver,tag,send_handles(sendreqnum))
                   sendreqnum = sendreqnum + 1
                else if(receiver == iproc .and. receiver /= 0) then
!CMR: unpack data supercell head sends to ITSELF in g_lo
                   tempfield = 1
                   if(has_phi) then
                      ph(:,it,ik) = tempdata(:,tempfield,it,ik)
                      tempfield = tempfield + 1
                   end if
                   if(has_apar) then
                      ap(:,it,ik) = tempdata(:,tempfield,it,ik)
                      tempfield = tempfield + 1
                   end if
                   if(has_bpar) then
                      bp(:,it,ik) = tempdata(:,tempfield,it,ik)
                   end if
                end if
             end do
          end if
       end do
    end do
    
    call waitall(recvreqnum-1,recv_handles)
    
    do ikit=1, g_lo%ikitrange
!CMR: all other cores unpack received data in g_lo
       ik=g_lo%local_ikit_points(ikit)%ik
       it=g_lo%local_ikit_points(ikit)%it
       if(kwork_filter(it,ik)) cycle
       sender = self%heads(it,ik)
       receiver = g_lo%ikit_procs_assignment(it,ik)%proc_list(1) 
       if(sender /= iproc .and. receiver == iproc .and. .not. proc0) then
          tempfield = 1
          if(has_phi) then
             ph(:,it,ik) = tempdata3(:,tempfield,it,ik)
             tempfield = tempfield + 1
          end if
          if(has_apar) then
             ap(:,it,ik) = tempdata3(:,tempfield,it,ik)
             tempfield = tempfield + 1
          end if
          if(has_bpar) then
             bp(:,it,ik) = tempdata3(:,tempfield,it,ik)
          end if
       end if
    end do
    
    call waitall(sendreqnum-1,send_handles)

    deallocate(send_handles, recv_handles)
    deallocate(tempdata3)
!CMR: END Supercell Heads => FIRST g_lo rank(ik,it)
    
!CMR: BEGIN   FIRST g_lo rank(ik,it) => OTHER g_lo ranks(ik,it)

    !AJ Now get the data from the process in g_lo that has received a given ik,it point to the rest of the 
    !AJ processes that own it.
    !AJ This could be optimised if the xyblock_comm is available, but as a first start the code below works
    !AJ and doesn't involve global collectives.
    do ik = 1,self%naky
       do it = 1,self%ntheta0       
          
          if(kwork_filter(it,ik)) cycle

          if(g_lo%ikit_procs_assignment(it,ik)%mine) then
             
             sender = g_lo%ikit_procs_assignment(it,ik)%sub_proc_list(1)
             local_iproc = g_lo%ikit_procs_assignment(it,ik)%comm%iproc

             if(sender == local_iproc) then
                tempfield = 1
                if(has_phi) then
                   tempdata(:,tempfield,it,ik) = ph(:,it,ik)
                   tempfield = tempfield + 1
                end if
                
                if(has_apar) then
                   tempdata(:,tempfield,it,ik) = ap(:,it,ik)
                   tempfield = tempfield + 1
                end if
                
                if(has_bpar) then
                   tempdata(:,tempfield,it,ik) = bp(:,it,ik)
                end if
                
             end if

             call broadcast_sub(tempdata(:,:,it,ik),sender,g_lo%ikit_procs_assignment(it,ik)%comm%id)
             
             if(sender /= local_iproc .and. .not. proc0) then
                tempfield = 1
                if(has_phi) then
                   ph(:,it,ik) = tempdata(:,tempfield,it,ik)
                   tempfield = tempfield + 1
                end if
                
                if(has_apar) then
                   ap(:,it,ik) = tempdata(:,tempfield,it,ik)
                   tempfield = tempfield + 1
                end if
                
                if(has_bpar) then
                   bp(:,it,ik) = tempdata(:,tempfield,it,ik)
                end if
             end if
          end if
       end do
    end do
    
    deallocate(tempdata)
!CMR: END   FIRST g_lo rank(ik,it) => OTHER g_lo ranks(ik,it)
  end subroutine fm_gather_fields_gf

  !> Update the fields using calculated update
  !! We update both the g_lo (phi,apar,bpar) and gf_lo (gf_phi,gf_apar,gf_bpar) fields 
  !! as these are needed to make sure we can progress the fields in g_lo and gf_lo both
  !! times and thereby not have to transfer the full fields for all processes every iteration 
  !! only the required updates.
  !! With some refactoring it should be possible to do all the work on a single set of field 
  !! arrays as they are the same size regardless of the layout being used.
  subroutine fm_update_fields_gf(phi, apar, bpar, gf_phi, gf_apar, gf_bpar)
    use fields_arrays, only: orig_phi=>phi, orig_apar=>apar, orig_bpar=>bpar
    use fields_arrays, only: orig_gf_phi=>gf_phi, orig_gf_apar=>gf_apar, orig_gf_bpar=>gf_bpar
    use run_parameters, only: has_phi, has_apar, has_bpar
    use kt_grids, only: kwork_filter
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, gf_lo, ik_idx, it_idx
    use mp, only: proc0
    implicit none
    complex,dimension(-ntgrid:,:,:),intent(inout) :: phi,apar,bpar
    complex,dimension(-ntgrid:,:,:),intent(inout) :: gf_phi,gf_apar,gf_bpar
    integer :: ik,it,ikit,igf

    !If we're proc0 then we need to do full array (for diagnostics)
    if(proc0) then
       if(has_phi) then
          phi=phi+orig_phi
          gf_phi=gf_phi+orig_gf_phi
       end if
       if(has_apar) then
          apar=apar+orig_apar
          gf_apar=gf_apar+orig_gf_apar
       end if
       if(has_bpar) then
          bpar=bpar+orig_bpar
          gf_bpar=gf_bpar+orig_gf_bpar
       end if
       return
    end if

    !AJ Iterate through each distinct ik,it point I have in g_lo and update the associated fields
    do ikit=1,g_lo%ikitrange
       ik = g_lo%local_ikit_points(ikit)%ik 
       it = g_lo%local_ikit_points(ikit)%it
       if(kwork_filter(it,ik)) cycle
       if(has_phi) phi(:,it,ik)=phi(:,it,ik)+orig_phi(:,it,ik)
       if(has_apar) apar(:,it,ik)=apar(:,it,ik)+orig_apar(:,it,ik)
       if(has_bpar) bpar(:,it,ik)=bpar(:,it,ik)+orig_bpar(:,it,ik)
    end do

    !AJ Iterate through each gf_lo point I have and update the associated fields
    do igf=gf_lo%llim_proc,gf_lo%ulim_proc
       ik = ik_idx(gf_lo,igf)
       it = it_idx(gf_lo,igf)
       if(kwork_filter(it,ik)) cycle
       if(has_phi) gf_phi(:,it,ik)=gf_phi(:,it,ik)+orig_gf_phi(:,it,ik)
       if(has_apar) gf_apar(:,it,ik)=gf_apar(:,it,ik)+orig_gf_apar(:,it,ik)
       if(has_bpar) gf_bpar(:,it,ik)=gf_bpar(:,it,ik)+orig_gf_bpar(:,it,ik)
    end do

  end subroutine fm_update_fields_gf

  !> Update the fields using the new fields
  subroutine fm_update_fields_newstep(self)
    use fields_arrays, only: phi,apar,bpar,phinew,aparnew,bparnew
    use fields_arrays, only: gf_phi,gf_apar,gf_bpar,gf_phinew,gf_aparnew,gf_bparnew
    use run_parameters, only: has_phi, has_apar, has_bpar
    use kt_grids, only: kwork_filter
    use gs2_layouts, only: g_lo, ik_idx, it_idx, gf_lo
    use mp, only: proc0
    use array_utils, only: copy
    implicit none
    class(fieldmat_gf_type), intent(in) :: self
    integer :: ik,it,ikit,igf
    UNUSED_DUMMY(self)
    !If we're proc0 then we need to do full array (for diagnostics)
    if(proc0) then
       if(has_phi) then
          call copy(phinew, phi)
          call copy(gf_phinew, gf_phi)
       end if
       if(has_apar) then
          call copy(aparnew, apar)
          call copy(gf_aparnew, gf_apar)
       end if
       if(has_bpar) then
          call copy(bparnew, bpar)
          call copy(gf_bparnew, gf_bpar)
       end if
       return
    end if

    !AJ Iterate through each distinct ik,it point I have in g_lo and update the associated fields
    do ikit=1,g_lo%ikitrange
       ik = g_lo%local_ikit_points(ikit)%ik 
       it = g_lo%local_ikit_points(ikit)%it
       if(kwork_filter(it,ik)) cycle
       if(has_phi) phi(:,it,ik)=phinew(:,it,ik)
       if(has_apar) apar(:,it,ik)=aparnew(:,it,ik)
       if(has_bpar) bpar(:,it,ik)=bparnew(:,it,ik)
    end do

    !AJ Iterate through each gf_lo point I have and update the associated fields
    do igf=gf_lo%llim_proc,gf_lo%ulim_proc
       ik = ik_idx(gf_lo,igf)
       it = it_idx(gf_lo,igf)
       if(kwork_filter(it,ik)) cycle       
       if(has_phi) gf_phi(:,it,ik)=gf_phinew(:,it,ik)
       if(has_apar) gf_apar(:,it,ik)=gf_aparnew(:,it,ik)
       if(has_bpar) gf_bpar(:,it,ik)=gf_bparnew(:,it,ik)
    end do

  end subroutine fm_update_fields_newstep

  !> FIXME : Add documentation
  subroutine fm_getfieldeq_nogath (self,phi, apar, bpar, fieldeq, fieldeqa, fieldeqp)
    use theta_grid, only: ntgrid
    use dist_fn, only: getan_nogath
    use dist_fn_arrays, only: antot, antota, antotp
    implicit none
    class(fieldmat_gf_type), intent(in) :: self
    complex, dimension (-ntgrid:,:,:), intent (in) :: phi, apar, bpar
    complex, dimension (-ntgrid:,:,:), intent (in out) ::fieldeq,fieldeqa,fieldeqp

    call getan_nogath(antot, antota, antotp)
    call self%getfieldeq1_nogath (phi, apar, bpar, antot, antota, antotp, &
         fieldeq, fieldeqa, fieldeqp)

  end subroutine fm_getfieldeq_nogath  

!------------------------------------------------------------------------ 

!================
!PROC_DECOMP
!================

  !> Work out which cells are local
  subroutine pc_find_locality(self)
    use gs2_layouts, only: gf_lo, ik_idx, it_idx
    use mp, only: sum_allreduce
    implicit none
    class(pc_gf_type),intent(inout) :: self
    integer :: it,ik,igf

    !Initialise
    self%is_local=0

    do igf=gf_lo%llim_proc,gf_lo%ulim_proc
       ik=ik_idx(gf_lo,igf)
       it=it_idx(gf_lo,igf)
       self%is_local(it,ik)=1
    enddo

    !Now count how many procs have each cell
    self%nproc_per_cell=self%is_local
    call sum_allreduce(self%nproc_per_cell)
  end subroutine pc_find_locality

  !> A wrapper routine to do the decomposition
  subroutine pc_make_decomp(self, fieldmat)
    use mp, only: mp_abort
    implicit none
    class(pc_gf_type), intent(inout) :: self
    class(fieldmat_type), intent(in out) :: fieldmat
    call self%decomp(fieldmat)
  end subroutine pc_make_decomp

  !---------------------------
  !Decomposition routines
  !---------------------------

  !> The process decomposition works on the gf_lo layout, so if you have a 
  !! point in gf_lo that is in a supercell you are in that supercell and you 
  !! own the whole cell associated with that point, otherwise you don't have anything 
  !! in that supercell, i.e. each cell has a single owner.
  subroutine pc_decomp(self, fieldmat)
    use mp, only: split, comm_type, free_comm
    implicit none
    class(pc_gf_type), intent(inout) :: self
    class(fieldmat_type), intent(in out) :: fieldmat
    integer :: ik, is, ic, ifq, it, llim, ulim
    integer :: nrow_tmp
    !/Don't need to change pc%is_local as this is already correct

    !Make sure we do the invert locally as mpi based version not implemented!
    self%force_local_invert=.true.

    !Now we (re)calculate how much data is available
    call self%count_avail(fieldmat)

    !Initialise the number of rows responsible
    self%nresp_per_cell=0

    ! For fields gf force has_to_gather to .false.
    self%has_to_gather = .false.

    !Now loop over all cells, setting the row_llim of
    !their row blocks
    do ik=1,fieldmat%naky
       do is=1,fieldmat%kyb(ik)%nsupercell
          nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow
          do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell
             !Store it index
             it=fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind

             !AJ cell locality is restricted to a single process in gf_lo, meaning for each cell there is only 
             !AJ one process that will match this if statement.
             if(fieldmat%kyb(ik)%supercells(is)%cells(ic)%is_local)then
                
                !Now work out limits
                llim=1
                ulim=llim+nrow_tmp-1


                !Now loop over row blocks to set index
                do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=llim
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=ulim
                   call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow
                enddo
             else
                do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=0
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=0
                   call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow
                enddo
             endif

             !Record the number of responsible rows, note we assume all rb have same size
             if(size(fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb)>0)then
                self%nresp_per_cell(it,ik)=fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(1)%nrow
             else
                self%nresp_per_cell(it,ik)=0
             endif
          enddo
       enddo
    enddo
  end subroutine pc_decomp

!------------------------------------------------------------------------ 

!//////////////////////////////
!// MODULE LEVEL ROUTINES
!//////////////////////////////

  !> Routine to initialise
  subroutine init_fields_gf_local
    use antenna, only: init_antenna
    use theta_grid, only: init_theta_grid
    use kt_grids, only: init_kt_grids
    use gs2_layouts, only: init_gs2_layouts
    use mp, only: proc0, mp_abort, barrier
    use file_utils, only: error_unit
    use fields_local, only: init_fields_matrixlocal
    implicit none

    !Early exit if possible
    if (initialised) return

    !Exit if local fields not supported (e.g. compiled with pgi)
    if(.not.fields_gf_local_functional()) call mp_abort("field_option='gf local' not supported with this build")

    !Make sure any dependencies are already initialised and
    !initialise the field matrix object.
    if (debug.and.proc0) write(6,*) "init_fields_gf_local: gs2_layouts"
    call init_gs2_layouts
    if (debug.and.proc0) write(6,*) "init_fields_gf_local: theta_grid"
    call init_theta_grid
    if (debug.and.proc0) write(6,*) "init_fields_gf_local: kt_grids"
    call init_kt_grids
    if (debug.and.proc0) write(6,*) "init_fields_gf_local: init_fields_matrixlocal"
    call init_fields_matrixlocal(fieldmat, pc)
    if (debug.and.proc0) write(6,*) "init_fields_gf_local: antenna"
    call init_antenna

    !Set the initialised state
    initialised = .true.
    reinit = .false.
  end subroutine init_fields_gf_local

  !> Reset the fields_gf_local module
  subroutine reset_fields_gf_local
    implicit none
    initialised=.false.
    reinit=.true.
  end subroutine reset_fields_gf_local

  !> Finish the fields_gf_local module
  subroutine finish_fields_gf_local
    implicit none
    call pc%reset
    call fieldmat%reset
    reinit = .false.
    initialised=.false.
  end subroutine finish_fields_gf_local

  !> Calculate the update to the fields
  subroutine getfield_gf_local(phi,apar,bpar,gf_phi,gf_apar,gf_bpar,do_update_in)
    use theta_grid, only: ntgrid
    use optionals, only: get_option_with_default
    implicit none
    complex, dimension(-ntgrid:,:,:), intent(out) :: phi,apar,bpar,gf_phi,gf_apar,gf_bpar !Note, these are actually phinew,... in typical usage
    logical, optional, intent(in) :: do_update_in
    logical :: do_update

    do_update = get_option_with_default(do_update_in, .false.)

    !Use fieldmat routine to calculate the field update
    !AJ phi(new),apar(new),bpar(new) are in gf_lo here
    call fieldmat%get_field_update(gf_phi,gf_apar,gf_bpar,pc)
   
    !NOTE: We currently calculate omega at every time step so we
    !actually need to gather everytime, which is a pain!
    !We also fill in the empties here.
    !AJ before this  phi(new),apar(new),bpar(new) are in gf_lo
    !AJ before this  phi(new),apar(new),bpar(new) are in g_lo
    call fieldmat%gather_fields_gf(gf_phi,gf_apar,gf_bpar,phi,apar,bpar)

    !AJ phi(new),apar(new),bpar(new) are in g_lo
    !Thi s routine updates *new fields using gathered update
    if(do_update) call fieldmat%update_fields_gf(phi,apar,bpar,gf_phi,gf_apar,gf_bpar)
  end subroutine getfield_gf_local

  !> Initialise the fields from the initial g, just uses the
  !! fields_implicit routine
  subroutine init_allfields_gf_local
    use fields_implicit, only: init_allfields_implicit
    use fields_arrays, only: phinew, aparnew, bparnew, gf_phinew, gf_aparnew, gf_bparnew
    use fields_arrays, only: phi, apar, bpar, gf_phi, gf_apar, gf_bpar
    implicit none
    call init_allfields_implicit(.true.)
    !AJ After the call above phinew, aparnew, bparnew have the initial fields, but in gf_lo layout
    !AJ The call below redistributed them so that phinew,aparnew,bparnew,phi,apar,bpar have the correct 
    !AJ fields in g_lo layout and gf_phinew,gf_aparnew,gf_bparnew,gf_phi,gf_apar,gf_bpar have the correct 
    !AJ fields in gf_lo layouts.  Note that the new and original arrays (i.e. phinew and phi) will have the same
    !AJ values at this point
    call fieldmat%gather_init_fields(gf_phinew,gf_aparnew,gf_bparnew,gf_phi,gf_apar,gf_bpar,phinew,aparnew,bparnew,phi,apar,bpar)
  end subroutine init_allfields_gf_local

  !> This routine advances the solution by one full time step
  subroutine advance_gf_local(istep, remove_zonal_flows_switch)
    use run_parameters, only: has_phi, has_apar, has_bpar, reset
    use fields_implicit, only: remove_zonal_flows
    use fields_arrays, only: phi, apar, bpar, phinew
    use fields_arrays, only: aparnew, bparnew, apar_ext
    use fields_arrays, only: gf_phi, gf_apar, gf_bpar
    use fields_arrays, only: gf_phinew, gf_aparnew, gf_bparnew
    use fields_arrays, only: time_field, time_field_mpi
    use dist_fn, only: timeadv, exb_shear, g_exb, g_exbfac
    use dist_fn, only: collisions_advance
    use dist_fn_arrays, only: g, gnew
    use mp, only: get_mp_times
    use job_manage, only: time_message
    use antenna, only: antenna_amplitudes, no_driver
    use array_utils, only: copy
    implicit none
    integer, intent(in) :: istep
    logical, intent(in) :: remove_zonal_flows_switch
    integer, parameter :: diagnostics = 1
    logical :: do_update
    real :: mp_total, mp_total_after

    do_update=.true.

    !GGH NOTE: apar_ext is initialized in this call
    if(.not.no_driver) then
       !AJ THIS MAY NOT WORK FOR GF_LO
       write(*,*) 'antenna has not been configured for gf_lo, may produce incorrect results'
       call antenna_amplitudes (apar_ext)
    end if

    !Apply flow shear if active
    if(abs(g_exb*g_exbfac)>epsilon(0.)) then
       !AJ THIS MAY NOT WORK FOR GF_LO
       write(*,*) 'Flow shear has not been configured for gf_lo, may produce incorrect results'
       call exb_shear(gnew,phinew,aparnew,bparnew,istep,field_local_allreduce_sub)
    end if

    !Update g and fields
    !AJ IS THIS FULL COPY NECESSARY?
    call copy(gnew, g)
    if(do_update)then !Here we tie the "smart" update to do_update
       call fieldmat%update_fields_newstep
    else
       if(has_phi) then
          call copy(phinew, phi)
          call copy(gf_phinew, gf_phi)
       end if
       if(has_apar) then
          call copy(aparnew, apar)
          call copy(gf_aparnew, gf_apar)
       end if
       if(has_bpar) then
          call copy(bparnew, bpar)
          call copy(gf_bparnew, gf_bpar)
       end if
    endif

    !Find gnew given fields at time step midpoint
    call timeadv (phi, apar, bpar, phinew, &
         aparnew, bparnew, istep)
    if(reset) return !Return is resetting

    !Add in antenna driving if present
    !<DD>TAGGED: Should we only this is fapar>0 as well?
    if(.not.no_driver) then
       !AJ THIS MAY NOT WORK FOR GF_LO
       write(*,*) 'antenna has not been configured for gf_lo, may produce incorrect results'
       aparnew=aparnew+apar_ext
    end if

    call time_message(.false.,time_field,' Field Solver')
    call get_mp_times(total_time = mp_total)

    !Calculate the fields at the next time point
    call getfield_gf_local(phinew,aparnew,bparnew,gf_phinew,gf_aparnew,gf_bparnew,do_update)

    !If we do the update in getfield_gf_local don't do it here
    if(.not.do_update)then
       if(has_phi) then
          phinew=phinew+phi
          gf_phinew=gf_phinew+gf_phi
       end if
       if(has_apar) then
          aparnew=aparnew+apar
          gf_aparnew=gf_aparnew+gf_apar
       end if
       if(has_bpar) then
          bparnew=bparnew+bpar
          gf_bparnew=gf_bparnew+gf_bpar
       end if
    endif

    call time_message(.false.,time_field,' Field Solver')
    call get_mp_times(total_time = mp_total_after)
    time_field_mpi = time_field_mpi + (mp_total_after - mp_total)

    !Remove zonal component if requested
    if(remove_zonal_flows_switch) then
       !AJ THIS MAY NOT WORK IN GF_LO
       write(*,*) 'zonal flows have not been configured for gf_lo, may produce incorrect results'
       call remove_zonal_flows
    end if

    !Evolve gnew to next half time point
    call timeadv (phi, apar, bpar, phinew, &
         aparnew, bparnew, istep, diagnostics) 

    ! Advance collisions, if separate from timeadv
    call collisions_advance (phi, bpar, phinew, aparnew, bparnew, diagnostics)

  end subroutine advance_gf_local

  subroutine dump_response_to_file_gf_local(suffix)
    use fields_local, only: dump_response_to_file_local
    implicit none
    !> If passed then use as part of file suffix
    character(len=*), optional, intent(in) :: suffix
    call dump_response_to_file_local(fieldmat, suffix)
  end subroutine dump_response_to_file_gf_local

  !> Returns true if GS2 was built in such a way
  !! as to allow this module to work.
  !! Currently does not work with PGI compilers. 
  !! See online discussions.
  logical function fields_gf_local_functional()
    use fields_local, only: fields_local_functional
    fields_gf_local_functional = fields_local_functional()
  end function fields_gf_local_functional
end module fields_gf_local