fields_local.f90 Source File


Contents

Source Code


Source Code

!> This module implements the same implicit algorithm as fields_implicit.
!! However, it is an almost complete rewrite of the implementation 
!! which can give large gains in the speed of calculating the response 
!! matrix, as well as significant gains to the advance steps
module fields_local
  use mp, only: comm_type
  implicit none
  private

  ! Module level routines
  public :: init_fields_local, init_allfields_local, finish_fields_local
  public :: advance_local, reset_fields_local, dump_response_to_file_local
  public :: fields_local_functional, read_response_from_file_local
  public :: do_smart_update, field_local_allreduce, field_local_allreduce_sub
  public :: field_local_tuneminnrow,  field_local_nonblocking_collectives
  public :: minNRow, init_fields_matrixlocal
  public :: rowblock_type, cell_type, supercell_type, ky_type, fieldmat_type, pc_type

  ! User set parameters
  public :: dump_response, read_response

  ! Made public for testing
  public :: fieldmat

  !> Public type allowing tests to access otherwise-private routines
  type, public :: fields_local_testing
    contains
      procedure, nopass :: getfield_local

  end type fields_local_testing

  interface dump_response_to_file_local
     module procedure dump_response_to_file_local_passed
     module procedure dump_response_to_file_local_nopass
  end interface dump_response_to_file_local

  !//////////////////////////////
  !// CUSTOM TYPES
  !//////////////////////////////

  !> This is the lowest level of data object and will hold the actual data
  !! involved with the field matrix
  type :: rowblock_type
     complex, dimension(:,:), allocatable :: data !< This is the field data
     integer :: row_llim, row_ulim !< The row limits this block corresponds to
     integer :: col_llim, col_ulim !< The col limits this block corresponds to
     integer :: nrow, ncol !< The size of the row block
     complex, dimension(:),allocatable :: tmp_sum !< Work space for field update
     !/Row and parent properties. Mostly for debug printing
     integer :: ir_ind !< Property of rowblock
     integer :: it_ind, ik_ind !< Details of parents
     integer :: is_ind, ic_ind !< Details of parents so cell can find itself
   contains
     procedure :: deallocate => rb_deallocate
     procedure :: allocate => rb_allocate
     procedure :: init => rb_init
     procedure :: debug_print => rb_debug_print
     procedure :: mv_mult_fun => rb_mv_mult_fun
     procedure :: mv_mult => rb_mv_mult
     procedure :: irex_to_ir => rb_irex_to_irloc
     procedure :: reset => rb_reset
     procedure :: set_nrow => rb_set_nrow
  end type rowblock_type

  !> This is the next level up of data and represents the cell type.
  !! A cell has a unique it,ik pair and represents all the fields/field eq
  !! on the extended domain for the given it/ik
  type :: cell_type
     type(rowblock_type), dimension(:), allocatable :: rb !< These are the row blocks, currently one for each field equation
     integer :: nrb !< How many row blocks, equal to nfield
     !//NOTE: probably don't want these as this data should be stored by the row blocks only
     integer :: row_llim, row_ulim !< The row limits, always 1 to nfield*nextend
     integer, dimension(:), allocatable :: col_llim, col_ulim !< These are the column limits for each fdq
     !//////
     integer :: nrow, ncol !< The number of rows and columns for each field equation.
     integer :: ncol_tot !< The total number of columns (nfdq*ncol)
     logical :: is_local !< Does this cell have any data on this proc?
     logical :: is_empty !< Have we got any data for this cell on this proc?
     logical :: is_all_local !< Is all of this cells data on this proc?
     logical :: ignore_boundary !< Do we ignore the boundary point (column) in this cell?
     complex, dimension(:),allocatable :: tmp_sum !< Work space for field update
     type(comm_type) :: parent_sub !< Sub communicator involving all processors in parent
     !Cell and parent properties. Mostly for debug printing.
     integer :: ic_ind, it_ind !< Cell properties
     integer :: ik_ind, is_ind !< Parent properties
   contains
     procedure :: deallocate => c_deallocate
     procedure :: allocate => c_allocate
     procedure :: init => c_init
     procedure :: debug_print => c_debug_print
     procedure :: mv_mult_rb => c_mv_mult_rb
     procedure :: get_field_update => c_get_field_update
     procedure :: reset => c_reset
     procedure :: set_locality => c_set_locality
     procedure :: has_row => c_has_row
  end type cell_type

  !> This is the next level up of data and represents the supercell type.
  !! A supercell represents a collection of connected cells
  type :: supercell_type
     type(cell_type), dimension(:), allocatable :: cells !< These are the cells
     integer :: ncell
     integer :: nextend !< Length of the extended domain
     integer :: nrow, ncol !< The number of rows and columns. Equal to nextend*nfield
     integer :: it_leftmost !< It index of leftmost cell
     integer :: head_iproc_world !< The proc id (in comm world) of the head proc
     integer :: head_iproc !< The proc id (in sc_sub_all) of the head proc
     integer :: head_iproc_pd !< The proc id (in sc_sub_pd) of the head proc
     logical :: is_local !< Does this supercell have any data on this proc?
     logical :: is_empty !< Have we got any data for this supercell on this proc?
     logical :: is_all_local !< Is all of this supercells data on this proc?
     complex, dimension(:),allocatable :: tmp_sum !< Work space for field update
     type(comm_type) :: sc_sub_all !< Sub communicator involving all processors with this supercell
     type(comm_type) :: sc_sub_pd !< Sub communicator for all procs with some data but not all of it
     type(comm_type) :: parent_sub !< Sub communicator involving all processors in parent
     integer, dimension(:),allocatable :: nb_req_hand !< For non-blocking broadcast request handle storage
     logical :: initdone !< Have we finished initialising this block?
     logical, dimension(:), allocatable :: initialised !< Have we initialised each point?
     logical :: is_head = .false. !< Are we the head of this supercell?
     !Cell and parent properties. Mostly for debug printing.
     !> The is_ind value is the index of the supercell in the parent ky_type's
     !> supercell array.
     integer :: is_ind
     !> The is_label is the supercell_label of this supercell as determined
     !> by calculate_supercell_labels from elsewhere.
     integer :: is_label
     integer :: ik_ind !< Parent properties
     integer :: collective_request
     real :: condition_number = -1 !< Condition number of the associated response matrix. Only valid on the head.
   contains
     procedure :: deallocate => sc_deallocate
     procedure :: allocate => sc_allocate
     procedure :: init => sc_init
     procedure :: debug_print => sc_debug_print
     procedure :: get_field_update => sc_get_field_update
     procedure :: reduce_tmpsum => sc_reduce_tmpsum
     procedure :: iex_to_dims => sc_iex_to_dims
     procedure :: iex_to_ifl => sc_iex_to_ifl
     procedure :: iex_to_ic => sc_iex_to_ic
     procedure :: iex_to_ig => sc_iex_to_ig
     procedure :: iex_to_it => sc_iex_to_it
     procedure :: has_it => sc_has_it
     procedure :: reset => sc_reset
     procedure :: set_locality => sc_set_locality
     procedure :: get_left_it => sc_get_left_it
     procedure :: store_fq => sc_store_fq
     procedure :: pull_rows_to_arr => sc_pull_rows_to_arr
     procedure :: push_arr_to_rows => sc_push_arr_to_rows
     procedure :: prepare => sc_prepare
     procedure :: invert => sc_invert
     procedure :: invert_local => sc_invert_local
     procedure :: invert_mpi => sc_invert_mpi
     procedure :: dump => sc_dump
     procedure :: make_subcom_1 => sc_make_subcom_1
     procedure :: make_subcom_2 => sc_make_subcom_2
     procedure :: dump_to_file => sc_dump_to_file
     procedure :: read_from_file => sc_read_from_file
  end type supercell_type

  !> This is the next level up of data and represents the ky type.
  !! A ky block consists of a number of supercells
  type :: ky_type
     type(supercell_type), dimension(:), allocatable :: supercells !< These are the supercells
     integer :: nsupercell
     logical :: is_local !< Does this supercell have any data on this proc?
     logical :: is_empty !< Have we got any data for this supercell on this proc?
     logical :: is_all_local !< Is all of this supercells data on this proc?
     logical :: initdone !< Have we finished initialising this block?
     type(comm_type) :: ky_sub_all !< Sub communicator involving all processors with this ky
     type(comm_type) :: parent_sub !< Sub communicator involving all processors in parent
     !Cell and parent properties. Mostly for debug printing.
     integer :: ik_ind
   contains
     procedure :: deallocate => ky_deallocate
     procedure :: allocate => ky_allocate
     procedure :: init => ky_init
     procedure :: debug_print => ky_debug_print
     procedure :: get_field_update => ky_get_field_update
     procedure :: wait_for_nonblocking_collectives => ky_wait_for_nonblocking_collectives
     procedure :: reset => ky_reset
     procedure :: set_locality => ky_set_locality
     procedure :: is_from_it => ky_is_from_it
     procedure :: store_fq => ky_store_fq
     procedure :: prepare => ky_prepare
     procedure :: make_subcom_1 => ky_make_subcom_1
     procedure :: make_subcom_2 => ky_make_subcom_2
  end type ky_type

  !> This is the top level object, consisting of a collection of
  !! ky blocks.
  type :: fieldmat_type
     type(ky_type), dimension(:), allocatable :: kyb !< The ky blocks
     integer :: naky !< Number of ky blocks
     integer :: ntheta0 
     integer :: npts !< Total number of theta grid points on all extended domains
     integer :: nbound !< Number of ignored boundary points
     logical :: is_local !< Does this supercell have any data on this proc?
     logical :: is_empty !< Have we got any data for the fieldmat on this proc?
     logical :: is_all_local !< Is all of this supercells data on this proc?
     type(comm_type) :: fm_sub_all !< Sub communicator involving all processors with fieldmat
     type(comm_type) :: fm_sub_headsc_p0 !< Sub communicator involving the supercell heads and proc0
     integer :: prepare_type=0 !< What sort of preparation do we do to the field matrix
     !Currently only support:
     !0   : Invert the field matrix
     !In the future may wish to do things like LU decomposition etc.
     !Could also do things like eigenvalue analysis etc.
     integer, dimension(:,:), allocatable :: heads !<  An array that holds the iproc (from comm world) of the supercell head for a given ik,it point
     logical :: no_populate=.false. !< Advanced usage only, if true then don't populate response
     logical :: no_prepare=.false. !< Advanced usage only, if true then don't prepare (invert) response
     integer :: nfield !< Number of fields we're working with
   contains
     procedure :: deallocate => fm_deallocate
     procedure :: allocate => fm_allocate
     procedure :: debug_print => fm_debug_print
     procedure :: get_field_update => fm_get_field_update
     procedure :: init => fm_init
     procedure :: reset => fm_reset
     procedure :: populate => fm_populate
     procedure :: init_next_field_points => fm_init_next_field_points
     procedure :: prepare => fm_prepare
     procedure :: make_subcom_1 => fm_make_subcom_1
     procedure :: make_subcom_2 => fm_make_subcom_2
     procedure :: calc_sc_heads => fm_calc_sc_heads
     procedure :: gather_fields => fm_gather_fields
     procedure :: unpack_to_field => fm_unpack_to_field
     procedure :: write_debug_data => fm_write_debug_data
     procedure :: set_is_local => fm_set_is_local
     procedure :: count_subcom => fm_count_subcom
     procedure :: getfieldeq_nogath => fm_getfieldeq_nogath
     procedure :: reduce_an => fm_reduce_an_head_send_broadcast
     procedure :: check_an => fm_check_an
     procedure :: getfieldeq1_nogath => fm_getfieldeq1_nogath
     procedure :: update_fields => fm_update_fields
     procedure :: update_fields_newstep => fm_update_fields_newstep
     procedure :: get_condition_numbers => fm_get_condition_numbers
  end type fieldmat_type

  !> This is the type which controls/organises the parallel
  !! data decomposition etc.
  type :: pc_type
     integer, dimension(:,:), allocatable :: is_local !< Is the given it,ik on this proc?
     !NOTE: We have is_local as integer rather than logical so we can reduce across procs
     !to count how many procs have a given cell
     type(comm_type), dimension(:,:), allocatable :: itik_subcom !< Cell level sub-communicators
     integer, dimension(:,:), allocatable :: nproc_per_cell !< How many procs have this cell
     integer, dimension(:,:), allocatable :: nresp_per_cell !< How many rows are we personally responsible for in each cell
     integer, dimension(:,:), allocatable :: navail_per_cell !< How many rows could we be responsible for in each cell
     integer :: nresp_tot, navail_tot !< Total number of rows available/potentially available
     logical :: force_local_invert=.false. !< If true then we force local inversion
     !the local fields. This may impact diagnostics/parallel writing as it means only the processors g_lo local section
     !of the fields are evolved on this processor.
     !NOTE: If we're doing force local invert really when we populate the supercell we should store all of the 
     !data locally so we don't need to fetch it later.
     logical :: has_to_gather=.true. !< If true we have to gather when calling getfieldeq, determined by decomp routine
     integer :: decomp_type=3 !< This sets what type of decomposition is done
     !By defining your own decomposition routine and associating it with a
     !particular value of decomp_type you should be able to easily add in
     !new parallel decompositions easily without having to rewrite any other
     !routines (although I don't promise this!)
     !//List of currently supported decomp_types
     !0  : Force everything to be done locally and serially, i.e. every proc has all field mat data
     !1  : Force local/serial but only for the supercells we would have anyway
     !2  : Force local/serial but only for the cells we would have anyway
     !3  : Simple mpi, minimal attempt to load balance or avoid splitting small blocks
     !4  : As above but with attempts to avoid splitting small blocks 
   contains
     procedure :: deallocate => pc_deallocate
     procedure :: allocate => pc_allocate
     procedure :: debug_print => pc_debug_print
     procedure :: current_nresp => pc_current_nresp
     procedure :: find_locality => pc_find_locality
     procedure :: count_avail => pc_count_avail
     procedure :: has_ik => pc_has_ik
     procedure :: has_it => pc_has_it
     procedure :: make_decomp => pc_make_decomp
     procedure :: init => pc_init
     procedure :: reset => pc_reset
     !//The different decomposition routines
     procedure :: decomp_all_serial_local => pc_decomp_all_serial_local
     procedure :: decomp_own_serial_local => pc_decomp_own_serial_local
     procedure :: decomp_owncells_serial_local => pc_decomp_owncells_serial_local
     procedure :: decomp_owncells_simplempi => pc_decomp_owncells_simplempi
  end type pc_type

  !//////////////////////////////
  !// MODULE VARIABLES
  !//////////////////////////////
  integer :: dlun=6 !< File unit for debug printing
  !Tuning this can improve advance/init time (usually one at the expense of other)
  logical :: do_smart_update=.false. !< If true and x/y distributed then use a "smart" update which only updates
  logical,parameter :: debug=.false. !< Do we do debug stuff?
  logical :: initialised=.false. !< Have we initialised yet?
  logical :: reinit=.false. !< Are we reinitialising?
  logical :: dump_response=.false. !< Do we dump the response matrix?
  logical :: read_response=.false. !< Do we read the response matrix from dump?
  logical :: field_local_allreduce = .false. !< If true use an allreduce to gather field else use reduce+broadcast
  logical :: field_local_allreduce_sub = .false. !< If true and field_local_allreduce true then do two sub comm all reduces rather than 1
  logical :: field_local_tuneminnrow = .false. !< Are auto-tuning minnrow
  logical :: field_local_nonblocking_collectives = .false.
  integer :: minNRow = 16 !< Tuning this can influence both init and advance times
  type(pc_type),save :: pc !< This is the parallel control object
  type(fieldmat_type),save :: fieldmat !< This is the top level field matrix object
  !> The condition numbers above this limit will trigger a warning
  real, parameter :: condition_number_warning_limit = sqrt(1.0/epsilon(1.0))
contains

!//////////////////////////////
!// CUSTOM TYPES
!//////////////////////////////
!
!================
!ROWBLOCK
!================
  
  !> Allocate storage space
  subroutine rb_allocate(self)
    implicit none
    class(rowblock_type), intent(inout) :: self
    allocate(self%data(self%ncol,self%nrow))
    allocate(self%tmp_sum(self%nrow))
    self%data = 0.
    self%tmp_sum = 0.
  end subroutine rb_allocate

  !> Deallocate storage space
  subroutine rb_deallocate(self)
    implicit none
    class(rowblock_type), intent(inout) :: self
    if(allocated(self%data)) deallocate(self%data)
    if(allocated(self%tmp_sum)) deallocate(self%tmp_sum)
  end subroutine rb_deallocate

  !> Initialise the members of the rowblock instance
  subroutine rb_init(self, ifq, ic, it, is, ik, ncol, nextend)
    use theta_grid, only: ntgrid
    implicit none
    class(rowblock_type), intent(in out) :: self
    integer, intent(in) :: ifq, ic, it, is, ik, ncol
    !> The length of the parent supercell's real space
    !> domain. Represents the spacing between the start of different
    !> field/field equations as we store data as the full real space
    !> domain for each field, e.g. [fieldeq(:), fieldeqa(:),
    !> fieldeqp(:)], rather than say interleaved, e.g. [[fieldeq(1),
    !> fieldeqa(1), fieldeqp(1)], [fieldeq(2), fieldeqa(2),
    !> fieldeqp(2)],...]
    integer, intent(in) :: nextend
    ! Store rowblock index, essentially which fieldeq we represent
    self%ir_ind = ifq
    ! Store indices of parent types
    self%ic_ind = ic
    self%it_ind = it
    self%is_ind = is
    self%ik_ind = ik
    ! Store data limits
    self%ncol = ncol
    self%col_llim = 1 + (ic - 1) * (2 * ntgrid) + (ifq - 1) * nextend
    self%col_ulim = self%col_llim + ncol - 1
  end subroutine rb_init

  !> Debug printing
  subroutine rb_debug_print(self)
    implicit none
    class(rowblock_type), intent(in) :: self
    write(dlun,'("Rowblock debug print. Index ir=",I0," nrow=",I0," ncol=",I0," rl,ru=",I0," ",I0, " and cl,cu=",I0," ",I0)') &
         self%ir_ind, self%nrow, self%ncol, &
         self%row_llim, self%row_ulim, &
         self%col_llim, self%col_ulim
  end subroutine rb_debug_print

  !> Matrix vector multiplication
  function rb_mv_mult_fun(self,vect)
    implicit none
    class(rowblock_type), intent(in) :: self
    complex,dimension(self%ncol), intent(in) :: vect
    complex,dimension(self%nrow) :: rb_mv_mult_fun
!    integer :: ir
    !As a 1d vector in fortran is a "row vector" (as the
    !first index represents columns) matmul is a bit annoying
    !when trying to do a matrix-vector product
    rb_mv_mult_fun = matmul(vect,self%data)
!    do ir=1,self%nrow
!       rb_mv_mult_fun(ir)=sum(self%data(:,ir)*vect)
!    enddo
  end function rb_mv_mult_fun

  !> Matrix vector multiplication stored in tmp_sum
  subroutine rb_mv_mult(self,vect)
    implicit none
    class(rowblock_type), intent(inout) :: self
    complex,dimension(self%ncol), intent(in) :: vect
    self%tmp_sum=self%mv_mult_fun(vect)
  end subroutine rb_mv_mult
  
  !> Convert extended row index to local row index
  function rb_irex_to_irloc(self,irex)
    implicit none
    class(rowblock_type), intent(in) :: self
    integer, intent(in) :: irex
    integer :: rb_irex_to_irloc
    rb_irex_to_irloc=1+(irex-self%row_llim)
  end function rb_irex_to_irloc

  !> Set the nrow variables
  subroutine rb_set_nrow(self)
    implicit none
    class(rowblock_type), intent(inout) :: self
    if(self%row_ulim<=0)then
       self%nrow=0
    else
       self%nrow=1+self%row_ulim-self%row_llim
    endif
  end subroutine rb_set_nrow

  !> A routine to reset the object
  subroutine rb_reset(self)
    implicit none
    class(rowblock_type), intent(inout) :: self

    !deallocate
    call self%deallocate

    !Could zero out variables but no real need
    self%row_llim=0
    self%row_ulim=0
  end subroutine rb_reset

!------------------------------------------------------------------------

!================
!CELL
!================

  !> Allocate storage space
  subroutine c_allocate(self)
    implicit none
    class(cell_type), intent(inout) :: self
    integer :: ir
    allocate(self%tmp_sum(self%nrow))
    do ir=1,self%nrb
       call self%rb(ir)%allocate
    enddo
  end subroutine c_allocate

  !> Deallocate storage space
  subroutine c_deallocate(self)
    implicit none
    class(cell_type), intent(inout) :: self
    if(allocated(self%tmp_sum)) deallocate(self%tmp_sum)
    if(allocated(self%rb)) deallocate(self%rb)
  end subroutine c_deallocate

  !> Initialise the members of the cell instance
  subroutine c_init(self, ic, it, is, ik, nfield, nrow, nextend, ignore_boundary)
    use theta_grid, only: ntgrid
    implicit none
    class(cell_type), intent(in out) :: self
    integer, intent(in) :: ic, it, is, ik, nfield, nrow, nextend
    logical, intent(in) :: ignore_boundary
    integer :: ifq

    ! Store basic cell properties
    self%nrb = nfield
    self%ic_ind = ic
    self%it_ind = it
    self%ignore_boundary = ignore_boundary
    ! Store parent properties
    self%is_ind = is
    self%ik_ind = ik

    ! Store related/derived properties
    self%row_llim = 1
    self%row_ulim = nrow
    self%nrow = nrow

    self%ncol = 2 * ntgrid
    if (.not. ignore_boundary) self%ncol = self%ncol + 1

    allocate (self%col_llim(nfield), self%col_ulim(nfield))
    do ifq = 1, nfield
       self%col_llim(ifq) = 1 + (ic - 1) + (2 * ntgrid) + (ifq - 1) * nrow
       self%col_ulim(ifq) = self%col_llim(ifq) + self%ncol - 1
    end do
    self%ncol_tot = self%ncol * nfield

    allocate (self%rb(self%nrb))

    !Now make row block objects
    do ifq = 1, nfield
       call self%rb(ifq)%init(ifq, ic, it, is, ik, self%ncol, nextend)
    end do
  end subroutine c_init

  !> Debug printing
  subroutine c_debug_print(self)
    implicit none
    class(cell_type), intent(in) :: self
    write(dlun,'("Cell debug print. Index ic=",I0," nrow=",I0," ncol=",I0)') &
         self%ic_ind, self%nrow, self%ncol
  end subroutine c_debug_print

  !> Do matrix vector multiplication at rowblock level
  subroutine c_mv_mult_rb(self,vect,ifq)
    implicit none
    class(cell_type), intent(inout) :: self
    complex, dimension(self%ncol), intent(in) :: vect
    integer, intent(in) :: ifq
    call self%rb(ifq)%mv_mult(vect)
  end subroutine c_mv_mult_rb

  !> Get the field update for this cells data
  !! Note still need to reduce across other cells in this
  !! supercell.
  subroutine c_get_field_update(self,fq,fqa,fqp)
    use run_parameters, only: has_phi, has_apar, has_bpar
    implicit none
    class(cell_type), intent(inout) :: self
    complex, dimension(self%ncol), intent(in) :: fq,fqa,fqp
    integer :: ifq

    !If we don't have any data then exit
    if(self%is_empty) return

    !First do the multiplication at rowblock level
    ifq=0
    if(has_phi) then
       ifq=ifq+1
       call self%mv_mult_rb(fq,ifq)
    endif
    if(has_apar) then
       ifq=ifq+1
       call self%mv_mult_rb(fqa,ifq)
    endif
    if(has_bpar) then
       ifq=ifq+1
       call self%mv_mult_rb(fqp,ifq)
    endif

    !Now store at cell level
    self%tmp_sum=0
    do ifq=1,self%nrb
       self%tmp_sum(self%rb(ifq)%row_llim:self%rb(ifq)%row_ulim)=&
            self%tmp_sum(self%rb(ifq)%row_llim:self%rb(ifq)%row_ulim)&
            -self%rb(ifq)%tmp_sum
    enddo

  end subroutine c_get_field_update

  !> A routine to reset the object
  subroutine c_reset(self)
    implicit none
    class(cell_type), intent(inout) :: self
    integer :: ir

    !Call reset on all children
    do ir=1,self%nrb
       call self%rb(ir)%reset
    enddo

    !deallocate
    call self%deallocate

    !Could zero out variables but no real need
  end subroutine c_reset

  !> Set the locality of each object
  subroutine c_set_locality(self, pc)
    use kt_grids, only: aky, akx
    implicit none
    class(cell_type), intent(inout) :: self
    class(pc_type), intent(in) :: pc
    
    !Set our locality. Note here we assume all rb have same size
    if(size(self%rb)>0)then
       self%is_empty=self%rb(1)%nrow==0
       self%is_all_local=self%rb(1)%nrow==self%nrow
    else
       self%is_empty=.true.
       self%is_all_local=.false.
    endif

    !/Also ignore the ky=kx=0 mode
    self%is_empty=(self%is_empty.or.(abs(aky(self%ik_ind))<epsilon(0.0).and.abs(akx(self%it_ind))<epsilon(0.0)))
    self%is_local=pc%is_local(self%it_ind,self%ik_ind)==1

  end subroutine c_set_locality

  !> Test if a given row belongs to the current cell
  function c_has_row(self,irow)
    implicit none
    class(cell_type), intent(in) :: self
    integer, intent(in) :: irow
    logical :: c_has_row
    !NOTE: Here we assume all row blocks in the cell have the same
    !row limits
    if(size(self%rb)>0)then
       c_has_row=(irow>=self%rb(1)%row_llim.and.irow<=self%rb(1)%row_ulim)
    else
       c_has_row=.false.
    endif
  end function c_has_row

!------------------------------------------------------------------------

!================
!SUPERCELL
!================

  !> Allocate storage space
  subroutine sc_allocate(self)
    implicit none
    class(supercell_type), intent(inout) :: self
    integer :: ic
    allocate(self%tmp_sum(self%nrow))
    do ic=1,self%ncell
       call self%cells(ic)%allocate
    enddo
  end subroutine sc_allocate

  !> Deallocate storage space
  subroutine sc_deallocate(self)
    implicit none
    class(supercell_type), intent(inout) :: self
    if(allocated(self%tmp_sum)) deallocate(self%tmp_sum)
    if(allocated(self%cells)) deallocate(self%cells)
  end subroutine sc_deallocate

  !> Initialise the supercell instance by setting and calculating
  !> some basic properties. Does not deal with allocating all
  !> storage etc.
  subroutine sc_init(self, is, itmin, ik, nfield, nbound)
    use theta_grid, only: ntgrid
    use mp, only: mp_request_null
    use dist_fn, only: itright, supercell_labels, n_cells
    implicit none
    class(supercell_type), intent(in out) :: self
    integer, intent(in) :: is, ik, itmin, nfield
    integer, intent(in out) :: nbound
    integer :: it, ic
    logical :: ignore_boundary

    self%is_ind = is
    self%it_leftmost = itmin
    self%ncell = n_cells(itmin, ik)
    self%ik_ind = ik
    self%is_label = supercell_labels(itmin, ik)

    self%nextend = 1 + 2 * ntgrid * self%ncell
    self%nrow = self%nextend * nfield
    self%ncol = self%nrow

    self%collective_request = mp_request_null

    allocate (self%initialised(self%nextend))
    self%initialised = .false.

    ! Now setup cells
    allocate (self%cells(self%ncell))

    it = itmin
    do ic = 1, self%ncell
       !We ignore the boundary for all cells except the rightmost
       ignore_boundary = ic < self%ncell
       if (ignore_boundary) nbound = nbound + 1

       call self%cells(ic)%init( ic, it, is, ik, nfield, &
            self%nrow, self%nextend, ignore_boundary)

       !Update it
       if (ic < self%ncell) it = itright(it, ik)
    end do

  end subroutine sc_init

  !> Debug printing
  subroutine sc_debug_print(self)
    implicit none
    class(supercell_type), intent(in) :: self
    write(dlun,'("Supercell debug print. Index is=",I0," nrow=",I0," ncol=",I0)') &
         self%is_ind, self%nrow, self%ncol
  end subroutine sc_debug_print

  !> Get the field update
  subroutine sc_get_field_update(self,fq,fqa,fqp)
    implicit none
    class(supercell_type), intent(inout) :: self
    complex, dimension(:,:), intent(in) :: fq,fqa,fqp
    integer :: ic, it_ind, ulim

    !Exit if we don't have any of the data locally
    if(.not.self%is_local) return

    if(.not.self%is_empty)then
       ! Now loop over cells and trigger field update. We add OpenMP here
       ! to help distribute the work. Note that cells%get_field_update ends
       ! up calling matmul, so in the situation where matmul is replaced
       ! with a call to equivalent BLAS routines _and_ an OpenMP enabled
       ! BLAS is linked this _could_ lead to an attempt to use nested
       ! parallelism, which might be slow.
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(ic, it_ind, ulim) &
       !$OMP SHARED(self, fq, fqa, fqp) &
       !$OMP SCHEDULE(static)
       do ic=1,self%ncell
          !Get cell properties
          it_ind=self%cells(ic)%it_ind
          ulim=self%cells(ic)%ncol
          !Do update
          call self%cells(ic)%get_field_update(fq(:ulim,it_ind),fqa(:ulim,it_ind),fqp(:ulim,it_ind))
       end do
       !$OMP END PARALLEL DO
    end if

    !<DD>TAGGED
    !/If we're going to reduce why not reduce once on the top level object --> If we do
    !local reduction and store in the top level ky_field_update then we can reduce the fields
    !in order to do everything. This may require a new class of subcommunicators

    !Now we need to reduce across cells
    !AJ If non-blocking collective communications are active then they are called here
    call self%reduce_tmpsum

  end subroutine sc_get_field_update

  !> Reduce the field update across cells to give the final answer
  subroutine sc_reduce_tmpsum(self)
    use mp, only: sum_allreduce_sub, sum_reduce_sub, nb_sum_reduce_sub
    implicit none
    class(supercell_type), intent(in out) :: self
    integer :: ic
    complex, dimension(self%nrow) :: local_tmp_sum

    ! Zero out the local tmp_sum
    local_tmp_sum = 0.0

    !NOTE: Here we rely on the cells having not reduced/
    !gathered their data yet so there are no repeated rows.
    ! Clearly cells%tmp_sum must be the same size as self%tmp_sum
    ! at this point. If cells only updated a subset of their tmp_sum
    ! entries we could perhaps save some work here by only summing over
    ! the set regions. Currently the work involved here does not scale
    ! at all with increasing processor count _beyond_ changes to cell
    ! locality (i.e. when is_empty triggers).
    !First do local sums
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(ic) &
    !$OMP SHARED(self) &
    !$OMP REDUCTION(+: local_tmp_sum) &
    !$OMP SCHEDULE(static)
    do ic = 1, self%ncell
       if (self%cells(ic)%is_empty) cycle
       local_tmp_sum = local_tmp_sum + self%cells(ic)%tmp_sum
    end do
    !$OMP END PARALLEL DO

    ! Copy local tmp_sum into super cell variant. Note this is an added cost
    ! due to OpenMP requiring us to use a non-member variable in reduction
    ! clauses. If this file was preprocessed then we could imagine offering
    ! an alternative code-path here for non-OpenMP builds.
    self%tmp_sum = local_tmp_sum

    !<DD>TAGGED: As we currently have to do fm_gather_fields on every time step we only need
    !to reduce to the head of the supercell, which may be an all_local proc in which case we don't
    !want to do any reduction. If it's a pd proc then really we should just do reduce_sub rather
    !than allreduce_sub as we're going to broadcast the result later anyway.
    !if(self%sc_sub_pd%nproc.gt.0) call sum_allreduce_sub(self%tmp_sum,self%sc_sub_pd%id)
    if (.not.(self%is_empty .or. self%is_all_local)) then
       !AJ If non-blocking collectives are enabled and available on the system then call them here
       if (field_local_nonblocking_collectives) then
          call nb_sum_reduce_sub(self%tmp_sum, 0, self%sc_sub_pd, self%collective_request)
       else
          call sum_reduce_sub(self%tmp_sum, 0, self%sc_sub_pd)
       end if
    end if
    !<DD> At this point the head of the supercell has the field update stored in self%tmp_sum
    !all other procs have partial/no data
  end subroutine sc_reduce_tmpsum

  !> Convert the extended domain index to ig, it and ifl
  !!
  !! @note The iex_to routines all need testing!
  subroutine sc_iex_to_dims(self,iex,ig,ic,it,ifl)
    use theta_grid, only: ntgrid
    implicit none
    class(supercell_type), intent(in) :: self
    integer, intent(in) :: iex
    integer, intent(out) :: ig,ic,it,ifl

    !Get field index
    ifl=1+int((iex-1)/self%nextend)
    
    !Get cell and it index
    ic=min(self%ncell,1+int((iex-(ifl-1)*self%nextend-1)/(2*ntgrid)))
    it=self%cells(ic)%it_ind

    !Get ig index
    ig=-ntgrid+iex-(ifl-1)*self%nextend-1-2*ntgrid*(ic-1)
  end subroutine sc_iex_to_dims
    
  !> Convert the extended domain index to ifl
  subroutine sc_iex_to_ifl(self,iex,ifl)
    implicit none
    class(supercell_type), intent(in) :: self
    integer, intent(in) :: iex
    integer, intent(out) :: ifl
    ifl=1+(iex-1)/self%nextend
  end subroutine sc_iex_to_ifl

  !> Convert the extended domain index to ic
  subroutine sc_iex_to_ic(self,iex,ic)
    use theta_grid, only: ntgrid
    implicit none
    class(supercell_type), intent(in) :: self
    integer, intent(in) :: iex
    integer, intent(out) :: ic
    integer :: ifl, iex_tmp
    !First work out how many fields there are
    call self%iex_to_ifl(iex,ifl)
    !Ensure we're looking between 1 and self%nextend
    iex_tmp=iex-(ifl-1)*self%nextend
    ic=MIN(int((iex_tmp-1)/(2*ntgrid))+1,self%ncell)
  end subroutine sc_iex_to_ic

  !> Convert the extended domain index to it
  subroutine sc_iex_to_it(self,iex,it)
    use theta_grid, only: ntgrid
    implicit none
    class(supercell_type), intent(in) :: self
    integer, intent(in) :: iex
    integer, intent(out) :: it
    integer :: ifl, iex_tmp, ic
    !First work out how many fields there are
    call self%iex_to_ifl(iex,ifl)
    !Ensure we're looking between 1 and self%nextend
    iex_tmp=iex-(ifl-1)*self%nextend
    ic=MIN(int((iex_tmp-1)/(2*ntgrid))+1,self%ncell)
    it=self%cells(ic)%it_ind
  end subroutine sc_iex_to_it

  !> Convert the extended domain index to ig
  subroutine sc_iex_to_ig(self,iex,ig)
    use theta_grid, only: ntgrid
    implicit none
    class(supercell_type), intent(in) :: self
    integer, intent(in) :: iex
    integer, intent(out) :: ig
    integer :: iex_tmp, ifl
    !First get ifl
    call self%iex_to_ifl(iex,ifl)
    !Limit to 1 and self%nextend
    iex_tmp=iex-(ifl-1)*self%nextend
    
    if(iex_tmp==self%nextend) then
       ig=ntgrid
    else
       ig=-ntgrid+mod(iex_tmp-1,2*ntgrid)
    endif
  end subroutine sc_iex_to_ig

  !> Is the passed it a member of this supercell
  function sc_has_it(self,it)
    implicit none
    class(supercell_type), intent(in) :: self
    integer, intent(in) :: it
    logical :: sc_has_it
    sc_has_it=any(self%cells%it_ind==it)
  end function sc_has_it

  !> A routine to reset the object
  subroutine sc_reset(self)
    use mp, only: free_comm
    implicit none
    class(supercell_type), intent(inout) :: self
    integer :: ic

    !Call reset on all children
    do ic=1,self%ncell
       call self%cells(ic)%reset
    enddo

    ! Free up communicators created for this supercell
    if(self%sc_sub_all%nproc>0 .and. self%sc_sub_all%id/=self%parent_sub%id) then
       call free_comm(self%sc_sub_all)
    end if
    if(self%sc_sub_pd%nproc>0) then
       call free_comm(self%sc_sub_pd)
    end if

    !deallocate
    call self%deallocate

    !Could zero out variables but no real need
  end subroutine sc_reset

  !> Set the locality of each object
  subroutine sc_set_locality(self, pc)
    implicit none
    class(supercell_type), intent(inout) :: self
    class(pc_type), intent(in) :: pc
    integer :: ic

    !Set the locality of children
    do ic=1,self%ncell
       call self%cells(ic)%set_locality(pc)
    enddo
    
    !Set our locality
    self%is_empty=all(self%cells%is_empty)
    self%is_local=any(self%cells%is_local)
    self%is_all_local=all(self%cells%is_all_local)

  end subroutine sc_set_locality

  !> Given an it value get the it of the left connected cell
  function sc_get_left_it(self,it)
    implicit none
    class(supercell_type),intent(in) :: self
    integer,intent(in)::it
    integer :: sc_get_left_it, ic, tmp
    !Default to no connection
    tmp=-1

    !Look for early exit
    if((.not.self%has_it(it)).or.(self%ncell==1)) then
       tmp=-1
    else
       !Now see which cell has this it
       do ic=1,self%ncell
          if(self%cells(ic)%it_ind==it) then
             tmp=ic-1
             exit
          endif
       enddo

       !Now set appropriate value
       if(tmp<=0) then
          tmp=-1
       else
          tmp=self%cells(tmp)%it_ind
       endif

    endif

    !Update result
    sc_get_left_it=tmp
  end function sc_get_left_it

  !> Debug routine to dump the current supercell
  subroutine sc_dump(self,prefix)
    use mp, only: proc0
    use optionals, only: get_option_with_default
    implicit none
    class(supercell_type), intent(inout) :: self
    character(len=*), optional, intent(in) :: prefix
    character(len=80) :: fname
    complex, dimension(:,:), allocatable :: tmp
    integer :: lun

    !Make filename
    write(fname,'(A,"sc_is_",I0,"_ik_",I0,".dat")') &
         trim(get_option_with_default(prefix, '')), self%is_ind, self%ik_ind
    
    !Allocate array and fetch data
    allocate(tmp(self%ncol,self%nrow))

    !Fetch data
    call self%pull_rows_to_arr(tmp)

    !Now we've fetched data we can write
    if(proc0) then
       !Write
       open(newunit=lun,file=fname,status='replace',form="unformatted")
       write(lun) tmp
       close(lun)
    endif

    !Free memory
    deallocate(tmp)
  end subroutine sc_dump

  !> Routine to write the response matrix for this supercell to netcdf file.
  !>
  !> @note Only the head processor actually writes to file. However, in current
  !> use we only call this routine on processors which have this supercell local
  !> and which don't consider it empty. This is usually ok, but for box runs the
  !> mode with ky=kx=0 (super)cell is forced to be considered empty which means 
  !> that, as it stands, this supercell won't be dumped here (although it is for
  !> the fields_implicit approach). This could cause issues for restoring these
  !> response matrices as we end up recalculating these if we can't load any
  !> expected file. As such we have to be careful how we deal with this matrix.
  subroutine sc_dump_to_file(self, suffix)
    use gs2_save, only: gs2_save_response
    use fields_arrays, only: get_specific_response_file_name, time_dump_response
    use gs2_time, only: code_dt
    use job_manage, only: time_message
    implicit none
    !> The instance of the supercell class
    class(supercell_type), intent(in out) :: self
    !> If passed then use as part of file suffix
    character(len=*), optional, intent(in) :: suffix
    complex, dimension(:,:), allocatable :: tmp_arr

    ! Exit early if we're not involved in this supercell
    if(.not.self%is_local) return
    call time_message(.false.,time_dump_response,' Field Dump')

    !Allocate array -- for big problems this might be an issue
    allocate(tmp_arr(self%nrow,self%ncol))

    !First have to pull all row level data
    call self%pull_rows_to_arr(tmp_arr)

    !Now save if on head proc -- could probably only allocate tmp_arr and call
    !pull_rows_to_arr on this head proc as well, at least for now.
    if(self%is_head)then
       call gs2_save_response(tmp_arr, get_specific_response_file_name(self%ik_ind, self%is_ind, code_dt, suffix), code_dt, self%condition_number)
    endif

    deallocate(tmp_arr)
    call time_message(.false.,time_dump_response,' Field Dump')
  end subroutine sc_dump_to_file

  !> Routine to read the response matrix for this supercell from netcdf file.
  subroutine sc_read_from_file(self, could_read, suffix)
    use gs2_save, only: gs2_restore_response
    use mp, only: broadcast_sub, proc0
    use fields_arrays, only: get_specific_response_file_name, time_read_response
    use gs2_time, only: code_dt
    use file_utils, only: error_unit
    use job_manage, only: time_message
    implicit none
    !> The instance of the supercell class
    class(supercell_type), intent(in out) :: self
    !> Flag to indicate if the file was successfully read
    logical, intent(out) :: could_read
    !> If passed then use as part of file suffix
    character(len=*), optional, intent(in) :: suffix

    complex, dimension(:,:), allocatable :: tmp_arr
    character(len = 256) :: file_name
    logical :: file_found
    real :: file_time_step

    ! Initialise reading state.
    could_read = .true.

    ! Construct the expected file name
    file_name = adjustl(get_specific_response_file_name(self%ik_ind, self%is_ind, code_dt, suffix))

    ! First check if the expected file exists, if not then exit
    inquire( file = trim(file_name), exist = file_found)
    if (.not. file_found) then
       if(proc0) write(error_unit(), &
            '("Could not find response file ",A,", reverting to calculation.")') trim(file_name)
       ! Set the flag indicating if we could read the file. Note if this processor
       ! doesn't require this data (i.e. it considers this empty) then we ignore
       ! the presence of the file as we don't care. This is mostly important for
       ! the ky=kx=0 file which may not be present but also isn't required for any
       ! processor.
       could_read = .false. .or. self%is_empty
       return
    end if

    ! We start timing after an initial early exit
    call time_message(.false.,time_read_response,' Field Read')

    !Now allocate array -- for big problems this might be an issue
    allocate(tmp_arr(self%nrow,self%ncol))

    !Now if on head proc read
    if (self%is_head) then
       call gs2_restore_response(tmp_arr, file_name, file_time_step, self%condition_number)
       ! Should perhaps double check that file_time_step == code_dt?
    end if

    !Now we need to broadcast the data to everyone with supercell.
    !NOTE: Really we would want a supercell bound method which will do
    !this so that this module level routine is insensitive to communicators etc.
    !Furthermore we are lazy and use a broadcast_sub to send data to everyone
    !NOTE: The processors which consider this to supercell to be empty (i.e. which
    !don't have any associated work) shouldn't need to take part in this communication.
    !If we can promise that only non-empty processors could be the head that should
    !work fine, provided we have a suitable communicator (sc_sub_pd?).
    call broadcast_sub(tmp_arr,self%head_iproc,self%sc_sub_all%id)

    !Now push array to local storage
    call self%push_arr_to_rows(tmp_arr)

    deallocate(tmp_arr)
    call time_message(.false.,time_read_response,' Field Read')
  end subroutine sc_read_from_file

  !> Store the field equations at row level
  subroutine sc_store_fq(self, fq, fqa, fqp, ifl_in, it_in, ig_in)
    use theta_grid, only: ntgrid
    use run_parameters, only: has_phi, has_apar, has_bpar
    implicit none
    class(supercell_type), intent(inout) :: self
    complex, dimension(:, :), intent(in) :: fq, fqa, fqp
    integer, intent(in) :: ifl_in, it_in, ig_in
    integer :: ic, ic_in, irow, ifq, ulim, ir

    !If we don't have anything in this supercell then exit
    if(self%is_empty) return

    !Find out which cell has our it
    ic_in=0
    do ic=1,self%ncell
       if(self%cells(ic)%it_ind==it_in) then
          ic_in=ic
          exit
       endif
    enddo

    !Work out the row we want to put data in
    irow=(ig_in+ntgrid+1)+(ic_in-1)*(2*ntgrid)+(ifl_in-1)*self%nextend

    ! Loop over cells
    do ic=1,self%ncell
       !If we don't have this cell cycle
       if(self%cells(ic)%is_empty) then
          cycle
       endif

       !Check if we have this row, if not cycle
       if(.not.self%cells(ic)%has_row(irow)) then
          cycle
       endif

       !Find upper limit of column
       ulim=self%cells(ic)%ncol
       
       ifq=0
       if(has_phi)then
          !Increment counter
          ifq=ifq+1

          !Convert extended irow to local
          ir=self%cells(ic)%rb(ifq)%irex_to_ir(irow)

          !Store data
          self%cells(ic)%rb(ifq)%data(:,ir) = fq(:ulim, self%cells(ic)%it_ind)
       end if

       if(has_apar)then
          !Increment counter
          ifq=ifq+1

          !Convert extended irow to local
          ir=self%cells(ic)%rb(ifq)%irex_to_ir(irow)

          !Store data
          self%cells(ic)%rb(ifq)%data(:,ir) = fqa(:ulim, self%cells(ic)%it_ind)
       end if

       if(has_bpar)then
          !Increment counter
          ifq=ifq+1

          !Convert extended irow to local
          ir=self%cells(ic)%rb(ifq)%irex_to_ir(irow)

          !Store data
          self%cells(ic)%rb(ifq)%data(:,ir) = fqp(:ulim, self%cells(ic)%it_ind)
       end if
    end do

  end subroutine sc_store_fq

  !> Prepare the field matrix for calculating field updates
  subroutine sc_prepare(self,prepare_type,pc)
    use mp, only: mp_abort
    implicit none
    class(supercell_type), intent(inout) :: self
    class(pc_type), intent(in) :: pc
    integer, intent(in) :: prepare_type
    character (len=40) :: errmesg

    !Exit early if we're empty
    if(self%is_empty) return
    if(.not.self%is_local) return

    !Need to decide what method we're using to prepare the
    !field matrix
    select case(prepare_type)
    case(0) !Invert
       call self%invert(pc)
    case default
       write(errmesg,'("ERROR: Invalid prepare_type : ",I0)') prepare_type
       call mp_abort(trim(errmesg))
    end select

    ! Now dump the matrix
    if (dump_response) call self%dump_to_file
  end subroutine sc_prepare

  !> A routine to invert the field matrix
  subroutine sc_invert(self, pc)
    use job_manage, only: time_message
    use fields_arrays, only: time_field_invert, time_field_invert_mpi
    use mp, only: get_mp_times
    implicit none
    class(supercell_type), intent(inout) :: self
    class(pc_type), intent(in) :: pc
    real :: mp_total, mp_total_after

    if(self%is_empty) return
    if(.not.self%is_local) return
    call time_message(.false.,time_field_invert,' Field Matrix Invert')
    call get_mp_times(total_time = mp_total)
    
    !If we have all of the supercell local then use an explictly local
    !matrix inversion
    !Note could have a variable which forces local inversion
    if(self%is_all_local.or.pc%force_local_invert) then
!       call self%dump(prefix="orig_")
       call self%invert_local
!       call self%dump(prefix="inv_")
    else
       call self%invert_mpi
    endif

    call time_message(.false.,time_field_invert,' Field Matrix Invert')
    call get_mp_times(total_time = mp_total_after)
    time_field_invert_mpi = time_field_invert_mpi + (mp_total_after - mp_total)
  end subroutine sc_invert

  !> A routine to invert the field matrix locally
  !>
  !> @note Currently every processor which is responsible for a chunk
  !> of the response matrix at the solve stage will do this inversion.
  !> This is obviously duplicated work but allows us to skip any
  !> communication in the push_arr_to_rows method. If only one
  !> processor did the inversion then we would need to communicate the
  !> result back. We are unlikely to want to do this in a non-blocking
  !> manner as it would require that we hold onto the tmp_arr for each
  !> response matrix, only freeing this memory after the non-blocking
  !> communications have completed. As these matrices are large we
  !> don't want them hanging around longer than required. In the typical
  !> use case where we have large minnrow, we would expect most matrices
  !> to belong to just one or a few processors so this point may not be
  !> important in most cases.
  subroutine sc_invert_local(self)
    use mat_inv, only: invert_serial
    use mp, only: broadcast_sub
    implicit none
    class(supercell_type), intent(inout) :: self
    complex, dimension(:,:), allocatable :: tmp_arr

    !Exit early if possible
    if(self%is_empty) return
    if(.not.self%is_local) return

    !Allocate temporary array
    allocate(tmp_arr(self%ncol,self%nrow))

    !First have to pull all row level data
    call self%pull_rows_to_arr(tmp_arr)

    !It may be better to hook into the GK_VERBOSITY level system
    !rather than relying on the debug flag.
    if (self%is_head) then
       self%condition_number = l2norm(tmp_arr)
    end if

    !Now invert in place
    if (self%is_head) then
       call invert_serial(tmp_arr, self%nrow)
       tmp_arr=transpose(tmp_arr)
    end if

    !It would be quite nice if we could do the following in a non-blocking manner
    !but this would require us to either keep the potentially large tmp_arr allocated
    !for all supercells or for us to be able to do this "in-place" (i.e. we use something
    !like non-blocking scatterv rather than broadcast).
    if (.not. self%is_all_local) then
       if (self%sc_sub_pd%nproc > 0) call broadcast_sub(tmp_arr, &
            self%head_iproc_pd, self%sc_sub_pd%id)
    end if

    if (self%is_head) then
       self%condition_number = self%condition_number * l2norm(tmp_arr)
    end if

    !Now push back to row level
    call self%push_arr_to_rows(tmp_arr)

    !Free memory
    deallocate(tmp_arr)

  end subroutine sc_invert_local

  !> A routine to invert the field matrix using mpi
  subroutine sc_invert_mpi(self)
    use mp, only: mp_abort
    implicit none
    class(supercell_type), intent(inout) :: self
    self%is_empty=.true. !Until implemented make everything be treated as empty.
    call mp_abort("ERROR: Invert with mpi not yet implemented.")
  end subroutine sc_invert_mpi

  !> A routine to collect all the row level data and store in passed array
  !! Gather the row blocks up for this cell to fill an array
  subroutine sc_pull_rows_to_arr(self,arr)
    use mp, only: sum_allreduce_sub
    use array_utils, only: zero_array
    implicit none
    class(supercell_type), intent(in) :: self
    complex, dimension(:,:), intent(out) :: arr !Note: Dimensions should be self%ncol, self%nrow
    integer :: ic, rl,ru, cl,cu, irb

    ! Zero out arr |--> Not needed if gathering
    ! This may look like it is only needed for the sum_allreduce_sub branch below,
    ! however without this line we would end up not setting arr in cases where we
    ! think of this supercell as empty. This is usually fine, but for the ky=kx=0
    ! mode all processors consider the supercell to be empty, but we still need to
    ! initialise this array in some cases (e.g. when dumping to file). We could
    ! attempt to only do this zeroing in this special case to save some time, but
    ! we don't expect to be calling this routine much so opt for simpler approach.
    call zero_array(arr)

    !Now see if we want to exit
    if (self%is_empty) return
    if (.not.self%is_local) return

    !Place local piece in arr
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(ic, irb, rl, ru, cl, cu) &
    !$OMP SHARED(self, arr) &
    !$OMP SCHEDULE(static)
    do ic = 1, self%ncell
       if (self%cells(ic)%is_empty) cycle
       do irb = 1, self%cells(ic)%nrb
          rl=self%cells(ic)%rb(irb)%row_llim
          ru=self%cells(ic)%rb(irb)%row_ulim
          cl=self%cells(ic)%rb(irb)%col_llim
          cu=self%cells(ic)%rb(irb)%col_ulim
          arr(cl:cu,rl:ru)=self%cells(ic)%rb(irb)%data
       end do
    end do
    !$OMP END PARALLEL DO

    !If we're now all local then also need comms
    if (.not. self%is_all_local) then
       !Now use MPI to get the rest of the data from other procs
       !<DD>FOR NOW USE ALL_REDUCE AS EASIER, BUT SHOULD BE ABLE
       !TO DO THIS WITH ALL_GATHERV WHICH SHOULD BE FASTER
       ! We only actually need the result on the head of the supercell
       ! as this is the only process which operates on the result.
       ! We could therefore replace allreduce with reduce.
       if (self%sc_sub_pd%nproc > 0) call sum_allreduce_sub(arr, self%sc_sub_pd%id)
    end if

  end subroutine sc_pull_rows_to_arr

  !> A routine to distribute an array to appropriate row blocks
  subroutine sc_push_arr_to_rows(self,arr)
    implicit none
    class(supercell_type), intent(inout) :: self
    complex, dimension(self%nrow,self%ncol), intent(in) :: arr
    integer :: ic, ir
    integer :: rl,ru,cl,cu
    !If empty don't have anywhere to store data so exit
    if(self%is_empty) return

    !Store all local sections
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(ic, ir, rl, ru, cl, cu) &
    !$OMP SHARED(self, arr) &
    !$OMP SCHEDULE(static)
    do ic = 1, self%ncell
       if (self%cells(ic)%is_empty) cycle

       do ir = 1, self%cells(ic)%nrb
          !Get row/col limits
          rl = self%cells(ic)%rb(ir)%row_llim
          ru = self%cells(ic)%rb(ir)%row_ulim
          cl = self%cells(ic)%rb(ir)%col_llim
          cu = self%cells(ic)%rb(ir)%col_ulim
          
          !Store data
          self%cells(ic)%rb(ir)%data = arr(cl:cu, rl:ru)
       end do
    end do
    !$OMP END PARALLEL DO
  end subroutine sc_push_arr_to_rows

  !> Create primary (top level) sub communicators
  subroutine sc_make_subcom_1(self)
    use mp, only: comm_type, split, free_comm, mp_undefined, sum_allreduce_sub
    implicit none
    class(supercell_type), intent(inout) :: self
    integer :: ic, colour
    type(comm_type) :: tmp

    !/First make a subcommunicator involving any procs
    !which can see this supercell
    colour = 0
    if(self%is_local) colour = 1
    !Here we check if the new subcom has same number of members as
    !parent, if so we can carry on using the parent instead
    call sum_allreduce_sub(colour, self%parent_sub%id)

    if (colour == self%parent_sub%nproc) then
       self%sc_sub_all=self%parent_sub
    else
       colour = mp_undefined
       if(self%is_local) colour = 1

       call split(colour,tmp,self%parent_sub%id)
       !Note we only store the subcom for those procs which will use it
       !this ensures an error will occur if we try to use the subcom in
       !the wrong place
       if(self%is_local)then
          self%sc_sub_all=tmp
       else
          !If we can't see any of this object then
          !return
          return
       endif
    endif

    !Now make the supercell sub communicators
    do ic=1,self%ncell
       !Set parent subcom
       self%cells(ic)%parent_sub=self%sc_sub_all
    enddo

  end subroutine sc_make_subcom_1

  !> Create the secondary (intraobject) subcommunicators
  subroutine sc_make_subcom_2(self)
    use mp, only: comm_type, split, sum_allreduce_sub, proc0
    use mp, only: mp_undefined, min_allreduce_sub, nproc, iproc
    implicit none
    class(supercell_type), intent(inout) :: self
    integer :: colour, cc
    integer, dimension(3) :: colours
    type(comm_type) :: tmp
    logical :: some_empty, some_pd, some_all, is_partial_data
    !Exit if we don't have this supercell
    if (.not. self%is_local) return

    ! Set local flags
    colours = 0
    is_partial_data = .not. (self%is_empty .or. self%is_all_local)
    if (self%is_empty) colours(1) = 1
    if (is_partial_data) colours(2) = 1
    if (self%is_all_local) colours(3) = 1

    ! Get global flags
    call sum_allreduce_sub(colours, self%sc_sub_all%id)

    !/Work out if we have any empty procs
    some_empty = colours(1) /= 0
    !/Work out if we have any partial data procs
    some_pd = colours(2) /= 0
    !/Work out if we have any all local procs
    some_all = colours(3) /= 0

    !/Now make a subcommunicator involving all procs
    !which have part of the supercell data range (but not all of it)
    if (some_pd) then
       colour = mp_undefined
       if (is_partial_data) colour = 1
       call split(colour, tmp, self%sc_sub_all%id)
       if (colour /= mp_undefined) then
          !/Store communicator
          self%sc_sub_pd = tmp
       end if
    end if

    !Decide if we're the head proc
    !Note: We also decide which proc is in charge of filling the empty
    !procs with data. By allowing this to be different from the head proc
    !we have a better chance of being able to overlap comms and computations
    !Note: We try to make proc0 the head proc if it is a member of the supercell
    !which has data as it means we don't need to send this supercell to anyone.
    if (some_all) then
       ! Here we try to determine which processor will be the head.
       ! If proc0 is included and has is_all_local we prefer that. If not we can
       ! identify the head as the first proc in sc_sub_all which has is_all_local.
       ! 1. We can determine if proc0 is present and has is_all_local
       !    by simply setting cc = (proc0 .and. self%is_all_local) ? 1 : 0
       !    and then reducing that on sc_sub_all.
       ! 2. If proc0 is not present then the head will just be the proc with
       !    the lowest rank in sc_sub_all with is_all_local true. Therefore we
       !    instead take min_allreduce of cc = is_all_local ? sc_sub_all%iproc : nproc + 1
       !    and then set self%is_head = sc_sub_all%iproc == cc
       ! Work out if proc0 is a member of sc_sub_all _and_ has is_all_local
       cc = 0
       if (proc0 .and. self%is_all_local) cc = 1
       call sum_allreduce_sub(cc, self%sc_sub_all%id)
       if (cc == 1) then
          ! If we have proc0 then that is head
          self%is_head = proc0
       else
          if (self%is_all_local) then
             cc = self%sc_sub_all%iproc
          else
             cc = nproc + 1
          end if
          ! We don't currently have an implementation for this.
          call min_allreduce_sub(cc, self%sc_sub_all%id)
          ! If we don't have proc0 then the head is the processor
          ! with the lowest rank in sc_sub_all which has is_all_local
          self%is_head = cc == self%sc_sub_all%iproc
       end if
    else
       if (self%sc_sub_pd%nproc > 0)then
          cc = 0
          if (proc0) cc = 1
          call sum_allreduce_sub(cc, self%sc_sub_pd%id)

          !Decide if we're the head, prefer proc0 to be head
          !if it's a member of our group
          if (cc /= 0) then
             self%is_head = proc0
          else
             self%is_head = self%sc_sub_pd%proc0
          end if
       end if
    end if

    !Store the head_iproc values
    colours = 0
    if (self%is_head) then
       colours(1) = self%sc_sub_all%iproc
       colours(2) = self%sc_sub_pd%iproc
       colours(3) = iproc
    end if

    call sum_allreduce_sub(colours, self%sc_sub_all%id)
    self%head_iproc = colours(1)
    self%head_iproc_pd = colours(2)
    self%head_iproc_world = colours(3)
  end subroutine sc_make_subcom_2

!------------------------------------------------------------------------

!================
!KYBLOCK
!================

  !> Allocate storage space
  subroutine ky_allocate(self)
    implicit none
    class(ky_type), intent(inout) :: self
    integer :: is
    do is=1,self%nsupercell
       call self%supercells(is)%allocate
    enddo
  end subroutine ky_allocate

  !> Deallocate storage space
  subroutine ky_deallocate(self)
    implicit none
    class(ky_type), intent(inout) :: self
    if(allocated(self%supercells)) deallocate(self%supercells)
  end subroutine ky_deallocate

  !> Setup properties of ky_type instance
  subroutine ky_init(self, ik, itmins, nsupercell, nfield, nbound)
    implicit none
    class(ky_type), intent(in out) :: self
    integer, intent(in) :: ik, nsupercell, nfield
    integer, dimension(:), intent(in) :: itmins
    integer, intent(in out) :: nbound
    integer :: is

    self%ik_ind = ik
    self%nsupercell = nsupercell

    allocate (self%supercells(nsupercell))

    do is = 1, nsupercell
       call self%supercells(is)%init(is, itmins(is), ik, nfield, nbound)
    end do

  end subroutine ky_init

  !> Debug printing
  subroutine ky_debug_print(self)
    implicit none
    class(ky_type), intent(in) :: self
    write(dlun,'("Ky block debug print. Index ik=",I0," nsupercell=",I0)') &
         self%ik_ind, self%nsupercell
  end subroutine ky_debug_print

  !> Get the field update for this ik
  subroutine ky_get_field_update(self,fq,fqa,fqp)
    implicit none
    class(ky_type), intent(inout) :: self
    complex, dimension(:,:), intent(in) :: fq,fqa,fqp
    integer :: is

    !First trigger calculation of field update.
    !Note: Empty supercells don't do anything
    do is=1,self%nsupercell
       !/Calculate the field update
       call self%supercells(is)%get_field_update(fq,fqa,fqp)
    enddo
  end subroutine ky_get_field_update

  !> Wait on the nonblocking collective communications if they have been used.
  subroutine ky_wait_for_nonblocking_collectives(self)
    use mp, only : wait
    implicit none
    class(ky_type), intent(inout) :: self
    integer :: is

    !AJ This could be done more efficiently if the requests 
    !AJ are put into a single array and waitany is called on that
    !AJ array.
    do is=1,self%nsupercell
       call wait(self%supercells(is)%collective_request)
    enddo
  end subroutine ky_wait_for_nonblocking_collectives

  !> Store the field equation at row level
  subroutine ky_store_fq(self,fq,fqa,fqp,ifl_in,it_in,ig_in)
    implicit none
    class(ky_type), intent(in out) :: self
    complex, dimension(:, :), intent(in) :: fq, fqa, fqp
    integer, intent(in) :: ifl_in, it_in, ig_in
    integer :: is

    !If we don't have anything in this supercell then exit
    if (self%is_empty) return

    !Delegate work to supercell
    do is = 1,self%nsupercell
       if (.not.self%supercells(is)%has_it(it_in)) cycle
       call self%supercells(is)%store_fq(fq, fqa, fqp, ifl_in, it_in, ig_in)
    end do
  end subroutine ky_store_fq

  !> Given it say what the supercell id is
  function ky_is_from_it(self,it)
    implicit none
    class(ky_type), intent(inout) :: self
    integer, intent(in) :: it
    integer :: ky_is_from_it
    integer :: is
    ky_is_from_it=-1
    do is=1,self%nsupercell
       if(self%supercells(is)%has_it(it))then
          ky_is_from_it=is
          exit
       endif
    enddo
  end function ky_is_from_it

  !> A routine to reset the object
  subroutine ky_reset(self)
    use mp, only: free_comm, mp_comm_null
    implicit none
    class(ky_type), intent(inout) :: self
    integer :: is

    !Call reset on all children
    do is=1,self%nsupercell
       call self%supercells(is)%reset
    enddo

    !AJ Free communicators associated with this ky block
    if(self%ky_sub_all%id /= self%parent_sub%id .and. self%ky_sub_all%nproc > 0 .and. self%ky_sub_all%id /= mp_comm_null) then
       call free_comm(self%ky_sub_all)
    end if
    !deallocate
    call self%deallocate

    !Could zero out variables but no real need
  end subroutine ky_reset

  !> Set the locality of each object
  subroutine ky_set_locality(self, pc)
    implicit none
    class(ky_type), intent(inout) :: self
    class(pc_type), intent(in) :: pc
    integer :: is

    !Set the locality of children
    do is=1,self%nsupercell
       call self%supercells(is)%set_locality(pc)
    enddo
    
    !Set our locality
    self%is_empty=all(self%supercells%is_empty)
    self%is_local=any(self%supercells%is_local)
    self%is_all_local=all(self%supercells%is_all_local)

  end subroutine ky_set_locality

  !> Prepare the field matrix for calculating field updates
  subroutine ky_prepare(self,prepare_type,pc)
    implicit none
    class(ky_type), intent(inout) :: self
    integer, intent(in) :: prepare_type
    class(pc_type), intent(in) :: pc
    integer :: is

    !Exit early if we're empty
    if(self%is_empty) return
    if(.not.self%is_local) return

    !Tell each supercell to prepare
    ! This might be a good place to add OpenMP pragmas
    ! however it's possible that this could increase peak
    ! memory consumption due to now having to hold
    ! multiple matrices in memory at once, thereby
    ! potentially negating some of the benefit of using OpenMP
    ! to get more memory per process. More testing would
    ! be required to explore this further.
    do is=1,self%nsupercell
       call self%supercells(is)%prepare(prepare_type, pc)
    enddo
  end subroutine ky_prepare

  !> Create the primary subcommunicators
  subroutine ky_make_subcom_1(self)
    use mp, only: comm_type, split, free_comm, mp_undefined, sum_allreduce_sub
    implicit none
    class(ky_type), intent(inout) :: self
    integer :: is, colour
    type(comm_type) :: tmp
    !/First make a subcommunicator involving any procs
    !which can see this ky block
    colour = 0
    if(self%is_local) colour = 1
    call sum_allreduce_sub(colour, self%parent_sub%id)

    if ( colour == self%parent_sub%nproc) then
          self%ky_sub_all=self%parent_sub
    else
       colour = mp_undefined
       if(self%is_local) colour = 1
       call split(colour,tmp,self%parent_sub%id)

       !Note we only store the subcom for those procs which will use it
       !this ensures an error will occur if we try to use the subcom in
       !the wrong place
       if(self%is_local)then
          self%ky_sub_all=tmp
       else
          return
       endif
    endif

    !Now make the supercell sub communicators
    do is=1,self%nsupercell
       !Set parent subcom
       self%supercells(is)%parent_sub=self%ky_sub_all

       !Now make child subcom
       call self%supercells(is)%make_subcom_1
    enddo
  end subroutine ky_make_subcom_1

  !> Create the secondary subcommunicators
  subroutine ky_make_subcom_2(self)
    implicit none
    class(ky_type), intent(inout) :: self
    integer :: is

    !Exit if we don't have this kyb
    if(.not.self%is_local)return

    !Now make the supercell sub communicators
    do is=1,self%nsupercell
       !Now make child subcom
       call self%supercells(is)%make_subcom_2
    enddo
  end subroutine ky_make_subcom_2

!------------------------------------------------------------------------

  !AJ Autotune MinNRow size to obtain optimal row size
  subroutine tuneMinNRow()
    use job_manage, only: timer_local
    use mp, only: nproc, max_reduce, min_reduce, sum_reduce, broadcast, proc0
    use fields_arrays, only: phinew, aparnew, bparnew
    use gs2_layouts, only: g_lo
    use warning_helpers, only: exactly_equal
    implicit none
    integer :: i
    integer :: current_best_size
    real :: start_time, end_time, current_best, temp_best
    real :: min_best, max_best, av_best, current_av_best
    logical :: do_update, do_gather
    do_gather = .true.
    do_update=do_smart_update
    if(g_lo%x_local.and.g_lo%y_local) do_update=.false.
    minNRow = 2
    current_best = -1
    do i = 1,10         
       call init_fields_matrixlocal(fieldmat, pc, tuning_in=.true.)
       start_time = timer_local()
       call getfield_local(phinew,aparnew,bparnew,do_gather,do_update)
       end_time = timer_local()
       call finish_fields_local()
       temp_best = end_time - start_time
       max_best = temp_best
       call max_reduce(max_best, 0)
       min_best = temp_best
       call min_reduce(min_best, 0)
       av_best = temp_best
       call sum_reduce(av_best, 0)
       if(proc0) then
          av_best = av_best/nproc
          if(current_best < 0) then
             current_best = max_best
             current_av_best = av_best
             current_best_size = minNRow
          else
             if(max_best < current_best) then
                current_best = max_best
                current_av_best = av_best
                current_best_size = minNRow
             else if(exactly_equal(max_best, current_best)) then
                if(av_best < current_av_best) then
                   current_best = max_best
                   current_av_best = av_best
                   current_best_size = minNRow
                end if
             end if
          end if
       end if
       minNRow = minNRow*2
    end do
    call broadcast(current_best_size)
    minNRow = current_best_size
    if(proc0) then
       write(*,*) 'Chosen rowsize for matrix-vector operations in fields update',minNRow,current_best
    end if
  end subroutine tuneMinNRow


!================
!FIELDMAT
!================

  !> Allocate storage space
  subroutine fm_allocate(self)
    implicit none
    class(fieldmat_type), intent(inout) :: self
    integer :: ik
    allocate(self%heads(self%ntheta0, self%naky))
    if(.not.self%is_local) return
    do ik=1,self%naky
       call self%kyb(ik)%allocate
    enddo
  end subroutine fm_allocate

  !> Deallocate storage space
  subroutine fm_deallocate(self)
    implicit none
    class(fieldmat_type), intent(inout) :: self
    if(allocated(self%kyb)) deallocate(self%kyb)
    if(allocated(self%heads)) deallocate(self%heads)
  end subroutine fm_deallocate

  !> Debug printing
  subroutine fm_debug_print(self)
    implicit none
    class(fieldmat_type), intent(in) :: self
    write(dlun,'("Field debug print. naky=",I0)') &
         self%naky
  end subroutine fm_debug_print

  !> Just work out the locality (also sets is_empty etc. but is not intended for this)
  subroutine fm_set_is_local(self, pc)
    implicit none
    class(fieldmat_type), intent(inout) :: self
    class(pc_type), intent(in) :: pc
    integer :: ik

    !Set the locality of children
    do ik=1,self%naky
       call self%kyb(ik)%set_locality(pc)
    enddo
    
    !Set our locality
    self%is_empty=all(self%kyb%is_empty)
    self%is_local=any(self%kyb%is_local)
    self%is_all_local=all(self%kyb%is_local)
  end subroutine fm_set_is_local

  !> Initialise the field objects
  subroutine fm_init(self)
    use kt_grids, only: naky, ntheta0
    use dist_fn, only: get_leftmost_it, n_supercells
    use run_parameters, only: has_phi, has_apar, has_bpar
    use theta_grid, only: ntgrid
    implicit none
    class(fieldmat_type), intent(inout) :: self
    integer :: it, cur_id, it_start, is_count, ik
    integer, dimension(:), allocatable :: tmp_supercell_ids
    integer, dimension(:,:), allocatable :: supercell_ids
    integer, dimension(:,:), allocatable :: is_to_itmin

    !Count the fields
    self%nfield = count([has_phi, has_apar, has_bpar])

    !Allocate temp arrays
    allocate(supercell_ids(ntheta0, naky))
    ! Here the first dimension is allocated as ntheta0
    ! but for each ky we only use the first nsupercell(ik)
    ! entries.
    allocate(is_to_itmin(ntheta0, naky))

    !Initialise arrays
    is_to_itmin = 0
    supercell_ids = -1

    !Find the leftmost it for each supercell and use this to label the
    !supercell.  Note that this is different from the supercell_label
    !from dist_fn.
    do ik = 1, naky
       do it = 1, ntheta0
          supercell_ids(it, ik) = get_leftmost_it(it, ik)
       end do
    end do

    !Now work out how many supercells there are for each
    !ik and what their properties are.
    do ik = 1, naky
       !Store the leftmost it values for current ik
       tmp_supercell_ids = supercell_ids(:, ik)
       
       !Now loop over all it values and count how many
       !have which it as the leftmost one.
       it_start = 0
       is_count = 0
       do while (any(tmp_supercell_ids >= 0))
          !Increase the it value
          it_start = it_start + 1

          !Get the leftmost it for supercell with it_start in it
          cur_id = tmp_supercell_ids(it_start)

          !If we've seen this supercell then cycle
          if (cur_id == -1) cycle

          !Increase supercell counter
          is_count = is_count + 1

          !Store the leftmost it value
          is_to_itmin(is_count, ik) = cur_id

          !Set all matching entries to -1 so we know to skip
          !this supercell again
          do it = it_start, ntheta0
             if (tmp_supercell_ids(it) == cur_id) then
                tmp_supercell_ids(it) = -1
             end if
          end do
       end do
    end do
    
    !Free memory
    deallocate(tmp_supercell_ids)

    !Now we want to allocate the basic field structure
    !Store the basic properties
    self%naky = naky
    self%ntheta0 = ntheta0
    self%npts = naky * ntheta0 * (2 * ntgrid + 1)
    self%nbound = 0 !Calculated whilst we create objects
    allocate(self%kyb(naky))
    
    !/Make the kyb objects
    do ik = 1, naky
       call self%kyb(ik)%init(ik, is_to_itmin(:, ik), n_supercells(ik), self%nfield, self%nbound)
    end do

    !Free memory
    deallocate(supercell_ids, is_to_itmin)
  end subroutine fm_init

  !> Find the response of g to delta-fn field perturbations 
  !! and store at the row level
  subroutine fm_populate(self, pc)
    use run_parameters, only: has_phi, has_apar, has_bpar
    use fields_arrays, only: phi, apar, bpar, phinew, aparnew, bparnew
    use dist_fn_arrays, only: g
    use kt_grids, only: kwork_filter
    use array_utils, only: zero_array
    implicit none
    class(fieldmat_type), intent(inout) :: self
    class(pc_type), intent(in) :: pc
    integer :: ifl, pts_remain, ik, is

    if(self%no_populate) return

    !First initialise everything to 0
    call zero_array(g)
    call zero_array(phi) ; call zero_array(phinew)
    call zero_array(apar) ; call zero_array(aparnew)
    call zero_array(bpar) ; call zero_array(bparnew)

    !Initialise field counter
    ifl=0

    !Do phi
    if(has_phi) then
       !Set the number of points left
       pts_remain=self%npts-self%nbound

       !Increment the field counter
       ifl=ifl+1

       !Reset the filter array
       kwork_filter=.false.

       !Reset the init state arrays
       self%kyb%initdone=.false.
       do ik=1,self%naky
          self%kyb(ik)%supercells%initdone=.false.
          do is=1,self%kyb(ik)%nsupercell
             self%kyb(ik)%supercells(is)%initialised=.false.
          enddo
       enddo

       !Now loop over all points and initialise
       do while(pts_remain>0)
          call self%init_next_field_points(phinew,pts_remain,kwork_filter,ifl,pc)
       enddo
    endif

    !Do apar
    if(has_apar) then
       !Set the number of points left
       pts_remain=self%npts-self%nbound

       !Increment the field counter
       ifl=ifl+1

       !Reset the filter array
       kwork_filter=.false.

       !Reset the init state arrays
       self%kyb%initdone=.false.
       do ik=1,self%naky
          self%kyb(ik)%supercells%initdone=.false.
          do is=1,self%kyb(ik)%nsupercell
             self%kyb(ik)%supercells(is)%initialised=.false.
          enddo
       enddo

       !Now loop over all points and initialise
       do while(pts_remain>0)
          call self%init_next_field_points(aparnew,pts_remain,kwork_filter,ifl,pc)
       enddo
    endif

    !Do bpar
    if(has_bpar) then
       !Set the number of points left
       pts_remain=self%npts-self%nbound

       !Increment the field counter
       ifl=ifl+1

       !Reset the filter array
       kwork_filter=.false.

       !Reset the init state arrays
       self%kyb%initdone=.false.
       do ik=1,self%naky
          self%kyb(ik)%supercells%initdone=.false.
          do is=1,self%kyb(ik)%nsupercell
             self%kyb(ik)%supercells(is)%initialised=.false.
          enddo
       enddo

       !Now loop over all points and initialise
       do while(pts_remain>0)
          call self%init_next_field_points(bparnew,pts_remain,kwork_filter,ifl,pc)
       enddo
    endif
    
    !Reset the filter array
    kwork_filter=.false.
  end subroutine fm_populate

  !> Initialise the next set of delta functions, find the response
  !! and store in the appropriate rows.
  subroutine fm_init_next_field_points(self,field,pts_remain,kwork_filter,ifl,pc)
    use kt_grids, only: naky, ntheta0
    use theta_grid, only: ntgrid
    use dist_fn, only: getfieldeq, getfieldeq_nogath, timeadv
    use dist_fn_arrays, only: fieldeq, fieldeqa, fieldeqp
    use fields_arrays, only: phi, apar, bpar, phinew, aparnew, bparnew
    use array_utils, only: zero_array
    implicit none
    class(fieldmat_type), intent(inout) :: self
    complex, dimension(-ntgrid:ntgrid,ntheta0,naky), intent(inout) :: field
    integer, intent(inout) :: pts_remain
    integer, intent(in) :: ifl
    logical, dimension(ntheta0, naky), intent(inout) :: kwork_filter
    class(pc_type), intent(in) :: pc
    integer :: ik, is, iex, ic, iex_init, ig, it
    integer :: it_cur, ig_cur
    integer :: pts_set_here
    logical :: found_ig, found_it

    pts_set_here = 0

    !Loop over all ik
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(ik, is, ic, iex_init, iex, ig, it) &
    !$OMP SHARED(self, kwork_filter, field, ntgrid) &
    !$OMP REDUCTION(+:pts_set_here) &
    !$OMP SCHEDULE(static)
    do ik = 1, self%naky
       !Technically should be able to skip ik values which are not local (though could be empty)
       !/would have to remember to decrease pts_remain

       !If we've finished this ky set the 
       !filter and move on to next ik
       if (self%kyb(ik)%initdone) then
          kwork_filter(:, ik) = .true.
          cycle
       end if

       !Now look at all supercells with this ik
       do is = 1, self%kyb(ik)%nsupercell
          !If we've finished with this supercell
          !then set the filter and move on to next
          if (self%kyb(ik)%supercells(is)%initdone) then
             !Set filter for all it on supercell
             do ic = 1, self%kyb(ik)%supercells(is)%ncell
                kwork_filter(self%kyb(ik)%supercells(is)%cells(ic)%it_ind, ik) = .true.
             end do
             cycle
          end if

          !//If we get to this point then we know that we have at least
          !one grid point on supercell for which we've not made a delta-fn

          !Now look for the first point that we've not initialised
          iex_init = 0
          do iex = 1, self%kyb(ik)%supercells(is)%nextend
             !If we've already done this point then cycle
             if (self%kyb(ik)%supercells(is)%initialised(iex)) cycle
             !Else this is the first point so store it and exit
             iex_init = iex
             exit
          end do

          !If this is the last point then mark supercell as done
          !but don't set filter yet, this will be automatically
          !applied on next loop
          if (iex_init == self%kyb(ik)%supercells(is)%nextend) then
             self%kyb(ik)%supercells(is)%initdone = .true.
          endif

          !Mark this point as initialised
          self%kyb(ik)%supercells(is)%initialised(iex_init) = .true.

          !Increment the number of points handled in this call
          pts_set_here = pts_set_here + 1

          !Get the ig and it indices corresponding to this point
          call self%kyb(ik)%supercells(is)%iex_to_ig(iex_init,ig)
          call self%kyb(ik)%supercells(is)%iex_to_it(iex_init,it)

          !Initialise field
          field(ig, it, ik) = 1.0

          !Set the duplicate boundary point if we're at the left
          !boundary of this call and we are not the leftmost cell in
          !the chain.
          if ((ig == -ntgrid) .and. &
               (it /= self%kyb(ik)%supercells(is)%it_leftmost)) then
             !Get the left connection
             it = self%kyb(ik)%supercells(is)%get_left_it(it)
             field(ntgrid, it, ik) = 1.0
          end if
       end do !Ends loop over supercells

       !Decide if we've done this kyb
       self%kyb(ik)%initdone = all(self%kyb(ik)%supercells%initdone)

    end do !Ends loop over ik
    !$OMP END PARALLEL DO

    ! Update the number of points remaining
    pts_remain = pts_remain - pts_set_here

    !Now do the time advance
    call timeadv(phi, apar, bpar, phinew, aparnew, bparnew, 0)

    !Now get the field equations
    !In some(/most) situations the nogath version should
    !be ok, and will be better for scaling.
    if (pc%has_to_gather) then
       call getfieldeq(phinew, aparnew, bparnew, fieldeq, fieldeqa, fieldeqp)
    else
       !       call getfieldeq_nogath(phinew,aparnew,bparnew,fq,fqa,fqp)
       !<DD>Could improve performance by using a "smart" routine which only operates on local/not empty data
       call self%getfieldeq_nogath(phinew, aparnew, bparnew, fieldeq, fieldeqa, fieldeqp)
    end if

    !Now we need to store the field equations in the appropriate places
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(ik, ig_cur, it_cur, found_ig, found_it, it, is) &
    !$OMP SHARED(self, field, ntgrid, ntheta0, ifl, fieldeq, fieldeqa, fieldeqp) &
    !$OMP SCHEDULE(static)
    do ik = 1, self%naky
       !If no data then skip ik -- because we don't skip setting
       !the field in the first loop of this routine but we do skip
       !zeroing it out here on processors which don't have any interest
       !in this ky, we need to zero the field after this loop to
       !ensure it has been fully reset to 0 on all procs. We might
       !consider skipping the setting of the field in the first loop
       !instead, although this could mean the field is different on
       !different processors (which should be fine) and might complicate
       !how we track the number of points we need to visit etc.
       if (self%kyb(ik)%is_empty) cycle

       !Loop until we've done all supercells
       ! Note we dont use gs2_max_abs here as we are already inside
       ! an OpenMP region.
       do while (maxval(abs(field(: , : ,ik))) > 0)
          !Reset variables
          ig_cur = -ntgrid-1
          it_cur = 0
          found_ig = .false.
          found_it = .false.
          
          !Look for first entry which was set to a delta-fn
          theta0: do it = 1, ntheta0
             ! Find the first theta grid point equal to 1.0
             ig_cur = findloc(field(:, it, ik), cmplx(1.0, 0.0), dim = 1)
             found_ig = ig_cur > 0
             ! If we didn't find a theta grid value move on to
             ! next it
             if (.not. found_ig) cycle theta0
             ! Adjust ig_cur to correct index range (findloc
             ! assumes arrays are 1 indexed)
             ig_cur = ig_cur - (ntgrid + 1)
             if (found_ig) then
                it_cur = it
                found_it = .true.
                exit theta0
             end if
          end do theta0

          !Now check we've found a point, else cycle
          !NOTE: If this triggers we're probably in trouble
          !as nothing will be different on the next iteration
          !and hence we will be stuck in an infinite loop.
          !As such this should really trigger an abort.
          if (.not. found_it) cycle

          !Zero out the field at this point so we ignore
          !it on future iterations
          field(ig_cur, it_cur, ik)=0.0

          !Find the appropriate supercell
          is = self%kyb(ik)%is_from_it(it_cur)

          !Also zero out the boundary point if needed
          if((ig_cur == -ntgrid) .and. &
               (it_cur /= self%kyb(ik)%supercells(is)%it_leftmost)) then
             it = self%kyb(ik)%supercells(is)%get_left_it(it_cur)
             field(ntgrid, it, ik) = 0.0
          end if

          !Now store data
          call self%kyb(ik)%store_fq(fieldeq(:, :, ik), fieldeqa(:, :, ik), &
               fieldeqp(:, :, ik), ifl, it_cur, ig_cur)
       end do !Ends loop over points
    end do !Ends loop over ky
    !$OMP END PARALLEL DO

    ! See comment in above loop for why this is needed
    call zero_array(field)
  end subroutine fm_init_next_field_points

  !> Prepare the field matrix for calculating field updates
  subroutine fm_prepare(self, pc)
    implicit none
    class(fieldmat_type), intent(inout) :: self
    class(pc_type), intent(in) :: pc
    integer :: ik

    !Exit early if we're empty
    if(self%is_empty) return
    if(.not.self%is_local) return
    if(self%no_prepare) return

    !Tell each ky to prepare
    !This might be a good place to add OpenMP as each prepare (inversion)
    !has the potential to be quite slow. However, as the cost per inversion
    !can be quite variable (due to very variable length supercell domains)
    !it may make more sense to OpenMP at the next level down (i.e. the kyb)
    !so we're more likely to be sharing work a bit more fairly. Also, at scale
    !one might expect most of the ky to be "empty" on a given processor.
    do ik=1,self%naky
       call self%kyb(ik)%prepare(self%prepare_type, pc)
    enddo
  end subroutine fm_prepare

  !> A routine to calculate the update to the fields
  subroutine fm_get_field_update(self, phi, apar, bpar, pc)
    use dist_fn, only: getfieldeq, getfieldeq_nogath
    use dist_fn_arrays, only: fieldeq, fieldeqa, fieldeqp
    implicit none
    class(fieldmat_type), intent(inout) :: self
    complex, dimension(:,:,:), intent(inout) :: phi,apar,bpar
    class(pc_type), intent(in) :: pc
    integer :: ik

    !First get the field eq
    if(pc%has_to_gather)then
       call getfieldeq(phi,apar,bpar,fieldeq,fieldeqa,fieldeqp)
    else
!<DD>Could improve performance by using a "smart" routine which only operates on local/not empty data
       call self%getfieldeq_nogath(phi,apar,bpar,fieldeq,fieldeqa,fieldeqp)
    endif

    ! Now get the field update for each ik. Note that we don't OpenMP
    ! here as at scale the majority of kyb will not be local so it is
    ! expected to be better to OpenMP at the supercell level when
    ! iterating over cells.
    do ik=1,self%naky
       !Skip non-local ky
       if(.not.self%kyb(ik)%is_local) cycle

       !Trigger the supercell field updates
       !AJ Non-blocking can be done here
       call self%kyb(ik)%get_field_update(fieldeq(:,:,ik),fieldeqa(:,:,ik),fieldeqp(:,:,ik))
    enddo

    !AJ Wait for collective comms here (these are setup in get_field_update
    if(field_local_nonblocking_collectives) then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(ik) &
       !$OMP SHARED(self) &
       !$OMP SCHEDULE(static)
       do ik=1,self%naky
          !Skip non-local ky
          if(.not.self%kyb(ik)%is_local) cycle          
          call self%kyb(ik)%wait_for_nonblocking_collectives()
       enddo
       !$OMP END PARALLEL DO
    end if

  end subroutine fm_get_field_update

  !> A routine to unpack the supercell tmp_sum vectors to
  !! full field arrays
  subroutine fm_unpack_to_field(self,ph,ap,bp)
    use theta_grid, only: ntgrid
    use run_parameters, only: has_phi, has_apar, has_bpar
    use mp, only: proc0
    implicit none
    class(fieldmat_type), intent(inout) :: self
    complex,dimension(-ntgrid:,:,:), intent(inout) :: ph,ap,bp
    integer :: ik,is,ic,iex,ig,it,bnd,ifl

    !We don't need to unpack if we're proc0 as we've done this already
    if(.not.proc0)then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(ik, is, iex, ifl, ic, it, bnd, ig) &
       !$OMP SHARED(self, ph, ap, bp, has_phi, has_apar, has_bpar, ntgrid) &
       !$OMP SCHEDULE(static)
       do ik=1,self%naky
          if(.not.self%kyb(ik)%is_local) cycle
          do is=1,self%kyb(ik)%nsupercell
             if(.not.self%kyb(ik)%supercells(is)%is_local) cycle
             
             iex=0
             ifl=0
             if(has_phi)then
                ifl=ifl+1
                do ic=1,self%kyb(ik)%supercells(is)%ncell
                   it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                   bnd=0
                   if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
                   do ig=-ntgrid,ntgrid-bnd
                      iex=iex+1
                      ph(ig,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
                   enddo
                enddo
                
                !/Fix boundary points
                if(self%kyb(ik)%supercells(is)%ncell>1) then
                   do ic=1,self%kyb(ik)%supercells(is)%ncell-1
                      ph(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=&
                           ph(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik)
                   enddo
                endif
                
             endif
             
             if(has_apar)then
                ifl=ifl+1
                do ic=1,self%kyb(ik)%supercells(is)%ncell
                   it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                   bnd=0
                   if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
                   do ig=-ntgrid,ntgrid-bnd
                      iex=iex+1
                      ap(ig,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
                   enddo
                enddo
                
                !/Fix boundary points
                if(self%kyb(ik)%supercells(is)%ncell>1) then
                   do ic=1,self%kyb(ik)%supercells(is)%ncell-1
                      ap(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=&
                           ap(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik)
                   enddo
                endif
                
             endif
             
             if(has_bpar)then
                ifl=ifl+1
                do ic=1,self%kyb(ik)%supercells(is)%ncell
                   it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                   bnd=0
                   if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
                   do ig=-ntgrid,ntgrid-bnd
                      iex=iex+1
                      bp(ig,it,ik)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
                   enddo
                enddo
                
                !/Fix boundary points
                if(self%kyb(ik)%supercells(is)%ncell>1) then
                   do ic=1,self%kyb(ik)%supercells(is)%ncell-1
                      bp(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=&
                           bp(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik)
                   enddo
                endif
                
             endif
          enddo
       enddo
       !$OMP END PARALLEL DO
    endif
  end subroutine fm_unpack_to_field

  !> A routine to reset the object
  subroutine fm_reset(self)
    use mp, only: free_comm
    implicit none
    class(fieldmat_type), intent(inout) :: self
    integer :: ik

    !Call reset on all children
    do ik=1,self%naky
       call self%kyb(ik)%reset
    enddo

    !deallocate
    call self%deallocate
!AJ Free the communicators associated with this field matrix object.
    if(self%fm_sub_headsc_p0%nproc>0) then
       call free_comm(self%fm_sub_headsc_p0)
    end if
    !Could zero out variables but no real need
  end subroutine fm_reset

  !> Count how many subcommunicators will be created
  subroutine fm_count_subcom(self)
    use mp, only: iproc, barrier, nproc
    implicit none
    class(fieldmat_type), intent(inout) :: self
    integer :: ik, is, ip
    integer :: nsub_tot, nsub_proc

    !Initialise
    nsub_tot=2 !Subcom belonging to fieldmat object
    nsub_proc=nsub_tot

    !Now get children
    do ik=1,self%naky
       nsub_tot=nsub_tot+2
       if(self%kyb(ik)%is_local) nsub_proc=nsub_proc+2

       do is=1,self%kyb(ik)%nsupercell
          nsub_tot=nsub_tot+2
          if(self%kyb(ik)%supercells(is)%is_local) nsub_proc=nsub_proc+2
       enddo
    enddo

    !Now report
    if(iproc==0) write(dlun,'("Total number of subcommunicators is ",I0)') nsub_tot
    do ip=0,nproc-1
       if(ip==iproc) write(dlun,'("Number of subcommunicators on ",I0," is ",I0)') iproc, nsub_proc
       call barrier
    enddo
    if(iproc==0)write(dlun,'()')
  end subroutine fm_count_subcom

  !> Create all the necessary subcommunicators
  subroutine fm_make_subcom_1(self)
    use mp, only: iproc, nproc, mp_comm, proc0
    implicit none
    class(fieldmat_type), intent(inout) :: self
    integer :: ik

    !Set the all comm to be mp_comm
    self%fm_sub_all%id=mp_comm
    self%fm_sub_all%iproc=iproc
    self%fm_sub_all%nproc=nproc
    self%fm_sub_all%proc0=proc0

    !NOTE: We may actually want to allow a communicator to be given as input
    !as this would allow us to split off a group of procs to do the field
    !calculations for separate fieldmat objects. This may be one way to look
    !at preparing response matrices for different time steps etc.
    !NOTE: We'd need to take this into account in the decomposition routines
    !as well where we split up the work. and also in any fieldmat routine which uses
    !iproc,nproc,proc0 etc.
    !NOTE: Best way to achieve this is to always reference self%fm_sub_all when wanting
    !iproc,nproc,proc0 etc.

    !Set the parents and make their subcommunicators
    do ik=1,self%naky
       !Set parent subcom
       self%kyb(ik)%parent_sub=self%fm_sub_all

       !Make individual subcom
       call self%kyb(ik)%make_subcom_1
    enddo

  end subroutine fm_make_subcom_1

  !> Create the secondary subcommunicators
  subroutine fm_make_subcom_2(self)
    use mp, only: split, free_comm, mp_undefined
    implicit none
    class(fieldmat_type), intent(inout) :: self
    integer :: ik, colour
    type(comm_type) :: tmp
    logical :: has_head_sc

    !Make child subcommunicators
    do ik=1,self%naky
       !Make individual subcom
       call self%kyb(ik)%make_subcom_2
    enddo

    !Now make the supercell head communicator used for field gathers
    !/First work out if this proc is the head of any supercells
    has_head_sc=.false.
    do ik=1,self%naky
       has_head_sc=has_head_sc.or.any(self%kyb(ik)%supercells%is_head)
    enddo
    !/Now split to make a subcom
    colour = mp_undefined
    !NOTE: Here we assume that the supercell head will always
    !have the data we want *but* currently in ky_get_field we don't
    !store anything on the procs for which the supercell is local but 
    !empty. This should really be changed as whilst empty procs don't
    !have any field data to calculate they will still need it when
    !advancing g etc.
    if(has_head_sc.or.self%fm_sub_all%proc0) colour=1
    call split(colour,tmp,self%fm_sub_all%id)
    if(colour /= mp_undefined) then
       !/Store subcom
       self%fm_sub_headsc_p0=tmp
    endif

  end subroutine fm_make_subcom_2

  !AJ Record the process that is the supercell head associated with a given ik,it point
  !AJ Supercells have a single ik but may have many it points so a given supercell head 
  !AJ may be responsible for a range of points
  subroutine fm_calc_sc_heads(self)
    use mp, only: iproc, sum_allreduce
    implicit none

    class(fieldmat_type), intent(inout) :: self
    integer :: ik, is, ic, it
    !AJ There may be a problem here is any supercell has no assigned head.
    !AJ This is the case (I think) for 1,1; but it shouldn't actually cause 
    !AJ a problem as 1,1 shouldn't be communicated anyway as only one process
    !AJ is involved in it.
    self%heads = 0
    do ik=1,self%naky
       do is=1,self%kyb(ik)%nsupercell
          if(self%kyb(ik)%supercells(is)%is_head) then
             do ic=1,self%kyb(ik)%supercells(is)%ncell               
                it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                self%heads(it,ik) = iproc
             end do
          end if
       end do
    end do

    call sum_allreduce(self%heads)

  end subroutine fm_calc_sc_heads
  
  !> Gather all the fields to proc0/all for diagnostics etc.
  subroutine fm_gather_fields(self,ph,ap,bp,to_all_in,do_allreduce_in)
    use theta_grid, only: ntgrid
    use kt_grids, only: naky, ntheta0
    use mp, only: broadcast, sum_reduce_sub, sum_allreduce, sum_allreduce_sub
    use mp, only: rank_comm, comm_type, proc0
    use run_parameters, only: has_phi, has_apar, has_bpar
    use gs2_layouts, only: g_lo, intspec_sub
    use array_utils, only: zero_array, copy
    use optionals, only: get_option_with_default
    implicit none
    class(fieldmat_type), intent(inout) :: self
    complex, dimension(:,:,:), intent(inout) :: ph,ap,bp
    logical, optional, intent(in) :: to_all_in, do_allreduce_in
    logical :: to_all, do_allreduce
    complex, dimension(:,:,:,:), allocatable :: tmp, tmp_tmp
    integer :: ifl, ik, is, ic, it, iex, bnd, ig
    logical :: use_sub
    type(comm_type), save :: les_comm
    logical, save :: first=.true.
    logical, save :: sub_has_proc0=.false.
    logical :: xy_block_comm_is_available
    logical :: x_is_not_split

    !Set the to_all flag
    to_all = get_option_with_default(to_all_in, .false.)

    !Set the do_allreduce flag
    do_allreduce = get_option_with_default(do_allreduce_in, .false.)
    
    !Shouldn't do to_all if do_allreduce
    if(do_allreduce) to_all=.false.

    !Work out if we want to use sub communicators in reduction and
    !if we are allowed to.
    xy_block_comm_is_available = .not. (g_lo%x_local .and. g_lo%y_local) .and. intspec_sub
    x_is_not_split = g_lo%x_local
    use_sub=( &
         xy_block_comm_is_available .and. x_is_not_split .and. & !Parallel decomposition constraints
         do_allreduce .and. field_local_allreduce_sub) !Runtime choices

    !If using sub-communicators set up object and work out which procs
    !have proc0 as a member
    if(use_sub.and.first)then
       !Setup type
       les_comm%id=g_lo%lesblock_comm
       call rank_comm(les_comm%id,les_comm%iproc)
       !Decide who has proc0
       ig=0
       if(proc0) ig=1
       call sum_allreduce_sub(ig,les_comm%id)
       if(ig/=0) sub_has_proc0=.true.
    endif
    first=.false.

    !Allocate the tmp array if we're part of the head subcom
    !or if we're sending to all
    if((self%fm_sub_headsc_p0%nproc>0).or.to_all.or.do_allreduce) then
       if(use_sub)then
          !Use a smaller array so we ignore zero entries, really would like to reuse some of the
          !integrate species code to do reduction and gather but have extra dimension due to fields
          allocate(tmp_tmp(-ntgrid:ntgrid,g_lo%it_min:g_lo%it_max,g_lo%ik_min:g_lo%ik_max,self%nfield))
       else
          allocate(tmp_tmp(-ntgrid:ntgrid,ntheta0,naky,self%nfield))
       endif
       allocate(tmp(-ntgrid:ntgrid,ntheta0,naky,self%nfield))
       call zero_array(tmp_tmp)
       if(use_sub) call zero_array(tmp) !Only need to zero if tmp_tmp is not same size as tmp
    endif

    !Now loop over supercells, and store sections
    !on procs which are part of the head subcom
    if(self%fm_sub_headsc_p0%nproc>0)then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(ik, is, iex, ifl, it, bnd, ig) &
       !$OMP SHARED(self, ntgrid, tmp_tmp) &
       !$OMP SCHEDULE(static)
       do ik=1,self%naky
          do is=1,self%kyb(ik)%nsupercell
             !Skip if we're not the head supercell
             if(.not.self%kyb(ik)%supercells(is)%is_head) cycle
  
             iex=0
             do ifl=1,self%nfield
                do ic=1,self%kyb(ik)%supercells(is)%ncell
                   it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
                   bnd=0
                   if(ic/=self%kyb(ik)%supercells(is)%ncell) bnd=1
                   do ig=-ntgrid,ntgrid-bnd
                      iex=iex+1
                      tmp_tmp(ig,it,ik,ifl)=self%kyb(ik)%supercells(is)%tmp_sum(iex)
!                      if(ifl .eq. 1) write(*,*) self%kyb(ik)%supercells(is)%tmp_sum(iex)
                  enddo
                enddo
             enddo
!<DD>TAGGED:Worth looking at improving this bad memory pattern
             !/Now fix boundary points
             if(self%kyb(ik)%supercells(is)%ncell>1) then
                do ic=1,self%kyb(ik)%supercells(is)%ncell-1
                   tmp_tmp(ntgrid,self%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik,:)=&
                        tmp_tmp(-ntgrid,self%kyb(ik)%supercells(is)%cells(ic+1)%it_ind,ik,:)
                enddo
             endif
          enddo
       enddo
       !$OMP END PARALLEL DO

       !Now reduce
       !Really we should be able to do some form of gather here
       !as every proc knows how long the supercell is and where to put it
!       call sum_allreduce_sub(tmp,self%fm_sub_headsc_p0%id)
       if(.not.do_allreduce)call sum_reduce_sub(tmp_tmp,0,self%fm_sub_headsc_p0)
    endif

!    write(*,*) 'tmp_tmp',tmp_tmp(:,:,:,1)

    !Should really be able to do this on the xyblock sub communicator
    !but proc0 needs to know full result for diagnostics so would need
    !some way to tell proc0 about the other parts of the field (or
    !update diagnostics to work in parallel). We have the lesblock comm
    !which should help with this, but we'd need to work out which procs
    !are in the same comm as proc0. 
    if(do_allreduce) then
       if(use_sub) then
          !This reduction (and copy) ensures that every proc knows the field
          !in its local x-y cells. It could be implemented as an allgatherv which
          !would be slightly more efficient but is more complicated and may involve 
          !more local operations which outweigh the benefits. 
          call sum_allreduce_sub(tmp_tmp,g_lo%xyblock_comm)
       else
          call sum_allreduce(tmp_tmp)
       endif
    endif

    !Now copy tmp_tmp into tmp if we have it
    if((self%fm_sub_headsc_p0%nproc>0).or.to_all.or.do_allreduce) then
       if(use_sub)then
          tmp(:,g_lo%it_min:g_lo%it_max,g_lo%ik_min:g_lo%ik_max,:)=tmp_tmp
          
          !Here we reduce to proc0 of the lesblock comm (which should include the
          !global proc0). Should only really need to call this on procs which
          !have proc0 in their subcomm but no way to determine this currently
          !NOTE: We don't need to know the result of this reduction until we
          !reach the diagnostics. With non-blocking communications we may be able
          !to do this required communication in the background whilst we proceed
          !with the second call to timeadv. There is a slight complication in that
          !we're communicating the change in the fields so we have to remember to 
          !increment the field before diagnostics are called.
          if(sub_has_proc0) call sum_reduce_sub(tmp,0,les_comm) 
       else
          call copy(tmp_tmp, tmp) !It would be nice to avoid this copy if possible
       endif
    endif

    !////////////////////////
    !// Distribute and unpack
    !////////////////////////
    !Note: Currently use a broadcast from proc0 to distribute data to all procs
    !May be more efficient to use a scatter type approach so we send the minimum
    !amount of data. This may end up being slower as we don't take advantage of
    !optimised broadcast which improves bandwidth (branching), as we're doing 1 to many.
    !Might expect a broadcast from the supercell head on sc_sub_all to be most
    !efficient, but it doesn't appear to be better than global broadcast.
    !As the supercell head has knowledge of the result quite early on in the
    !field update algorithm a non-blocking broadcast (MPI-3) on sc_sub_all
    !may be useful and would allow the following broadcasts to be replaced
    !with wait variants plus unpacking.
    !Note: By doing the broadcast at supercell level we would end up sending
    !more data than required (if x is parallel) but to do it at cell level 
    !would require a subcommunicator for each cell, which can easily exceed
    !the maximum number allowed.

    !Fetch to all procs
    if(to_all)then
       call broadcast(tmp)
    endif

    !Finally, unpack tmp and deallocate arrays
    ! We could perhaps use copy here to add OpenMP acceleration
    ! but I worry this could result in the creation of a temporary
    ! in some cases, leading to double copying?
    if(self%fm_sub_all%proc0.or.to_all.or.do_allreduce)then
       ifl=0
       if(has_phi) then
          ifl=ifl+1
          call copy(tmp(:,:,:,ifl), ph)
       endif
       if(has_apar) then
          ifl=ifl+1
          call copy(tmp(:,:,:,ifl), ap)
       endif
       if(has_bpar) then
          ifl=ifl+1
          call copy(tmp(:,:,:,ifl), bp)
       endif
    endif

    if(allocated(tmp)) deallocate(tmp)
    if(allocated(tmp_tmp)) deallocate(tmp_tmp)

    !/Fill the empties by broadcasting from proc0 if we
    !didn't do it with the earlier broadcast. Note that here we
    !have up to three broadcasts whilst with to_all we only have one.
    !However with to_all we have the additional cost of unpacking data.
    !It is possible that for some values of nfield to_all is faster than
    !.not.to_all, this is also likely to depend on the machine
    if(.not.(to_all.or.do_allreduce)) then
       if(has_phi) call broadcast(ph)
       if(has_apar) call broadcast(ap)
       if(has_bpar) call broadcast(bp)
    endif
  end subroutine fm_gather_fields

  !> Update the fields using calculated update
  subroutine fm_update_fields(self,phi,apar,bpar)
    use fields_arrays, only: orig_phi=>phi, orig_apar=>apar, orig_bpar=>bpar
    use run_parameters, only: has_phi, has_apar, has_bpar
    use kt_grids, only: kwork_filter
    use mp, only: proc0
    implicit none
    class(fieldmat_type), intent(in) ::  self
    complex,dimension(:,:,:),intent(inout) :: phi,apar,bpar
    integer :: ik,it,is,ic

    !If we're proc0 then we need to do full array (for diagnostics)
    if(proc0) then
       if(has_phi) phi=phi+orig_phi
       if(has_apar) apar=apar+orig_apar
       if(has_bpar) bpar=bpar+orig_bpar
       return
    endif

    !Now loop over cells and calculate field equation as required
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(ik, is, ic, it) &
    !$OMP SHARED(self, kwork_filter, has_phi, has_apar, has_bpar, &
    !$OMP phi, apar, bpar, orig_phi, orig_apar, orig_bpar) &
    !$OMP SCHEDULE(static)
    do ik=1,self%naky
       !Skip not local cells
       if(.not.self%kyb(ik)%is_local) cycle
       do is=1,self%kyb(ik)%nsupercell
          if(.not.self%kyb(ik)%supercells(is)%is_local) cycle
          do ic=1,self%kyb(ik)%supercells(is)%ncell
             if(.not.self%kyb(ik)%supercells(is)%cells(ic)%is_local) cycle

             it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind

             if(kwork_filter(it,ik)) cycle

             !Increment fields
             if(has_phi) phi(:,it,ik)=phi(:,it,ik)+orig_phi(:,it,ik)
             if(has_apar) apar(:,it,ik)=apar(:,it,ik)+orig_apar(:,it,ik)
             if(has_bpar) bpar(:,it,ik)=bpar(:,it,ik)+orig_bpar(:,it,ik)
          enddo
       enddo
    enddo
    !$OMP END PARALLEL DO

  end subroutine fm_update_fields

  !> Update the fields using the new fields
  subroutine fm_update_fields_newstep(self)
    use fields_arrays, only: phi,apar,bpar,phinew,aparnew,bparnew
    use run_parameters, only: has_phi, has_apar, has_bpar
    use kt_grids, only: kwork_filter
    use mp, only: proc0
    use array_utils, only: copy
    implicit none
    class(fieldmat_type), intent(in) ::  self
    integer :: ik,it,is,ic

    !If we're proc0 then we need to do full array (for diagnostics)
    if(proc0) then
       if(has_phi) call copy(phinew, phi)
       if(has_apar) call copy(aparnew,apar)
       if(has_bpar) call copy(bparnew, bpar)
       return
    endif

    !Now loop over cells and calculate field equation as required
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(ik, is, ic, it) &
    !$OMP SHARED(self, kwork_filter, has_phi, has_apar, has_bpar, &
    !$OMP phi, apar, bpar, phinew, aparnew, bparnew) &
    !$OMP SCHEDULE(static)
    do ik=1,self%naky
       !Skip not local cells
       if(.not.self%kyb(ik)%is_local) cycle
       do is=1,self%kyb(ik)%nsupercell
          if(.not.self%kyb(ik)%supercells(is)%is_local) cycle
          do ic=1,self%kyb(ik)%supercells(is)%ncell
             if(.not.self%kyb(ik)%supercells(is)%cells(ic)%is_local) cycle

             it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind

             if(kwork_filter(it,ik)) cycle

             !Increment fields 
             if(has_phi) phi(:,it,ik)=phinew(:,it,ik)
             if(has_apar) apar(:,it,ik)=aparnew(:,it,ik)
             if(has_bpar) bpar(:,it,ik)=bparnew(:,it,ik)
         enddo
       enddo
    enddo
    !OMP END PARALLEL DO
  end subroutine fm_update_fields_newstep

  !> Fetch all condition numbers to proc0 and report on
  !> worrying values as required.
  subroutine fm_get_condition_numbers(self)
    use file_utils, only: error_unit
    use gs2_time, only: code_dt
    use mp, only: proc0, sum_reduce
    use unit_tests, only: debug_message
    implicit none
    type ky_condition_type
       real, dimension(:), allocatable ::  conditions
    end type ky_condition_type
    class(fieldmat_type), intent(in) :: self
    integer, parameter :: verb = 1
    type(ky_condition_type), dimension(:), allocatable :: ky_conditions
    integer :: ik, is
    character(len = 100) :: condition_number_message
    real :: condition_number
    allocate(ky_conditions(self%naky))

    ! Note that here we communciate over comm_world. It is likely that we
    ! could instead reduce individual `ky_conditions(:)%conditions` arrays
    ! amongst just the supercell heads of the ky block and then reduce
    ! across the ky heads to proc0. Currently we don't worry about this as
    ! we just do this once per initialisation, but this could become expensive
    ! when running at large scale?

    do ik = 1, self%naky
       allocate(ky_conditions(ik)%conditions(self%kyb(ik)%nsupercell))
       ky_conditions(ik)%conditions = 0.0
       do is = 1, self%kyb(ik)%nsupercell
          if (self%kyb(ik)%supercells(is)%is_head) then
             ky_conditions(ik)%conditions(is) = self%kyb(ik)%supercells(is)%condition_number
          end if
       end do
       call sum_reduce(ky_conditions(ik)%conditions, 0)
    end do

    if (proc0) then
       do ik = 1, self%naky
          do is = 1, self%kyb(ik)%nsupercell
             condition_number = ky_conditions(ik)%conditions(is)
             write(condition_number_message,'("Condition number for response with ik = ",I0,", is = ",I0," and dt = ",E14.6E3," is ",E14.6E3)') ik, is, code_dt, condition_number
             call debug_message(verb, "fields_local::fm_get_condition_numbers "//condition_number_message)
             if (condition_number > condition_number_warning_limit) then
                write(error_unit(), '("Warning: ",A)') trim(condition_number_message)
             end if

             ! In debug mode always display the condition number message to screen
             if (debug) then
                write(*,'(A)') trim(condition_number_message)
             end if
          end do
       end do
    end if

    do ik = 1, self%naky
       deallocate(ky_conditions(ik)%conditions)
    end do
    deallocate(ky_conditions)
  end subroutine fm_get_condition_numbers

  !> Write some debug data to file
  subroutine fm_write_debug_data(self)
    use file_utils, only: open_output_file, close_output_file
    use mp, only: iproc, nproc_comm, rank_comm
    implicit none
    class(fieldmat_type), intent(in) :: self
    character(len=20) :: ext
    integer :: unit
    integer :: ik,is,ic,np,ip

    !Make the file extension
    write(ext,'(".debug.",I0,".md")') iproc

    !Open output file
    call open_output_file(unit,ext)

    !Now write data
    write(unit,'("Field matrix debug data for proc : ",I0)') iproc
    write(unit,'(80("="))')
    write(unit,'()')

    !/Here we have details of what cells this proc can see and how
    !they're marked
    write(unit,'("Details of object locality and state")')
    write(unit,'(80("-"))')
    write(unit,'()')
    write(unit,'(7(" ",A13," "))') "ik", "is", "ic", "is_local", "is_empty", "is_all_local", "is_head"
    write(unit,'(7(" ",14("-")))')
    do ik=1,self%naky
       do is=1,self%kyb(ik)%nsupercell
          do ic=1,self%kyb(ik)%supercells(is)%ncell
             write(unit,'(3(I14," "),4(L14," "))') ik,is,ic,&
                  self%kyb(ik)%supercells(is)%cells(ic)%is_local,&
                  self%kyb(ik)%supercells(is)%cells(ic)%is_empty,&
                  self%kyb(ik)%supercells(is)%cells(ic)%is_all_local,&
                  self%kyb(ik)%supercells(is)%is_head
          enddo
       enddo
    enddo
    write(unit,'(" ",7(14("-")," "))')
    write(unit,'()')

    !/Here we see what limits the row blocks have for the cells
    !we actually have
    write(unit,'()')
    write(unit,'("Row limits")')
    write(unit,'(80("-"))')
    write(unit,'()')
    do ik=1,self%naky
       write(unit,'("* ik : ",I0)') ik
!       if(.not.self%kyb(ik)%is_local) cycle
       do is=1,self%kyb(ik)%nsupercell
          write(unit,'("    * is : ",I0)') is
!          if(.not.self%kyb(ik)%supercells(is)%is_local) cycle
          write(unit,'("           ","is_head      - ",L1)') self%kyb(ik)%supercells(is)%is_head
          write(unit,'("           ","is_empty     - ",L1)') self%kyb(ik)%supercells(is)%is_empty
          write(unit,'("           ","is_local     - ",L1)') self%kyb(ik)%supercells(is)%is_local
          write(unit,'("           ","is_all_local - ",L1)') self%kyb(ik)%supercells(is)%is_all_local
          write(unit,'()')
          write(unit,'("        ",4(" ",A8," "))') "ic", "rllim", "rulim", "nrow"
          write(unit,'("        ",4(" ",9("-")))')
          do ic=1,self%kyb(ik)%supercells(is)%ncell
             if(size(self%kyb(ik)%supercells(is)%cells(ic)%rb)>0)then
                write(unit,'("        ",4(I9," "))') ic,&
                     self%kyb(ik)%supercells(is)%cells(ic)%rb(1)%row_llim,&
                     self%kyb(ik)%supercells(is)%cells(ic)%rb(1)%row_ulim,&
                     self%kyb(ik)%supercells(is)%cells(ic)%rb(1)%nrow
             endif
          enddo
          write(unit,'("        ",4(" ",9("-")))')
       enddo
    enddo
    write(unit,'()')

    !/Subcommunicators
    write(unit,'()')
    write(unit,'("Subcommunicators")')
    write(unit,'(80("-"))')
    write(unit,'()')
    do ik=1,self%naky
       write(unit,'("* ik : ",I0)') ik
       do is=1,self%kyb(ik)%nsupercell
          write(unit,'("    * is : ",I0)') is
          write(unit,'()')
          write(unit,'("        ",6(" ",A7," "))') "name", "nproc", "iproc", "proc0", "REnproc", "REiproc"
          write(unit,'("        ",6(" ",8("-")))')

          if(self%kyb(ik)%supercells(is)%sc_sub_all%nproc>0)then
             call nproc_comm(self%kyb(ik)%supercells(is)%sc_sub_all%id,np)
             call rank_comm(self%kyb(ik)%supercells(is)%sc_sub_all%id,ip)
          else
             np=-1
             ip=-1
          endif
          write(unit,'("        ",(" ",A8),2(I8," "),1(L8," "),2(I8," "))') "all",&
               self%kyb(ik)%supercells(is)%sc_sub_all%nproc,&
               self%kyb(ik)%supercells(is)%sc_sub_all%iproc,&
               self%kyb(ik)%supercells(is)%sc_sub_all%proc0,&
               np,ip

          if(self%kyb(ik)%supercells(is)%sc_sub_pd%nproc>0)then
             call nproc_comm(self%kyb(ik)%supercells(is)%sc_sub_pd%id,np)
             call rank_comm(self%kyb(ik)%supercells(is)%sc_sub_pd%id,ip)
          else
             np=-1
             ip=-1
          endif
          write(unit,'("        ",(" ",A8),2(I8," "),1(L8," "),2(I8," "))') "pd",&
               self%kyb(ik)%supercells(is)%sc_sub_pd%nproc,&
               self%kyb(ik)%supercells(is)%sc_sub_pd%iproc,&
               self%kyb(ik)%supercells(is)%sc_sub_pd%proc0,&
               np,ip

          write(unit,'("        ",6(" ",8("-")))')
          write(unit,'()')
       enddo
    enddo
    write(unit,'()')

    !Now close output file
    call close_output_file(unit)
  end subroutine fm_write_debug_data

  subroutine fm_getfieldeq_nogath (self,phi, apar, bpar, fieldeq, fieldeqa, fieldeqp)
    use theta_grid, only: ntgrid
    use dist_fn, only: getan_nogath, gf_lo_integrate
    use dist_fn_arrays, only: antot, antota, antotp
    implicit none
    class(fieldmat_type), intent(in) :: self
    complex, dimension (-ntgrid:,:,:), intent (in) :: phi, apar, bpar
    complex, dimension (-ntgrid:,:,:), intent (in out) ::fieldeq,fieldeqa,fieldeqp

    call getan_nogath(antot, antota, antotp)
    !AJ If we are integrating in the gf_lo data distribution then the results of antot etc... from
    !AJ getan_nogath above will be in gf_lo distribution, perform a communication step to redistribute the 
    !AJ result so the owners of the data in g_lo end up with the data.
    if(gf_lo_integrate) then
       call self%reduce_an(antot, antota, antotp)
    end if
    call self%getfieldeq1_nogath (phi, apar, bpar, antot, antota, antotp, &
         fieldeq, fieldeqa, fieldeqp)
  end subroutine fm_getfieldeq_nogath

  !AJ This routine takes in data that has been calculated in gf_lo layout (i.e. the antot etc... arrays 
  !AJ have been partitioned in gf_lo format so that each naky,ntheta point is on a single process) and 
  !AJ redistributes it to a g_lo layout (i.e. so the processes that own the naky,ntheta points in g_lo 
  !AJ have the data at the end).
  subroutine fm_reduce_an_head_send_broadcast (self,antot, antota, antotp)
    use mp, only: broadcast, iproc, nbsend, nbrecv, initialise_requests, waitall, broadcast_sub
    use theta_grid, only: ntgrid
    use run_parameters, only: has_phi, has_apar, has_bpar
    use gs2_layouts, only: gf_lo, proc_id, idx, ik_idx, it_idx
    use kt_grids, only: kwork_filter
    implicit none
    class(fieldmat_type), intent(in) :: self
    complex, dimension (-ntgrid:,:,:), intent (in out) :: antot, antota, antotp

    integer :: igf, ik, it, ic, is, root, sendreqnum, recvreqnum, sender, receiver, tag, taglim
    integer, dimension (gf_lo%xypoints*3) :: send_handles
    !AJ This array is bigger than it needs to be but does mean we do not
    !AJ need to calculate how many supercells a give process is head for
    integer, dimension (gf_lo%naky*gf_lo%ntheta0*3) :: recv_handles
    integer :: tempit, tempis, tempismax
    complex, dimension(:,:,:,:,:), allocatable :: tempdata

    tempis = 0
    tempismax = 0
    do ik = 1,self%naky
       tempis = self%kyb(ik)%nsupercell
       if(tempis > tempismax) then
          tempismax = tempis
       end if
       
    end do

    allocate(tempdata(-ntgrid:ntgrid,gf_lo%ntheta0,self%nfield,gf_lo%naky,tempismax))

    call initialise_requests(send_handles)
    call initialise_requests(recv_handles)

    recvreqnum = 1

    taglim = self%naky*self%ntheta0+1

    !AJ Iterate through the supercells and each supercell that I am a head for 
    !AJ post a receive for data for a given element of the field array.
    do ik = 1,self%naky
       do is=1,self%kyb(ik)%nsupercell
          if(self%kyb(ik)%supercells(is)%is_head) then
             tempit = 1
             do ic=1,self%kyb(ik)%supercells(is)%ncell

                it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind

                if(kwork_filter(it,ik)) cycle

                !AJ Calculate which process owns this it,ik point in gf_lo space
                !AJ i.e. which process will be sending the data to this supercell head
                sender = proc_id(gf_lo,idx(gf_lo,ik,it))

                if(sender /= iproc) then
                
                   tag = ik*self%ntheta0 + it
                
                   if(has_phi) then
                      call nbrecv(tempdata(:,tempit,1,ik,is),sender,tag,recv_handles(recvreqnum))
                      recvreqnum = recvreqnum + 1
                   end if

                   tag = tag + taglim
                   
                   if(has_apar) then
                      call nbrecv(tempdata(:,tempit,2,ik,is),sender,tag,recv_handles(recvreqnum))
                      recvreqnum = recvreqnum + 1
                   end if
                   
                   tag = tag + taglim
                   
                   if(has_bpar) then
                      call nbrecv(tempdata(:,tempit,3,ik,is),sender,tag,recv_handles(recvreqnum))
                      recvreqnum = recvreqnum + 1
                   end if

                else

                   if(has_phi) then
                      tempdata(:,tempit,1,ik,is) = antot(:,it,ik)
                   end if

                   if(has_apar) then
                      tempdata(:,tempit,2,ik,is) = antota(:,it,ik)
                   end if

                   if(has_bpar) then
                      tempdata(:,tempit,3,ik,is) = antotp(:,it,ik)
                   end if

                end if

                tempit = tempit + 1

             end do
          end if
       end do
    end do

    sendreqnum = 1

    !AJ Iterate through all the points I own in gf_lo and send that part of the field 
    !AJ arrays to the process that is the supercell head for the supercell that contains 
    !AJ that it, ik point.
    do igf = gf_lo%llim_proc, gf_lo%ulim_proc
       ik = ik_idx(gf_lo,igf)
       it = it_idx(gf_lo,igf)

       if(kwork_filter(it,ik)) cycle

       tag = ik*self%ntheta0 + it

       receiver = self%heads(it,ik)

       if(iproc /= receiver) then

          if(has_phi) then
             call nbsend(antot(:,it,ik),receiver,tag,send_handles(sendreqnum))
             sendreqnum = sendreqnum + 1
          end if
          
          tag = tag + taglim
          
          if(has_apar) then
             call nbsend(antota(:,it,ik),receiver,tag,send_handles(sendreqnum))
             sendreqnum = sendreqnum + 1
          end if
          
          tag = tag + taglim
          
          if(has_bpar) then
             call nbsend(antotp(:,it,ik),receiver,tag,send_handles(sendreqnum))
             sendreqnum = sendreqnum + 1
          end if
          
       end if
       
    end do

    call waitall(sendreqnum-1,send_handles)
    call waitall(recvreqnum-1,recv_handles)

    do ik = 1,self%naky
       if(.not.self%kyb(ik)%is_local) cycle
       do is=1,self%kyb(ik)%nsupercell
          if(.not. self%kyb(ik)%supercells(is)%is_local) cycle
          root = self%kyb(ik)%supercells(is)%head_iproc_pd
          
          if((self%kyb(ik)%supercells(is)%is_empty.or.self%kyb(ik)%supercells(is)%is_all_local)) cycle

          tempit = 1
          do ic=1,self%kyb(ik)%supercells(is)%ncell
             it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind

             if(kwork_filter(it,ik)) cycle

             tempit = tempit + 1

          end do

          call broadcast_sub(tempdata(:,1:tempit,1:self%nfield,ik,is),root,self%kyb(ik)%supercells(is)%sc_sub_pd%id)

          if(has_phi) then
             tempit = 1
             do ic=1,self%kyb(ik)%supercells(is)%ncell
                it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind

                if(kwork_filter(it,ik)) cycle

                antot(:,it,ik) = tempdata(:,tempit,1,ik,is)
                tempit = tempit + 1
             end do
          end if
             
          if(has_apar) then
             tempit = 1
             do ic=1,self%kyb(ik)%supercells(is)%ncell
                it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind

                if(kwork_filter(it,ik)) cycle

                antota(:,it,ik) = tempdata(:,tempit,2,ik,is)
                tempit = tempit + 1
             end do
          end if
          
          if(has_bpar) then
             tempit = 1
             do ic=1,self%kyb(ik)%supercells(is)%ncell
                it = self%kyb(ik)%supercells(is)%cells(ic)%it_ind

                if(kwork_filter(it,ik)) cycle

                antotp(:,it,ik) = tempdata(:,tempit,3,ik,is)
                tempit = tempit + 1
             end do
          end if
             
       end do
    end do


    deallocate(tempdata)

  end subroutine fm_reduce_an_head_send_broadcast

  subroutine fm_check_an (self,antot, tempantot, antota, tempantota, antotp, tempantotp)
    use mp, only: broadcast_sub, iproc, sum_allreduce_sub, mp_abort
    use theta_grid, only: ntgrid
    use run_parameters, only: has_phi, has_apar, has_bpar
    use gs2_layouts, only: proc_id, idx
    use kt_grids, only: kwork_filter
    implicit none
    class(fieldmat_type), intent(in) :: self
    complex, dimension (-ntgrid:,:,:), intent (in) :: antot, antota, antotp, tempantot, tempantota, tempantotp
    complex, parameter :: tol = (1e-14,1e-14)
    integer :: ik, it, is, ic, ig

    loop1: do ik=1,self%naky
       !Skip empty cells, note this is slightly different to skipping
       !.not.is_local. Skipping empty is probably faster but may be more dangerous
       if(self%kyb(ik)%is_empty) cycle loop1
       do is=1,self%kyb(ik)%nsupercell
          if(self%kyb(ik)%supercells(is)%is_empty) cycle
          do ic=1,self%kyb(ik)%supercells(is)%ncell
             if(self%kyb(ik)%supercells(is)%cells(ic)%is_empty) cycle

             it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind
             if(kwork_filter(it,ik)) cycle
             
             if(has_phi) then
                do ig=-ntgrid,ntgrid
                   if(abs(aimag(antot(ig,it,ik)-tempantot(ig,it,ik)))>aimag(tol) .or. abs(real(antot(ig,it,ik)-tempantot(ig,it,ik)))>real(tol)) then
                      write(*,*) iproc,'problem with antot',it,ik,antot(ig,it,ik),tempantot(ig,it,ik)
                      call mp_abort('Problem with antot')
                      exit loop1
                   end if
                end do
             endif
             
             if(has_apar) then
                do ig=-ntgrid,ntgrid
                   if(abs(aimag(antota(ig,it,ik)-tempantota(ig,it,ik)))>aimag(tol) .or. abs(real(antota(ig,it,ik)-tempantota(ig,it,ik)))>real(tol)) then
                      write(*,*) iproc,'problem with antota',it,ik,antota(ig,it,ik),tempantota(ig,it,ik)
                      call mp_abort('Problem with antota')
                       exit loop1
                   end if
                end do
             endif
             
             if(has_bpar)then
                do ig=-ntgrid,ntgrid
                   if(abs(aimag(antotp(ig,it,ik)-tempantotp(ig,it,ik)))>aimag(tol) .or. abs(real(antotp(ig,it,ik)-tempantotp(ig,it,ik)))>real(tol)) then
                      write(*,*) iproc,'problem with antotp',it,ik,antotp(ig,it,ik),tempantotp(ig,it,ik)
                      call mp_abort('Problem with antotp')
                      exit loop1
                   end if
                end do
             end if
          end do
       enddo
    enddo loop1


  end subroutine fm_check_an

  subroutine fm_getfieldeq1_nogath (self,phi, apar, bpar, antot, antota, antotp, &
       fieldeq, fieldeqa, fieldeqp)
    use theta_grid, only: ntgrid, bmag
    use kt_grids, only: naky, ntheta0, kperp2, kwork_filter
    use run_parameters, only: has_phi, has_apar, has_bpar, beta
    use species, only: spec, has_electron_species
    use dist_fn, only: gamtot,gamtot1, gamtot2, fl_avg, apfac
    use dist_fn, only: adiabatic_option_switch, adiabatic_option_fieldlineavg, calculate_flux_surface_average
    implicit none
    class(fieldmat_type), intent(in) :: self
    complex, dimension (-ntgrid:,:,:), intent (in) :: phi, apar, bpar
    complex, dimension (-ntgrid:,:,:), intent (in) :: antot, antota, antotp
    complex, dimension (-ntgrid:,:,:), intent (in out) ::fieldeq,fieldeqa,fieldeqp

    integer :: ik, it, is, ic
    if (.not. allocated(fl_avg)) allocate (fl_avg(ntheta0, naky))
    fl_avg = 0.

    if (.not. has_electron_species(spec)) then
       if (adiabatic_option_switch == adiabatic_option_fieldlineavg) then
          call calculate_flux_surface_average(fl_avg,antot)
       end if
    end if

    !Now loop over cells and calculate field equation as required.
    !Note we only populate the parts of the fieldeq arrays which
    !are local to this processor.

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(ik, it, is, ic) &
    !$OMP SHARED(self, kwork_filter, has_phi, antot, gamtot, fl_avg, &
    !$OMP fieldeq, bpar, gamtot1, has_apar, has_bpar, fieldeqa, antota, kperp2, apar, &
    !$OMP fieldeqp, gamtot2, beta, apfac, bmag, phi, antotp) &
    !$OMP SCHEDULE(static)
    do ik=1,self%naky
       !Skip empty cells, note this is slightly different to skipping
       !.not.is_local. Skipping empty is probably faster but may be more dangerous
       if(self%kyb(ik)%is_empty) cycle
       do is=1,self%kyb(ik)%nsupercell
          if(self%kyb(ik)%supercells(is)%is_empty) cycle
          do ic=1,self%kyb(ik)%supercells(is)%ncell
             if(self%kyb(ik)%supercells(is)%cells(ic)%is_empty) cycle

             it=self%kyb(ik)%supercells(is)%cells(ic)%it_ind

             if(kwork_filter(it,ik)) cycle

             !Now fill in data
             if(has_phi) then
                fieldeq(:,it,ik) = antot(:,it,ik) - &
                     gamtot(:,it,ik)*phi(:,it,ik) + &
                     fl_avg(it, ik)
                if(has_bpar) fieldeq(:,it,ik)=fieldeq(:,it,ik)+bpar(:,it,ik)*gamtot1(:,it,ik)
             endif

             if(has_apar) then
                fieldeqa(:,it,ik) = antota(:,it,ik) - &
                     kperp2(:,it,ik)*apar(:,it,ik)
             endif

             if(has_bpar)then
                fieldeqp(:,it,ik) = bpar(:,it,ik) + &
                     (antotp(:,it,ik)+bpar(:,it,ik)*gamtot2(:,it,ik) + &
                     0.5*phi(:,it,ik)*gamtot1(:,it,ik))*(beta*apfac/bmag**2)
             endif
          enddo
       enddo
    enddo
    !$OMP END PARALLEL DO
  end subroutine fm_getfieldeq1_nogath

!------------------------------------------------------------------------ 

!================
!PROC_DECOMP
!================
  !> Allocate storage space
  subroutine pc_allocate(self)
    use kt_grids, only: naky, ntheta0
    implicit none
    class(pc_type), intent(inout) :: self
    allocate(self%is_local(ntheta0,naky))
    allocate(self%itik_subcom(ntheta0,naky))
    allocate(self%nproc_per_cell(ntheta0,naky))
    allocate(self%nresp_per_cell(ntheta0,naky))
    allocate(self%navail_per_cell(ntheta0,naky))
  end subroutine pc_allocate

  !> Deallocate storage space
  subroutine pc_deallocate(self)
    implicit none
    class(pc_type), intent(inout) :: self
    if(allocated(self%is_local)) deallocate(self%is_local)
    if(allocated(self%itik_subcom)) deallocate(self%itik_subcom)
    if(allocated(self%nproc_per_cell)) deallocate(self%nproc_per_cell)
    if(allocated(self%nresp_per_cell)) deallocate(self%nresp_per_cell)
    if(allocated(self%navail_per_cell)) deallocate(self%navail_per_cell)
  end subroutine pc_deallocate

  !> Debug printing
  subroutine pc_debug_print(self)
    implicit none
    class(pc_type), intent(in) :: self
    write(dlun,'("Proc type debug print. Index nrow=",I0)') &
         self%current_nresp()
  end subroutine pc_debug_print

  !> Return the current number of rows we're responsible for
  function pc_current_nresp(self)
    implicit none
    class(pc_type), intent(in) :: self
    integer :: pc_current_nresp
    pc_current_nresp=sum(self%nresp_per_cell)
  end function pc_current_nresp

  !> Work out which cells are local
  subroutine pc_find_locality(self)
    use gs2_layouts, only: g_lo, ik_idx, it_idx
    use mp, only: sum_allreduce
    implicit none
    class(pc_type),intent(inout) :: self
    integer :: it,ik,iglo

    !Initialise
    self%is_local=0

    !Fill in values, this may be fairly slow
    do iglo=1,g_lo%ikitrange
       ik=g_lo%local_ikit_points(iglo)%ik
       it=g_lo%local_ikit_points(iglo)%it
       self%is_local(it,ik)=1
    end do
!    do iglo=g_lo%llim_proc,g_lo%ulim_proc
!       ik=ik_idx(g_lo,iglo)
!       it=it_idx(g_lo,iglo)
!       self%is_local(it,ik)=1
!    enddo

    !Now count how many procs have each cell
    self%nproc_per_cell=self%is_local
    call sum_allreduce(self%nproc_per_cell)
  end subroutine pc_find_locality

  !> Work out how many rows are available in each cell
  subroutine pc_count_avail(self, fieldmat)
    use kt_grids, only: ntheta0, naky
    implicit none
    class(pc_type),intent(inout) :: self
    class(fieldmat_type), intent(in) :: fieldmat
    integer :: it,ik,is

    !Initialise
    self%navail_per_cell=0

    !Loop over cells
    do ik=1,naky
       do it=1,ntheta0
          !If cell isn't local then cycle
          if(self%is_local(it,ik)/=1) cycle

          !Now find the supercell object with this it,ik
          do is=1,fieldmat%kyb(ik)%nsupercell
             !If this supercell has the it then store the 
             !size of the supercell and exit loop
             if(fieldmat%kyb(ik)%supercells(is)%has_it(it)) then
                self%navail_per_cell(it,ik)=fieldmat%kyb(ik)%supercells(is)%nrow
                exit
             endif
          enddo
       enddo
    enddo

    !Set the total number of responsible rows
    self%navail_tot=sum(self%navail_per_cell)
  end subroutine pc_count_avail

  !> Determine if we have any cells with passed ik.
  !!
  !! @note This just says if we can see index
  !! not that we have any data there
  function pc_has_ik(self,ik)
    implicit none
    class(pc_type), intent(in) :: self
    integer, intent(in) :: ik
    logical :: pc_has_ik
    pc_has_ik=any(self%is_local(:,ik)==1)
  end function pc_has_ik

  !> Determine if we have any cells with passed it.
  !!
  !! @note This just says if we can see index
  !! not that we have any data there
  function pc_has_it(self,it)
    implicit none
    class(pc_type), intent(in) :: self
    integer, intent(in) :: it
    logical :: pc_has_it
    pc_has_it=any(self%is_local(it,:)==1)
  end function pc_has_it
 
  !> A wrapper routine to do the decomposition
  subroutine pc_make_decomp(self, fieldmat)
    use mp, only: mp_abort
    implicit none
    class(pc_type), intent(inout) :: self
    class(fieldmat_type), intent(in out) :: fieldmat
    character (len=40) :: errmesg

    write(errmesg,'("decomp_type ",I0," not yet implemented.")') self%decomp_type

    !Select the appropriate routine to use
    select case(self%decomp_type)
    case(0)
       !Entirely local and serial
       call self%decomp_all_serial_local(fieldmat)
    case(1)
       !All supercells we can see a bit of are done
       !serially/locally
       call self%decomp_own_serial_local(fieldmat)
    case(2)
       !Just do the cells that we can see
       call self%decomp_owncells_serial_local(fieldmat)
    case(3)
       !Simple mpi with no attempt at load balance
       !or avoiding splitting small blocks
       call self%decomp_owncells_simplempi(fieldmat)
    case(4)
       !Simple mpi with no attempt at load balance
       !but we attempt to avoid splitting small blocks
       call mp_abort(trim(errmesg))
    case default
       !Invalid decomp_type
       write(errmesg,'("Invalid decomp_type : ",I0)') self%decomp_type
       call mp_abort(trim(errmesg))
    end select
  end subroutine pc_make_decomp

  !> A routine to reset the pc object
  subroutine pc_reset(self)
    implicit none
    class(pc_type), intent(inout) :: self
    
    !deallocate
    call self%deallocate

    !Could zero out variables but no real need
  end subroutine pc_reset

  !A routine to initialise the object
  subroutine pc_init(self, fieldmat)
    implicit none
    class(pc_type), intent(inout) :: self
    class(fieldmat_type), intent(in) :: fieldmat

    !First allocate arrays
    call self%allocate

    !Now find cell locality
    call self%find_locality

    !Count avail data
    call self%count_avail(fieldmat)
  end subroutine pc_init

  !---------------------------
  !Decomposition routines
  !---------------------------

  !> All serial and local || Decomp_type=0
  subroutine pc_decomp_all_serial_local(self, fieldmat)
    implicit none
    class(pc_type), intent(inout) :: self
    class(fieldmat_type), intent(in out) :: fieldmat
    integer :: ik, is, ic, ifq
    integer :: nrow_tmp
    !First of all override previously found is_local array
    !to force all cells to be considered on every processor
    self%is_local=1

    !Now we calculate how much data is available
    call self%count_avail(fieldmat)

    !Set the number of rows responsible
    self%nresp_per_cell=self%navail_per_cell

    !Now loop over all cells, setting the row_llim of
    !their row blocks
    do ik=1,fieldmat%naky
       do is=1,fieldmat%kyb(ik)%nsupercell
          nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow
          do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell
             do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb
                fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=1
                fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=nrow_tmp
                call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow
             enddo
          enddo
       enddo
    enddo
  end subroutine pc_decomp_all_serial_local

  !> Serial and local but only for the supercells  || Decomp_type=1
  !! for which we have some data
  subroutine pc_decomp_own_serial_local(self,fieldmat)
    implicit none
    class(pc_type), intent(inout) :: self
    class(fieldmat_type), intent(in out) :: fieldmat
    integer :: ik, is, ic, ifq
    integer :: nrow_tmp
    logical :: have_some

    !First need to work out which supercells we have part of
    do ik=1,fieldmat%naky
       if(.not.any(self%is_local(:,ik)==1)) cycle
       do is=1,fieldmat%kyb(ik)%nsupercell
          have_some=.false.
          !/See if we have any parts of this supercell
          do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell
             if(self%is_local(fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)==1) then
                have_some=.true.
             endif
          enddo
          !/If so then make sure we mark is_local
          if(have_some)then
             do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell
                self%is_local(fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)=1
             enddo
          endif
       enddo
    enddo

    !Now we calculate how much data is available
    call self%count_avail(fieldmat)

    !Set the number of rows responsible
    self%nresp_per_cell=self%navail_per_cell

    !Now loop over all cells, setting the row_llim of
    !their row blocks
    do ik=1,fieldmat%naky
       do is=1,fieldmat%kyb(ik)%nsupercell
          nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow
          do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell
             do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb
                if(self%is_local(fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)==1) then
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=1
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=nrow_tmp
                else
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=0
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=0
                endif
                call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow
             enddo
          enddo
       enddo
    enddo
  end subroutine pc_decomp_own_serial_local

  !> Serial and local but only for the cells  || Decomp_type=2
  !! for which we have some data
  subroutine pc_decomp_owncells_serial_local(self, fieldmat)
    implicit none
    class(pc_type), intent(inout) :: self
    class(fieldmat_type), intent(in out) :: fieldmat
    integer :: ik, is, ic, ifq
    integer :: nrow_tmp

    !/Don't need to change pc%is_local as this is already correct

    !Make sure we do the invert locally as mpi based version not implemented!
    self%force_local_invert=.true.

    !Now we say that we can use no gather based getfieldeq
    self%has_to_gather=.false.

    !Now we calculate how much data is available
    call self%count_avail(fieldmat)

    !Set the number of rows responsible
    self%nresp_per_cell=self%navail_per_cell

    !Now loop over all cells, setting the row_llim of
    !their row blocks
    do ik=1,fieldmat%naky
       do is=1,fieldmat%kyb(ik)%nsupercell
          nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow
          do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell
             do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb
                if(self%is_local(fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind,ik)==1) then
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=1
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=nrow_tmp
                else
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=0
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=0
                endif
                call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow
             enddo
          enddo
       enddo
    enddo
  end subroutine pc_decomp_owncells_serial_local

  !> Cells are decomposed based on proc || Decomp_type=3
  subroutine pc_decomp_owncells_simplempi(self, fieldmat)
    use mp, only: comm_type, sum_allreduce_sub
    use sorting, only: quicksort
    implicit none
    class(pc_type), intent(inout) :: self
    class(fieldmat_type), intent(in out) :: fieldmat
    integer :: ik, is, ic, ifq, it, ip, llim, ulim
    integer :: nrow_tmp, nrow_loc, remainder, np
    integer, dimension(:), allocatable :: nwork, iproc_work
    type(comm_type) :: supercell_comm
    logical :: cell_is_local

    !/Don't need to change pc%is_local as this is already correct

    !Make sure we do the invert locally as mpi based version not implemented!
    self%force_local_invert=.true.

    !Now we say that we can use no gather based getfieldeq
    self%has_to_gather=.false.

    !Now we (re)calculate how much data is available
    call self%count_avail(fieldmat)

    !Initialise the number of rows responsible
    self%nresp_per_cell=0

    !Now loop over all cells, setting the row_llim of
    !their row blocks
    do ik=1,fieldmat%naky
       do is=1,fieldmat%kyb(ik)%nsupercell
          nrow_tmp=fieldmat%kyb(ik)%supercells(is)%nrow
          do ic=1,fieldmat%kyb(ik)%supercells(is)%ncell
             ! If we have this supercell then determine which processors have
             ! this cell and decide which processors get which work. We attempt
             ! to distribute work to those processors with the least amount of
             ! work already allocated.
             if(fieldmat%kyb(ik)%supercells(is)%is_local)then
                ! Copy out a couple of values to make the code more readable
                supercell_comm = fieldmat%kyb(ik)%supercells(is)%cells(ic)%parent_sub
                cell_is_local = fieldmat%kyb(ik)%supercells(is)%cells(ic)%is_local

                ! First determine how many processors in the supercell communicator
                ! have this cell
                np = 0
                if (cell_is_local) np = 1

                call sum_allreduce_sub(np, supercell_comm%id)

                ! Now we can determine what the effective processor rank should be for distributing work.
                ! Essentially we just want to number the processors with this cell from 0 to np in order of
                ! how little work they currently have.

                ! First record how much work each processors has, forcing those without this cell to report -ve work
                allocate(nwork(0:supercell_comm%nproc-1))
                nwork = 0

                if (cell_is_local) then
                   nwork(supercell_comm%iproc) = self%current_nresp()
                else
                   nwork(supercell_comm%iproc) = -1
                end if

                ! Now all processors in supercell communicator know how much work each processor has
                ! except those which aren't local and hence can't take part.
                call sum_allreduce_sub(nwork, supercell_comm%id)

                ! Now we have to label the processors in sorted order. To do this we first create an array
                ! of sequential processor ids and we then sort this based on the amount of work, as stored in nwork
                allocate(iproc_work(0:supercell_comm%nproc-1))
                iproc_work(:) = [(ip, ip=0, supercell_comm%nproc-1)]

                ! Now sort iproc_work based on nwork
                call quicksort(supercell_comm%nproc, nwork, iproc_work)

                if (cell_is_local) then
                   ! With 2008 standard can just do:
                   ! ip_expect = findloc(iproc_work, supercell_comm%iproc)
                   ! instead of this loop
                   ! Search the processor list to find where the processor's id is found
                   do ip = 0, supercell_comm%nproc-1
                      if(iproc_work(ip) == supercell_comm%iproc) exit
                   end do

                   ! Here we have to remove any offset in ip arising
                   ! from the processors without this cell. As they
                   ! are assigned -1 work they come at the start of
                   ! the array.
                   ip = ip - count(nwork == -1)
                end if
                deallocate(nwork, iproc_work)
             endif

             !NOTE: By sorting the procs as done above we end up with lots of different procs
             !which are proc0 for at least one supercell. This means that later when we assign
             !the supercell heads we have lots of members, whilst if we try to minimise the number
             !of unique procs which are proc0 for 1 or more supercells then there will be fewer
             !members and the subsequent allreduce will be cheaper.

             !Store it index
             it=fieldmat%kyb(ik)%supercells(is)%cells(ic)%it_ind

             !If local work out limits, if not just set to 0.
             if(fieldmat%kyb(ik)%supercells(is)%cells(ic)%is_local)then

                !Calculate row limits based on cell size, nprocs and
                !proc rank
                !/Simple block size
                nrow_loc=max(ceiling(nrow_tmp*1.0/np),minNRow)
                !By enforcing a minimum value of nrow_loc we can ensure
                !that blocks don't get too small. Note it's easy to work
                !out the limits for any of the other procs *without* comms.
                !Hence should be quite cheap to check how many empty procs
                !we have, how small the smallest block is etc. So shouldn't
                !be too expensive to work out if we should do an extra row
                !or two if it will remove procs with little work.
                remainder=nrow_tmp-nrow_loc*np !Note this should be -ve due to use of ceiling
                
                !Now work out limits
                llim=1+ip*(nrow_loc)
                ulim=llim+nrow_loc-1

                !Now make sure that we don't overstep the cell limits
                if(llim>nrow_tmp) then
                   !If the lower limit is too big then this proc should
                   !be made empty
                   llim=0
                   ulim=0
                else if(ulim>nrow_tmp) then
                   !If just the upper limit is too big then
                   !lower it
                   ulim=nrow_tmp
                endif

                !Now loop over row blocks to set index
                do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=llim
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=ulim
                   call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow
                enddo

             else
                do ifq=1,fieldmat%kyb(ik)%supercells(is)%cells(ic)%nrb
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_llim=0
                   fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%row_ulim=0
                   call fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(ifq)%set_nrow
                enddo
             endif

             !Record the number of responsible rows, note we assume all rb have same size
             if(size(fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb)>0)then
                self%nresp_per_cell(it,ik)=fieldmat%kyb(ik)%supercells(is)%cells(ic)%rb(1)%nrow
             else
                self%nresp_per_cell(it,ik)=0
             endif
          enddo
       enddo
    enddo
  end subroutine pc_decomp_owncells_simplempi


!------------------------------------------------------------------------ 

!//////////////////////////////
!// MODULE LEVEL ROUTINES
!//////////////////////////////

  !> Routine to initialise
  subroutine init_fields_local
    use antenna, only: init_antenna
    use theta_grid, only: init_theta_grid
    use kt_grids, only: init_kt_grids
    use gs2_layouts, only: init_gs2_layouts, g_lo
    use mp, only: proc0, mp_abort
    use file_utils, only: error_unit
    implicit none

    !Early exit if possible
    if (initialised) return

    !Exit if local fields not supported (e.g. compiled with pgi)
    if(.not.fields_local_functional()) call mp_abort("field_option='local' not supported with this build")

    !Make sure any dependencies are already initialised and
    !initialise the field matrix object.
    if (debug.and.proc0) write(6,*) "init_fields_local: gs2_layouts"
    call init_gs2_layouts
    if (debug.and.proc0) write(6,*) "init_fields_local: theta_grid"
    call init_theta_grid
    if (debug.and.proc0) write(6,*) "init_fields_local: kt_grids"
    call init_kt_grids
    if (debug.and.proc0) write(6,*) "init_fields_local: init_fields_matrixlocal"
    if(field_local_tuneminnrow) then
       call tuneMinNRow()       
    end if
    call init_fields_matrixlocal(fieldmat, pc)
    if (debug.and.proc0) write(6,*) "init_fields_local: antenna"
    call init_antenna

    !Print a warning message if x_lo isn't local
    if((.not.(g_lo%x_local.and.g_lo%y_local)).and.field_local_allreduce_sub) then
       if(proc0)write(error_unit(),'("Warning : In this run not all procs will hold the full field data (only proc0)")')
    endif

    !Set the initialised state
    initialised = .true.
    reinit = .false.
  end subroutine init_fields_local

  !> Routine use to initialise field matrix data
  subroutine init_fields_matrixlocal(fieldmat, pc, tuning_in)
    use mp, only: proc0, mp_abort, land_allreduce
    use file_utils, only: error_unit
    use optionals, only: get_option_with_default
    implicit none
    class(fieldmat_type), intent(in out) :: fieldmat
    class(pc_type), intent(in out) :: pc
    logical, intent(in), optional :: tuning_in
    logical :: tuning
    logical :: response_was_read
    real :: ts,te

    !Exit if we've already initialised
    if(initialised) return

    tuning = get_option_with_default(tuning_in, .false.)

    if(.not.reinit)then
       !Use the fieldmat init routine to create fieldmat
       !structure and fill some of the basic variables
       !if(debug)call barrier
       if(proc0.and.debug) then
          write(dlun,'("Initialising fieldmat.")')
          call cpu_time(ts)
       endif
       call fieldmat%init
       !if(debug)call barrier
       if(proc0.and.debug) then 
          call cpu_time(te)
          write(dlun,'("--Done in ",F12.4," units")') te-ts
       endif

       !Now initialise the parallel decomposition object
       !Note: We don't actually decompose until we make some
       !of the primary subcom so we know how many procs are 
       !available to each object etc.
       !if(debug)call barrier
       if(proc0.and.debug) then
          write(dlun,'("Initialising pc.")')
          call cpu_time(ts)
       endif
       call pc%init(fieldmat)
       !if(debug)call barrier
       if(proc0.and.debug) then 
          call cpu_time(te)
          write(dlun,'("--Done in ",F12.4," units")') te-ts
       endif

       !Now use the pc object to set field object locality
       !and make the primary subcommunicators
       !if(debug)call barrier
       if(proc0.and.debug) then
          write(dlun,'("Setting initial locality and making primary subcommunicators.")')
          call cpu_time(ts)
       endif
       call fieldmat%set_is_local(pc)
       call fieldmat%make_subcom_1
       !if(debug)call barrier
       if(proc0.and.debug) then 
          call cpu_time(te)
          write(dlun,'("--Done in ",F12.4," units")') te-ts
       endif

       !Now we can setup the decomposition, which basically
       !consists of setting the row limit of row block objects
       !if(debug)call barrier
       if(proc0.and.debug) then
          write(dlun,'("Initialising decomposition.")')
          call cpu_time(ts)
       endif
       call pc%make_decomp(fieldmat)
       !if(debug)call barrier
       if(proc0.and.debug) then 
          call cpu_time(te)
          write(dlun,'("--Done in ",F12.4," units")') te-ts
       endif

       !Now set fieldmat locality and allocate data arrays
       !if(debug)call barrier
       if(proc0.and.debug) then
          write(dlun,'("Setting locality and allocating.")')
          call cpu_time(ts)
       endif
       call fieldmat%set_is_local(pc)
       call fieldmat%allocate
       !if(debug)call barrier
       if(proc0.and.debug) then 
          call cpu_time(te)
          write(dlun,'("--Done in ",F12.4," units")') te-ts
       endif

       !Need to make cell/supercell level sub-communicators
       !at some point before we try to invert.
       !Can make them at any point after locality defined

       !/Note that to split a sub-comm everybody in the original
       !communicator needs to make the call. We can either split
       !all groups off from WORLD or we can take "nested subsets"
       !Note sure if there is any perfomance benefit in the actual
       !comms (i'd guess not) but it should be faster to split smaller
       !groups and can do it in parallel so the nested subset approach
       !may be better.
       !if(debug)call barrier
       if(proc0.and.debug)then
          write(dlun,'("Creating the secondary subcoms.")')
          call cpu_time(ts)
       endif
       call fieldmat%make_subcom_2

       call fieldmat%calc_sc_heads

       !if(debug)call barrier
       if(proc0.and.debug) then 
          call cpu_time(te)
          write(dlun,'("--Done in ",F12.4," units")') te-ts
       endif
    endif
!!!!!!!!!!
!!NOTE: All of the above should be a one off setup cost unless something
!!fundamental changes with the parallel layout (a change in nproc/layout etc.)
!!The remaining two tasks (populate+prepare) must be repeated each time the
!!relevant physics and numerical parameters change. In practice this just means
!!when the time step changes.
!!!!!!!!!!
    if(.not. tuning) then

       response_was_read = .false.

       !Now fill the fieldmat with data
       if(read_response)then
          if(proc0.and.debug)then
             write(dlun,'("Populating from dump.")')
             call cpu_time(ts)
          endif
          call read_response_from_file_local(fieldmat, could_read = response_was_read)
          if(proc0.and.debug) then
             call cpu_time(te)
             write(dlun,'("--Done in ",F12.4," units")') te-ts
          endif

          ! Ensure we reduce response_was_read across all processors so if _any_
          ! processors couldn't read the matrices we recalculate on all processors.
          call land_allreduce(response_was_read)
          if((.not. response_was_read) .and. proc0) write(error_unit(), &
               '("Could not find response file so reverting to calculation.")')

       end if

       !If we weren't able to read the response file, or didn't try to,
       !then we need to calculate the response matrix instead
       if (.not. response_was_read) then
          !if(debug)call barrier
          if(proc0.and.debug)then
             write(dlun,'("Populating.")')
             call cpu_time(ts)
          endif
          call fieldmat%populate(pc)
          !if(debug)call barrier
          if(proc0.and.debug) then
             call cpu_time(te)
             write(dlun,'("--Done in ",F12.4," units")') te-ts
          endif
          
          !Now prepare the response matrices
          !Note: Currently prepare just means invert
          !but at some point we may wish to LU decompose
          !or other approaches etc.
          !if(debug)call barrier
          if(proc0.and.debug) then
             write(dlun,'("Preparing.")')
             call cpu_time(ts)
          endif
          call fieldmat%prepare(pc)
          !if(debug)call barrier
          if(proc0.and.debug) then 
             call cpu_time(te)
             write(dlun,'("--Done in ",F12.4," units")') te-ts
          endif
          call fieldmat%get_condition_numbers
       endif

       !Dump response to file
       if(dump_response)then
          if(proc0.and.debug)then
             write(dlun,'("Dumping to file.")')
             call cpu_time(ts)
          endif
          call dump_response_to_file_local(fieldmat)
          if(proc0.and.debug) then
             call cpu_time(te)
             write(dlun,'("--Done in ",F12.4," units")') te-ts
          endif
       endif

    endif
    !Now write debug data
!    call fieldmat%write_debug_data
  end subroutine init_fields_matrixlocal

  !> Reset the fields_local module
  subroutine reset_fields_local
    implicit none
    initialised=.false.
    reinit=.true.
  end subroutine reset_fields_local

  !> Finish the fields_local module
  subroutine finish_fields_local()
    implicit none
    call pc%reset
    call fieldmat%reset
    reinit = .false.
    initialised=.false.
  end subroutine finish_fields_local

  !> Calculate the update to the fields
  subroutine getfield_local(phi,apar,bpar,do_gather_in,do_update_in)
    use unit_tests, only: debug_message
    use optionals, only: get_option_with_default
    implicit none
    complex, dimension(:,:,:), intent(out) :: phi,apar,bpar !Note, these are actually phinew,... in typical usage
    logical, optional, intent(in) :: do_gather_in, do_update_in
    logical :: do_gather, do_update
    integer, parameter :: verb = 4

    !Set gather flag, this currently always needs to be true for
    !correct operation.
    do_gather = get_option_with_default(do_gather_in, .false.)
    do_update = get_option_with_default(do_update_in, .false.)

    call debug_message(verb, &
        'fields_local::getfield_local calling get_field_update')
    !Use fieldmat routine to calculate the field update
    call fieldmat%get_field_update(phi,apar,bpar,pc)

    call debug_message(verb, &
        'fields_local::getfield_local calling gather_fields')
    !Gather to proc0 if requested
    !NOTE: We currently calculate omega at every time step so we
    !actually need to gather everytime, which is a pain!
    !We also fill in the empties here.
    do_gather = .true.
    if(do_gather) call fieldmat%gather_fields(phi,apar,bpar,&
         to_all_in=.false.,do_allreduce_in=field_local_allreduce)

    call debug_message(verb, &
        'fields_local::getfield_local calling update_fields')
    !This routine updates *new fields using gathered update
    if(do_update) call fieldmat%update_fields(phi,apar,bpar)

    call debug_message(verb, &
        'fields_local::getfield_local finished')
  end subroutine getfield_local

  !> Initialise the fields from the initial g, just uses the
  !! fields_implicit routine
  subroutine init_allfields_local
    use fields_implicit, only: init_allfields_implicit
    implicit none
    !EGH Note that this will fail if someone has set
    ! the parameter new_field_init in init_g to .false.
    ! Add a warning/check?
    call init_allfields_implicit
  end subroutine init_allfields_local

  !> This routine advances the solution by one full time step
  subroutine advance_local(istep, remove_zonal_flows_switch)
    use run_parameters, only: has_phi, has_apar, has_bpar, reset
    use fields_implicit, only: remove_zonal_flows
    use fields_arrays, only: phi, apar, bpar, phinew, time_field
    use fields_arrays, only: aparnew, bparnew, apar_ext, time_field_mpi
    use dist_fn, only: timeadv, exb_shear, g_exb, g_exbfac
    use dist_fn, only: collisions_advance
    use dist_fn_arrays, only: g, gnew
    use antenna, only: antenna_amplitudes, no_driver
    use gs2_layouts, only: g_lo
    use unit_tests, only: debug_message
    use mp, only: get_mp_times
    use job_manage, only: time_message
    use array_utils, only: copy
    implicit none
    integer, intent(in) :: istep
    logical, intent(in) :: remove_zonal_flows_switch
    integer, parameter :: diagnostics = 1
    logical, parameter :: do_gather=.true.
    logical :: do_update
    integer, parameter :: verb = 4
    real :: mp_total, mp_total_after
    !do_gather=.true. => fields are collected from all procs and distributed to everyone (required)
    !do_update=.true. => "Smart" update routines are used which only update the local parts of the
    !field arrays. This could help when xy are strongly parallelised but is likely to be slower
    !when all local, hence we should turn it off if xy all local.
    do_update=do_smart_update

    if(g_lo%x_local.and.g_lo%y_local) do_update=.false.

    !GGH NOTE: apar_ext is initialized in this call
    if(.not.no_driver) call antenna_amplitudes (apar_ext)

    call debug_message(verb, &
      'fields_local::advance_local calling g_exb')
    !Apply flow shear if active
    if(abs(g_exb*g_exbfac)>epsilon(0.)) call exb_shear(gnew,phinew,aparnew,bparnew,istep,field_local_allreduce_sub)

    !Update g and fields
    call copy(gnew,g)
    if(do_update)then !Here we tie the "smart" update to do_update
       call fieldmat%update_fields_newstep
    else
       if (has_phi) call copy(phinew, phi)
       if (has_apar) call copy(aparnew, apar)
       if (has_bpar) call copy(bparnew, bpar)
    endif

    call debug_message(verb, &
      'fields_local::advance_local calling timeadv 1')

    !Find gnew given fields at time step midpoint
    call timeadv (phi, apar, bpar, phinew, &
         aparnew, bparnew, istep)
    if(reset) return !Return is resetting

    !Add in antenna driving if present
    !<DD>TAGGED: Should we only this is fapar>0 as well?
    if(.not.no_driver) aparnew=aparnew+apar_ext

    call debug_message(verb, &
         'fields_local::advance_local calling getfield_local')

    !For timing
    !if(debug)call barrier !!/These barriers influence the reported time
    call time_message(.false.,time_field,' Field Solver')
    call get_mp_times(total_time = mp_total)

    !Calculate the fields at the next time point
    call getfield_local(phinew,aparnew,bparnew,do_gather,do_update)

    !If we do the update in getfield_local don't do it here
    if(.not.do_update)then
       if (has_phi) phinew = phinew + phi
       if (has_apar) aparnew = aparnew + apar
       if (has_bpar) bparnew = bparnew + bpar
    endif

    !For timing
    !Barrier usage ensures that proc0 measures the longest time taken on
    !any proc.
    !if(debug)call barrier !!/These barriers influence the reported time
    call time_message(.false.,time_field,' Field Solver')
    call get_mp_times(total_time = mp_total_after)
    time_field_mpi = time_field_mpi + (mp_total_after - mp_total)

    !Remove zonal component if requested
    if(remove_zonal_flows_switch) call remove_zonal_flows

    call debug_message(verb, &
      'fields_local::advance_local calling timeadv 2')
    !Evolve gnew to next half time point
    call timeadv (phi, apar, bpar, phinew, &
         aparnew, bparnew, istep, diagnostics) 

    ! Advance collisions, if separate from timeadv
    call collisions_advance (phi, bpar, phinew, aparnew, bparnew, diagnostics)

  end subroutine advance_local

  subroutine dump_response_to_file_local_nopass(suffix)
    implicit none
    !> If passed then use as part of file suffix
    character(len=*), optional, intent(in) :: suffix
    call dump_response_to_file_local(fieldmat, suffix)
  end subroutine dump_response_to_file_local_nopass

  !> Routine to dump the current response matrix data
  !! to file. One file per connected domain. Each written
  !! by the head of the supercell.
  subroutine dump_response_to_file_local_passed(fieldmat, suffix)
    implicit none
    class(fieldmat_type), intent(in out) :: fieldmat
    !> If passed then use as part of file suffix
    character(len=*), optional, intent(in) :: suffix
    integer :: ik, is

    !First we have to pull data so that head of each supercell has full data
    !Currently we are lazy and just ensure all procs in supercell has full data
    do ik=1,fieldmat%naky
       !If ik isn't local than cycle
       if(.not.fieldmat%kyb(ik)%is_local) cycle

       !Loop over supercells
       do is=1,fieldmat%kyb(ik)%nsupercell
          !If is isn't local than cycle
          if(.not.fieldmat%kyb(ik)%supercells(is)%is_local) cycle

          !Call supercell bound method to dump response
          call fieldmat%kyb(ik)%supercells(is)%dump_to_file(suffix)
       enddo
    enddo
  end subroutine dump_response_to_file_local_passed

  !> Routine to read the current response matrix data
  !> from file dump. One file per connected domain. Each written
  !> by the head of the supercell.
  !> NOTE: Have to have setup communicators etc.
  subroutine read_response_from_file_local(fieldmat, could_read, suffix)
    implicit none
    class(fieldmat_type), intent(in out) :: fieldmat
    !> Flag to indicate if all files successfully read
    logical, intent(out) :: could_read
    !> If passed then use as part of file suffix
    character(len=*), optional, intent(in) :: suffix
    integer :: ik, is

    could_read = .true.

    !First we have to pull data so that head of each supercell has full data
    !Currently we are lazy and just ensure all procs in supercell has full data
    do ik=1,fieldmat%naky
       !If ik isn't local than cycle
       if(.not.fieldmat%kyb(ik)%is_local) cycle

       !Loop over supercells
       do is=1,fieldmat%kyb(ik)%nsupercell
          !If is isn't local than cycle
          if (.not.fieldmat%kyb(ik)%supercells(is)%is_local) cycle

          call fieldmat%kyb(ik)%supercells(is)%read_from_file(could_read = could_read, suffix = suffix)
          ! If we failed to read this file then leave immediately
          if (.not. could_read) return
       end do
    end do
  end subroutine read_response_from_file_local

  !> Returns true if GS2 was built in such a way
  !! as to allow this module to work.
  !! Currently does not work with PGI compilers. 
  !! See online discussions.
  function fields_local_functional()
    use runtime_tests, only: compiler_pgi
    logical :: fields_local_functional
    fields_local_functional = (.not. compiler_pgi())
  end function fields_local_functional

  !> Calculate the L2/Frobenius/Euclidiean-norm of a 2D complex array
  real function l2norm(matrix)
    !> The matrix to calculate the norm of
    complex, dimension(:, :), intent(in) :: matrix
    !> The shape of the matrix
    integer, dimension(2) :: matrix_shape
    !> Indicies for indexing the matrix
    integer :: i, j
    l2norm = 0.
    matrix_shape = shape(matrix)
    do j = 1, matrix_shape(2)
       do i = 1, matrix_shape(1)
          l2norm = l2norm + real(matrix(i,j) * conjg(matrix(i,j)))
       end do
    end do
    l2norm = sqrt(l2norm)

    ! We could write all of the above as
    !    l2norm = sqrt(sum(matrix*matrix))
    ! but this could run over the matrix twice
    ! and require a matrix sized temporary. Our
    ! approach avoids this. We could have an intermediate
    ! approach of
    !  do j = 1, matrix_shape(1)
    !     l2norm = l2norm + sum(matrix(:,j) * conjg(matrix(:,j)))
    !  end do
    !  l2norm = sqrt(l2norm)
    ! which might offer a bit of a middle ground
    ! Also note that a norm2 method was introduced in fortran 2008
    ! but that only accepts real valued arguments.
  end function l2norm

end module fields_local