split_nonlinear_terms.fpp Source File


Contents


Source Code

!> A module to deal with advancing the nonlinear term
!> separately from the linear terms.
module split_nonlinear_terms
  use abstract_config, only: abstract_config_type, CONFIG_MAX_NAME_LEN
  use rk_schemes, only: rk_scheme_type
  implicit none
  private

  public :: init_split_nonlinear_terms, finish_split_nonlinear_terms, set_split_nonlinear_terms_config
  public :: advance_nonlinear_term, advance_nonadiabatic_dfn, strang_split
  public :: split_nonlinear_terms_config_type, get_split_nonlinear_terms_config

  logical :: initialized = .false.

  !> Statistics for the number of steps taken in the integrators
  integer :: nfailed_steps = 0, nrhs_calls = 0, nsuccessful_steps = 0

  !> Interface to describe the call signatures for routines to calculate the
  !> field from gnew (invert_field_func) and to calculate the source term used
  !> in the advance_nonlinear_term methods. Used to allow us to pass in these
  !> methods, giving us a way to test the integrators on different problems.
  interface
     subroutine invert_field_func (g_in, phi, apar, bpar, gf_lo, local_only)
       implicit none
       complex, dimension (:,:,:), intent(in) :: g_in
       complex, dimension (:,:,:), intent (out) :: phi, apar, bpar
       logical, intent(in), optional :: gf_lo, local_only
     end subroutine invert_field_func

     subroutine source_term_func(g_in, g1, phi, apar, bpar, max_vel_local, &
          need_to_adjust, calculate_cfl_limit)
       implicit none
       complex, dimension (:,:,:), intent (in) :: g_in
       complex, dimension (:,:,:), intent (out) :: g1
       complex, dimension (:,:,:), intent (in) :: phi, apar, bpar
       real, intent(out) :: max_vel_local
       logical, intent(in) :: need_to_adjust
       logical, intent(in), optional :: calculate_cfl_limit
     end subroutine source_term_func
  end interface

  ! Related to inputs
  integer :: split_method_switch
  integer, parameter :: split_method_ab3 = 1, split_method_backwards_euler = 2, &
       split_method_rk = 3, split_method_picard = 4
  type(rk_scheme_type) :: the_rk_scheme
  logical :: show_statistics, advance_nonadiabatic_dfn, strang_split
  real :: convergence_tolerance, time_step_safety_factor
  real :: relative_tolerance, absolute_tolerance

  !> Used to represent the input configuration of split_nonlinear_terms
  type, extends(abstract_config_type) :: split_nonlinear_terms_config_type
     ! namelist : split_nonlinear_terms_knobs
     ! indexed : false
     !> The absolute tolerance used in error controlled schemes. Attempt to adjust
     !> time step to keep error below this tolerance.
     real :: absolute_tolerance = 1.0e-3
     !> If true then nonlinear integrator advances the nonadiabatic dfn, h,
     !> rather than the modified distribution function, g.
     logical :: advance_nonadiabatic_dfn = .false.
     !> The tolerance used in convergence checks. Currently only used
     !> for halting the fixed-point iteration in the beuler method
     real :: convergence_tolerance = 1.0e-1
     !> The relative tolerance used in error controlled schemes. Attempt to adjust
     !> time step to keep error below this tolerance.
     real :: relative_tolerance = 1.0e-2
     !> Choose which specific RK scheme to use in the RK method.
     !> Generally if the tolerances are small a higher order scheme
     !> will be more effective than low order. At loose tolerances
     !> low order schemes are generally more efficient
     !> Valid options include:
     !>
     !>  - 'cashkarp' -- Cash-Karp scheme of order 4(5)
     !>  - 'default' -- Same as heun
     !>  - 'heun' -- Heun scheme of order 1(2)
     !>
     !> See [[rk_schemes]] for other options.
     character(len = 20) :: rk_method = 'default'
     !> If true then reports the number of right hand side (NL term) calculations,
     !> the number of failed internal steps and the number of successful steps at the
     !> end of each linear size step.
     logical :: show_statistics = .false.
     !> What algorithm do we use to advance the nonlinear term if
     !> split_nonlinear is true.
     !> Valid options include:
     !>
     !>  - 'AB3' -- Adams-Bashforth (up to) 3rd order. CLF controlled time step.
     !>  - 'beuler' -- Backwards Euler. Relative and absolute error controlled time step.
     !>            Newton iteration controlled by convergence_tolerance.
     !>  - 'default' -- The same as 'RK'.
     !>  - 'RK' -- RK scheme with embedded error estimate. Exact scheme can be switched out
     !>            using rk_method switch. Relative and absolute error controlled time step.
     !>  - 'picard' -- Use simple Picard iteration. Not generally recommended but available
     !>            for experimentation.
     !>
     character(len = 20) :: split_method = 'default'
     !> If true then indicates that we want to use a strang split approach where
     !> we advance the NL operator by a half step, linear by a full step and then
     !> NL by another half step. This _should_ be second order accurate whilst the
     !> regular split is only first order accurate.
     logical :: strang_split = .false.
     !> Multiplies the new time step calculated from either the error or
     !> cfl limits. Smaller values are more conservative but may help avoid
     !> repeated violation of the error/cfl limits and hence reduce the
     !> number of failed steps.
     real :: time_step_safety_factor = 0.9
   contains
     procedure, public :: read => read_split_nonlinear_terms_config
     procedure, public :: write => write_split_nonlinear_terms_config
     procedure, public :: reset => reset_split_nonlinear_terms_config
     procedure, public :: broadcast => broadcast_split_nonlinear_terms_config
     procedure, public, nopass :: get_default_name => get_default_name_split_nonlinear_terms_config
     procedure, public, nopass :: get_default_requires_index => get_default_requires_index_split_nonlinear_terms_config
  end type split_nonlinear_terms_config_type

  type(split_nonlinear_terms_config_type) :: split_nonlinear_terms_config

contains

  !> Initialises the split_nonlinear_terms module. Primarily just reading input
  subroutine init_split_nonlinear_terms(split_nonlinear_terms_config_in)
    implicit none
    type(split_nonlinear_terms_config_type), intent(in), optional :: split_nonlinear_terms_config_in

    if (initialized) return
    initialized = .true.

    call read_parameters(split_nonlinear_terms_config_in)

  end subroutine init_split_nonlinear_terms

  !> Read the split_nonlinear_terms namelist
  subroutine read_parameters(split_nonlinear_terms_config_in)
    use file_utils, only: error_unit
    use text_options, only: text_option, get_option_value
    use rk_schemes, only: get_rk_schemes_as_text_options, get_rk_scheme_by_id
    implicit none
    type(split_nonlinear_terms_config_type), intent(in), optional :: split_nonlinear_terms_config_in
    type(text_option), dimension (*), parameter :: split_method_opts = &
         [ &
         text_option('default', split_method_rk), &
         text_option('AB3', split_method_ab3), &
         text_option('RK', split_method_rk), &
         text_option('beuler', split_method_backwards_euler),  &
         text_option('picard', split_method_picard)  &
         ]
    character(len = 20) :: rk_method, split_method
    integer :: ierr, rk_method_switch

    if (present(split_nonlinear_terms_config_in)) split_nonlinear_terms_config = split_nonlinear_terms_config_in

    call split_nonlinear_terms_config%init(name = 'split_nonlinear_terms_knobs', requires_index = .false.)

    ! Copy out internal values into module level parameters
    associate(self => split_nonlinear_terms_config)
#include "split_nonlinear_terms_copy_out_auto_gen.inc"
    end associate

    ierr = error_unit()

    call get_option_value &
         (split_method, split_method_opts, split_method_switch, &
         ierr, "split_method in split_nonlinear_terms_knobs",.true.)

    call get_option_value &
         (rk_method, get_rk_schemes_as_text_options(), rk_method_switch, &
         ierr, "rk_method in split_nonlinear_terms_knobs",.true.)
    the_rk_scheme = get_rk_scheme_by_id(rk_method_switch)
  end subroutine read_parameters

  !> Reset the module, freeing memory etc.
  subroutine finish_split_nonlinear_terms
    implicit none
    initialized = .false.
    call split_nonlinear_terms_config%reset()
  end subroutine finish_split_nonlinear_terms

  !> Reset the step statistics
  subroutine reset_step_statistics
    nfailed_steps = 0 ; nrhs_calls = 0 ; nsuccessful_steps = 0
  end subroutine reset_step_statistics

  subroutine report_step_statistics(istep, unit_in)
    use iso_fortran_env, only: output_unit
    use optionals, only: get_option_with_default
    implicit none
    integer, intent(in) :: istep
    integer, intent(in), optional :: unit_in
    integer :: unit
    unit = get_option_with_default(unit_in, output_unit)
    write(unit,'("Iteration : ",I0," Number of internal steps : ",I0," Number of failed steps : ",I0," Number of RHS calls : ",I0)') &
         istep, nsuccessful_steps, nfailed_steps, nrhs_calls
  end subroutine report_step_statistics

  !> Advances dg/dt = NL(g,chi) from g_state, chi from t -> t+dt by calling
  !> specific requested method.
  !> On output g_state contains new g at next step, chi is left unchanged
  subroutine advance_nonlinear_term(g_state, istep, phi, apar, bpar, &
       dt, fields_func, source_func)
    use dist_fn_arrays, only: g_adjust, to_g_gs2, from_g_gs2
    use mp, only: mp_abort, get_mp_times, proc0
    use job_manage, only: time_message
    use nonlinear_terms, only: time_add_explicit_terms, time_add_explicit_terms_mpi
    use run_parameters, only: has_phi, has_apar, has_bpar
    use array_utils, only: copy
    implicit none
    integer, intent(in) :: istep
    complex, dimension (:,:,:), intent(in out) :: g_state
    complex, dimension (:,:,:), intent(in) :: phi, apar, bpar
    real, intent(in) :: dt
    complex, dimension (:,:,:), allocatable :: phinew, aparnew, bparnew
    real :: mp_total_after, mp_total
    procedure(invert_field_func) :: fields_func
    procedure(source_term_func) :: source_func

    call time_message(.false., time_add_explicit_terms, 'Explicit terms')
    call get_mp_times(total_time = mp_total)
    call reset_step_statistics

    ! Copy current fields to local copies
    allocate(phinew, mold = phi)
    allocate(aparnew, mold = apar)
    allocate(bparnew, mold = bpar)
    if (has_phi) call copy(phi, phinew)
    if (has_apar) call copy(apar, aparnew)
    if (has_bpar) call copy(bpar, bparnew)

    ! If we want to advance h rather than g then adjust
    ! here to get h. Note we rely on the call site also
    ! providing the correct fields_func to calculate fields
    ! from h rather than g. A different option would be
    ! to require the call site to always pass a way to calculate
    ! from g and a way from h and we then decide here which to use.
    if (advance_nonadiabatic_dfn) then
       call g_adjust(g_state, phinew, bparnew, direction = from_g_gs2)
    end if

    ! Call requested method for advancing nonlinear term
    select case(split_method_switch)
    case(split_method_ab3)
       call advance_nonlinear_term_ab3(g_state, phinew, aparnew, bparnew, dt, &
            fields_func, source_func)
    case(split_method_backwards_euler)
       call advance_nonlinear_term_beuler(g_state, phinew, aparnew, bparnew, dt, &
            fields_func, source_func)
    case(split_method_rk)
       call advance_nonlinear_term_rk(g_state, phinew, aparnew, bparnew, dt, &
            fields_func, source_func)
    case(split_method_picard)
       call advance_nonlinear_term_picard(g_state, phinew, aparnew, bparnew, dt, &
            fields_func, source_func)
    case default
       call mp_abort("Invalid split_method_switch", .true.)
    end select

    if (proc0 .and. show_statistics) call report_step_statistics(istep)

    ! If we advanced h then we need to reconstruct g from this
    if (advance_nonadiabatic_dfn) then
       call g_adjust(g_state, phi, bpar, direction = to_g_gs2)
    end if

    ! Now that we've advanced the explicit terms by dt and set the
    ! current g_state to this new result. Note that we do not update the
    ! fields, instead resetting them to their original values. The
    ! update to the fields is calculated as a part of the implicit
    ! field solve and including the NL contribution here leads to
    ! "double counting".

    call time_message(.false., time_add_explicit_terms, 'Explicit terms')
    call get_mp_times(total_time = mp_total_after)
    time_add_explicit_terms_mpi = time_add_explicit_terms_mpi + (mp_total_after - mp_total)
  end subroutine advance_nonlinear_term

  !> Advances dg/dt = NL(g,chi) from current g_state, chi from t -> t+dt by using
  !> AB scheme of orders up to 3rd.
  subroutine advance_nonlinear_term_ab3(g_state, phinew, aparnew, bparnew, dt, &
       fields_func, source_func)
    use theta_grid, only: ntgrid
    use gs2_time, only: get_adams_bashforth_coefficients
    use gs2_time, only: code_dt_min, dt_not_set, code_dt_prev1, code_dt_prev2
    use mp, only: mp_abort, max_allreduce
    use gs2_layouts, only: g_lo
    use array_utils, only: copy, zero_array
    implicit none
    complex, dimension (-ntgrid:, :, g_lo%llim_proc:), intent(in out) :: g_state
    complex, dimension (-ntgrid:,:,:), intent (in out) :: phinew, aparnew, bparnew
    !> Arrays to hold the nonlinear source history similar to gexp_1/2/3
    complex, dimension(:, :, :), allocatable :: gnl_1, gnl_2, gnl_3
    real, intent(in) :: dt
    real :: internal_time, time_step, max_vel, half_time
    real, dimension(:), allocatable :: ab_coeffs
    integer :: counter, iglo
    procedure(invert_field_func) :: fields_func
    procedure(source_term_func) :: source_func

    ! Initialise internal variables
    internal_time = 0.0
    counter = 1

    allocate(gnl_1(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc)) ; call zero_array(gnl_1)
    allocate(gnl_2(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc)) ; call zero_array(gnl_2)
    allocate(gnl_3(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc)) ; call zero_array(gnl_3)

    ! Reset time step sizes to reflect fact we're restarting the explicit scheme
    code_dt_prev1 = dt_not_set
    code_dt_prev2 = dt_not_set

    ! Keep taking explicit steps until we have advanced by dt
    do while (internal_time < dt)
       ! Get the current nonlinear source term and cfl limit
       call source_func(g_state, gnl_1, phinew, aparnew, bparnew, &
            max_vel, need_to_adjust = .not. advance_nonadiabatic_dfn, &
            calculate_cfl_limit = .true.)
       nrhs_calls = nrhs_calls + 1

       ! Get the maximum velocity across all processors
       ! and convert to time_step estimate.
       call max_allreduce(max_vel)
       time_step = 1.0 / max_vel
       time_step = time_step * time_step_safety_factor

       ! If the cfl time step is too small then abort
       if (time_step < code_dt_min) call mp_abort("Time step has become too small in advance_nonlinear_term.", .true.)

       ! Now make sure we're don't overshoot the target time by limiting time step.
       time_step = min(time_step, dt - internal_time)

       ! Get the Adams-Bashforth coefficients for the current collection of time step sizes
       ab_coeffs = get_adams_bashforth_coefficients(target_dt = time_step)

       half_time = 0.5 * time_step

       ! Advance solution by time_step
       select case(counter)
       case(1)
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iglo) &
          !$OMP SHARED(g_lo, g_state, half_time, ab_coeffs, gnl_1) &
          !$OMP SCHEDULE(static)
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             g_state(:, :, iglo) = g_state(:, :, iglo) + half_time *( &
                  ab_coeffs(1) * gnl_1(:, :, iglo)       &
                  )
          end do
          !$OMP END PARALLEL DO
       case(2)
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iglo) &
          !$OMP SHARED(g_lo, g_state, half_time, ab_coeffs, gnl_1, gnl_2) &
          !$OMP SCHEDULE(static)
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             g_state(:, :, iglo) = g_state(:, :, iglo) + half_time *( &
                  ab_coeffs(1) * gnl_1(:, :, iglo) +     &
                  ab_coeffs(2) * gnl_2(:, :, iglo)       &
                  )
          end do
          !$OMP END PARALLEL DO
       case default
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iglo) &
          !$OMP SHARED(g_lo, g_state, half_time, ab_coeffs, gnl_1, gnl_2, gnl_3) &
          !$OMP SCHEDULE(static)
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             g_state(:, :, iglo) = g_state(:, :, iglo) + half_time *( &
                  ab_coeffs(1) * gnl_1(:, :, iglo) +     &
                  ab_coeffs(2) * gnl_2(:, :, iglo) +     &
                  ab_coeffs(3) * gnl_3(:, :, iglo)       &
                  )
          end do
          !$OMP END PARALLEL DO
       end select

       ! Calculate the fields consistent with the new g
       call calculate_fields(fields_func, g_state, phinew, aparnew, bparnew)

       ! Increment internal state
       internal_time = internal_time + time_step
       counter = counter + 1
       nsuccessful_steps = nsuccessful_steps + 1

       ! Shift time history along one point
       code_dt_prev2 = code_dt_prev1
       code_dt_prev1 = time_step
       call copy(gnl_2, gnl_3)
       call copy(gnl_1, gnl_2)
    end do

  end subroutine advance_nonlinear_term_ab3

  !> Advances dg/dt = NL(g,chi) from current g_state, chi from t -> t+dt by using
  !> backwards Euler with fixed-point iteration.
  subroutine advance_nonlinear_term_beuler(g_state, phinew, aparnew, bparnew, dt, &
       fields_func, source_func)
    use theta_grid, only: ntgrid
    use gs2_time, only: code_dt_min
    use mp, only: mp_abort, max_allreduce, proc0
    use run_parameters, only: immediate_reset
    use gs2_layouts, only: g_lo
    use array_utils, only: copy, gs2_max_abs, zero_array
    use warning_helpers, only: exactly_equal
    implicit none
    complex, dimension (-ntgrid:, :, g_lo%llim_proc:), intent(in out) :: g_state
    complex, dimension (-ntgrid:,:,:), intent (in out) :: phinew, aparnew, bparnew
    complex, dimension(:, :, :), allocatable :: g_cur, g_local, gnl_1, gnl_2
    real, intent(in) :: dt
    real :: internal_time, time_step, max_vel
    integer :: counter, iglo
    integer, parameter :: iteration_limit = 20
    real, parameter :: time_adjust_factor = 2
    logical, parameter :: debug = .false.
    real :: inverse_error, error, actual_dt, half_time, cfl_time_step
    real, dimension(3) :: errors
    real, save :: time_step_last = -1
    procedure(invert_field_func) :: fields_func
    procedure(source_term_func) :: source_func
    logical :: reject_step, cfl_limited
    complex, dimension(:, :, :), allocatable :: error_array

    ! Initialise internal variables
    if (time_step_last < 0) time_step_last = dt
    internal_time = 0.0
    time_step = time_step_last

    allocate(error_array(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    allocate(g_cur(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    allocate(g_local(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    allocate(gnl_1(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    allocate(gnl_2(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    call copy(g_state, g_cur)
    call zero_array(error_array) ; call zero_array(g_local)
    call zero_array(gnl_1); call zero_array(gnl_2)

    ! Keep on advancing until we have advanced by dt
    do while (internal_time < dt)
       ! Reset counter and error
       counter = 0
       error = 1.0
       errors = 0.0

       ! Limit the actual time step that we take to avoid overshooting (i.e.
       ! to keep internal_time <= dt).
       actual_dt = min(time_step, dt - internal_time)
       half_time = 0.5 * actual_dt

       ! Calculate the current NL source term, for use in error
       ! estimates later.
       call source_func(g_state, gnl_2, phinew, aparnew, bparnew, &
            max_vel, need_to_adjust = .not. advance_nonadiabatic_dfn, &
            calculate_cfl_limit = .false.)
       nrhs_calls = nrhs_calls + 1

       ! Try to take a step of size actual_dt using fixed point iteration with
       ! backwards Euler.
       do while (error > convergence_tolerance .and. counter < iteration_limit)

          ! Get the current nonlinear source term and cfl limit -- note we don't use the
          ! cfl limit here
          call source_func(g_state, gnl_1, phinew, aparnew, bparnew, &
               max_vel, need_to_adjust = .not. advance_nonadiabatic_dfn, &
               calculate_cfl_limit = .false.)
          nrhs_calls = nrhs_calls + 1

          ! Calculate the estimate of the new g
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iglo) &
          !$OMP SHARED(g_lo, g_local, g_cur, half_time, gnl_1) &
          !$OMP SCHEDULE(static)
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             g_local(:, :, iglo) = g_cur(:, :, iglo) + half_time * gnl_1(:, :, iglo)
          end do
          !$OMP END PARALLEL DO

          ! Calculate the maximum absolute error and the maximum value of the previous solution

          ! Note we _could_ merge the following loop witth the gs2_max_abs kernel
          ! so that we only loop over the error_array once (and actually don't
          ! need to explicitly store the full array)
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iglo) &
          !$OMP SHARED(g_lo, error_array, g_state, g_local) &
          !$OMP SCHEDULE(static)
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             error_array(:, :, iglo) = g_state(:, :, iglo) - g_local(:, :, iglo)
          end do
          !$OMP END PARALLEL DO
          errors(1) = gs2_max_abs(error_array)
          errors(2) = gs2_max_abs(g_local)
          call max_allreduce(errors)

          ! Calculate an approximate relative error
          error = errors(1) / errors(2)

          ! Update internals -- increment counter and update the 'previous' solution with the new one.
          counter = counter + 1
          call copy(g_local, g_state)

          ! Update the fields consistently with the current solution
          call fields_func(g_state, phinew, aparnew, bparnew, local_only = .true.)
       end do

       ! Now we need to decide if the last step was successful or not. There are two exit conditions, either the
       ! error was small enough (good step) or the iteration limit was exceeded (bad step).
       if (counter >= iteration_limit) then
          ! If it was a bad step then throw away these changes and reduce the time step
          if (proc0 .and. debug) print*,'Too many iterations, reducing step from',time_step,'to',time_step/time_adjust_factor,'with internal_time = ',internal_time

          ! Reduce the time step
          time_step = time_step / time_adjust_factor
          ! If the time step is now too small then abort
          if (time_step < code_dt_min) call mp_abort("Time step has become too small in advance_nonlinear_term.", .true.)

          ! If we've taken too many iterations then we want to drop the time step and try
          ! again. To try again we must ensure that gnew has been reset to g, and the fields
          ! are consistent with this.
          reject_step = .true.
       else
          ! If we have met the convergence condition then we should estimate the error
          ! on the solution and see if we need to reject the step or how we should adjust
          ! the step size.
          ! To estimate the error we compare the backward Euler update with an approximate
          ! Crank-Nicolson update.
          ! g_new_beuler = g + 0.5 * actual_dt * gnl_1(g_new_beuler)
          ! g_new_cn = g + 0.5 * actual_dt * (gnl_1(g_new_beuler) + gnl_1(g))/2
          ! abs_error  = |g_new_beuler - g_new_cn|
          !      = 0.5 * actual_dt * 0.5 * (gnl_1(g_new_beuler) - gnl_1(g)
          ! Where gnl_1(h) is the source func evaluated with gnew=h.

          ! Here we calculate gnl_1(g_new_beuler) exactly. We could probably
          ! rely on using the existing value of gnl_1(g_local) as, whilst this is
          ! strictly from the previous iteration of the fixed-point scheme, we know
          ! it is within the convergence tolerance of the true value.
          call source_func(g_state, gnl_1, phinew, aparnew, bparnew, &
               max_vel, need_to_adjust = .not. advance_nonadiabatic_dfn, &
               calculate_cfl_limit = .true.)
          nrhs_calls = nrhs_calls + 1

          ! Note we _could_ merge the following loop witth the gs2_max_abs kernel
          ! so that we only loop over the error_array once (and actually don't
          ! need to explicitly store the full array)
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iglo) &
          !$OMP SHARED(g_lo, error_array, gnl_1, gnl_2) &
          !$OMP SCHEDULE(static)
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             error_array(:, :, iglo) = gnl_1(:, :, iglo) - gnl_2(:, :, iglo)
          end do
          !$OMP END PARALLEL DO
          errors(1) = gs2_max_abs(error_array) * (0.5 * half_time)
          errors(3) = max_vel
          call max_allreduce(errors)
          inverse_error = get_inverse_error(errors)
          cfl_time_step = 1.0 / errors(3)

          !Calculate new time step based on error
          time_step = min(actual_dt * sqrt(inverse_error), cfl_time_step)
          cfl_limited = time_step < actual_dt .and. exactly_equal(time_step, cfl_time_step)
          time_step = time_step * time_step_safety_factor

          ! If error is too large reject the step
          if ( inverse_error < 1.0 .or. (cfl_limited .and. immediate_reset)) then
             if (proc0 .and. debug) print*,'Attempted beuler step failed due to error or cfl, adapting time step'
             ! Reset as we're retrying the step.
             reject_step = .true.
          else
             reject_step = .false.
          end if
       end if

       if (reject_step) then
          ! If the time step is now too small then abort
          if (time_step < code_dt_min) call mp_abort("Time step has become too small in advance_nonlinear_term.", .true.)
          call copy(g_cur, g_state)
          ! If we store the initial field then we could perhaps just copy here instead.
          ! This could be faster as it avoids communications involved in the velocity integration.
          call calculate_fields(fields_func, g_state, phinew, aparnew, bparnew)
          nfailed_steps = nfailed_steps + 1
       else
          ! If we have met the error condition and not taken too many steps
          ! then g_state represents our new solution so make sure we update g to reflect this
          internal_time = internal_time + actual_dt
          call copy(g_state, g_cur)
          nsuccessful_steps = nsuccessful_steps + 1
       end if

    end do

    ! Store the final time step size so we can start from this in the next callg
    time_step_last = time_step

    if(proc0.and.debug) print*,'Explicit done in ',counter,'steps with final error',inverse_error,'and step',time_step,'final time',internal_time

  end subroutine advance_nonlinear_term_beuler

  !> Advances dg/dt = NL(g,chi) from current g_state, chi from t -> t+dt by using
  !> an Runge-Kutta scheme with embedded error estimate for error control. This method
  !> is a wrapper to the true generic RK implementation.
  subroutine advance_nonlinear_term_rk(g_state, phinew, aparnew, bparnew, dt, &
       fields_func, source_func)
    implicit none
    complex, dimension (:,:,:), intent(in out) :: g_state
    complex, dimension (:,:,:), intent (in out) :: phinew, aparnew, bparnew
    real, intent(in) :: dt
    procedure(invert_field_func) :: fields_func
    procedure(source_term_func) :: source_func

    call advance_nonlinear_term_rk_implementation( &
         g_state, phinew, aparnew, bparnew, dt, &
         fields_func, source_func, the_rk_scheme)
  end subroutine advance_nonlinear_term_rk

  !> Advances dg/dt = NL(g,chi) from current g_state, chi from t -> t+dt by using
  !> an Runge-Kutta scheme with embedded error estimate for error control
  subroutine advance_nonlinear_term_rk_implementation(g_state, phinew, &
       aparnew, bparnew, dt, fields_func, source_func, scheme)
    use theta_grid, only: ntgrid
    use gs2_time, only: code_dt_min
    use mp, only: mp_abort, proc0, max_allreduce
    use gs2_layouts, only: g_lo
    use rk_schemes, only: rk_scheme_type
    use run_parameters, only: immediate_reset
    use array_utils, only: copy, gs2_max_abs, zero_array
    use warning_helpers, only: exactly_equal
    implicit none
    complex, dimension (-ntgrid:, :, g_lo%llim_proc:), intent(in out) :: g_state
    complex, dimension (-ntgrid:,:,:), intent (in out) :: phinew, aparnew, bparnew
    complex, dimension(:, :, :), allocatable :: g_cur, gnl_1
    real, intent(in) :: dt
    type(rk_scheme_type), intent(in) :: scheme
    complex, dimension (:, :, :, :), allocatable :: stages
    real :: internal_time, time_step, max_vel, inverse_error
    real :: new_time_step, cfl_time_step
    integer :: counter, nstages
    logical, parameter :: debug = .false.
    integer :: istage, istage_sub, iglo
    real, dimension(:), allocatable :: solution_coeffs, error_coeffs
    real, dimension(3) :: errors
    real, save :: time_step_last = -1
    procedure(invert_field_func) :: fields_func
    procedure(source_term_func) :: source_func
    logical :: cfl_limited
    real :: half_time
    real, dimension(:), allocatable :: stage_coeffs

    ! Initialise internal variables
    if (time_step_last < 0) time_step_last = dt
    internal_time = 0.0
    counter = 1
    time_step = time_step_last

    ! Copy the relevant coefficients for forming the solution to reduce
    ! later duplication.
    if (scheme%follow_high_order) then
       allocate(solution_coeffs, source = scheme%high_order_coeffs)
    else
       allocate(solution_coeffs, source = scheme%lower_order_coeffs)
    end if

    ! Get the error coefficients
    allocate(error_coeffs, source = scheme%high_order_coeffs - scheme%lower_order_coeffs)

    nstages = scheme%number_of_stages
    allocate(stages(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc, nstages))
    allocate(gnl_1(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    allocate(g_cur(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    call copy(g_state, g_cur) ; call zero_array(stages) ; call zero_array(gnl_1)

    ! Keep taking explicit steps until we have advanced by dt
    do while (internal_time < dt)
       if(proc0.and.debug) print*,'Taking step with size',time_step,'at time',internal_time
       half_time = 0.5 * time_step

       do istage = 1, nstages
          call copy(g_cur, g_state)
          stage_coeffs = half_time * scheme%coeffs(:, istage)

          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(istage_sub, iglo) &
          !$OMP SHARED(g_lo, g_state, half_time, scheme, stages, istage, stage_coeffs) &
          !$OMP COLLAPSE(2) &
          !$OMP SCHEDULE(static)
          do istage_sub = 1, istage-1
             do iglo = g_lo%llim_proc, g_lo%ulim_proc
                g_state(:, :, iglo) = g_state(:, :, iglo) + stage_coeffs(istage_sub) * stages(:, :, iglo, istage_sub)
             end do
          end do
          !$OMP END PARALLEL DO
          call calculate_fields(fields_func, g_state, phinew, aparnew, bparnew)

          ! Get the current nonlinear source term
          call source_func(g_state,  stages(:, :, :, istage), &
               phinew, aparnew, bparnew, &
               max_vel, need_to_adjust = .not. advance_nonadiabatic_dfn, &
               calculate_cfl_limit = (istage == 1))

          nrhs_calls = nrhs_calls + 1
          if (istage == 1) errors(3) = max_vel

       end do

       ! One can directly form the error by simply forming error =
       ! time_step * 0.5 * sum(stages_i * (high_order_coeffs_i -
       ! low_order_coeffs_i)). Here we store this in gnl_1
       ! without the time_step * 0.5 factor (to avoid extra
       ! array operations). We multiply the final error by
       ! time_step/2 to ensure full consistency.
       ! Could we merge this OpenMP parallel region with the
       ! other loop directly below to avoid some overhead?
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(iglo) &
       !$OMP SHARED(g_lo, gnl_1, error_coeffs, stages) &
       !$OMP SCHEDULE(static)
       do iglo = g_lo%llim_proc, g_lo%ulim_proc
          gnl_1(:, :, iglo) = error_coeffs(1) * stages(:, :, iglo, 1)
       end do
       !$OMP END PARALLEL DO

       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(iglo, istage) &
       !$OMP SHARED(g_lo, gnl_1, error_coeffs, nstages, stages) &
       !$OMP COLLAPSE(2) &
       !$OMP SCHEDULE(static)
       do istage = 2, nstages
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             gnl_1(:, :, iglo) = gnl_1(:, :, iglo) + error_coeffs(istage) * stages(:, :, iglo, istage)
          end do
       end do
       !$OMP END PARALLEL DO

       ! Store the absolute error
       errors(1) = gs2_max_abs(gnl_1) * half_time
       ! Estimate the absolution size of the solution using
       ! the old solution
       errors(2) = gs2_max_abs(g_cur)
       call max_allreduce(errors)
       inverse_error = get_inverse_error(errors)
       cfl_time_step = 1.0 / errors(3)

       !Calculate the new_time_step
       new_time_step = min(time_step * (inverse_error)**(1.0/scheme%order), cfl_time_step)
       cfl_limited = new_time_step < time_step .and. exactly_equal(new_time_step, cfl_time_step)
       new_time_step = new_time_step * time_step_safety_factor

       if(proc0 .and. debug) print*,'Inverse error is ',inverse_error,'at internal_time', &
            internal_time,'with step',time_step,'new time step is',new_time_step, &
            'counter=',counter

       ! If the cfl time step is too small then abort
       if (new_time_step < code_dt_min) &
            call mp_abort("Time step has become too small in advance_nonlinear_term.", .true.)

       ! If the error is too large we need to retry with a smaller step, so don't
       ! update anything
       if (inverse_error < 1.0 .or. (cfl_limited .and. immediate_reset)) then
          if(proc0.and.debug) print*,'Time step with size',time_step, &
               'failed at internal_time',internal_time,'and counter',counter, &
               'retrying with step',new_time_step
          nfailed_steps = nfailed_steps + 1
       else
          stage_coeffs = solution_coeffs * half_time
          !Update the solution. Note we could probably merge the two
          !OpenMP loops into a single OpenMP parallel region and then
          !just add OMP DO to each loop. This would save a bit of overhead.
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iglo) &
          !$OMP SHARED(g_lo, g_cur, solution_coeffs, stage_coeffs, stages) &
          !$OMP SCHEDULE(static)
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             g_cur(:, :, iglo) = g_cur(:, :, iglo) + stage_coeffs(1) * stages(:, :, iglo, 1)
          end do
          !$OMP END PARALLEL DO

          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(istage, iglo) &
          !$OMP SHARED(g_lo, g_cur, solution_coeffs, nstages, half_time, &
          !$OMP stages, stage_coeffs) &
          !$OMP COLLAPSE(2) &
          !$OMP SCHEDULE(static)
          do istage = 2, nstages
             do iglo = g_lo%llim_proc, g_lo%ulim_proc
                g_cur(:, :, iglo) = g_cur(:, :, iglo) + stage_coeffs(istage) * stages(:, :, iglo, istage)
             end do
          end do
          !$OMP END PARALLEL DO

          internal_time = internal_time + time_step
          nsuccessful_steps = nsuccessful_steps + 1
       end if

       !Now update the time step, capped to dt
       time_step = min(new_time_step, dt)

       ! Make sure we don't overshoot the target time by limiting time step.
       if(internal_time < dt) then
          time_step = min(time_step, dt - internal_time)
          counter = counter + 1
       end if

    end do

    if(proc0.and.debug) print*,'Completed rk step in',counter,'steps to',internal_time

    ! We may be able to avoid this copy and field solve on exit if we restructure the main
    ! loop slightly so g_state represents the new solution. This may require use of g_work
    ! or similar to hold intermediate values.
    call copy(g_cur, g_state)

    time_step_last = time_step
  end subroutine advance_nonlinear_term_rk_implementation

  !> Advances dg/dt = NL(g,chi) from current g_state, chi from t -> t+dt by using
  !> a simple Picard iteration scheme
  subroutine advance_nonlinear_term_picard(g_state, phinew, &
       aparnew, bparnew, dt, fields_func, source_func)
    use theta_grid, only: ntgrid
    use gs2_time, only: code_dt_min
    use mp, only: mp_abort, proc0, max_allreduce
    use gs2_layouts, only: g_lo
    use array_utils, only: copy, gs2_max_abs, zero_array
    implicit none
    complex, dimension (-ntgrid:, :, g_lo%llim_proc:), intent(in out) :: g_state
    complex, dimension (-ntgrid:,:,:), intent (in out) :: phinew, aparnew, bparnew
    complex, dimension(:, :, :), allocatable :: gnl_1
    real, intent(in) :: dt
    real :: internal_time, time_step, max_vel, cfl_limit
    integer :: counter, ipic, iglo
    logical, parameter :: debug = .false.
    real, dimension(4) :: errors
    real, save :: time_step_last = -1
    procedure(invert_field_func) :: fields_func
    procedure(source_term_func) :: source_func
    complex, dimension(:, :, :), allocatable :: g_start
    integer, parameter :: npic_limit = 15, npic_low_limit = 3
    real, parameter :: time_step_factor = 2.0
    logical, parameter :: check_cfl = .true.
    logical :: converged
    real :: half_time

    ! Initialise internal variables
    if (time_step_last < 0) time_step_last = dt
    internal_time = 0.0
    counter = 1
    time_step = time_step_last

    allocate(g_start(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    allocate(gnl_1(-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc))
    call zero_array(g_start) ; call zero_array(gnl_1)
    ! Keep taking explicit steps until we have advanced by dt
    do while (internal_time < dt)
       call copy(g_state, g_start)
       half_time = 0.5 * time_step

       if(proc0.and.debug) print*,'Taking step with size',time_step,'at time',internal_time
       ipic = 1

       call calculate_fields(fields_func, g_state, phinew, aparnew, bparnew)

       ! Get the current nonlinear source term
       call source_func(g_state, gnl_1, &
            phinew, aparnew, bparnew, &
            max_vel, need_to_adjust = .not. advance_nonadiabatic_dfn, &
            calculate_cfl_limit = check_cfl)
       nrhs_calls = nrhs_calls + 1

       errors(1) = 0.0
       errors(2) = gs2_max_abs(gnl_1) * half_time
       errors(3) = gs2_max_abs(g_start)
       errors(4) = max_vel
       call max_allreduce(errors)

       ! Check for CFL limit - we can simply rescale the change in g and carry on
       ! so here we simply rescale the effective time step,
       cfl_limit = 1.0 / errors(4)
       if (time_step > cfl_limit .and. check_cfl) then
          ! Drop the effective time step a little from the limit
          cfl_limit = 0.9 * cfl_limit
          if(proc0.and.debug) print*,'Dropping time step due to cfl from',time_step,'to',cfl_limit
          errors(2) = errors(2) * cfl_limit / time_step
          time_step = cfl_limit
          half_time = time_step * 0.5
       end if

       converged = picard_converged(errors)

       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(iglo) &
       !$OMP SHARED(g_state, g_start, gnl_1, half_time, g_lo)&
       !$OMP SCHEDULE(static)
       do iglo = g_lo%llim_proc, g_lo%ulim_proc
          g_state(:, :, iglo) = g_start(:, :, iglo) + gnl_1(:, :, iglo) * half_time
       end do
       !$OMP END PARALLEL DO

       do while(.not. converged)
          ! Note the first step (i.e. call before this loop) is included in
          ! ipic here (i.e. it counts towards our approach towards npic_limit).
          ipic = ipic + 1
          if (ipic > npic_limit) exit

          call calculate_fields(fields_func, g_state, phinew, aparnew, bparnew)

          ! Get the current nonlinear source term
          call source_func(g_state, gnl_1, &
               phinew, aparnew, bparnew, &
               max_vel, need_to_adjust = .not. advance_nonadiabatic_dfn, &
               calculate_cfl_limit = check_cfl)
          nrhs_calls = nrhs_calls + 1

          errors(1) = errors(2)
          errors(2) = gs2_max_abs(gnl_1) * half_time
          errors(4) = max_vel
          call max_allreduce(errors)
          cfl_limit = 1.0 / errors(4)

          converged = picard_converged(errors)

          if(proc0.and.debug) print*,'ipic',ipic,converged,'|',errors

          if (time_step > cfl_limit .and. .not. converged) exit

          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iglo) &
          !$OMP SHARED(g_state, g_start, gnl_1, half_time, g_lo)&
          !$OMP SCHEDULE(static)
          do iglo = g_lo%llim_proc, g_lo%ulim_proc
             g_state(:, :, iglo) = g_start(:, :, iglo) + gnl_1(:, :, iglo) * half_time
          end do
          !$OMP END PARALLEL DO
       end do

       if (.not. converged) then
          ! If we didn't converge in the limit then reset state
          ! to the start of the step and reduce the time step.
          call copy(g_start, g_state)
          time_step = time_step / time_step_factor
          nfailed_steps = nfailed_steps + 1

          ! If the time step is too small then abort
          if (time_step < code_dt_min) &
               call mp_abort("Time step has become too small in advance_nonlinear_term.", .true.)
       else
          internal_time = internal_time + time_step
          nsuccessful_steps = nsuccessful_steps + 1
          ! If we converged quickly then increase the time step
          if (ipic <= npic_low_limit) time_step = time_step * time_step_factor
       end if

       !Now update the time step, capped to dt
       time_step = min(time_step, dt)

       ! Make sure we don't overshoot the target time by limiting time step.
       if(internal_time < dt) then
          time_step = min(time_step, dt - internal_time)
          counter = counter + 1
       end if

    end do

    if(proc0.and.debug) print*,'Completed Picard step in',counter,'steps to',internal_time,'last time step',time_step

    time_step_last = time_step
  contains
    pure logical function picard_converged(errors) result(converged)
      implicit none
      real, dimension(4), intent(in) :: errors
      converged = &
           !Change in solution less than abs tol
           (errors(2) < absolute_tolerance) .or.  &
           ! Change in the change, less than abs tol
           (abs(errors(2) - errors(1)) < absolute_tolerance) .or. &
           ! Change is less than the relative tolerance
           (errors(2) < relative_tolerance * errors(3)) .or. &
           ! Change in the change less than the relative tolerance
           (abs(errors(2) - errors(1)) < relative_tolerance * errors(3))
    end function picard_converged

  end subroutine advance_nonlinear_term_picard

  !> Returns the normalised inverse error estimate. In other words
  !> the error tolerance, rtol*errors(2) + atol, divided by the error
  !> estimate (plus a small number to avoid divide by zero).
  !>
  !> Assumes that errors contains the global magnitude of the error
  !> estimate in the first element and the global maximum of the solution
  !> in the second element.
  real pure function get_inverse_error(errors) result(inverse_error)
    implicit none
    real, dimension(:), intent(in) :: errors
    inverse_error = (relative_tolerance * errors(2) + absolute_tolerance) / &
         (errors(1) + epsilon(errors(1)))
  end function get_inverse_error

  !> Calculates the fields consistent with the passed distribution function
  subroutine calculate_fields(fields_func, g_in, phi_out, apar_out, bpar_out)
    use nonlinear_terms, only: time_add_explicit_terms_field
    use job_manage, only: time_message
    procedure(invert_field_func) :: fields_func
    complex, dimension(:, :, :), intent(in) :: g_in
    complex, dimension (:,:,:), intent (out) :: phi_out, apar_out, bpar_out
    call time_message(.false., time_add_explicit_terms_field, 'Explicit terms - field')
    call fields_func(g_in, phi_out, apar_out, bpar_out, local_only = .true.)
    call time_message(.false., time_add_explicit_terms_field, 'Explicit terms - field')
  end subroutine calculate_fields

#include "split_nonlinear_terms_auto_gen.inc"
end module split_nonlinear_terms