!> FIXME : Add documentation
module nonlinear_terms
  !
  ! missing factors of B_a/B(theta) in A_perp terms??
  !

  use abstract_config, only: abstract_config_type, CONFIG_MAX_NAME_LEN
  use mp, only: mp_request_null
  use iso_fortran_env, only: real32
  implicit none

  private

  public :: init_nonlinear_terms, finish_nonlinear_terms
  public :: read_parameters, wnml_nonlinear_terms, check_nonlinear_terms
  public :: add_explicit_terms, nonlinear_mode_switch, split_nonlinear
  public :: finish_init, reset_init, nonlin, accelerated
  public :: cfl, time_add_explicit_terms, time_add_explicit_terms_mpi
  public :: time_add_explicit_terms_field, nonlinear_mode_none, nonlinear_mode_on
  public :: gryfx_zonal, calculate_current_nl_source_and_cfl_limit
  public :: cfl_req_hand, nb_check_time_step_too_large
  public :: nonlinear_terms_testing

  public :: nonlinear_terms_config_type
  public :: set_nonlinear_terms_config
  public :: get_nonlinear_terms_config
  
  !> Public type allowing tests to access otherwise-private routines
  type :: nonlinear_terms_testing
    contains
      procedure, nopass :: add_nl

  end type nonlinear_terms_testing

  !> FIXME : Add documentation  
  type gs2_gryfx_zonal_type
    logical :: on = .false.
    complex(real32), dimension (:), pointer :: NLdens_ky0, NLupar_ky0, NLtpar_ky0, &
                                         NLtprp_ky0, NLqpar_ky0, NLqprp_ky0
    logical :: first_half_step = .false.
  end type gs2_gryfx_zonal_type

  type(gs2_gryfx_zonal_type) :: gryfx_zonal

  integer :: istep_last = 0

  ! knobs
  integer :: nonlinear_mode_switch

  integer, parameter :: nonlinear_mode_none = 1, nonlinear_mode_on = 2

#ifndef SHMEM
  real, dimension (:,:), allocatable :: ba, gb, bracket
  ! yxf_lo%ny, yxf_lo%llim_proc:yxf_lo%ulim_alloc

  real, dimension (:,:,:), allocatable :: aba, agb, abracket
  ! 2*ntgrid+1, 2, accelx_lo%llim_proc:accelx_lo%ulim_alloc

#else
  real, save, dimension (:,:), pointer :: ba => null(), gb => null(), bracket => null()
  ! yxf_lo%ny, yxf_lo%llim_proc:yxf_lo%ulim_alloc

  real, save, dimension (:,:,:), pointer, contiguous :: aba => null(), &
       agb => null(), abracket => null()
  ! 2*ntgrid+1, 2, accelx_lo%llim_proc:accelx_lo%ulim_alloc
#endif

! CFL coefficients
  real :: cfl, cflx, cfly
  !> Maximum velocity of the nonlinear term, the reciprocal of the maximum timestep that
  !> satisfies the CFL condition. This currently has to be module level due to potential
  !> non-blocking reduction.
  real :: max_vel
  integer :: cfl_req_hand = mp_request_null
  logical :: use_cfl_limit

  !> Variables related to the order based error estimate -- see config documentation
  real :: error_target
  integer :: istep_error_start
  logical :: use_order_based_error

! for gryfx
  real :: densfac, uparfac, tparfac, tprpfac, qparfac, qprpfac

  logical :: split_nonlinear
  logical :: include_apar, include_bpar, include_phi

  real :: time_add_explicit_terms(2) = 0., time_add_explicit_terms_mpi = 0.
  real :: time_add_explicit_terms_field(2) = 0.

  logical :: nonlin = .false.
  logical :: initialized = .false.
  logical :: initializing = .true.
  logical :: alloc = .true.
  logical :: zip = .false.
  logical :: nl_forbid_force_zero = .true.
  logical :: accelerated = .false.

  logical :: exist = .false.

  !> The "large" cfl time step limit to use when we don't
  !> have a valid cfl limit from the NL term.
  real, parameter :: dt_cfl_default_large = 1.e8

  !> Used to represent the input configuration of nonlinear_terms
  type, extends(abstract_config_type) :: nonlinear_terms_config_type
     ! namelist : nonlinear_terms_knobs
     ! indexed : false
     !> Scales the estimate CFL limit used to determine when the
     !> timestep should be changed. The maximum allowed timestep
     !> satisfies `delt < cfl * min(Delta_perp/v_perp)` where `v_perp
     !> * delt` is the maximum distance travelled in a single
     !> timestep.
     real :: cfl = 0.1
     !> GRYFX only. No documentation available
     real :: densfac = 1.
     !> Set the error threshold used to determine when the timestep should change if
     !> [[nonlinear_terms_knobs:use_order_based_error]] is true.
     real :: error_target = 0.1
     !> Flag for testing. If false do not include apar contribution to nonlinear term.
     logical :: include_apar = .true.
     !> Flag for testing. If false do not include bpar contribution to nonlinear term.
     logical :: include_bpar = .true.
     !> Flag for testing. If false do not include phi contribution to nonlinear term.
     logical :: include_phi = .true.
     !> Set the first timestep for which the order based error checks are made if
     !> [[nonlinear_terms_knobs:use_order_based_error]] is true.
     integer :: istep_error_start = 30
     !> If `true` (default) then forces the nonlinear source term to
     !> zero in the forbidden region.
     logical :: nl_forbid_force_zero = .true.
     !> Determines if the nonlinear terms should be calculated. Must
     !> be one of:
     !>
     !> - 'none' - Do not include nonlinear terms, i.e. run a linear calculation.
     !> - 'default' - The same as 'none'
     !> - 'off' - The same as 'none'
     !> - 'on' - Include nonlinear terms.
     !>
     !> Could consider changing this to a logical.
     character(len = 20) :: nonlinear_mode = 'default'
     !> GRYFX only. No documentation available
     real :: qparfac = 1./sqrt(8.)
     !> GRYFX only. No documentation available
     real :: qprpfac = 1./sqrt(8.)
     !> Do we evolve the nonlinear term separately from the linear terms (true) or
     !> include the nonlinear term as a source in the standard algorithm (false).
     logical :: split_nonlinear = .false.
     !> GRYFX only. No documentation available
     real :: tparfac = 1./2.
     !> GRYFX only. No documentation available
     real :: tprpfac = 1./2.
     !> GRYFX only. No documentation available
     real :: uparfac = 1./sqrt(2.)
     !> If true then use the cfl limit to set the maximum timestep allowed.
     logical :: use_cfl_limit = .true.
     !> If true then use an error estimate from comparing the nonlinear source
     !> calculated using 2nd and 3rd order Adams-Bashforth schemes to control
     !> the timestep in use. This does not disable the CFL estimate.
     logical :: use_order_based_error = .false.
     !> Not currently used, should consider removing.  Original
     !> documentation was "Experts only (for secondary/tertiary
     !> calculations)."  which suggests a close relation to the
     !> `eqzip` option of [[knobs]].
     logical :: zip = .false.
   contains
     procedure, public :: read => read_nonlinear_terms_config
     procedure, public :: write => write_nonlinear_terms_config
     procedure, public :: reset => reset_nonlinear_terms_config
     procedure, public :: broadcast => broadcast_nonlinear_terms_config
     procedure, public, nopass :: get_default_name => get_default_name_nonlinear_terms_config
     procedure, public, nopass :: get_default_requires_index => get_default_requires_index_nonlinear_terms_config
  end type nonlinear_terms_config_type

  type(nonlinear_terms_config_type) :: nonlinear_terms_config
  
contains

  !> FIXME : Add documentation
  subroutine check_nonlinear_terms(report_unit,delt_adj)
    use gs2_time, only: code_dt_min, code_dt_max
    use kt_grids, only: box
    use run_parameters, only: nstep, wstar_units
    use theta_grid, only: nperiod
    implicit none
    integer, intent(in) :: report_unit
    real, intent(in) :: delt_adj
    if (nonlin) then
       write (report_unit, *) 
       write (report_unit, fmt="('This is a nonlinear simulation.')")
       write (report_unit, *) 
       if (wstar_units) then
          write (report_unit, *) 
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, fmt="('Nonlinear runs require wstar_units = .false. in the knobs namelist.')") 
          write (report_unit, fmt="('THIS IS AN ERROR.')") 
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, *) 
       end if
       if ( .not. box) then
          write (report_unit, *) 
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, fmt="('Nonlinear runs must be carried out in a box.')") 
          write (report_unit, fmt="('Set grid_option to box in the kt_grids_knobs namelist.')") 
          write (report_unit, fmt="('THIS IS AN ERROR.')") 
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, *) 
       end if

       if (split_nonlinear) then
          write (report_unit, *)
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, fmt="('The nonlinear term will be evolved separately from')")
          write (report_unit, fmt="('the linear terms in a crude IMEX style.')")
          write (report_unit, fmt="('This is an experimental feature.')")
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, *)
       end if

       if (.not. include_apar) then
          write (report_unit, *)
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, fmt="('The A|| potential contributions to the nonlinear term are disabled.')")
          write (report_unit, fmt="('This is probably an error.')")
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, *)
       end if

       if (.not. include_bpar) then
          write (report_unit, *)
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, fmt="('The B|| potential contributions to the nonlinear term are disabled.')")
          write (report_unit, fmt="('This is probably an error.')")
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, *)
       end if

       if (.not. include_phi) then
          write (report_unit, *)
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, fmt="('The electrostatic potential contributions to the nonlinear term are disabled.')")
          write (report_unit, fmt="('This is probably an error.')")
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, *)
       end if

       if (nperiod > 1) then
          write (report_unit, *) 
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, fmt="('Nonlinear runs usually have nperiod = 1.')") 
          write (report_unit, fmt="('THIS MAY BE AN ERROR.')") 
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, *) 
       end if

       if (use_cfl_limit) then
          write (report_unit, *)
          write (report_unit, fmt="('The timestep will be adjusted to satisfy the CFL limit.')")
          write (report_unit, fmt="('The maximum delt < ',f10.4,' * min(Delta_perp/v_perp). (cfl)')") cfl
          write (report_unit, *)
       else
          write (report_unit, *)
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, fmt="('Nonlinear usually have use_cfl_limit = T')")
          write (report_unit, fmt="('################# WARNING #######################')")
          write (report_unit, *)
       end if

       if (use_order_based_error) then
          write (report_unit, *)
          write (report_unit, fmt="('The timestep will be adjusted to keep the estimated error below',e11.4)") error_target
          write (report_unit, fmt="('beginning on step',I0)") istep_error_start

          write (report_unit, *)
       else
          if (.not. use_cfl_limit) then
             write (report_unit, *)
             write (report_unit, fmt="('################# WARNING #######################')")
             write (report_unit, fmt="('Both CFL and error methods for controlling explicit timestep are disabled.')")
             write (report_unit, fmt="('THIS IS PROBABLY AN ERROR.')")
             write (report_unit, fmt="('################# WARNING #######################')")
             write (report_unit, *)
          end if
       end if

       write (report_unit, fmt="('The minimum delt ( code_dt_min ) = ',e11.4)") code_dt_min
       write (report_unit, fmt="('The maximum delt (code_dt_max) = ',e11.4)") code_dt_max
       write (report_unit, fmt="('When the time step needs to be changed, it is adjusted by a factor of ',f10.4)") delt_adj
       write (report_unit, fmt="('The number of time steps nstep = ',i7)") nstep
    endif
  end subroutine check_nonlinear_terms

  !> FIXME : Add documentation  
  subroutine wnml_nonlinear_terms(unit)
    implicit none
    integer, intent(in) :: unit
    if (.not. exist) return
    if (nonlinear_mode_switch == nonlinear_mode_on) then
       write (unit, *)
       write (unit, fmt="(' &',a)") "nonlinear_terms_knobs"
       write (unit, fmt="(' nonlinear_mode = ',a)") '"on"'
       write (unit, fmt="(' cfl = ',e17.10)") cfl
       write (unit, fmt="(' nl_forbid_force_zero = ',L1)") nl_forbid_force_zero
       if (zip) write (unit, fmt="(' zip = ',L1)") zip
       write (unit, fmt="(' /')")
    endif
  end subroutine wnml_nonlinear_terms

  !> FIXME : Add documentation  
  subroutine init_nonlinear_terms(nonlinear_terms_config_in) 
    use theta_grid, only: init_theta_grid, ntgrid
    use kt_grids, only: init_kt_grids, naky, ntheta0, nx, ny, akx, aky
    use le_grids, only: init_le_grids, nlambda, negrid
    use species, only: init_species, nspec
    use gs2_layouts, only: init_dist_fn_layouts, yxf_lo, accelx_lo
    use gs2_layouts, only: init_gs2_layouts
    use gs2_transforms, only: init_transforms
    use mp, only: nproc, iproc
    use array_utils, only: zero_array
#ifdef SHMEM
    use shm_mpi3, only : shm_alloc
#endif
    implicit none
    type(nonlinear_terms_config_type), intent(in), optional :: nonlinear_terms_config_in
    logical, parameter :: debug=.false.

    if (initialized) return
    initialized = .true.
    
    if (debug) write(6,*) "init_nonlinear_terms: init_gs2_layouts"
    call init_gs2_layouts
    if (debug) write(6,*) "init_nonlinear_terms: init_theta_grid"
    call init_theta_grid
    if (debug) write(6,*) "init_nonlinear_terms: init_kt_grids"
    call init_kt_grids
    if (debug) write(6,*) "init_nonlinear_terms: init_le_grids"
    call init_le_grids
    if (debug) write(6,*) "init_nonlinear_terms: init_species"
    call init_species
    if (debug) write(6,*) "init_nonlinear_terms: init_dist_fn_layouts"
    call init_dist_fn_layouts (ntgrid, naky, ntheta0, nlambda, negrid, nspec, nproc, iproc)
    
    call read_parameters(nonlinear_terms_config_in)

    if (debug) write(6,*) "init_nonlinear_terms: init_transforms"
    if (nonlinear_mode_switch /= nonlinear_mode_none) then
       call init_transforms (ntgrid, naky, ntheta0, nlambda, negrid, nspec, nx, ny, accelerated)

       if (debug) write(6,*) "init_nonlinear_terms: allocations"
       if (alloc) then
          if (accelerated) then
#ifndef SHMEM
             allocate (     aba(2*ntgrid+1, 2, accelx_lo%llim_proc:accelx_lo%ulim_alloc))
             allocate (     agb(2*ntgrid+1, 2, accelx_lo%llim_proc:accelx_lo%ulim_alloc))
             allocate (abracket(2*ntgrid+1, 2, accelx_lo%llim_proc:accelx_lo%ulim_alloc))
#else
             call shm_alloc(aba, (/ 1, 2*ntgrid+1, 1, 2, accelx_lo%llim_proc, accelx_lo%ulim_alloc /))
             call shm_alloc(agb, (/ 1, 2*ntgrid+1, 1, 2, accelx_lo%llim_proc, accelx_lo%ulim_alloc /))
             call shm_alloc(abracket, (/ 1, 2*ntgrid+1, 1, 2, accelx_lo%llim_proc, accelx_lo%ulim_alloc /))
#endif
             call zero_array(aba) ; call zero_array(agb) ; call zero_array(abracket)
          else
#ifndef SHMEM
             allocate (     ba(yxf_lo%ny,yxf_lo%llim_proc:yxf_lo%ulim_alloc))
             allocate (     gb(yxf_lo%ny,yxf_lo%llim_proc:yxf_lo%ulim_alloc))
             allocate (bracket(yxf_lo%ny,yxf_lo%llim_proc:yxf_lo%ulim_alloc))
#else
             call shm_alloc(ba, (/ 1,yxf_lo%ny,yxf_lo%llim_proc,yxf_lo%ulim_alloc /))
             call shm_alloc(gb, (/ 1,yxf_lo%ny,yxf_lo%llim_proc,yxf_lo%ulim_alloc /))
             call shm_alloc(bracket, (/ 1,yxf_lo%ny,yxf_lo%llim_proc,yxf_lo%ulim_alloc /))
#endif
             call zero_array(ba) ; call zero_array(gb) ; call zero_array(bracket)
          end if
          alloc = .false.
       end if

       cfly = (aky(naky)/cfl)*0.5
       cflx = (akx((ntheta0+1)/2)/cfl)*0.5
    end if

  end subroutine init_nonlinear_terms

  !> FIXME : Add documentation  
  subroutine read_parameters(nonlinear_terms_config_in)
    use file_utils, only: input_unit_exist, error_unit
    use text_options, only: text_option, get_option_value
    implicit none
    type(nonlinear_terms_config_type), intent(in), optional :: nonlinear_terms_config_in
    type (text_option), dimension (4), parameter :: nonlinearopts = &
         [ text_option('default', nonlinear_mode_none), &
            text_option('none', nonlinear_mode_none), &
            text_option('off', nonlinear_mode_none), &
            text_option('on', nonlinear_mode_on) ]
    character(len = 20) :: nonlinear_mode
    integer :: ierr

    if (present(nonlinear_terms_config_in)) nonlinear_terms_config = nonlinear_terms_config_in

    call nonlinear_terms_config%init(name = 'nonlinear_terms_knobs', requires_index = .false.)

    ! Copy out internal values into module level parameters
    cfl = nonlinear_terms_config%cfl
    densfac = nonlinear_terms_config%densfac
    error_target = nonlinear_terms_config%error_target
    include_apar = nonlinear_terms_config%include_apar
    include_bpar = nonlinear_terms_config%include_bpar
    include_phi = nonlinear_terms_config%include_phi
    istep_error_start = nonlinear_terms_config%istep_error_start
    nl_forbid_force_zero = nonlinear_terms_config%nl_forbid_force_zero
    nonlinear_mode = nonlinear_terms_config%nonlinear_mode
    qparfac = nonlinear_terms_config%qparfac
    qprpfac = nonlinear_terms_config%qprpfac
    split_nonlinear = nonlinear_terms_config%split_nonlinear
    tparfac = nonlinear_terms_config%tparfac
    tprpfac = nonlinear_terms_config%tprpfac
    uparfac = nonlinear_terms_config%uparfac
    use_cfl_limit = nonlinear_terms_config%use_cfl_limit
    use_order_based_error = nonlinear_terms_config%use_order_based_error
    zip = nonlinear_terms_config%zip

    exist = nonlinear_terms_config%exist

    ierr = error_unit()
    call get_option_value &
         (nonlinear_mode, nonlinearopts, nonlinear_mode_switch, &
         ierr, "nonlinear_mode in nonlinear_terms_knobs",.true.)

    if (nonlinear_mode_switch == nonlinear_mode_on)  then
       nonlin = .true.
    end if

  end subroutine read_parameters

  !> FIXME : Add documentation  
  subroutine add_explicit_terms (g1, g2, g3, phi, apar, bpar, istep, bd)
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo
    use gs2_time, only: save_dt_cfl
    use job_manage, only: time_message
    use mp, only: get_mp_times
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: g1, g2, g3
    complex, dimension (-ntgrid:,:,:), intent (in) :: phi,    apar,    bpar
    integer, intent (in) :: istep
    real, intent (in) :: bd
    real :: dt_cfl
    logical, parameter :: nl = .true.
    real :: mp_total_after, mp_total
    call time_message(.false., time_add_explicit_terms, 'Explicit terms')
    call get_mp_times(total_time = mp_total)

    select case (nonlinear_mode_switch)
    case (nonlinear_mode_none)
!!! NEED TO DO SOMETHING HERE...  BD GGH
       dt_cfl = dt_cfl_default_large
       call save_dt_cfl (dt_cfl)
#ifdef LOWFLOW
       if (istep /=0) &
            call add_explicit (g1, g2, g3, phi, apar, bpar, istep, bd)
#endif
    case (nonlinear_mode_on)
       if (istep /= 0) then
         call add_explicit (g1, g2, g3, phi, apar, bpar, istep, bd, nl)
       endif
    end select
    call time_message(.false., time_add_explicit_terms, 'Explicit terms')
    call get_mp_times(total_time = mp_total_after)
    time_add_explicit_terms_mpi = time_add_explicit_terms_mpi + (mp_total_after - mp_total)
  end subroutine add_explicit_terms

  !> FIXME : Add documentation  
  subroutine add_explicit (g1, g2, g3, phi, apar, bpar, istep, bd,  nl)
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, ik_idx, it_idx, il_idx, is_idx
    use dist_fn_arrays, only: g
    use gs2_time, only: save_dt_cfl, code_dt, check_time_step_too_large
    use run_parameters, only: reset, immediate_reset
    use unit_tests, only: debug_message
    use mp, only: nb_max_allreduce, max_allreduce
    use array_utils, only: copy, zero_array
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: g1, g2, g3
    complex, dimension (-ntgrid:,:,:), intent (in) :: phi, apar, bpar
    integer, intent (in) :: istep
    real, intent (in) :: bd
    logical, intent (in), optional :: nl
    integer, parameter :: verb = 4
    integer :: iglo, ik
    real :: dt_cfl

    real :: error
    real :: target_dt

    call debug_message(verb, 'nonlinear_terms::add_explicit starting')

    if (initializing) then
       if (present(nl)) then
          dt_cfl = dt_cfl_default_large
          call save_dt_cfl (dt_cfl)

          ! Ensure the module level max_vel is consistent with
          ! the selected cfl limiting timestep.
          max_vel = 1.0/dt_cfl
       end if
       return
    endif


    !
    ! @todo Currently not self-starting.  Need to fix this.
    !

    call debug_message(verb, 'nonlinear_terms::add_explicit copying old terms')

    ! The following if statement ensures that the nonlinear term is calculated only once
    ! per timestep. It guards against calculating the nonlinear term in two cases:
    !   1. the second call to `timeadv` in a usual timestep
    !   2. both calls to `timeadv` after the CFL condition check has triggered a reset
    !      of this timestep (when `immediate_reset = .true.`). As the nonlinear term is
    !      independent of `delt`, it does not need to be calculated again.
    !
    ! N.B. When `immediate_reset = .false.`, the timestep is completed even though the
    ! CFL condition is broken. `delt` and the response matrix are changed at the end of
    ! the timestep; the following timestep is "normal", and the nonlinear term is
    ! calculated.
    if (istep /= istep_last) then

       istep_last = istep

       ! Shuffle time history along -- this is a place pointers might help
       ! avoid some copying.
       ! Should we only need to do this if we have a nonlinear simulation?
       ! It's possible we might have other explicit sources, in which case we
       ! probably always need to do this shuffling, however in that case this
       ! subroutine probably best belongs in a different module.
       call copy(g2, g3)
       call copy(g1, g2)

       call debug_message(verb, 'nonlinear_terms::add_explicit copied old terms')

       ! if running nonlinearly, then compute the nonlinear term at grid points
       ! and store it in g1
       if (present(nl)) then
          call debug_message(verb, 'nonlinear_terms::add_explicit calling add_nl')
          if(gryfx_zonal%on) then
             call add_nl_gryfx (g1)
          else
             call add_nl (g, g1, phi, apar, bpar)
          endif
          ! takes g1 at grid points and returns 2*g1 at cell centers

          call nl_center (g1, bd)

          ! Currently the module level max_vel contains the maximum
          ! velocity found from the nonlinear term on this
          ! processor. This can be used to find the cfl limting time
          ! step.

          ! If we're not using the cfl limit then we'd like to zero
          ! out the maximum velocity however, we later take 1/max_vel
          ! so let's just set it small.
          if (.not. use_cfl_limit) then
             max_vel = epsilon(0.0)
          end if

          ! Calculate the effective maximum velocity based on the error
          ! based time step limit (really we estimate the maximum time step
          ! to keep our errors below our target and then convert to an
          ! effective velocity).
          if (use_order_based_error .and. (istep > istep_error_start)) then
             ! Get the current order based error estimate
             error = estimate_error(g1, g2, g3)

             ! Based on the current error estimate and the expected error scaling
             ! work out what time step we expect to give us the target error.
             ! Note that we don't scale by cfl here as the user has direct control
             ! over error_target already. Also note that by taking the square root
             ! we are assuming that the error looks like dt^2. We should probably
             ! look at what order schemes we're actually comparing to determine
             ! the appropriate dependency.
             target_dt =code_dt*sqrt(error_target/(error))

             ! Now we have an error limited time step lets convert this
             ! to an effective maximum velocity
             max_vel = max(max_vel, 1.0/target_dt)
          end if

          ! Now we can reduce our local limiting velocities to get
          ! a global value for the maximum velocity. We only need to
          ! reduce if at least one time limit check is active.
          if (use_cfl_limit .or. use_order_based_error) then
             ! Estimate the global cfl limit based on max_vel calculated
             ! in `add_nl`.If immediate_reset is true, check the max
             ! allreduce immediately.  If not, overlap comms and work,
             ! and check for reset at the end of the timestep.
             if(immediate_reset)then
                call max_allreduce(max_vel)
                ! This call can update the value of reset.
                call set_cfl_time_step_and_check_if_too_large
             else
                ! Do a nonblocking reduce and check later in
                ! [[gs2_reinit:check_time_step]].
                call nb_max_allreduce(max_vel, cfl_req_hand)
             end if
          else
             ! If we don't have any checks active then we better make sure that the
             ! limiting time step is set. We will have set max_vel earlier to
             ! epsilon(0.0) so we can just use the usual routine to set this, but
             ! can skip communications. Really we probably just need to do this once
             ! but for now just put this here to be on the safe side. The cost should
             ! be low.
             call set_cfl_time_step_and_check_if_too_large
          end if

          if(reset) then
             return !Return if resetting
          endif

       else
          call zero_array(g1)
       end if

#ifdef LOWFLOW
       ! do something
#endif

    end if

    if (zip) then
       do iglo = g_lo%llim_proc, g_lo%ulim_proc
          ik = ik_idx(g_lo,iglo)
          if (ik /= 1) then
             g (:,1,iglo) = 0.
             g (:,2,iglo) = 0.
             g1(:,1,iglo) = 0.
             g1(:,2,iglo) = 0.
          end if
       end do
    end if

    istep_last = istep
  end subroutine add_explicit

  !> Get an estimate of the error on the NL source term by comparing
  !> 2nd and 3rd order calculations of the source term.
  !>
  !> Here we're calculating g at t+code_dt using g(t) = g1, g(t-dt1) = g2
  !> and g(t-dt2) = g3 using 2nd and 3rd order methods and comparing
  !> the difference to estimate the error.
  real function estimate_error(g1, g2, g3) result(error)
    use gs2_time, only: get_adams_bashforth_coefficients, time_history_available
    use mp, only: max_allreduce
    use array_utils, only: gs2_max_abs
    implicit none
    complex, dimension(:, :, :), intent(in) :: g1, g2, g3
    complex, dimension(:, :, :), allocatable :: source_order_3, source_order_2
    real, dimension(2) :: intermediate_error
    real, dimension(:), allocatable :: coeff_3rd, coeff_2nd

    allocate(coeff_3rd(min(time_history_available(), 3)))
    allocate(coeff_2nd(min(time_history_available(), 2)))

    ! Check that we actually have 3rd and 2nd order coefficients.
    ! This could fail on the first two time steps of a simulation, for
    ! example, where we don't have enough time history to allow a
    ! 2nd/3rd order calculation. For now just return an error of
    ! error_target times cfl squared. This is chosen such the target
    ! timestep remains the current timestep.
    if ( (size(coeff_3rd) /= 3) .or. (size(coeff_2nd) /= 2) ) then
       error = error_target *cfl*cfl
       return
    end if

    ! Calculate the coefficients required for variable time step schemes
    coeff_3rd = get_adams_bashforth_coefficients(maximum_order = 3)
    coeff_2nd = get_adams_bashforth_coefficients(maximum_order = 2)

    ! Calculate the source terms using 3rd and 2nd order methods
    allocate(source_order_3, mold = g1)
    allocate(source_order_2, mold = g1)
    source_order_3 = coeff_3rd(1) * g1 + coeff_3rd(2) * g2 + coeff_3rd(3) * g3
    source_order_2 = coeff_2nd(1) * g1 + coeff_2nd(2) * g2

    ! Here we find the maximum absolute error
    intermediate_error(1) = gs2_max_abs(source_order_3 - source_order_2)

    ! Now we find the maximum of source_order_2 to scale the absolute
    ! error with.  Note we use this rather than source_order_3 as the
    ! higher order method has a smaller region of stability so is
    ! perhaps more likely to blow up.
    intermediate_error(2) = gs2_max_abs(source_order_2)

    ! Reduce over all processors
    call max_allreduce(intermediate_error)

    ! Now we form a crude relative error by dividing by the maximum
    ! source value. We do this rather than forming the true relative
    ! error (i.e. dividing the entire absolute difference by the source)
    ! to avoid issues with dividing by small numbers. This will tend to
    ! underestimate the real relative error.
    error = intermediate_error(1) / intermediate_error(2)
  end function estimate_error

  !> Takes input array evaluated at theta grid points
  !! and overwrites it with array evaluated at cell centers
  !! note that there is an extra factor of 2 in output array
  subroutine nl_center (gtmp, bd)
    !
    !CMR, 13/10/2014
    ! Fixing some (cosmetic) issues from prior to R3021.
    ! (1) forbid(-ntgrid:ntgrid) refers to grid points at cell boundaries
    !     gtmp(-ntgrid:ntgrid-1) refers to cell centers
    !     => applying forbid to output gtmp did not zero the correct things!!!
    ! (2) totally trapped particles are special, so return source without upwinding is fine
    !     BUT source for other trapped particles should NOT enter forbidden region.
    !     if ig or ig+1 forbidden (i.e. ig+1 or ig is bounce point) now return gtmp(ig,:,iglo)=0
    !
    use gs2_layouts, only: g_lo, il_idx
    use theta_grid, only: ntgrid
    use le_grids, only: forbid, grid_has_trapped_particles, jend, is_ttp
    use mp, only: mp_abort

    implicit none
    real, intent (in) :: bd    
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: gtmp
    integer :: iglo, il, ig
    logical :: trapped

    trapped = grid_has_trapped_particles()

    ! factor of one-half appears elsewhere
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(iglo, il, ig) &
    !$OMP SHARED(g_lo, nl_forbid_force_zero, forbid, gtmp, ntgrid, is_ttp, jend, trapped, bd) &
    !$OMP SCHEDULE(static)
    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       il = il_idx(g_lo, iglo)

       !
       !CMR, 7/10/2014: 
       ! Incoming gtmp SHOULD vanish in forbidden region, 
       ! New logical in nonlinear_terms_knobs namelist: nl_forbid_force_zero
       !     nl_forbid_force_zero =.t. : force zeroing    (default)
       !     nl_forbid_force_zero =.f. : NO forced zeroing
       !                                 ie assume forbidden gtmp is zero on entry 
       !

       if ( nl_forbid_force_zero ) then
          ! force spurious gtmp outside trapped boundary to be zero
          where (forbid(:,il))
             gtmp(:,1,iglo) = 0.0
             gtmp(:,2,iglo) = 0.0
          end where
       endif

       do ig = -ntgrid, ntgrid-1
          !
          !CMR, 7/10/2014: 
          ! loop sets gtmp to value at upwinded cell center RIGHT of theta(ig)
          !           except for ttp where upwinding makes no sense!
          !
          ! Note this cycle doesn't skip when ig is in the forbidden region
          ! even if the pitch angle is greater than the totally trapped one
          ! at this location. This ensures that whilst we leave the (allowed)
          ! totally trapped source alone we still zero the NL source for forbidden
          ! points. Previously we cycled simply if the pitch angle index was larger
          ! than the totally trapped one. When there are multiple totally trapped
          ! pitch angles this may not be equivalent.
          if (is_ttp(ig, il)) cycle
          ! The below might be better off using forbid instead of checking jend
          if ( trapped .and. ( il > jend(ig) .or. il > jend(ig+1)) ) then
             !
             !CMR, 7/10/2014: 
             !   if either ig or ig+1 is forbidden, no source possible in a cell RIGHT of theta(ig) 
             !   => gtmp(ig,1:2,iglo)=0
             !
             gtmp(ig,1:2,iglo) = 0.0
          else 
             !
             !CMR, 7/10/2014: 
             !    otherwise ig and ig+1 BOTH allowed, and upwinding in cell RIGHT of theta(ig) is fine
             !
             gtmp(ig,1,iglo) = (1.+bd)*gtmp(ig+1,1,iglo) + (1.-bd)*gtmp(ig,1,iglo)
             gtmp(ig,2,iglo) = (1.-bd)*gtmp(ig+1,2,iglo) + (1.+bd)*gtmp(ig,2,iglo)

          endif
       end do
    end do
    !$OMP END PARALLEL DO
  end subroutine nl_center

  !> Calculates the current nonlinear source term by calling add_nl
  !> and the associated cfl limit.
  subroutine calculate_current_nl_source_and_cfl_limit(g_in, g1, phi, apar, bpar, &
       max_vel_local, need_to_adjust, calculate_cfl_limit)
    use optionals, only: get_option_with_default
    implicit none
    complex, dimension (:,:,:), intent (in) :: g_in
    complex, dimension (:,:,:), intent (out) :: g1
    complex, dimension (:,:,:), intent (in) :: phi, apar, bpar
    real, intent(out) :: max_vel_local
    logical, intent(in) :: need_to_adjust
    logical, intent(in), optional :: calculate_cfl_limit

    use_cfl_limit = get_option_with_default(calculate_cfl_limit, .true.)
    call add_nl(g_in, g1, phi, apar, bpar, need_to_adjust)

    if (use_cfl_limit) then
       ! This code may need changing if the handling of the cfl limit changes
       max_vel_local = max_vel
    else
       max_vel_local = 1.0e-8
    end if

  end subroutine calculate_current_nl_source_and_cfl_limit

  !> Calculate the nonlinear term and part of the corresponding CFL condition
  subroutine add_nl (g_in, g1, phi, apar, bpar, adjust)
    use theta_grid, only: ntgrid, kxfac
    use gs2_layouts, only: g_lo, ik_idx, it_idx
    use dist_fn_arrays, only: g_adjust, from_g_gs2
    use gs2_transforms, only: transform2, inverse2
    use kt_grids, only: aky, akx
    use gs2_time, only: save_dt_cfl, check_time_step_too_large
    use constants, only: zi
    use unit_tests, only: debug_message
    use optionals, only: get_option_with_default
    use array_utils, only: copy
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in) :: g_in
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (out) :: g1
    !> Local array used to store the adjusted input dfn if needed. Saves
    !> calling g_adjust twice.
    complex, dimension (-ntgrid:ntgrid, 2, g_lo%llim_proc:g_lo%ulim_alloc) :: g2
    complex, dimension (-ntgrid:,:,:), intent (in) :: phi, apar, bpar
    logical, intent(in), optional :: adjust
    logical :: should_adjust
    integer, parameter :: verb = 4

    integer :: iglo, ik, it, ia
    
    call debug_message(verb, 'nonlinear_terms::add_nl starting')
    should_adjust = get_option_with_default(adjust, .true.)

    !Form g1=i*kx*chi
    call load_kx_phi(g1, phi)    
    call load_kx_bpar(g1, bpar)    
    call load_kx_apar(g1, apar)    

    call debug_message(verb, 'nonlinear_terms::add_nl calling transform2')
    !Transform to real space
    if (accelerated) then
       call transform2 (g1, aba, ia)
    else
       call transform2 (g1, ba)
    end if

    call debug_message(verb, 'nonlinear_terms::add_nl calculated dphi/dx')
    
    !Form g1=i*ky*g_wesson
    call copy(g_in, g1)
    if (should_adjust) call g_adjust(g1, phi, bpar, direction = from_g_gs2)
    call copy(g1, g2)

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(iglo, ik) &
    !$OMP SHARED(g_lo, g1, aky) &
    !$OMP SCHEDULE(static)
    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       ik = ik_idx(g_lo,iglo)
       g1(:,:,iglo)=g1(:,:,iglo)*zi*aky(ik)
    enddo
    !$OMP END PARALLEL DO

    !Transform to real space
    if (accelerated) then
       call transform2 (g1, agb, ia)
    else
       call transform2 (g1, gb)
    end if

    call debug_message(verb, 'nonlinear_terms::add_nl calculated dg/dy')

    if (use_cfl_limit) then
       max_vel = 0.
       call get_max_vel(cfly)
    end if

    call calculate_bracket(.true.)
    
    call debug_message(verb, 'nonlinear_terms::add_nl calculated dphi/dx.dg/dy')

    !Form g1=i*ky*chi
    call load_ky_phi(g1, phi)
    call load_ky_bpar(g1, bpar)
    call load_ky_apar(g1, apar)

    call debug_message(verb, 'nonlinear_terms::add_nl calculated dphi/dy')

    !Transform to real space    
    if (accelerated) then
       call transform2 (g1, aba, ia)
    else
       call transform2 (g1, ba)
    end if

    !Form g1=i*kx*g_wesson noting that g2 holds our starting g_wesson

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(iglo, it) &
    !$OMP SHARED(g_lo, g2, akx) &
    !$OMP SCHEDULE(static)
    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       it = it_idx(g_lo,iglo)
       g2(:,:,iglo) = g2(:,:,iglo) * zi * akx(it)
    enddo
    !$OMP END PARALLEL DO

    !Transform to real space
    if (accelerated) then
       call transform2 (g2, agb, ia)
    else
       call transform2 (g2, gb)
    end if

    call debug_message(verb, 'nonlinear_terms::add_nl calculated dg/dx')

    if (use_cfl_limit) call get_max_vel(cflx)

    call calculate_bracket(.false.)
    
    call debug_message(verb, &
      'nonlinear_terms::add_nl calculated dphi/dx.dg/dy.dphi/dy.dg/dx')

    !Transform NL source back to spectral space
    if (accelerated) then
       call inverse2 (abracket, g1, ia)
    else
       call inverse2 (bracket, g1)
    end if

    ! Scale by kxfac
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(iglo) &
    !$OMP SHARED(g_lo, g1, kxfac) &
    !$OMP SCHEDULE(static)
    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       g1(:, :, iglo) = g1(:, :, iglo) * kxfac
    end do
    !$OMP END PARALLEL DO

    call debug_message(verb, &
      'nonlinear_terms::add_nl calculated nonlinear term')

  end subroutine add_nl

  !> FIXME : Add documentation    
  subroutine load_kx_phi(g1, phi)
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, ik_idx, it_idx
    use dist_fn_arrays, only: aj0
    use run_parameters, only: has_phi, fphi
    use kt_grids, only: akx
    use constants, only: zi
    use array_utils, only: zero_array
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: g1
    complex, dimension (-ntgrid:,:,:), intent (in) :: phi      
    complex, dimension(-ntgrid:ntgrid) :: fac
    integer :: iglo, it, ik

    if (has_phi .and. include_phi) then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(iglo, it, ik, fac) &
       !$OMP SHARED(g_lo, g1, akx, aj0, phi, fphi) &
       !$OMP SCHEDULE(static)
       do iglo = g_lo%llim_proc, g_lo%ulim_proc
          it = it_idx(g_lo,iglo)
          ik = ik_idx(g_lo,iglo)
          fac = zi*akx(it)*aj0(:,iglo)*phi(:,it,ik)*fphi
          g1(:,1,iglo) = fac
          g1(:,2,iglo) = fac
       end do
       !$OMP END PARALLEL DO
    else
       call zero_array(g1)
    endif
  end subroutine load_kx_phi

  !> FIXME : Add documentation    
  subroutine load_ky_phi(g1, phi)
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, ik_idx, it_idx
    use dist_fn_arrays, only: aj0
    use run_parameters, only: has_phi, fphi
    use kt_grids, only: aky
    use constants, only: zi
    use array_utils, only: zero_array
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: g1
    complex, dimension (-ntgrid:,:,:), intent (in) :: phi      
    complex, dimension(-ntgrid:ntgrid) :: fac
    integer :: iglo, it, ik

    if (has_phi .and. include_phi) then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(iglo, it, ik, fac) &
       !$OMP SHARED(g_lo, g1, aky, aj0, phi, fphi) &
       !$OMP SCHEDULE(static)
       do iglo = g_lo%llim_proc, g_lo%ulim_proc
          it = it_idx(g_lo,iglo)
          ik = ik_idx(g_lo,iglo)
          fac = zi*aky(ik)*aj0(:,iglo)*phi(:,it,ik)*fphi
          g1(:,1,iglo) = fac
          g1(:,2,iglo) = fac
       end do
       !$OMP END PARALLEL DO
    else
       call zero_array(g1)
    endif

  end subroutine load_ky_phi

  ! should I use vpa or vpac in next two routines??

  !> FIXME : Add documentation    
  subroutine load_kx_apar(g1, apar)
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, ik_idx, it_idx, is_idx
    use dist_fn_arrays, only: vpa, aj0
    use species, only: spec
    use run_parameters, only: has_apar, fapar
    use kt_grids, only: akx
    use constants, only: zi
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: g1
    complex, dimension (-ntgrid:,:,:), intent (in) :: apar
    complex, dimension(-ntgrid:ntgrid) :: fac    
    integer iglo, it, ik, is

    if(.not. (include_apar .and. has_apar) ) return

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(iglo, it, ik, is, fac) &
    !$OMP SHARED(g_lo, g1, vpa, akx, aj0, spec, apar, fapar) &
    !$OMP SCHEDULE(static)
    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       it = it_idx(g_lo,iglo)
       ik = ik_idx(g_lo,iglo)
       is = is_idx(g_lo,iglo)

       fac = zi*akx(it)*aj0(:,iglo)*spec(is)%stm*apar(:,it,ik)*fapar
       g1(:,1,iglo) = g1(:,1,iglo) - fac*vpa(:,1,iglo)
       g1(:,2,iglo) = g1(:,2,iglo) - fac*vpa(:,2,iglo)
    end do
    !$OMP END PARALLEL DO
  end subroutine load_kx_apar

  !> FIXME : Add documentation    
  subroutine load_ky_apar(g1, apar)
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, ik_idx, it_idx, is_idx
    use dist_fn_arrays, only: vpa, aj0
    use species, only: spec
    use run_parameters, only: has_apar, fapar
    use kt_grids, only: aky
    use constants, only: zi
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: g1
    complex, dimension (-ntgrid:,:,:), intent (in) :: apar            
    complex, dimension(-ntgrid:ntgrid) :: fac    
    integer iglo, it, ik, is

    if(.not. (include_apar .and. has_apar)) return

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(iglo, it, ik, is, fac) &
    !$OMP SHARED(g_lo, g1, vpa, aky, aj0, spec, apar, fapar) &
    !$OMP SCHEDULE(static)
    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       it = it_idx(g_lo,iglo)
       ik = ik_idx(g_lo,iglo)
       is = is_idx(g_lo,iglo)

       fac = zi*aky(ik)*aj0(:,iglo)*spec(is)%stm*apar(:,it,ik)*fapar
       g1(:,1,iglo) = g1(:,1,iglo) - fac*vpa(:,1,iglo)
       g1(:,2,iglo) = g1(:,2,iglo) - fac*vpa(:,2,iglo)
    end do
    !$OMP END PARALLEL DO
  end subroutine load_ky_apar

  !> FIXME : Add documentation    
  subroutine load_kx_bpar(g1, bpar)
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, ik_idx, it_idx, is_idx
    use dist_fn_arrays, only: vperp2, aj1
    use species, only: spec
    use run_parameters, only: has_bpar, fbpar
    use kt_grids, only: akx
    use constants, only: zi
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: g1
    complex, dimension (-ntgrid:,:,:), intent (in) :: bpar      
    complex, dimension(-ntgrid:ntgrid) :: fac    
    integer iglo, it, ik, is

    if (.not. (include_bpar .and. has_bpar)) return

    ! Is this factor of two from the old normalization?

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(iglo, it, ik, is, fac) &
    !$OMP SHARED(g_lo, g1, akx, aj1, vperp2, spec, bpar, fbpar) &
    !$OMP SCHEDULE(static)
    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       it = it_idx(g_lo,iglo)
       ik = ik_idx(g_lo,iglo)
       is = is_idx(g_lo,iglo)
       fac = zi*akx(it)*aj1(:,iglo) &
            *2.0*vperp2(:,iglo)*spec(is)%tz*bpar(:,it,ik)*fbpar
       g1(:,1,iglo) = g1(:,1,iglo) + fac
       g1(:,2,iglo) = g1(:,2,iglo) + fac
    end do
    !$OMP END PARALLEL DO
  end subroutine load_kx_bpar

  !> FIXME : Add documentation    
  subroutine load_ky_bpar(g1, bpar)
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, ik_idx, it_idx, is_idx
    use dist_fn_arrays, only: vperp2, aj1
    use species, only: spec
    use run_parameters, only: has_bpar, fbpar
    use kt_grids, only: aky
    use constants, only: zi
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (in out) :: g1
    complex, dimension (-ntgrid:,:,:), intent (in) :: bpar
    complex, dimension(-ntgrid:ntgrid) :: fac
    integer iglo, it, ik, is

    if (.not. (include_bpar .and. has_bpar)) return

    ! Is this factor of two from the old normalization?

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(iglo, it, ik, is, fac) &
    !$OMP SHARED(g_lo, g1, aky, aj1, vperp2, spec, bpar, fbpar) &
    !$OMP SCHEDULE(static)
    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       it = it_idx(g_lo,iglo)
       ik = ik_idx(g_lo,iglo)
       is = is_idx(g_lo,iglo)
       fac =  zi*aky(ik)*aj1(:,iglo) &
            *2.0*vperp2(:,iglo)*spec(is)%tz*bpar(:,it,ik)*fbpar
       g1(:,1,iglo) = g1(:,1,iglo) + fac
       g1(:,2,iglo) = g1(:,2,iglo) + fac
    end do
    !$OMP END PARALLEL DO
  end subroutine load_ky_bpar

  !> Calculates an estimate of the current maximum velocity due to the
  !! perturbed potentials. This is currenly a processor local estimate.
  subroutine get_max_vel(scaling)
    use array_utils, only: gs2_max_abs
    implicit none
    real, intent(in) :: scaling
    real :: local_max_vel
    if (accelerated) then
       local_max_vel = gs2_max_abs(aba)
    else
       local_max_vel = gs2_max_abs(ba)
    end if
    
    local_max_vel = local_max_vel * scaling

    max_vel = max(max_vel, local_max_vel)
  end subroutine get_max_vel

  !> Calculate (d Chi /dx).(d g_wesson/dy) and store in bracket if is_first_term = .true.
  !> else calculate (d Chi /dy).(d g_wesson/dx) and subtract from bracket
  !>
  !> @note The final bracket value must be scaled by kxfac to produce
  !> the correct value to be used.
  subroutine calculate_bracket(is_first_term)
    use gs2_layouts, only: accelx_lo, yxf_lo
    implicit none
    logical :: is_first_term
    integer :: iyxf
    if (is_first_term) then
       if (accelerated) then
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iyxf) &
          !$OMP SHARED(accelx_lo, agb, aba, abracket) &
          !$OMP SCHEDULE(static)
          do iyxf = accelx_lo%llim_proc, accelx_lo%ulim_proc
             abracket(:, :, iyxf) = aba(:, :, iyxf)*agb(:, :, iyxf)
          end do
          !$OMP END PARALLEL DO
       else
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iyxf) &
          !$OMP SHARED(yxf_lo, bracket, ba, gb) &
          !$OMP SCHEDULE(static)
          do iyxf = yxf_lo%llim_proc, yxf_lo%ulim_proc
             bracket(:, iyxf) = ba(:, iyxf)*gb(:, iyxf)
          end do
          !$OMP END PARALLEL DO
       endif
    else
       if (accelerated) then
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iyxf) &
          !$OMP SHARED(accelx_lo, agb, aba, abracket) &
          !$OMP SCHEDULE(static)
          do iyxf = accelx_lo%llim_proc, accelx_lo%ulim_proc
             abracket(:, :, iyxf) = abracket(:, :, iyxf) - aba(:, :, iyxf)*agb(:, :, iyxf)
          end do
          !$OMP END PARALLEL DO
       else
          !$OMP PARALLEL DO DEFAULT(none) &
          !$OMP PRIVATE(iyxf) &
          !$OMP SHARED(yxf_lo, bracket, ba, gb) &
          !$OMP SCHEDULE(static)
          do iyxf = yxf_lo%llim_proc, yxf_lo%ulim_proc
             bracket(:, iyxf) = bracket(:, iyxf) - ba(:, iyxf)*gb(:, iyxf)
          end do
          !$OMP END PARALLEL DO
       endif
    endif
  end subroutine calculate_bracket
  
  !> this subroutine constructs the GK nonlinear term from the 6 GryfX
  !! gyrofluid nonlinear terms.  we first construct in gryfx units.
  !! gryfx uses vt=sqrt(T/m) normalization, 
  !! so vt_gs2 = sqrt(2T/m) = sqrt(2) vt_gryfx.
  !! since vpa and vperp2 are normalized to vt_gs2, below we multiply by
  !! factors of sqrt(2) to get vpa and vperp2 in gryfx units, i.e.
  !! vpa_gryfx = sqrt(2) vpa
  !! vperp2_gryfx = 2 vperp2
  !!
  !! the Hermite-Laguerre construction of the distribution from the gyrofluid
  !! moments is of the form
  !! df/F_M = n + vpar/vt upar + 1/2(vpar^2/vt^2-1) tpar 
  !! + (vprp^2/2vt^2-1) tprp + 1/6(vpar^3/3vt^3 - 3 vpar/vt) qpar
  !! + vpar/vt (vprp^2/2vt^2-1) qprp
  !! where vt=vt_gryfx, and moments are normalized in gryfx units
  subroutine add_nl_gryfx (g1)
    use mp, only: max_allreduce
    use theta_grid, only: ntgrid
    use gs2_layouts, only: g_lo, ik_idx, it_idx, is_idx
    use dist_fn_arrays, only: vpa, vperp2
    use gs2_transforms, only: transform2, inverse2
    use gs2_time, only: save_dt_cfl, check_time_step_too_large
    use mp, only: broadcast
    implicit none
    complex, dimension (-ntgrid:,:,g_lo%llim_proc:), intent (out) :: g1

    integer :: iglo, ik, it, ig, iz, is, isgn, index_gryfx

    do iglo = g_lo%llim_proc, g_lo%ulim_proc
       ik = ik_idx(g_lo,iglo)
       it = it_idx(g_lo,iglo)
       is = is_idx(g_lo,iglo)
       do isgn = 1, 2
          do ig = -ntgrid, ntgrid
             iz = ig + ntgrid + 1
             if(ig==ntgrid) iz = 1 ! periodic point not included in gryfx arrays
             index_gryfx = 1 + (ik-1) + g_lo%naky*((it-1)) + &
                  g_lo%naky*g_lo%ntheta0*(iz-1) + &
                  (2*ntgrid)*g_lo%naky*g_lo%ntheta0*(is-1)
             
             g1(ig,isgn,iglo) =  gryfx_zonal%NLdens_ky0(index_gryfx) + &
                  
                  0.5*(2.*vpa(ig,isgn,iglo)*vpa(ig,isgn,iglo) - 1.)* &
                  gryfx_zonal%NLtpar_ky0(index_gryfx) + &
                  
                  (vperp2(ig,iglo) - 1.0)* &
                  gryfx_zonal%NLtprp_ky0(index_gryfx) + &

                  sqrt(2.)*vpa(ig,isgn,iglo)* &
                     gryfx_zonal%NLupar_ky0(index_gryfx) + &
                     
                     1./6.*( (sqrt(2.)*vpa(ig,isgn,iglo))**3. &
                     - 3*sqrt(2.)*vpa(ig,isgn,iglo) )* &
                     gryfx_zonal%NLqpar_ky0(index_gryfx) + &
                     
                     sqrt(2.)*vpa(ig,isgn,iglo)*(vperp2(ig,iglo) - 1.)* &
                     gryfx_zonal%NLqprp_ky0(index_gryfx) 
             
             ! now we must account for the fact that phi and g are normalized to a/rho_i,
             ! and grad ~ k is normalized to rho_i, and rho_gs2 = sqrt2 rho_gryfx.
             ! we have just calculated the GK nonlinear term in gryfx units:
             ! g1 = z_hat X grad_gryfx phi_gryfx . grad_gryfx g_gryfx =
             ! z_hat X (1/sqrt2 grad_gs2)(sqrt2 phi_gs2) . (1/sqrt2 grad_gs2)(sqrt2 g_gs2)
             ! = z_hat X grad_gs2 phi_gs2 . grad_gs2 g_gs2
             ! so it turns out that the GK nonlinear term is the same in gryfx units
             ! and gs2 units, so we don't need any additional factors.

          end do
       end do
    end do
    g1 = -g1 !left-handed / right-handed conversion    
  end subroutine add_nl_gryfx

  !> Finish nonblocking calculation of the maximum velocity
  !> and set the cfl limit.
  subroutine nb_check_time_step_too_large
    use run_parameters, only: immediate_reset
    implicit none
    if (immediate_reset) return
    call finish_nonblocking_max_vel_reduction
    call set_cfl_time_step_and_check_if_too_large
  end subroutine nb_check_time_step_too_large

  !> Finish nonblocking calculation of the maximum velocity
  !>
  !> This receives the mpi_iallreduce of the maximum velocity.
  subroutine finish_nonblocking_max_vel_reduction
    use mp, only: wait, mp_request_null
    implicit none

    ! Wait for the nonblocking maximum operation used to find
    ! the maximum velocity.
    if (cfl_req_hand == mp_request_null) return
    call wait(cfl_req_hand)
    cfl_req_hand = mp_request_null
  end subroutine finish_nonblocking_max_vel_reduction

  !> Calculates the cfl limit from the module level max_vel value
  !> saves it using [[gs2_time:save_dt_cfl]] and then sets the reset
  !> flag based on checking if the current time step is too large.
  subroutine set_cfl_time_step_and_check_if_too_large
    use run_parameters, only: reset
    use gs2_time, only: save_dt_cfl, check_time_step_too_large
    implicit none
    real :: dt_cfl

    ! Estimate the global cfl limit based on max_vel.
    if (max_vel <= 0.0) then
       ! We should only end up here in unusual cases (likely
       ! tests) as we initialise max_vel > 0 so the only way
       ! we get here is if the estimated NL velocity limit is
       ! exactly 0.
       dt_cfl = dt_cfl_default_large
    else
       dt_cfl = 1./max_vel
    end if

    call save_dt_cfl (dt_cfl)

    ! Now check to see if we've violated the cfl condition.
    call check_time_step_too_large(reset)
  end subroutine set_cfl_time_step_and_check_if_too_large

  !> FIXME : Add documentation
  subroutine reset_init
    implicit none
    initialized = .false.
    initializing = .true.
  end subroutine reset_init

  !> FIXME : Add documentation  
  subroutine finish_init
    implicit none
    initializing = .false.
  end subroutine finish_init

  !> FIXME : Add documentation  
  subroutine finish_nonlinear_terms
#ifndef SHMEM    
    use gs2_transforms, only: finish_transforms
#else
    use shm_mpi3, only : shm_free
#endif
    implicit none

#ifndef SHMEM
    if (allocated(aba)) deallocate (aba, agb, abracket)
    if (allocated(ba)) deallocate (ba, gb, bracket)
#else
    if (associated(aba)) call shm_free(aba)
    if (associated(agb)) call shm_free(agb)
    if (associated(abracket)) call shm_free(abracket)
    if (associated(ba)) then
       call shm_free(ba)
    endif
    if (associated(gb)) then
       call shm_free(gb)
    endif
    if (associated(bracket)) then
       call shm_free(bracket)
    endif
#endif
    nonlin = .false. ; alloc = .true. ; zip = .false. ; accelerated = .false.
    initialized = .false. ; initializing = .true. ; exist = .false.

#ifndef SHMEM
    call finish_transforms
#endif

    call nonlinear_terms_config%reset()
  end subroutine finish_nonlinear_terms

  !> Set the module level config type
  !> Will abort if the module has already been initialised to avoid
  !> inconsistencies.
  subroutine set_nonlinear_terms_config(nonlinear_terms_config_in)
    use mp, only: mp_abort
    type(nonlinear_terms_config_type), intent(in), optional :: nonlinear_terms_config_in
    if (initialized) then
       call mp_abort("Trying to set nonlinear_terms_config when already initialized.", to_screen = .true.)
    end if
    if (present(nonlinear_terms_config_in)) then
       nonlinear_terms_config = nonlinear_terms_config_in
    end if
  end subroutine set_nonlinear_terms_config

  !> Get the module level config instance
  function get_nonlinear_terms_config()
    type(nonlinear_terms_config_type) :: get_nonlinear_terms_config
    get_nonlinear_terms_config = nonlinear_terms_config
  end function get_nonlinear_terms_config

  !---------------------------------------
  ! Following is for the config_type
  !---------------------------------------

  !> Reads in the nonlinear_terms_knobs namelist and populates the member variables
  subroutine read_nonlinear_terms_config(self)
    use file_utils, only: input_unit_exist, get_indexed_namelist_unit
    use mp, only: proc0
    implicit none
    class(nonlinear_terms_config_type), intent(in out) :: self
    logical :: exist
    integer :: in_file

    ! Note: When this routine is in the module where these variables live
    ! we shadow the module level variables here. This is intentional to provide
    ! isolation and ensure we can move this routine to another module easily.
    character(len = 20) :: nonlinear_mode
    integer :: istep_error_start
    logical :: include_apar, include_bpar, include_phi, nl_forbid_force_zero, split_nonlinear, use_cfl_limit, use_order_based_error, zip
    real :: cfl, densfac, error_target, qparfac, qprpfac, tparfac, tprpfac, uparfac

    namelist /nonlinear_terms_knobs/ cfl, densfac, error_target, include_apar, include_bpar, include_phi, istep_error_start, nl_forbid_force_zero, nonlinear_mode, qparfac, qprpfac, split_nonlinear, tparfac, tprpfac, uparfac, &
         use_cfl_limit, use_order_based_error, zip

    ! Only proc0 reads from file
    if (.not. proc0) return

    ! First set local variables from current values
    cfl = self%cfl
    densfac = self%densfac
    error_target = self%error_target
    include_apar = self%include_apar
    include_bpar = self%include_bpar
    include_phi = self%include_phi
    istep_error_start = self%istep_error_start
    nl_forbid_force_zero = self%nl_forbid_force_zero
    nonlinear_mode = self%nonlinear_mode
    qparfac = self%qparfac
    qprpfac = self%qprpfac
    split_nonlinear = self%split_nonlinear
    tparfac = self%tparfac
    tprpfac = self%tprpfac
    uparfac = self%uparfac
    use_cfl_limit = self%use_cfl_limit
    use_order_based_error = self%use_order_based_error
    zip = self%zip

    ! Now read in the main namelist
    in_file = input_unit_exist(self%get_name(), exist)
    if (exist) read(in_file, nml = nonlinear_terms_knobs)

    ! Now copy from local variables into type members
    self%cfl = cfl
    self%densfac = densfac
    self%error_target = error_target
    self%include_apar = include_apar
    self%include_bpar = include_bpar
    self%include_phi = include_phi
    self%istep_error_start = istep_error_start
    self%nl_forbid_force_zero = nl_forbid_force_zero
    self%nonlinear_mode = nonlinear_mode
    self%qparfac = qparfac
    self%qprpfac = qprpfac
    self%split_nonlinear = split_nonlinear
    self%tparfac = tparfac
    self%tprpfac = tprpfac
    self%uparfac = uparfac
    self%use_cfl_limit = use_cfl_limit
    self%use_order_based_error = use_order_based_error
    self%zip = zip

    self%exist = exist
  end subroutine read_nonlinear_terms_config

  !> Writes out a namelist representing the current state of the config object
  subroutine write_nonlinear_terms_config(self, unit)
    implicit none
    class(nonlinear_terms_config_type), intent(in) :: self
    integer, intent(in) , optional:: unit
    integer :: unit_internal

    unit_internal = 6 ! @todo -- get stdout from file_utils
    if (present(unit)) then
       unit_internal = unit
    endif

    call self%write_namelist_header(unit_internal)
    call self%write_key_val("cfl", self%cfl, unit_internal)
    call self%write_key_val("densfac", self%densfac, unit_internal)
    call self%write_key_val("error_target", self%error_target, unit_internal)
    call self%write_key_val("include_apar", self%include_apar, unit_internal)
    call self%write_key_val("include_bpar", self%include_bpar, unit_internal)
    call self%write_key_val("include_phi", self%include_phi, unit_internal)
    call self%write_key_val("istep_error_start", self%istep_error_start, unit_internal)
    call self%write_key_val("nl_forbid_force_zero", self%nl_forbid_force_zero, unit_internal)
    call self%write_key_val("nonlinear_mode", self%nonlinear_mode, unit_internal)
    call self%write_key_val("qparfac", self%qparfac, unit_internal)
    call self%write_key_val("qprpfac", self%qprpfac, unit_internal)
    call self%write_key_val("split_nonlinear", self%split_nonlinear, unit_internal)
    call self%write_key_val("tparfac", self%tparfac, unit_internal)
    call self%write_key_val("tprpfac", self%tprpfac, unit_internal)
    call self%write_key_val("uparfac", self%uparfac, unit_internal)
    call self%write_key_val("use_cfl_limit", self%use_cfl_limit, unit_internal)
    call self%write_key_val("use_order_based_error", self%use_order_based_error, unit_internal)
    call self%write_key_val("zip", self%zip, unit_internal)
    call self%write_namelist_footer(unit_internal)
  end subroutine write_nonlinear_terms_config

  !> Resets the config object to the initial empty state
  subroutine reset_nonlinear_terms_config(self)
    class(nonlinear_terms_config_type), intent(in out) :: self
    type(nonlinear_terms_config_type) :: empty
    select type (self)
    type is (nonlinear_terms_config_type)
       self = empty
    end select
  end subroutine reset_nonlinear_terms_config

  !> Broadcasts all config parameters so object is populated identically on
  !! all processors
  subroutine broadcast_nonlinear_terms_config(self)
    use mp, only: broadcast
    implicit none
    class(nonlinear_terms_config_type), intent(in out) :: self
    call broadcast(self%cfl)
    call broadcast(self%densfac)
    call broadcast(self%error_target)
    call broadcast(self%include_apar)
    call broadcast(self%include_bpar)
    call broadcast(self%include_phi)
    call broadcast(self%istep_error_start)
    call broadcast(self%nl_forbid_force_zero)
    call broadcast(self%nonlinear_mode)
    call broadcast(self%qparfac)
    call broadcast(self%qprpfac)
    call broadcast(self%split_nonlinear)
    call broadcast(self%tparfac)
    call broadcast(self%tprpfac)
    call broadcast(self%uparfac)
    call broadcast(self%use_cfl_limit)
    call broadcast(self%use_order_based_error)
    call broadcast(self%zip)

    call broadcast(self%exist)
  end subroutine broadcast_nonlinear_terms_config

  !> Gets the default name for this namelist
  function get_default_name_nonlinear_terms_config()
    implicit none
    character(len = CONFIG_MAX_NAME_LEN) :: get_default_name_nonlinear_terms_config
    get_default_name_nonlinear_terms_config = "nonlinear_terms_knobs"
  end function get_default_name_nonlinear_terms_config

  !> Gets the default requires index for this namelist
  function get_default_requires_index_nonlinear_terms_config()
    implicit none
    logical :: get_default_requires_index_nonlinear_terms_config
    get_default_requires_index_nonlinear_terms_config = .false.
  end function get_default_requires_index_nonlinear_terms_config

end module nonlinear_terms