redistribute.f90 Source File


Contents

Source Code


Source Code

!> Redistribute distributed (integer, real, complex or logical) 
!! (1, 2, 3, or 4) dimensional arrays into two dimensional arrays with 
!! first index on local processor, and vice versa.
!!
!! The first operation is called 'gather' and the second is called 'scatter.'
!!  
!! One can also do a 'fill' operation.  This consists of copying 
!! values from a (2, 3, or 4) dimensional array of 
!! (integer, real, complex, or logical ) values into 
!! another array with the same number of dimensions.
!!
!! One can also do a three index to four index redistribution for complex numbers.
module redistribute

! Modifications for optimised local copy in c_redist_22 and c_redist_32 
! (and their inverse routines):
! (c) The Numerical Algorithms Group (NAG) Ltd, 2012
! on behalf of EPSRC for the HECToR project

  implicit none
  public :: opt_redist_nbk, using_measure_scatter, opt_redist_persist, opt_redist_persist_overlap
  public :: write_connection_matrix, get_redist_times, reset_redist_times
  private

  logical :: opt_local_copy = .false.
  logical :: opt_redist_nbk = .true. !< Do we use non-blocking sends?
  logical :: opt_redist_persist = .false. !< Do we use persistent sends?
  logical :: opt_redist_persist_overlap = .false. !< Do we try to do a bit of overlapping with persistent sends?
  logical :: using_measure_scatter=.true. !< If true then don't time in gather/scatter directly
  integer :: naky = 0
  integer :: ntgrid = 0
  integer :: nlambda = 0
  integer :: ntheta0 = 0
  integer :: negrid = 0
  integer :: nx = 0
  integer :: xxf_lo_ulim_proc = 0
  integer :: yxf_lo_ulim_proc = 0
  integer :: g_lo_ulim = 0
  character (len=5) :: layout = ''

  public :: index_list_type, delete_list
  public :: redist_type, delete_redist
  public :: report_map_property, measure_gather, measure_scatter
  public :: gather_count, scatter_count, time_redist

  public :: init_redist, gather, scatter
  public :: init_fill, fill
  public :: set_redist_character_type
  public :: set_xxf_optimised_variables
  public :: set_yxf_optimised_variables
  public :: set_optimised_choice
  public :: get_redistname

  interface gather
     module procedure c_redist_22, r_redist_22, i_redist_22, l_redist_22
     module procedure c_redist_32, r_redist_32, i_redist_32, l_redist_32
     module procedure c_redist_42, r_redist_42, i_redist_42, l_redist_42
     module procedure c_redist_23
     module procedure c_redist_34, c_redist_36
     module procedure r_redist_62 ! MRH I have left this subroutine in place for now, although we may consider removing it since it is redundant (only r_redist_26 is required)
     module procedure r_redist_26
     module procedure c_redist_33, r_redist_33
  end interface

  interface scatter
     module procedure c_redist_12,     r_redist_12,     i_redist_12,     l_redist_12
     module procedure c_redist_22_inv, r_redist_22_inv, i_redist_22_inv, l_redist_22_inv
     module procedure c_redist_32_inv, r_redist_32_inv, i_redist_32_inv, l_redist_32_inv
     module procedure c_redist_42_inv, r_redist_42_inv, i_redist_42_inv, l_redist_42_inv
     module procedure c_redist_36_inv
     module procedure r_redist_62_inv ! MRH I have left this subroutine in place for now, although we may consider removing it since it is redundant (only r_redist_26_inv is required)
     module procedure r_redist_26_inv
     module procedure c_redist_33_inv, r_redist_33_inv
  end interface

  interface measure_gather
     module procedure measure_gather_32, measure_gather_33
     module procedure measure_gather_22
  end interface

  interface measure_scatter
     module procedure measure_scatter_23, measure_scatter_33
     module procedure measure_scatter_22
  end interface

   interface init_redist
      module procedure init_redist_gs2, init_redist_agk
   end interface

  integer :: gather_count=0, scatter_count=0
  real :: time_redist(2)=0.
  real :: time_redist_mpi = 0.

  interface fill
     module procedure c_fill_2, c_fill_3, c_fill_4
     module procedure r_fill_2, r_fill_3, r_fill_4
     module procedure i_fill_2, i_fill_3, i_fill_4
     module procedure l_fill_2, l_fill_3, l_fill_4
  end interface

  !> FIXME : Add documentation
  type :: index_map
     integer :: nn
     integer, dimension (:), allocatable :: k
     integer, dimension (:), allocatable :: l
     integer, dimension (:), allocatable :: m
     integer, dimension (:), allocatable :: n
     integer, dimension (:), allocatable :: o
     integer, dimension (:), allocatable :: p
  end type index_map

  !> Small container type simply providing a 1D allocatable
  !> buffer. Used to allow array of buffers of different sizes.
  type :: redist_buffer
     complex, dimension(:), allocatable :: complex_buffer
     real, dimension(:), allocatable :: real_buffer
  end type redist_buffer

  !> FIXME : Add documentation
  type :: redist_type
     private
     integer, dimension (6) :: to_low, from_low, to_high, from_high
     type (index_map), dimension (:), allocatable :: to
     type (index_map), dimension (:), allocatable :: from
     complex, dimension (:), allocatable :: complex_buff
     real,    dimension (:), allocatable :: real_buff
     integer, dimension (:), allocatable :: integer_buff
     logical, dimension (:), allocatable :: logical_buff
     integer :: nsend, nrecv !Note these swap meaning when doing inverse communication
     type(redist_buffer), dimension(:), allocatable :: buff_send, buff_recv
     integer, dimension(:), allocatable :: send_hand, recv_hand, send_inv_hand, recv_inv_hand
     integer, dimension(:), allocatable :: recv_ip, recv_inv_ip
     ! optimised choice and optimised_choice_inv set the algorithm used for
     ! gather/scatters. With default zero, different algorithms are timed
     ! during initialization. Putting these inside the redist object allows
     ! this behaviour to be overridden. This is useful, for example, when using
     ! "split domains" layouts, where one of the optimized algorithms cannot be
     ! used as it makes assumptions about the form of layouts.
     integer :: optimised_choice = 0
     integer :: optimised_choice_inv = 0
     character (len=3) :: redistname = ""
     character (len=1) :: char = 'X'
  end type redist_type

  !> FIXME : Add documentation
  type :: index_list_type
     integer, dimension (:), allocatable :: first
     integer, dimension (:), allocatable :: second
     integer, dimension (:), allocatable :: third
     integer, dimension (:), allocatable :: fourth
     integer, dimension (:), allocatable :: fifth
     integer, dimension (:), allocatable :: sixth
  end type index_list_type

contains
  !> Returns current requested timer values
  subroutine get_redist_times(total_time, mpi_time, copy_time)
    implicit none
    real, intent(out), optional :: total_time, mpi_time, copy_time

    if (present(total_time)) total_time = time_redist(1)
    if (present(mpi_time)) mpi_time = time_redist_mpi
    if (present(copy_time)) copy_time = time_redist(1) - time_redist_mpi
  end subroutine get_redist_times

  !> Resets redist timers to zero
  subroutine reset_redist_times
    implicit none
    time_redist = 0.
    time_redist_mpi = 0.
  end subroutine reset_redist_times

  !> FIXME : Add documentation  
  subroutine set_redist_character_type(r, chartype)
    implicit none
    type (redist_type), intent (inout) :: r
    character (3), intent (in) :: chartype

    r%redistname = chartype

  end subroutine set_redist_character_type

  !> FIXME : Add documentation  
  subroutine set_yxf_optimised_variables(lyxf_lo_ulim_proc)
    implicit none
    integer, intent (in) :: lyxf_lo_ulim_proc

    yxf_lo_ulim_proc = lyxf_lo_ulim_proc

  end subroutine set_yxf_optimised_variables

  !> FIXME : Add documentation  
  subroutine set_xxf_optimised_variables(lopt_local_copy, lnaky, lntgrid, lntheta0, lnlambda, lnegrid, lnx, &
       lxxf_lo_ulim_proc, lg_lo_ulim, llayout)
    implicit none
    logical, intent (in) :: lopt_local_copy
    integer, intent (in) :: lnaky, lntgrid, lxxf_lo_ulim_proc, lg_lo_ulim, lntheta0, lnlambda, lnegrid, lnx
    character (len=5), intent (in) :: llayout

    opt_local_copy = lopt_local_copy
    naky = lnaky
    ntgrid = lntgrid
    xxf_lo_ulim_proc = lxxf_lo_ulim_proc
    ntheta0 = lntheta0
    nlambda = lnlambda
    negrid = lnegrid
    nx = lnx
    g_lo_ulim = lg_lo_ulim
    layout = llayout

  end subroutine set_xxf_optimised_variables

  !> Set choice of redist algorithm based on properties of g_lo
  subroutine set_optimised_choice(r,val)
    implicit none
    type (redist_type), intent (inout) :: r
    integer, intent (in) :: val

    r%optimised_choice = val
    r%optimised_choice_inv = val

  end subroutine set_optimised_choice

  !> Get redistname
  function get_redistname(r)
    implicit none
    type (redist_type), intent (inout) :: r
    character (len=3) :: get_redistname

    get_redistname = r%redistname

  end function get_redistname

  !> FIXME : Add documentation
  subroutine setup_persistent(r)
    use mp, only: iproc, nproc, send_init, recv_init, mp_request_null
    implicit none
    type (redist_type), intent (inout) :: r
    integer :: ip, count_r, count_s

    !Count messages
    r%nrecv = 0
    do ip = 0, nproc - 1
       if (r%to(ip)%nn > 0 .and. ip /= iproc) r%nrecv = r%nrecv + 1
    end do
    r%nsend = 0
    do ip = 0, nproc - 1
       if (r%from(ip)%nn > 0 .and. ip /= iproc) r%nsend = r%nsend + 1
    end do

    if(r%nrecv>0)then
       allocate(r%recv_ip(r%nrecv))
    end if
    if(r%nsend>0)then
       allocate(r%recv_inv_ip(r%nsend))
    end if

    !Setup processor lookups
    count_s = 0 ; count_r = 0
    do ip = 0, nproc - 1
       if (r%from(ip)%nn > 0 .and. ip /= iproc) then
          count_s = count_s + 1
          r%recv_inv_ip(count_s) = ip
       endif
       if (r%to(ip)%nn > 0 .and. ip /= iproc) then
          count_r = count_r + 1
          r%recv_ip(count_r) = ip
       end if
    end do

    !//Persistent comms -- Note could use these buffers in nonblocking case as well
    if (opt_redist_persist) then
       !Now allocate some buffers
       !Only implemented complex and real so far
       if (r%char == 'c' .or. r%char == 'r') then
          if (r%nrecv>0) then
             allocate(r%buff_recv(r%nrecv))
             count_r = 0
             do ip = 0, nproc - 1
                if (r%to(ip)%nn > 0 .and. ip /= iproc) then
                   count_r=count_r+1
                   if (r%char == 'c') then
                      allocate(r%buff_recv(count_r)%complex_buffer(r%to(ip)%nn))
                   else if (r%char == 'r') then
                      allocate(r%buff_recv(count_r)%real_buffer(r%to(ip)%nn))
                   end if
                end if
             end do
             allocate(r%recv_hand(r%nrecv))
             r%recv_hand = mp_request_null
             allocate(r%send_inv_hand(r%nrecv))
             r%send_inv_hand = mp_request_null
          end if

          if (r%nsend>0) then
             allocate(r%buff_send(r%nsend))
             count_s = 0
             do ip = 0, nproc - 1
                if (r%from(ip)%nn > 0 .and. ip /= iproc) then
                   count_s=count_s+1
                   if (r%char == 'c') then
                      allocate(r%buff_send(count_s)%complex_buffer(r%from(ip)%nn))
                   else if (r%char == 'r') then
                      allocate(r%buff_send(count_s)%real_buffer(r%from(ip)%nn))
                   end if
                end if
             end do
             allocate(r%send_hand(r%nsend))
             r%send_hand = mp_request_null
             allocate(r%recv_inv_hand(r%nsend))
             r%recv_inv_hand = mp_request_null
          end if

          !Setup communications
          count_s = 0 ;  count_r = 0
          do ip = 0, nproc - 1
             if (r%from(ip)%nn > 0 .and. ip /= iproc) then
                count_s = count_s+1
                if (r%char == 'c') then
                   call send_init(r%buff_send(count_s)%complex_buffer, &
                        ip, iproc, r%send_hand(count_s))
                   call recv_init(r%buff_send(count_s)%complex_buffer, &
                        ip, ip, r%recv_inv_hand(count_s))
                else if (r%char == 'r') then
                   call send_init(r%buff_send(count_s)%real_buffer, &
                        ip, iproc, r%send_hand(count_s))
                   call recv_init(r%buff_send(count_s)%real_buffer, &
                        ip, ip, r%recv_inv_hand(count_s))
                end if
             end if

             if (r%to(ip)%nn > 0 .and. ip /= iproc) then
                count_r = count_r+1
                if (r%char == 'c') then
                   call recv_init(r%buff_recv(count_r)%complex_buffer, &
                        ip, ip, r%recv_hand(count_r))
                   call send_init(r%buff_recv(count_r)%complex_buffer, &
                        ip, iproc, r%send_inv_hand(count_r))
                else if (r%char == 'r') then
                   call recv_init(r%buff_recv(count_r)%real_buffer, &
                        ip, ip, r%recv_hand(count_r))
                   call send_init(r%buff_recv(count_r)%real_buffer, &
                        ip, iproc, r%send_inv_hand(count_r))
                end if
             end if
          end do
       end if
    else
       ! Disable opt_redist_persist if we don't support the type
       opt_redist_persist = .false.
    end if

  end subroutine setup_persistent

  !> FIXME : Add documentation
  subroutine init_redist_gs2 (r, char, to_low, to_high, to_list, &
       from_low, from_high, from_list, ierr)

    use mp, only: iproc, nproc, proc0, mp_abort, sum_allreduce
    implicit none
    type (redist_type), intent (inout) :: r
    character(1), intent (in) :: char
    type (index_list_type), dimension (0:nproc-1), intent (in) :: to_list, from_list
    integer, dimension(:), intent (in) :: to_low, from_low, to_high, from_high
    integer, optional, intent (out) :: ierr
    integer :: ip, n_to, n_from, buff_size
    integer :: ndim_to, ndim_from, idim
    logical, parameter :: debug = .false.
    integer, dimension(2) :: nn_total

    ! Check consistency of inputs
    if (size(to_low) /= size(to_high)) then
       call mp_abort("Inconisistent sizes of to_low/to_high in init_redist_gs2.", .true.)
    end if
    if (size(from_low) /= size(from_high)) then
       call mp_abort("Inconisistent sizes of from_low/from_high in init_redist_gs2.", .true.)
    end if

    ndim_to = size(to_low)
    ndim_from = size(from_low)

    if (ndim_to > 6) then
       call mp_abort("Too many to_low values passed in init_redist_gs2. Maximum of 6 supported.", .true.)
    end if
    if (ndim_from > 6) then
       call mp_abort("Too many from_low values passed in init_redist_gs2. Maximum of 6 supported.", .true.)
    end if

    allocate (r%to(0:nproc-1), r%from(0:nproc-1))
    r%char=char

    if (present(ierr)) ierr = 0
    buff_size = 0

    ! Initialise limits
    r%from_low = 1
    r%to_low = 1
    r%from_high = 0
    r%to_high = 0

    ! Copy the passed in limits
    r%from_low(1:ndim_from) = from_low
    r%to_low(1:ndim_to) = to_low
    r%from_high(1:ndim_from) = from_high
    r%to_high(1:ndim_to) = to_high

    do ip = 0, nproc - 1
       if (allocated(to_list(ip)%first)) then
          n_to = size(to_list(ip)%first)
          r%to(ip)%nn = n_to
          if (ip /= iproc) buff_size = max(buff_size, n_to)

          idim = 0

          ! Copy indexing arrays
          call copy_indices_and_sanity_check(r%to(ip)%k, to_list(ip)%first, &
               idim, r%to_low, r%to_high)
          call copy_indices_and_sanity_check(r%to(ip)%l, to_list(ip)%second, &
               idim, r%to_low, r%to_high)
          call copy_indices_and_sanity_check(r%to(ip)%m, to_list(ip)%third, &
               idim, r%to_low, r%to_high)
          call copy_indices_and_sanity_check(r%to(ip)%n, to_list(ip)%fourth, &
               idim, r%to_low, r%to_high)
          call copy_indices_and_sanity_check(r%to(ip)%o, to_list(ip)%fifth, &
               idim, r%to_low, r%to_high)
          call copy_indices_and_sanity_check(r%to(ip)%p, to_list(ip)%sixth, &
               idim, r%to_low, r%to_high)

          if (idim /= ndim_to) then
             call mp_abort("Passed to_list has different number of&
                  & allocated members than passed to_low has&
                  & dimensions.", .true.)
          end if

       else
          r%to(ip)%nn = 0
       endif
    enddo

    do ip = 0, nproc - 1
       if (allocated(from_list(ip)%first)) then
          n_from = size(from_list(ip)%first)
          if (ip /= iproc) buff_size = max(buff_size, n_from)

          r%from(ip)%nn = n_from

          idim = 0

          ! Copy indexing arrays
          call copy_indices_and_sanity_check(r%from(ip)%k, from_list(ip)%first, &
               idim, r%from_low, r%from_high)
          call copy_indices_and_sanity_check(r%from(ip)%l, from_list(ip)%second, &
               idim, r%from_low, r%from_high)
          call copy_indices_and_sanity_check(r%from(ip)%m, from_list(ip)%third, &
               idim, r%from_low, r%from_high)
          call copy_indices_and_sanity_check(r%from(ip)%n, from_list(ip)%fourth, &
               idim, r%from_low, r%from_high)
          call copy_indices_and_sanity_check(r%from(ip)%o, from_list(ip)%fifth, &
               idim, r%from_low, r%from_high)
          call copy_indices_and_sanity_check(r%from(ip)%p, from_list(ip)%sixth, &
               idim, r%from_low, r%from_high)

          if (idim /= ndim_from) then
             call mp_abort("Passed from_list has different number of&
                  & allocated members than passed from_low has&
                  & dimensions.", .true.)
          end if
       else
          r%from(ip)%nn = 0
       endif
    enddo

    ! Minor sanity check for the local copy part of the redistribute
    if (r%to(iproc)%nn /= r%from(iproc)%nn) then
       call mp_abort('Inconsistent amount of data to copy locally.', .true.)
    end if

    ! Additional checks to aid with debugging
    if (debug) then
       ! Check total amount of data to be sent matches total amount to receive
       nn_total = 0
       nn_total(1) = sum(r%to%nn)
       nn_total(2) = sum(r%from%nn)
       call sum_allreduce(nn_total)
       if (nn_total(1) /= nn_total(2)) then
          call mp_abort('Total amount of data to be sent and received in disagreement.', .true.)
       end if
    end if

    if (buff_size > 0) then
       select case (r%char)
       case ('c')
          allocate (r%complex_buff(buff_size))
       case ('r')
          allocate (r%real_buff(buff_size))
       case ('i')
          allocate (r%integer_buff(buff_size))
       case ('l')
          allocate (r%logical_buff(buff_size))
       case default
          if (proc0) then
             write(*,*) 'Type to be redistributed invalid.  Must stop.'
             write(*,*) r%char
          endif
          call mp_abort('Type to be redistributed invalid.  Must stop.')
       end select
    end if

    !Setup buffers etc.
    call setup_persistent(r)

  contains

    subroutine copy_indices_and_sanity_check(destination, source, dim_counter, llims, ulims)
      implicit none
      integer, dimension(:), allocatable, intent(in out) :: destination
      integer, dimension(:), allocatable, intent(in) :: source
      integer, intent(in out) :: dim_counter
      integer, dimension(:), intent(in) :: llims, ulims

      if (.not. allocated(source)) return

      ! Update the dimension counter
      dim_counter = dim_counter + 1

      ! Copy into destination
      destination = source

      ! Sanity check values -- ensure indices are not outside the lower/upper
      ! limits for the array(s) we will be indexing
      if (any(destination < llims(dim_counter) .or. destination > ulims(dim_counter))) then
         call mp_abort("Passed indices out of bounds in init_redist_gs2", .true.)
      end if
    end subroutine copy_indices_and_sanity_check
  end subroutine init_redist_gs2

  !> FIXME : Add documentation
  subroutine init_redist_agk (r, char, to_low, to_high, to_list, &
       from_low, from_high, from_list, ierr)
    use mp, only: nproc
    implicit none
    type (redist_type), intent (inout) :: r
    character(1), intent (in) :: char
    type (index_list_type), dimension (0:nproc-1), intent (in) :: to_list, from_list
    integer, intent (in) :: to_low
    integer, dimension(:), intent (in) :: from_low, to_high, from_high
    integer, optional, intent (out) :: ierr
    integer, dimension(:), allocatable :: full_to_low
    allocate(full_to_low, mold = to_high)
    full_to_low = 1
    full_to_low(size(to_high)) = to_low
    call init_redist(r, char, full_to_low, to_high, to_list, &
         from_low, from_high, from_list, ierr)
  end subroutine init_redist_agk

  !> FIXME : Add documentation  
  subroutine init_fill (f, char, to_low, to_high, to_list, &
       from_low, from_high, from_list, ierr)
    use mp, only: nproc
    implicit none
    type (redist_type), intent (out) :: f
    character(1), intent (in) :: char
    type (index_list_type), dimension (0:nproc-1), intent (in) :: to_list, from_list
    integer, dimension(:), intent (in) :: to_low, from_low, to_high, from_high
    integer, optional, intent (out) :: ierr

    call init_redist(f, char, to_low, to_high, to_list, &
         from_low, from_high, from_list, ierr)
  end subroutine init_fill

  !> FIXME : Add documentation  
  subroutine delete_redist(r)
    use mp, only: free_request
    implicit none
    type (redist_type), intent (in out) :: r

    if (allocated(r%to)) deallocate (r%to)
    if (allocated(r%from)) deallocate (r%from)

    if (allocated (r%complex_buff)) deallocate (r%complex_buff)
    if (allocated (r%real_buff))    deallocate (r%real_buff)
    if (allocated (r%integer_buff)) deallocate (r%integer_buff)
    if (allocated (r%logical_buff)) deallocate (r%logical_buff)

    !Persistent related
    if(allocated(r%recv_hand))then
       call free_request(r%recv_hand)
       deallocate(r%recv_hand)
    endif
    if(allocated(r%send_hand))then
       call free_request(r%send_hand)
       deallocate(r%send_hand)
    endif
    if(allocated(r%recv_inv_hand))then
       call free_request(r%recv_inv_hand)
       deallocate(r%recv_inv_hand)
    endif
    if(allocated(r%send_inv_hand))then
       call free_request(r%send_inv_hand)
       deallocate(r%send_inv_hand)
    endif
    if(allocated(r%recv_ip)) deallocate(r%recv_ip)
    if(allocated(r%recv_inv_ip)) deallocate(r%recv_inv_ip)
    if(allocated(r%buff_send)) deallocate(r%buff_send)
    if(allocated(r%buff_recv)) deallocate(r%buff_recv)
  end subroutine delete_redist

  !> FIXME : Add documentation  
  subroutine delete_list(list)
    use mp, only: nproc
    implicit none
    type (index_list_type), dimension(0:nproc-1), intent (inout) :: list
    integer :: ip

    do ip = 0, nproc-1
       if(allocated(list(ip)%first)) deallocate(list(ip)%first)
       if(allocated(list(ip)%second)) deallocate(list(ip)%second)
       if(allocated(list(ip)%third)) deallocate(list(ip)%third)
       if(allocated(list(ip)%fourth)) deallocate(list(ip)%fourth)
       if(allocated(list(ip)%fifth)) deallocate(list(ip)%fifth)
       if(allocated(list(ip)%sixth)) deallocate(list(ip)%sixth)
    enddo
  end subroutine delete_list

  !> FIXME : Add documentation  
  subroutine c_redist_12 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension (r%from_low(1):), intent (in) :: from_here
    complex, dimension (r%to_low(1):,r%to_low(2):), intent (in out) :: to_here
    real :: mp_total, mp_total_after
    integer :: i, idp, ipto, ipfrom, iadp

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%from(iproc)%nn
       to_here(r%to(iproc)%k(i), r%to(iproc)%l(i)) &
            = from_here(r%from(iproc)%k(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i)) 
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine c_redist_12

  !> FIXME : Add documentation  
  subroutine c_redist_22 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here
    real :: mp_total, mp_total_after
    
    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    !If overlapping then start comms now 
    if(opt_redist_persist_overlap) call c_redist_22_mpi_copy_persist_start(r, from_here)

    ! redistribute from local processor to local processor
    ! The flag opt_local_copy is set by the user in the input 
    ! file to specify whether optimised local copy routines are used.
    ! These c_redist_22_new_copy routine is the new local copy 
    ! functionality where indirect addressing has largely been removed.
    ! c_redist_22_old_copy is the original local copy code.
    if(opt_local_copy .and. (r%redistname .eq. 'x2y')) then
       ! c_redist_22_new_copy is the new local copy functionality where 
       ! indirect addressing has largely been removed
       call c_redist_22_new_copy(r, from_here, to_here)
    else
       ! c_redist_22_old_copy is the original local copy functionality
       call c_redist_22_old_copy(r, from_here, to_here)
    end if
    
    ! c_redist_22_mpi_copy contains all the remote to local 
    ! copy functionality
    !<DD> Pick from non-blocking and blocking approach
    if(opt_redist_nbk)then
       if(opt_redist_persist)then
          if(.not.opt_redist_persist_overlap) call c_redist_22_mpi_copy_persist_start(r, from_here)
          call c_redist_22_mpi_copy_persist_end(r, to_here)
       else
          call c_redist_22_mpi_copy_nonblock(r, from_here, to_here)
       endif
    else
       call c_redist_22_mpi_copy(r, from_here, to_here)
    endif

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine c_redist_22

  !> FIXME : Add documentation  
  subroutine c_redist_22_old_copy (r, from_here, to_here)
    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    integer :: i, nn

!CMR 
! In the GS2 standard FFT situation this routine maps 
!         xxf(it,ixxf) to yxf(ik,iyxf) data type 
!         where it is kx (or x) index, ik is ky (or y) index, 
!         ixxf is (y,ig,isgn,"les") and iyxf is (x,ig,isgn,"les") 
!
    nn = r%from(iproc)%nn

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP PRIVATE(i) &
    !$OMP SHARED(nn, to_here, from_here, iproc, r) &
    !$OMP SCHEDULE(static)
    do i = 1, nn
!
! redistribute from local processor to local processor
! NB r%from(iproc)%nn is #elements sent by THIS processor to THIS processor
!    In this situation the data at (r%to(iproc)%k(i),r%to(iproc)%l(i)) 
!    should come from (r%from(iproc)%k(i), r%from(iproc)%l(i)). 
!
! This do loop, in GS2 standard FFT situation, corresponds to:
!    to_here(ik,iyxf)=from_here(it,ixxf)
!
       to_here(r%to(iproc)%k(i), &
               r%to(iproc)%l(i)) &
               = from_here(r%from(iproc)%k(i), &
                           r%from(iproc)%l(i))
    end do
    !$OMP END PARALLEL DO
  end subroutine c_redist_22_old_copy

  !> FIXME : Add documentation  
  subroutine c_redist_22_new_copy (r, from_here, to_here)
!=====================================================================
!AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing
!=====================================================================
!
! AJ, June 2011:
!  Modified LOCAL COPY part of c_redist_22 routine, as used by GS2 to 
!  transform xxf data type to yxf data type.
!  Here we REDUCE indirect addressing and cache load by EXPLOITING 
!  understanding of xxf and yxf data types in GS2.
! 
    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    integer :: i,ik,it,itmin,itmax,it_nlocal,ixxf,iyxf

    i = 1
!AJ Loop over all local copies from THIS proc (iproc) to THIS proc
    do while (i .le. r%from(iproc)%nn)
!AJ Initialise the inner loop indices from the existing indirect addresses
       itmin = r%from(iproc)%k(i)
       ixxf = r%from(iproc)%l(i)
       ik = r%to(iproc)%k(i)
       iyxf = r%to(iproc)%l(i)
!AJ it_nlocal is max #it-indices that can be accommodated on iproc
       it_nlocal = (yxf_lo_ulim_proc+1) - iyxf
!AJ itmax selects either the it_nlocal computed above or the maximum
!AJ yxf_lo%nx space available to this process.
       itmax = min((itmin-1)+it_nlocal,nx)
       do it = itmin,itmax
          to_here(ik,iyxf) = from_here(it,ixxf)
          iyxf = iyxf + 1
          i = i + 1
       end do
    end do

  end subroutine c_redist_22_new_copy

  !> FIXME : Add documentation  
  subroutine c_redist_22_mpi_copy (r, from_here, to_here)
    use mp, only: iproc, nproc, send, receive
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

  end subroutine c_redist_22_mpi_copy

  !> This routine does redistributes using entirely non-blocking
  !> comms.
  !>
  !> @note This routine may use more memory than blocking approach
  !> --> Can perform slower than blocking routine when redistribute
  !> is particularly large, else typically gives better performance
  subroutine c_redist_22_mpi_copy_nonblock(r, from_here, to_here)
    use mp, only: iproc, send, receive
    use mp, only: barrier,waitall,nbsend,nbrecv
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):  &
         ), intent (in) :: from_here
    complex, dimension ( &
         r%to_low(1):,   &
         r%to_low(2):    &
         ), intent (in out) :: to_here
    complex, dimension(:, :), allocatable :: r_buff, s_buff
    integer, dimension(:), allocatable :: recvhand, sendhand
    integer :: i, ip, nn, nrecv, nsend, j

    !Work out how big recv buffer has to be and allocate it
    nrecv = r%nrecv
    if ( nrecv > 0) then
       allocate( r_buff(size(r%complex_buff), nrecv))
       allocate( recvhand(nrecv))
    end if

    !Now work out how big send buffer has to be, allocate it and fill it
    nsend = r%nsend
    if (nsend > 0) then
       allocate( s_buff(size(r%complex_buff),nsend))
       allocate( sendhand(nsend))
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, r, s_buff, from_here, sendhand, iproc)
       do i = 1, nsend
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?

          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             s_buff(j, i) = from_here( &
                  r%from(ip)%k(j),     &
                  r%from(ip)%l(j)      &
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if

    !Post receives
    !Probably not much advantage to doing these ASAP as not going to do anything
    !until data is on its way --> Could post sends first so chance of data being
    !available as soon as receive posted.
    if (nrecv > 0) then
       ! Loops of this form require us to be able to call MPI routines
       ! from any thread. Given there is little work in this loop
       ! we may consider removing OpenMP pragmas if it allows us to
       ! drop to MPI_THREAD_FUNNELED

       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn) &
       !$OMP SHARED(nrecv, r, r_buff, recvhand) &
       !$OMP SCHEDULE(static)
       do i = 1, nrecv
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i))
       end do
       !$OMP END PARALLEL DO
    end if

    if (nrecv > 0) then
       !Now we don't have anything to do until all the data has arrived (so should we post sends first?)
       !We could do a wait all but may be better to keep on checking each message seperately so that when
       !it arrives we can copy out whilst waiting for others to arrive. 
       !Start with a wait all
       call waitall(nrecv, recvhand)

       !Now unpack data.
       ! We choose to distribute the inner loop, unpacking data, rather
       ! than the outer loop (distributing messages) as we expect
       ! the message size nn to be rather variable.
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, to_here, r, r_buff)
       do i = 1, nrecv
          !Which processor is this data from? Would we be better looping over all procs and just checking
          !if that's one to unpack? Removes memory lookup of array but requires branching and incrementing
          !a counter to work out the equivalent of i here (i.e. which receive we're looking at)         
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !Could this be vectorised and is it useful to do so?
          !Could we make r_buff a pointer array so that MPI put data directly into
          !to_here array? Would this be beneficial? Would need to be done before RECVs posted

          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(            &
                  r%to(ip)%k(j), &
                  r%to(ip)%l(j)  &
                  ) = r_buff(j, i)
          enddo
          !$OMP END DO NOWAIT
       enddo
       !$OMP END PARALLEL

       deallocate(r_buff)
       deallocate(recvhand)
    endif

    if (nsend > 0) then
       call waitall(nsend, sendhand)
       deallocate(s_buff)
       deallocate(sendhand)
    endif

  end subroutine c_redist_22_mpi_copy_nonblock

  !> Launch persistent communications associated with redistribute.
  !> Packs send buffer.
  subroutine c_redist_22_mpi_copy_persist_start(r, from_here)
    use mp, only: start_comm
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):  &
         ), intent (in) :: from_here
    integer :: i, ip, nn, nrecv, nsend, j

    !Start recv comms
    nrecv = r%nrecv
    if (nrecv > 0) then
       call start_comm(r%recv_hand)
    end if

    !Pack buffer and start communication
    nsend = r%nsend
    if (nsend > 0) then
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, r, from_here)
       do i = 1, nsend
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             r%buff_send(i)%complex_buffer(j) = from_here( &
                  r%from(ip)%k(j),           &
                  r%from(ip)%l(j)            &
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call start_comm(r%send_hand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if
  end subroutine c_redist_22_mpi_copy_persist_start

  !> Finish persistent communications associated with redistribute.
  !> Unpacks receive buffer.
  subroutine c_redist_22_mpi_copy_persist_end(r, to_here)
    use mp, only: waitall
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%to_low(1):,   &
         r%to_low(2):    &
         ), intent (in out) :: to_here
    integer :: i, ip, nn, nrecv, nsend, j

    !Wait for recv communications to complete and unpack buffer
    nrecv = r%nrecv
    if (nrecv > 0) then
       !Now we don't have anything to do until all the data has arrived (so should we post sends first?)
       !We could do a wait all but may be better to keep on checking each message seperately so that when
       !it arrives we can copy out whilst waiting for others to arrive. 
       !Start with a wait all
       call waitall(nrecv, r%recv_hand)

       !Now unpack data
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, to_here, r)
       do i = 1, nrecv
          !Which processor is this data from?
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(            &
                  r%to(ip)%k(j), &
                  r%to(ip)%l(j)  &
                  ) = r%buff_recv(i)%complex_buffer(j)
          enddo
          !$OMP END DO NOWAIT
       enddo
       !$OMP END PARALLEL
    endif

    !Wait for all sends to complete
    nsend = r%nsend
    if (nsend > 0) then
       call waitall(nsend, r%send_hand)
    endif
  end subroutine c_redist_22_mpi_copy_persist_end

  !> FIXME : Add documentation  
  subroutine c_redist_22_inv (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: get_mp_times
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in out) :: to_here
    real :: mp_total, mp_total_after

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    !If overlapping then start comms now
    if(opt_redist_persist_overlap) call c_redist_22_inv_mpi_copy_persist_start(r, from_here)
    ! redistribute from local processor to local processor
    if(opt_local_copy .and. (r%redistname .eq. 'x2y')) then
       ! c_redist_22_inv_new_copy is the new local copy functionality where 
       ! indirect addressing has largely been removed
       call c_redist_22_inv_new_copy(r, from_here, to_here)
    else
       ! c_redist_22_inv_old_copy is the original local copy functionality
       call c_redist_22_inv_old_copy(r, from_here, to_here)
    end if

    ! c_redist_22_inv_mpi_copy contains all the remote to local 
    ! copy functionality
    !<DD>
    if(opt_redist_nbk)then
       if(opt_redist_persist)then
          if(.not.opt_redist_persist_overlap) call c_redist_22_inv_mpi_copy_persist_start(r, from_here)
          call c_redist_22_inv_mpi_copy_persist_end(r, to_here)
       else
          call c_redist_22_inv_mpi_copy_nonblock(r, from_here, to_here)
       endif
    else
       call c_redist_22_inv_mpi_copy(r, from_here, to_here)
    endif

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine c_redist_22_inv

  !> FIXME : Add documentation  
  subroutine c_redist_22_inv_old_copy (r, from_here, to_here)
    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in out) :: to_here

    integer :: i, nn

!CMR 
! In the GS2 standard FFT situation this routine maps 
!         yxf(ik,iyxf) to xxf(it,ixxf) data type 
!         where it is kx (or x) index, ik is ky (or y) index, 
!         ixxf is (y,ig,isgn,"les") and iyxf is (x,ig,isgn,"les") 
!

    nn = r%to(iproc)%nn

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP SHARED(nn, to_here, from_here, r, iproc) &
    !$OMP SCHEDULE(static)
    do i = 1, nn
!
! redistribute from local processor to local processor
! NB r%from(iproc)%nn is #elements sent by THIS processor to THIS processor
!    In this situation the data at (r%from(iproc)%k(i),r%from(iproc)%l(i)) 
!    should come from (r%to(iproc)%k(i), r%to(iproc)%l(i)). 
!
! This do loop, in GS2 standard FFT situation, corresponds to:
!    to_here(it,ixxf)=from_here(ik,iyxf)
!
       to_here(r%from(iproc)%k(i), &
               r%from(iproc)%l(i)) &
               = from_here(r%to(iproc)%k(i), &
                           r%to(iproc)%l(i))
    end do
    !$OMP END PARALLEL DO
  end subroutine c_redist_22_inv_old_copy

  !> FIXME : Add documentation
  subroutine c_redist_22_inv_new_copy (r, from_here, to_here)
!=====================================================================
!AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing
!=====================================================================
!
! AJ, June 2011:
!  Modified LOCAL COPY part of c_redist_22_inv routine, used by GS2 to 
!  transform yxf data type back to xxf data type.
!  Here we REDUCE indirect addressing and cache load by EXPLOITING 
!  understanding of xxf and yxf data types in GS2.
! 
    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in out) :: to_here

    integer :: i,ik,it,itmin,itmax,it_nlocal,ixxf,iyxf

    i = 1
!AJ Loop over all local copies from THIS proc (iproc) to THIS proc
    do while (i .le. r%to(iproc)%nn)
!AJ Initialise the inner loop indices from the existing indirect addresses
       itmin = r%from(iproc)%k(i)
       ixxf = r%from(iproc)%l(i)
       ik = r%to(iproc)%k(i)
       iyxf = r%to(iproc)%l(i)
!AJ it_nlocal is max #it-indices that can be accommodated on iproc
       it_nlocal = (yxf_lo_ulim_proc+1) - iyxf
!AJ itmax selects either the it_nlocal computed above or the maximum
!AJ yxf_lo%nx space available to this process.
       itmax = min((itmin-1)+it_nlocal,nx)
       do it = itmin,itmax
          to_here(it,ixxf) = from_here(ik,iyxf)
          iyxf = iyxf + 1
          i = i + 1
       end do
    end do

  end subroutine c_redist_22_inv_new_copy

  !> FIXME : Add documentation
  subroutine c_redist_22_inv_mpi_copy (r, from_here, to_here)
    use mp, only: iproc, nproc, send, receive
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in out) :: to_here


    integer :: i, idp, ipto, ipfrom, iadp

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%complex_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%complex_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

       end if
    end do

  end subroutine c_redist_22_inv_mpi_copy

  !> This routine does redistributes using entirely non-blocking
  !> comms.
  !>
  !> @note This routine may use more memory than blocking approach
  !> --> Can perform slower than blocking routine when redistribute
  !> is particularly large, else typically gives better performance
  subroutine c_redist_22_inv_mpi_copy_nonblock(r, from_here, to_here)
    use mp, only: iproc, send, receive
    use mp, only: barrier,waitall,nbsend,nbrecv
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):  &
         ), intent (in out) :: to_here
    complex, dimension (&
         r%to_low(1):,  &
         r%to_low(2):   &
         ), intent (in) :: from_here
    complex, dimension(:, :), allocatable :: r_buff, s_buff
    integer, dimension(:), allocatable :: recvhand, sendhand
    integer :: i, ip, nn, nrecv, nsend, j

    !Work out how big recv buffer has to be and allocate it
    ! Note that we use redistribute type's nsend (not nrecv) because we are
    ! doing the inverse transform.
    nrecv = r%nsend
    if (nrecv > 0) then
       allocate( r_buff(size(r%complex_buff), nrecv))
       allocate( recvhand(nrecv))
    end if

    !Now work out how big send buffer has to be, allocate it and fill it
    ! Note that we use redistribute type's nrecv (not nsend) because we are
    ! doing the inverse transform.
    nsend = r%nrecv
    if (nsend > 0) then
       allocate( s_buff(size(r%complex_buff), nsend))
       allocate( sendhand(nsend))
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, s_buff, from_here, r, iproc, sendhand)
       do i = 1, nsend
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             s_buff(j, i) = from_here( &
                  r%to(ip)%k(j),       &
                  r%to(ip)%l(j)        &
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if

    !Post receives
    if (nrecv > 0) then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn) &
       !$OMP SHARED(nrecv, r, r_buff, recvhand) &
       !$OMP SCHEDULE(static)
       do i = 1, nrecv
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i))
       end do
       !$OMP END PARALLEL DO
    end if

    if (nrecv > 0) then
       !Now we don't have anything to do until all the data has arrived (so should we post sends first?)
       !We could do a wait all but may be better to keep on checking each message seperately so that when
       !it arrives we can copy out whilst waiting for others to arrive. 
       !Start with a wait all
       call waitall(nrecv, recvhand)

       !Now unpack data
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, r, to_here, r_buff)
       do i = 1, nrecv
          !Which processor is this data from
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(              &
                  r%from(ip)%k(j), &
                  r%from(ip)%l(j)  &
                  ) = r_buff(j, i)
          enddo
          !$OMP END DO NOWAIT
       enddo
       !$OMP END PARALLEL

       deallocate(r_buff)
       deallocate(recvhand)
    endif

    if (nsend > 0) then
       call waitall(nsend, sendhand)
       deallocate(s_buff)
       deallocate(sendhand)
    endif

  end subroutine c_redist_22_inv_mpi_copy_nonblock

  !> Launch persistent communications associated with redistribute.
  !> Packs send buffer.
  subroutine c_redist_22_inv_mpi_copy_persist_start(r, from_here)
    use mp, only: start_comm
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%to_low(1):,   &
         r%to_low(2):    &
         ), intent (in) :: from_here
    integer :: i, ip, nn, nrecv, nsend, j
    !For inverse transform SEND actually refers to received data and
    !RECV refers to data to be sent

    !Post receives
    nrecv = r%nsend
    if (nrecv > 0) then
       call start_comm(r%recv_inv_hand)
    endif

    !Pack send buffer and start sends
    nsend = r%nrecv
    if (nsend > 0) then
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, r, from_here)
       do i = 1, nsend
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             r%buff_recv(i)%complex_buffer(j) = from_here( &
                  r%to(ip)%k(j),             &
                  r%to(ip)%l(j)              &
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call start_comm(r%send_inv_hand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if
  end subroutine c_redist_22_inv_mpi_copy_persist_start

  !> Finish persistent communications associated with redistribute.
  !> Unpacks receive buffer. 
  subroutine c_redist_22_inv_mpi_copy_persist_end(r, to_here)
    use mp, only: waitall
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):  &
         ), intent (in out) :: to_here
    integer :: i, ip, nn, nrecv, nsend, j
    !For inverse transform SEND actually refers to received data and
    !RECV refers to data to be sent

    !Wait for receives to complete and unpack data
    nrecv = r%nsend
    if (nrecv > 0) then
       call waitall(nrecv, r%recv_inv_hand)

       !Now unpack data
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, to_here, r)
       do i = 1, nrecv
          !Which processor is this data from
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(              &
                  r%from(ip)%k(j), &
                  r%from(ip)%l(j)  &
                  ) = r%buff_send(i)%complex_buffer(j)
          end do
          !$OMP END DO NOWAIT
       end do
       !$OMP END PARALLEL
    end if

    !Wait for sends to complete
    nsend = r%nrecv
    if (nsend > 0) then
       call waitall(nsend, r%send_inv_hand)
    end if
  end subroutine c_redist_22_inv_mpi_copy_persist_end

  !> FIXME : Add documentation  
  subroutine c_redist_32 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    real :: time_optimised_loop_1(2), time_optimised_loop_2(2)
    real :: mp_total, mp_total_after
    integer :: i
    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    !If overlapping then start comms now
    if(opt_redist_persist_overlap) call c_redist_32_mpi_copy_persist_start(r, from_here)

!AJ redistribute from local processor to local processor
!AJ The flag opt_local_copy is set by the user in the input 
!AJ file to specify whether optimised local copy routines are used.
!AJ These c_redist_32_*_copy routines are the new local copy 
!AJ functionality where indirect addressing has largely been removed
!AJ There are two different versions of the optimised local copy 
!AJ functionality which are selected at runtime (providing 
!AJ opt_local_copy is true) through a performance measurement process 
!AJ documented below.
    if(opt_local_copy .and. (r%redistname .eq.  'g2x')) then
!AJ Because there are two different optimised local copy routines 
!AJ which provide benefits for different process counts and use cases 
!AJ we use an auto-tuning method to select which routine to use.  This 
!AJ works by timing both of the optimised routines on the first run of 
!AJ this functionality and then using the quickest for the rests of the 
!AJ times this routine is called.  The optimised_choice varible is used 
!AJ to record the choice of routine (it is a SAVE variable so will maintain 
!AJ a value between calls to the routine), it is initialise to 0 when the 
!AJ code first runs.  We run each routine 4 times, only timing the last 
!AJ three runs to avoid any potential initialisation penalties.  We time 
!AJ three rather than one to deal ensure that we collect enough timing 
!AJ data.
       if(r%optimised_choice .eq. 0) then
          time_optimised_loop_1 = 0.
          time_optimised_loop_2 = 0.
          call c_redist_32_new_copy(r, from_here, to_here)
          call time_message(.false.,time_optimised_loop_1,' Optimised Loop 1')
          do i= 1,3 
             call c_redist_32_new_copy(r, from_here, to_here)
          end do
          call time_message(.false.,time_optimised_loop_1,' Optimised Loop 1')

          call c_redist_32_new_opt_copy(r, from_here, to_here)
          call time_message(.false.,time_optimised_loop_2,' Optimised Loop 2')
          do i = 1,3
             call c_redist_32_new_opt_copy(r, from_here, to_here)
          end do
          call time_message(.false.,time_optimised_loop_2,' Optimised Loop 2')
          
          if(time_optimised_loop_1(1) .gt. time_optimised_loop_2(1)) then
             r%optimised_choice = 2
          else
             r%optimised_choice = 1
          end if

!AJ This else is encountered once the optimised auto-tuning choice has 
!AJ been calculated above.
       else

          if(r%optimised_choice .eq. 1) then
             call c_redist_32_new_copy(r, from_here, to_here)
          else
             call c_redist_32_new_opt_copy(r, from_here, to_here)
          end if

       end if

!AJ This else is encountered when the optimised local copy functionality is 
!AJ not enabled.
    else
!AJ c_redist_32_old_copy is the original local copy functionality
       call c_redist_32_old_copy(r, from_here, to_here)
    end if
    
!AJ c_redist_32_mpi_copy contains all the remote to local 
!AJ copy functionality
    !<DD>
    if(opt_redist_nbk)then
       if(opt_redist_persist) then
          if(.not.opt_redist_persist_overlap) call c_redist_32_mpi_copy_persist_start(r, from_here) !DD
          call c_redist_32_mpi_copy_persist_end(r, to_here) !DD
       else
          call c_redist_32_mpi_copy_nonblock(r, from_here, to_here) !DD
       endif
    else
       call c_redist_32_mpi_copy(r, from_here, to_here)
    endif

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine c_redist_32

  !> FIXME : Add documentation
  subroutine c_redist_32_old_copy(r, from_here, to_here)
    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    integer :: i, nn

!CMR 
! In the GS2 standard FFT situation this routine maps 
!         g(ig, isgn, iglo) to xxf(it,ixxf) data type 
!         where it is kx (or x) index, ixxf is (y,ig,isgn,"les") 
!         and iglo is ("xyles")

    nn = r%from(iproc)%nn

    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP SHARED(nn, to_here, r, iproc, from_here) &
    !$OMP SCHEDULE(static)
    do i = 1, nn
!
! redistribute from local processor to local processor
! NB r%from(iproc)%nn is #elements sent by THIS processor to THIS processor
!    In this situation the data at (r%to(iproc)%k(i),r%to(iproc)%l(i)) 
!    should come from (r%from(iproc)%k(i),r%from(iproc)%l(i),r%from(iproc)%m(i)). 
!
! This do loop, in GS2 standard FFT situation, corresponds to:
!    to_here(it,ixxf)=from_here(ig,isgn,iglo)
!
       to_here(r%to(iproc)%k(i),&
               r%to(iproc)%l(i)) &
               = from_here(r%from(iproc)%k(i), &
                           r%from(iproc)%l(i), &
                           r%from(iproc)%m(i))
    end do
    !$OMP END PARALLEL DO
  end subroutine c_redist_32_old_copy

  !> FIXME : Add documentation
  subroutine c_redist_32_new_copy(r, from_here, to_here)
!=====================================================================
!AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing
!=====================================================================
!
! AJ, June 2011:
!  Modified LOCAL COPY part of c_redist_32 routine, as used by GS2 to 
!  transform g data type to xxf data type.
!  Here we REDUCE indirect addressing and cache load by EXPLOITING 
!  understanding of g and xxf data types in GS2.
!
    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    integer :: i,k,t2,t1,f3,f2,f1,fhigh,thigh,f2max
    real :: nakyrecip

    i = 1
!AJ We want to be able to divide by naky with floating point 
!AJ arithmetic so we need to convert naky to a float (hence
!AJ the need for nakyrecip).  Also, to remove the necessity 
!AJ to have a division in the body of the loop below we are 
!AJ calculating the reciprocal of naky so that instead of 
!AJ dividing by naky we can multiply  by 1/naky.
    nakyrecip = naky
    nakyrecip = 1/nakyrecip
    f2max = r%from_high(2)
!AJ Loop over all local copies from THIS proc (iproc) to THIS proc
    do while(i .le. r%from(iproc)%nn)
!AJ Initialise the inner loop indices.  As the from_here array has 
!AJ 3 indices (c_redist_32 goes from 3 indices to 2 indices in the
!AJ copy) we need two inner loops to be able to move through all 
!AJ the to_here and from_here indices.
!AJ t1 (which equates to the it index) and f3 (which equates to iglo)
!AJ do not change for the iterations of the inner loops.  f2 (isgn) 
!AJ can only be 1 or 2 so the first inner loop is restricted to at 
!AJ most 2 iterations.
       f2 = r%from(iproc)%l(i)
       f3 = r%from(iproc)%m(i)
       t1 = r%to(iproc)%k(i)
!AJ Ensure that isgn (f2) is either 1 or 2.
       do while (f2 .le. f2max)
!AJ Get initial value of f1 (which equates to ig) and t2 (which equates
!AJ to ixxf).
          f1 = r%from(iproc)%k(i)
          t2 = r%to(iproc)%l(i)
!AJ Work out the maximum value ixxf (t2) can have by calculating the range 
!AJ of ixxf that this process owns.  We step through t2 by naky each iteration
!AJ so the line below calculates thigh as the number of ixxf steps for these 
!AJ inner loops of the operation owned by this process.
          thigh = ceiling(((xxf_lo_ulim_proc+1) - t2)*nakyrecip)
!AJ from_here and to_here are calculated for c_redist_32 using ig, isgn, 
!AJ and iglo.  The calculation of the maximum number if ixxf steps above can, 
!AJ therefore, be used to also calculate the maximum number of ig setsp.  THis 
!AJ is what the following line does.
          thigh = thigh + (f1-1)
!AJ Finally check the actual number of inner loop steps by ensuring that the 
!AJ computed maximum bound of ig is not beyond the actual allowed maximum value 
!AJ in r%from_high(1).
          fhigh = min(thigh,r%from_high(1))
          do k = f1,fhigh
             to_here(t1,t2) = from_here(k,f2,f3)
             t2 = t2 + naky
             i = i + 1
          end do
!AJ If at the end of the inner loop we still have theoretical iterations 
!AJ left on ixxf (i.e. the calculated thigh is higher than the allowed 
!AJ maximum of ig) then move to the next isgn.  If there aren't any 
!AJ iterations left then exit the inner loops (the f2 loop).
          if(thigh .gt. r%from_high(1)) then
             f2 = f2 + 1
          else
             f2 = f2max + 1
          end if
       end do
    end do

  end subroutine c_redist_32_new_copy

  !> FIXME : Add documentation
  subroutine c_redist_32_new_opt_copy(r, from_here, to_here)
!=====================================================================
!AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing
!=====================================================================
!
! AJ, June 2011:
!  Modified LOCAL COPY part of c_redist_32 routine, as used by GS2 to 
!  transform g data type to xxf data type.
!  Here we REDUCE indirect addressing and cache load by EXPLOITING 
!  understanding of g and xxf data types in GS2.
!  This functionality is different to c_redist_32_new_copy because 
!  that routine does not always provide optimsed performance 
!  (due to the way memory reads and writes are performed, particularly 
!  the use of a strided write which triggers a write allocate cache 
!  miss).
!  This new functionality moves through the loop indices in a different 
!  order to the other optimised local copy, so instead of following the 
!  original functionality by moving through the to_here and from_here loops
!  using the i variable from i = 1 to from(iproc)%nn we start at i = 1 and 
!  then skip through i's for a certain range, then move back to i = 2 and 
!  follow the same process until all the i range has been processed.
!
    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    integer :: i,t2,t1,f3,f2,f1
    integer :: f3max,f3maxmultiple,f3incr,innermax,iincrem,iglomax,t1test
    integer :: innermaxmultiplier,outerf3limit,startf3
    real :: innermaxrealvalue,tempnaky

!AJ tempnaky is a real version (rather than integer version) of naky
!AJ It is used to enable real arthimetic rather than integer arthimetic.
    tempnaky = naky
!AJ innermaxmultiplier is the upper limit of the xxf data range owned by this process
    innermaxmultiplier = xxf_lo_ulim_proc+1

!AJ This optimised new local copy functionality is reliant on the particular 
!AJ layout used to work out how to increment through the f3 (iglo) indices.
!AJ f3maxmultiple sets the upper bound of the f3 blocks we step through.
!AJ f3incr sets the amount added to f3 at each step of the inner loops.
    select case (layout)
    case ('yxels')
       f3maxmultiple = naky*ntheta0
       f3incr = naky
    case ('yxles')
       f3maxmultiple = naky*ntheta0
       f3incr = naky
    case ('lexys')
       f3maxmultiple = nlambda*negrid*ntheta0
       f3incr = nlambda*negrid
    case ('lxyes')
       f3maxmultiple = nlambda*ntheta0
       f3incr = nlambda
    case ('lyxes')
       f3maxmultiple = nlambda*naky*ntheta0
       f3incr = nlambda*naky
    case('xyles')
       f3maxmultiple = ntheta0
       f3incr = 1
    end select
          

    i = 1
!AJ iglomax is the maximum iglo (f3) owned by this process (actually one more than
!   this as g_lo is zero indexed)
    iglomax = 1 + g_lo_ulim
!AJ t1test is used handle the it (t1) index which has a 
!AJ range 1 -> (xxf_lo%ntheta0+1)/2) and then 
!AJ (it - xxf_lo%ntheta0 + xxf_lo%nx) -> xxf_lo%nx.  t1test is used to 
!AJ identify when this change in range happens and enable the code to 
!AJ compensate for it.
    t1test  = ((ntheta0+1)/2)+1
!AJ Loop over all local copies from THIS proc (iproc) to THIS proc
    do while(i .le. r%from(iproc)%nn)

       
!AJ Initial look up of values needed to calculate innermax
!AJ and setup the first iteration of the loop below (do while i .le. innermax)
       f1 = r%from(iproc)%k(i)
       f2 = r%from(iproc)%l(i)
       f3 = r%from(iproc)%m(i)
       t1 = r%to(iproc)%k(i)
       t2 = r%to(iproc)%l(i)

!AJ outerf3limit calculates the number of f3 iterations that can be undertaken 
!AJ before reaching the next f3 point.
       outerf3limit = f3 + f3incr
       do while(f3 .lt. outerf3limit)
          
!AJ iincrem counts how much i needs to be skipped at the end of the final inner loop
!AJ The i loop below moves through the starting i's for these inner loop iterations
!AJ but the innermost loop increments through further i's without actually moving i
!AJ so once we've finished the loop below we need to move i forward to skip all the 
!AJ elements we have just processed.
          iincrem = i
          
!AJ work out the size of the inner loops body 
!AJ innermax is the range of the t2 index (ixxf) this process 
!AJ owns.
          innermaxrealvalue = (innermaxmultiplier-t2)/tempnaky
          innermax = ceiling(innermaxrealvalue)-1
!AJ If the isgn (f2) index is at maximum (2) and the ig (f1) index 
!AJ will also be at maximum if we use the innermax value calculated above 
!AJ (ig maximum is r%from_high(1) which currently is ntgrid in the code) 
!AJ then limit innermax to the difference between ig and ntgrid (which is 
!AJ ig max).
          if(f2 .eq. 2 .and. (f1+innermax) .gt. r%from_high(1)) then
             innermax = i + (ntgrid - f1)
!AJ If the isgn (f2) index is not at maximum, but the calculated innermax would 
!AJ take the indexes beyond whaty is allowed for ig (which is the maximum of ig 
!AJ multipled by each increment of ig) then restrict innermax to the maximum 
!AJ number of allowed ig increments.
          else if((f2 .eq. 1 .and. innermax .gt. (((2*ntgrid)+1)+(ntgrid-f1)))) then
             innermax = i + ((2*ntgrid)+1) + (ntgrid - f1)
!AJ If we have reached this else then the calculated innermax does not breach 
!AJ any of the data limits it can be used for the number of iterations of the 
!AJ inner loop. 
          else
             innermax = i + innermax
          end if
          
          do while(i .le. innermax) 
             
!AJ Get the initial address variables.  There are potentially some extra data lookups 
!AJ here so there may be scope to optimise this, although this has not been checked.
             f1 = r%from(iproc)%k(i)
             f2 = r%from(iproc)%l(i)
             f3 = r%from(iproc)%m(i)
             t1 = r%to(iproc)%k(i)
             t2 = r%to(iproc)%l(i)

!AJ Record the initial f3 variable value for the inner loops for use in the bounds checking 
!AJ at the end of the inner loops.             
             startf3 = f3
!AJ Calculate the size of the innermost loop by working out the maximum that f3 (iglo) 
!AJ can go to for this process (using the f3maxmultiple as calculated previously that 
!AJ effectively defines the f3 blocksize of this process).
             f3max = ((f3/f3maxmultiple)+1)*f3maxmultiple
!AJ Ensure that the calculate f3max is not larger than the total iglo space this process
!AJ owns.
             f3max = min(f3max,iglomax)
             do while (f3 .lt. f3max)
                to_here(t1,t2) = from_here(f1,f2,f3)                   
                f3 = f3 + f3incr
                t1 = t1 + 1
!AJ Deal with the issues with t1 (it) being a split range variable as described in the 
!AJ comment for t1test.
                if(t1 .eq. t1test) then
                   t1 = t1 - ntheta0 + nx
                end if
                iincrem = iincrem + 1
             end do
             
!AJ Increment the outermost inner loop (move forwards in the i loop one step)
             i = i + 1
                          
          end do

!AJ Ensure that we have not finished the i loop altogether
          if(i .lt. r%from(iproc)%nn .and. iincrem .lt. r%from(iproc)%nn) then
!AJ The f1, f2 and t2 lookups are required to ensure that
!AJ the calculation of innermax the next time round this loop
!AJ are correctly performed as we are now moving forward in the loop
!AJ It may be that it is better to re-order the initialisation of values 
!AJ so this happens at the beginning of the loop rather than at this point.
             f1 = r%from(iproc)%k(i)
             f2 = r%from(iproc)%l(i)
             t2 = r%to(iproc)%l(i)          
!AJ Updated to ensure the f3 loop we are in is correctly setup as with the code 
!AJ above we are moving to a new i so we need to get the correct value of f3 for this i
             f3 =  startf3 + 1
          else
!AJ If we have gone beyond the end of the i loop then force the exit of the inner loops.
!AJ This should never happen as the loop functionality is designed to always perform the 
!AJ correct number of iterations, so this check could be removed from the code.
             f3 = outerf3limit
          end if

       end do

!AJ Once all the inner loops have been iterated move the i variable on to the next point 
!AJ point to be considered.
       i = iincrem
       
    end do
    
  end subroutine c_redist_32_new_opt_copy

  !> FIXME : Add documentation  
  subroutine c_redist_32_mpi_copy(r, from_here, to_here)
    use mp, only: iproc, nproc, send, receive
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i), &
                                              r%from(ipto)%m(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i), &
                                              r%from(ipto)%m(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

  end subroutine c_redist_32_mpi_copy

  !> This routine does redistributes using entirely non-blocking
  !> comms.
  !>
  !> @note This routine may use more memory than blocking approach
  !> --> Can perform slower than blocking routine when redistribute
  !> is particularly large, else typically gives better performance
  subroutine c_redist_32_mpi_copy_nonblock(r, from_here, to_here)
    use mp, only: iproc, send, receive
    use mp, only: barrier,waitall,nbsend,nbrecv
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):, &
         r%from_low(3):  &
         ), intent (in) :: from_here
    complex, dimension ( &
         r%to_low(1):,   &
         r%to_low(2):    &
         ), intent (in out) :: to_here
    complex, dimension(:, :), allocatable :: r_buff, s_buff
    integer, dimension(:), allocatable :: recvhand, sendhand
    integer :: i, ip, nn, nrecv, nsend, j

    !Work out how big recv buffer has to be and allocate it
    nrecv = r%nrecv
    if ( nrecv > 0) then
       allocate( r_buff(size(r%complex_buff), nrecv))
       allocate( recvhand(nrecv))
    end if

    !Now work out how big send buffer has to be, allocate it and fill it
    nsend = r%nsend
    if (nsend > 0) then
       allocate( s_buff(size(r%complex_buff),nsend))
       allocate( sendhand(nsend))
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, s_buff, from_here, r, iproc, sendhand)
       do i = 1, nsend
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             s_buff(j, i) = from_here( &
                  r%from(ip)%k(j),     &
                  r%from(ip)%l(j),     &
                  r%from(ip)%m(j)     &                  
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if

    !Post receives
    !Probably not much advantage to doing these ASAP as not going to do anything
    !until data is on its way --> Could post sends first so chance of data being
    !available as soon as receive posted.
    if (nrecv > 0) then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn) &
       !$OMP SHARED(nrecv, r, r_buff, recvhand) &
       !$OMP SCHEDULE(static)
       do i = 1, nrecv
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i))
       end do
       !$OMP END PARALLEL DO
    end if

    if (nrecv > 0) then
       !Now we don't have anything to do until all the data has arrived (so should we post sends first?)
       !We could do a wait all but may be better to keep on checking each message seperately so that when
       !it arrives we can copy out whilst waiting for others to arrive. 
       !Start with a wait all
       call waitall(nrecv, recvhand)

       !Now unpack data
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, to_here, r, r_buff)
       do i = 1, nrecv
          !Which processor is this data from? Would we be better looping over all procs and just checking
          !if that's one to unpack? Removes memory lookup of array but requires branching and incrementing
          !a counter to work out the equivalent of i here (i.e. which receive we're looking at)         
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !Could this be vectorised and is it useful to do so?
          !Could we make r_buff a pointer array so that MPI put data directly into
          !to_here array? Would this be beneficial? Would need to be done before RECVs posted
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(            &
                  r%to(ip)%k(j), &
                  r%to(ip)%l(j)  &
                  ) = r_buff(j, i)
          enddo
          !$OMP END DO NOWAIT
       enddo
       !$OMP END PARALLEL

       deallocate(r_buff)
       deallocate(recvhand)
    endif

    if (nsend > 0) then
       call waitall(nsend, sendhand)
       deallocate(s_buff)
       deallocate(sendhand)
    endif

  end subroutine c_redist_32_mpi_copy_nonblock

  !> Launch persistent communications associated with redistribute.
  !> Packs send buffer.
  subroutine c_redist_32_mpi_copy_persist_start(r, from_here)
    use mp, only: start_comm
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):, &         
         r%from_low(3):  &
         ), intent (in) :: from_here
    integer :: i, ip, nn, nrecv, nsend, j

    !Start recv comms
    nrecv = r%nrecv
    if (nrecv > 0) then
       call start_comm(r%recv_hand)
    end if

    !Pack buffer and start communication
    nsend = r%nsend
    if (nsend > 0) then
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, r, from_here)
       do i = 1, nsend
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             r%buff_send(i)%complex_buffer(j) = from_here( &
                  r%from(ip)%k(j),           &
                  r%from(ip)%l(j),           &                  
                  r%from(ip)%m(j)            &
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call start_comm(r%send_hand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if
  end subroutine c_redist_32_mpi_copy_persist_start

  !> Finish persistent communications associated with redistribute.
  !> Unpacks receive buffer.
  subroutine c_redist_32_mpi_copy_persist_end(r, to_here)
    use mp, only: waitall
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%to_low(1):,   &
         r%to_low(2):    &
         ), intent (in out) :: to_here
    integer :: i, ip, nn, nrecv, nsend, j

    !Wait for recv communications to complete and unpack buffer
    nrecv = r%nrecv
    if (nrecv > 0) then
       !Now we don't have anything to do until all the data has arrived (so should we post sends first?)
       !We could do a wait all but may be better to keep on checking each message seperately so that when
       !it arrives we can copy out whilst waiting for others to arrive. 
       !Start with a wait all
       call waitall(nrecv, r%recv_hand)

       !Now unpack data
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, to_here, r)
       do i = 1, nrecv
          !Which processor is this data from?
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(            &
                  r%to(ip)%k(j), &
                  r%to(ip)%l(j)  &
                  ) = r%buff_recv(i)%complex_buffer(j)
          enddo
          !$OMP END DO NOWAIT
       enddo
       !$OMP END PARALLEL
    endif

    !Wait for all sends to complete
    nsend = r%nsend
    if (nsend > 0) then
       call waitall(nsend, r%send_hand)
    endif
  end subroutine c_redist_32_mpi_copy_persist_end

  !> FIXME : Add documentation  
  subroutine c_redist_32_inv (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in out) :: to_here

    real :: time_optimised_loop_1(2), time_optimised_loop_2(2)
    real :: mp_total, mp_total_after
    integer :: i

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    !If overlapping then start comms now
    if(opt_redist_persist_overlap) call c_redist_32_inv_mpi_copy_persist_start(r, from_here)

!AJ redistribute from local processor to local processor
!AJ The flag opt_local_copy is set by the user in the input 
!AJ file to specify whether optimised local copy routines are used.
!AJ These c_redist_32_inv_*_copy routines are the new local copy 
!AJ functionality where indirect addressing has largely been removed
!AJ There are two different versions of the optimised local copy 
!AJ functionality which are selected at runtime (providing 
!AJ opt_local_copy is true) through a performance measurement process 
!AJ documented below.
    if(opt_local_copy .and. (r%redistname .eq. 'g2x')) then
!AJ Because there are two different optimised local copy routines 
!AJ which provide benefits for different process counts and use cases 
!AJ we use an auto-tuning method to select which routine to use.  This 
!AJ works by timing both of the optimised routines on the first run of 
!AJ this functionality and then using the quickest for the rests of the 
!AJ times this routine is called.  The optimised_choice varible is used 
!AJ to record the choice of routine (it is a SAVE variable so will maintain 
!AJ a value between calls to the routine), it is initialise to 0 when the 
!AJ code first runs.  We run each routine 4 times, only timing the last 
!AJ three runs to avoid any potential initialisation penalties.  We time 
!AJ three rather than one to deal ensure that we collect enough timing 
!AJ data.
       if(r%optimised_choice_inv .eq. 0) then
          time_optimised_loop_1 = 0.
          time_optimised_loop_2 = 0.
          call c_redist_32_inv_new_copy(r, from_here, to_here)

          call time_message(.false.,time_optimised_loop_1,' Optimised Loop 1')
          do i= 1,3
             call c_redist_32_inv_new_copy(r, from_here, to_here)
          end do
          call time_message(.false.,time_optimised_loop_1,' Optimised Loop 1')

          call c_redist_32_inv_new_opt_copy(r, from_here, to_here)

          call time_message(.false.,time_optimised_loop_2,' Optimised Loop 2')
          do i = 1,3
             call c_redist_32_inv_new_opt_copy(r, from_here, to_here)
          end do
          call time_message(.false.,time_optimised_loop_2,' Optimised Loop 2')

          if(time_optimised_loop_1(1) .gt. time_optimised_loop_2(1)) then
             r%optimised_choice_inv = 2
          else
             r%optimised_choice_inv = 1
          end if

!AJ This else is encountered once the optimised auto-tuning choice has 
!AJ been calculated above.
       else
          if(r%optimised_choice_inv .eq. 1) then
!AJ c_redist_32_inv_new_copy is the new local copy functionality where 
!AJ indirect addressing has largely been removed
             call c_redist_32_inv_new_copy(r, from_here, to_here)       
          else
             call c_redist_32_inv_new_opt_copy(r, from_here, to_here)       
          end if
       end if

!AJ This else is encountered when the optimised local copy functionality is 
!AJ not enabled.
    else
!AJ c_redist_32_inv_old_copy is the original local copy functionality
       call c_redist_32_inv_old_copy(r, from_here, to_here)
    end if
       
!AJ c_redist_32_inv_mpi_copy contains all the remote to local 
!AJ copy functionality
    !<DD>
    if(opt_redist_nbk)then
       if(opt_redist_persist)then
          if(.not.opt_redist_persist_overlap) call c_redist_32_inv_mpi_copy_persist_start(r,from_here)
          call c_redist_32_inv_mpi_copy_persist_end(r,to_here)
       else
          call c_redist_32_inv_mpi_copy_nonblock(r,from_here,to_here)
       endif
    else
       call c_redist_32_inv_mpi_copy(r, from_here, to_here)
    endif

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine c_redist_32_inv

  !> FIXME : Add documentation
  subroutine c_redist_32_inv_old_copy(r, from_here, to_here)
    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in out) :: to_here

    integer :: i, nn

!CMR 
! In the GS2 standard FFT situation this routine maps 
!         xxf(it,ixxf) to g(ig, isgn, iglo) data type 
!         where it is kx (or x) index, ixxf is (y,ig,isgn,"les") 
!         and iglo is ("xyles") 

    nn = r%to(iproc)%nn
    !$OMP PARALLEL DO DEFAULT(none) &
    !$OMP SHARED(nn, to_here, r, iproc, from_here) &
    !$OMP SCHEDULE(static)
    do i = 1, nn
!
! redistribute from local processor to local processor
! NB r%to(iproc)%nn is #elements sent by THIS processor to THIS processor
!    In this situation the data at (r%from(iproc)%k(i),r%from(iproc)%l(i),r%from(iproc)%m(i)) 
!    should come from (r%to(iproc)%k(i),r%to(iproc)%l(i)). 
!
! This do loop, in GS2 standard FFT situation, corresponds to:
!    to_here(ig,isgn,iglo)=from_here(it,ixxf)
!
       to_here(r%from(iproc)%k(i), &
               r%from(iproc)%l(i), &
               r%from(iproc)%m(i)) &
               = from_here(r%to(iproc)%k(i), &
                           r%to(iproc)%l(i))
    end do
    !$OMP END PARALLEL DO
  end subroutine c_redist_32_inv_old_copy

  !> FIXME : Add documentation  
  subroutine c_redist_32_inv_new_copy(r, from_here, to_here)
!=====================================================================
!AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing
!=====================================================================
!
! AJ, June 2011:
!  Modified LOCAL COPY part of c_redist_32_inv routine, as used by GS2 to 
!  transform xxf data type to g data type.
!  Here we REDUCE indirect addressing and cache load by EXPLOITING 
!  understanding of g and xxf data types in GS2.
!  These c_redist_32_inv_* routines use exactly the same code created for 
!  the c_redist_32_* functionality, the only difference is in the actual 
!  data copy the to_here_ and from_here indices are swapped.
!

    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in out) :: to_here

    integer :: i,k,t2,t1,f3,f2,f1,fhigh,thigh,f2max
    real :: nakyrecip

    i = 1
!AJ We want to be able to divide by naky with floating point 
!AJ arithmetic so we need to convert naky to a float (hence
!AJ the need for nakyrecip).  Also, to remove the necessity 
!AJ to have a division in the body of the loop below we are 
!AJ calculating the reciprocal of naky so that instead of 
!AJ dividing by naky we can multiply  by 1/naky.
    nakyrecip = naky
    nakyrecip = 1/nakyrecip
    f2max = r%from_high(2)
!AJ Loop over all local copies from THIS proc (iproc) to THIS proc
    do while(i .le. r%to(iproc)%nn)
!AJ Initialise the inner loop indices.  As the to_here array has 
!AJ 3 indices (c_redist_32_inv goes from 2 indices to 3 indices in the
!AJ copy) we need two inner loops to be able to move through all 
!AJ the to_here and from_here indices.
!AJ t1 (which equates to the it index) and f3 (which equates to iglo)
!AJ do not change for the iterations of the inner loops.  f2 (isgn) 
!AJ can only be 1 or 2 so the first inner loop is restricted to at 
!AJ most 2 iterations.
       f2 = r%from(iproc)%l(i)
       f3 = r%from(iproc)%m(i)
       t1 = r%to(iproc)%k(i)
!AJ Ensure that isgn (f2) is either 1 or 2.
       do while (f2 .le. f2max)
!AJ Get initial value of f1 (which equates to ig) and t2 (which equates
!AJ to ixxf).
          f1 = r%from(iproc)%k(i)
          t2 = r%to(iproc)%l(i)
!AJ Work out the maximum value ixxf (t2) can have by calculating the range 
!AJ of ixxf that this process owns.  We step through t2 by naky each iteration
!AJ so the line below calculates thigh as the number of ixxf steps for these 
!AJ inner loops of the operation owned by this process.
          thigh = ceiling(((xxf_lo_ulim_proc+1) - t2)*nakyrecip)
!AJ from_here and to_here are calculated for c_redist_32 using ig, isgn, 
!AJ and iglo.  The calculation of the maximum number if ixxf steps above can, 
!AJ therefore, be used to also calculate the maximum number of ig setsp.  THis 
!AJ is what the following line does.
          thigh = thigh + (f1-1)
!AJ Finally check the actual number of inner loop steps by ensuring that the 
!AJ computed maximum bound of ig is not beyond the actual allowed maximum value 
!AJ in r%from_high(1).
          fhigh = min(thigh,r%from_high(1))
          do k = f1,fhigh
             i = i + 1
             to_here(k,f2,f3) = from_here(t1,t2)
             t2 = t2 + naky
          end do
!AJ If at the end of the inner loop we still have theoretical iterations 
!AJ left on ixxf (i.e. the calculated thigh is higher than the allowed 
!AJ maximum of ig) then move to the next isgn.  If there aren't any 
!AJ iterations left then exit the inner loops (the f2 loop).
          if(thigh .gt. r%from_high(1)) then
             f2 = f2 + 1
          else
             f2 = f2max + 1
          end if
       end do
    end do

  end subroutine c_redist_32_inv_new_copy

  !> FIXME : Add documentation
  subroutine c_redist_32_inv_new_opt_copy(r, from_here, to_here)
!=====================================================================
!AJ, June 2011: New code from DCSE project on GS2 Indirect Addressing
!=====================================================================
!
! AJ, June 2011:
!  Modified LOCAL COPY part of c_redist_32_inv routine, as used by GS2 to 
!  transform xxf data type to g data type.
!  Here we REDUCE indirect addressing and cache load by EXPLOITING 
!  understanding of g and xxf data types in GS2.
!  This functionality is different to c_redist_32_inv_new_copy because 
!  that routine does not always provide optimsed performance 
!  (due to the way memory reads and writes are performed, particularly 
!  the use of a strided write which triggers a write allocate cache 
!  miss).
!  This new functionality moves through the loop indices in a different 
!  order to the other optimised local copy, so instead of following the 
!  original functionality by moving through the to_here and from_here loops
!  using the i variable from i = 1 to to(iproc)%nn we start at i = 1 and 
!  then skip through i's for a certain range, then move back to i = 2 and 
!  follow the same process until all the i range has been processed.
!  This routine uses the same functionality as teh c_redist_32_new_opt_copy
!  routine, the only difference is in the actual copy the indices of the 
!  from_here and to_here loops are swapped.
!

    use mp, only: iproc
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in out) :: to_here

    integer :: i,t2,t1,f3,f2,f1
    integer :: f3max,f3maxmultiple,f3incr,innermax,iincrem,iglomax,t1test
    integer :: innermaxmultiplier,outerf3limit,startf3
    real :: innermaxrealvalue,tempnaky

!AJ tempnaky is a real version (rather than integer version) of naky
!AJ It is used to enable real arthimetic rather than integer arthimetic.
    tempnaky = naky
!AJ innermaxmultiplier is the upper limit of the xxf data range owned by this process
    innermaxmultiplier = xxf_lo_ulim_proc+1

!AJ This optimised new local copy functionality is reliant on the particular 
!AJ layout used to work out how to increment through the f3 (iglo) indices.
!AJ f3maxmultiple sets the upper bound of the f3 blocks we step through.
!AJ f3incr sets the amount added to f3 at each step of the inner loops.
    select case (layout)
    case ('yxels')
       f3maxmultiple = naky*ntheta0
       f3incr = naky
    case ('yxles')
       f3maxmultiple = naky*ntheta0
       f3incr = naky
    case ('lexys')
       f3maxmultiple = nlambda*negrid*ntheta0
       f3incr = nlambda*negrid
    case ('lxyes')
       f3maxmultiple = nlambda*ntheta0
       f3incr = nlambda
    case ('lyxes')
       f3maxmultiple = nlambda*naky*ntheta0
       f3incr = nlambda*naky
    case('xyles')
       f3maxmultiple = ntheta0
       f3incr = 1
    end select          
    
    i = 1
!AJ iglomax is the maximum iglo (f3) owned by this process (actually one more than
!   this as g_lo is zero indexed)
    iglomax = 1 + g_lo_ulim
!AJ t1test is used handle the it (t1) index which has a 
!AJ range 1 -> (xxf_lo%ntheta0+1)/2) and then 
!AJ (it - xxf_lo%ntheta0 + xxf_lo%nx) -> xxf_lo%nx.  t1test is used to 
!AJ identify when this change in range happens and enable the code to 
!AJ compensate for it.
    t1test  = ((ntheta0+1)/2)+1
!AJ Loop over all local copies from THIS proc (iproc) to THIS proc
    do while(i .le. r%to(iproc)%nn)

!AJ Initial look up of values needed to calculate innermax
!AJ and setup the first iteration of the loop below (do while i .le. innermax)
       f1 = r%from(iproc)%k(i)
       f2 = r%from(iproc)%l(i)
       f3 = r%from(iproc)%m(i)
       t1 = r%to(iproc)%k(i)
       t2 = r%to(iproc)%l(i)

!AJ outerf3limit calculates the number of f3 iterations that can be undertaken 
!AJ before reaching the next f3 point.
       outerf3limit = f3 + f3incr
       do while(f3 .lt. outerf3limit)
          
!AJ iincrem counts how much i needs to be skipped at the end of the final inner loop
!AJ The i loop below moves through the starting i's for these inner loop iterations
!AJ but the innermost loop increments through further i's without actually moving i
!AJ so once we've finished the loop below we need to move i forward to skip all the 
!AJ elements we have just processed.
          iincrem = i
          
!AJ work out the size of the inner loops body 
!AJ innermax is the range of the t2 index (ixxf_ this process 
!AJ owns. 
          innermaxrealvalue = (innermaxmultiplier-t2)/tempnaky
          innermax = ceiling(innermaxrealvalue)-1
!AJ If the isgn (f2) index is at maximum (2) and the ig (f1) index 
!AJ will also be at maximum if we use the innermax value calculated above 
!AJ (ig maximum is r%from_high(1) which currently is ntgrid in the code) 
!AJ then limit innermax to the difference between ig and ntgrid (which is 
!AJ ig max).
          if(f2 .eq. 2 .and. (f1+innermax) .gt. r%from_high(1)) then
             innermax = i + (ntgrid - f1)
!AJ If the isgn (f2) index is not at maximum, but the calculated innermax would 
!AJ take the indexes beyond whaty is allowed for ig (which is the maximum of ig 
!AJ multipled by each increment of ig) then restrict innermax to the maximum 
!AJ number of allowed ig increments.
          else if((f2 .eq. 1 .and. innermax .gt. (((2*ntgrid)+1)+(ntgrid-f1)))) then
             innermax = i + ((2*ntgrid)+1) + (ntgrid - f1)
!AJ If we have reached this else then the calculated innermax does not breach 
!AJ any of the data limits it can be used for the number of iterations of the 
!AJ inner loop. 
          else
             innermax = i + innermax
          end if
          
          do while(i .le. innermax) 
             
!AJ Get the initial address variables.  There are potentially some extra data lookups 
!AJ here so there may be scope to optimise this, although this has not been checked.
             f1 = r%from(iproc)%k(i)
             f2 = r%from(iproc)%l(i)
             f3 = r%from(iproc)%m(i)
             t1 = r%to(iproc)%k(i)
             t2 = r%to(iproc)%l(i)
             
!AJ Record the initial f3 variable value for the inner loops for use in the bounds checking 
!AJ at the end of the inner loops.             
             startf3 = f3
!AJ Calculate the size of the innermost loop by working out the maximum that f3 (iglo) 
!AJ can go to for this process (using the f3maxmultiple as calculated previously that 
!AJ effectively defines the f3 blocksize of this process).
             f3max = ((f3/f3maxmultiple)+1)*f3maxmultiple
!AJ Ensure that the calculate f3max is not larger than the total iglo space this process
!AJ owns.
             f3max = min(f3max,iglomax)
             do while (f3 .lt. f3max)
                to_here(f1,f2,f3) = from_here(t1,t2)                
                f3 = f3 + f3incr
                t1 = t1 + 1
!AJ Deal with the issues with t1 (it) being a split range variable as described in the 
!AJ comment for t1test.
                if(t1 .eq. t1test) then
                   t1 = t1 - ntheta0 + nx
                end if
                iincrem = iincrem + 1
             end do
             
!AJ Increment the outermost inner loop (move forwards in the i loop one step)
             i = i + 1
                          
          end do

!AJ Ensure that we have not finished the i loop altogether
          if(i .lt. r%from(iproc)%nn .and. iincrem .lt. r%from(iproc)%nn) then
!AJ The f1, f2 and t2 lookups are required to ensure that
!AJ the calculation of innermax the next time round this loop
!AJ are correctly performed as we are now moving forward in the loop
!AJ It may be that it is better to re-order the initialisation of values 
!AJ so this happens at the beginning of the loop rather than at this point.
             f1 = r%from(iproc)%k(i)
             f2 = r%from(iproc)%l(i)
             t2 = r%to(iproc)%l(i)     
!AJ Updated to ensure the f3 loop we are in is correctly setup as with the code 
!AJ above we are moving to a new i so we need to get the correct value of f3 for this i     
             f3 =  startf3 + 1
          else
!AJ If we have gone beyond the end of the i loop then force the exit of the inner loops.
!AJ This should never happen as the loop functionality is designed to always perform the 
!AJ correct number of iterations, so this check could be removed from the code.
             f3 = outerf3limit
          end if

       end do
   
!AJ Once all the inner loops have been iterated move the i variable on to the next point 
!AJ point to be considered.    
       i = iincrem
       
    end do

  end subroutine c_redist_32_inv_new_opt_copy

  !> FIXME : Add documentation  
  subroutine c_redist_32_inv_mpi_copy(r, from_here, to_here)
    use mp, only: iproc, nproc, send, receive
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in out) :: to_here


    integer :: i, idp, ipto, ipfrom, iadp

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%complex_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i), &
                        r%from(ipfrom)%m(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i), &
                        r%from(ipfrom)%m(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%complex_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

       end if
    end do

  end subroutine c_redist_32_inv_mpi_copy

  !> This routine does redistributes using entirely non-blocking
  !> comms.
  !>
  !> @note This routine may use more memory than blocking approach
  !> --> Can perform slower than blocking routine when redistribute
  !> is particularly large, else typically gives better performance
  subroutine c_redist_32_inv_mpi_copy_nonblock(r, from_here, to_here)
    use mp, only: iproc, send, receive
    use mp, only: barrier,waitall,nbsend,nbrecv
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):, &
         r%from_low(3):  &
         ), intent (in out) :: to_here
    complex, dimension (&
         r%to_low(1):,  &
         r%to_low(2):   &
         ), intent (in) :: from_here
    complex, dimension(:, :), allocatable :: r_buff, s_buff
    integer, dimension(:), allocatable :: recvhand, sendhand
    integer :: i, ip, nn, nrecv, nsend, j

    !Work out how big recv buffer has to be and allocate it
    ! Note that we use redistribute type's nsend (not nrecv) because we are
    ! doing the inverse transform.
    nrecv = r%nsend
    if (nrecv > 0) then
       allocate( r_buff(size(r%complex_buff), nrecv))
       allocate( recvhand(nrecv))
    end if

    !Now work out how big send buffer has to be, allocate it and fill it
    ! Note that we use redistribute type's nrecv (not nsend) because we are
    ! doing the inverse transform.
    nsend = r%nrecv
    if (nsend > 0) then
       allocate( s_buff(size(r%complex_buff), nsend))
       allocate( sendhand(nsend))
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, r, s_buff, from_here, iproc, sendhand)
       do i = 1, nsend
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             s_buff(j, i) = from_here( &
                  r%to(ip)%k(j),       &
                  r%to(ip)%l(j)        &
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call nbsend(s_buff(1:nn, i), nn, ip, iproc, sendhand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if

    !Post receives
    if (nrecv > 0) then
       !$OMP PARALLEL DO DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn) &
       !$OMP SHARED(nrecv, r, r_buff, recvhand) &
       !$OMP SCHEDULE(static)
       do i = 1, nrecv
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          call nbrecv(r_buff(1:nn, i), nn, ip, ip, recvhand(i))
       end do
       !$OMP END PARALLEL DO
    end if

    if (nrecv > 0) then
       !Now we don't have anything to do until all the data has arrived (so should we post sends first?)
       !We could do a wait all but may be better to keep on checking each message seperately so that when
       !it arrives we can copy out whilst waiting for others to arrive. 
       !Start with a wait all
       call waitall(nrecv, recvhand)

       !Now unpack data
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, r, to_here, r_buff)
       do i = 1, nrecv
          !Which processor is this data from
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(              &
                  r%from(ip)%k(j), &
                  r%from(ip)%l(j), &
                  r%from(ip)%m(j)  &
                  ) = r_buff(j, i)
          enddo
          !$OMP END DO NOWAIT
       enddo
       !$OMP END PARALLEL

       deallocate(r_buff)
       deallocate(recvhand)
    endif

    if (nsend > 0) then
       call waitall(nsend, sendhand)
       deallocate(s_buff)
       deallocate(sendhand)
    endif

  end subroutine c_redist_32_inv_mpi_copy_nonblock

  !> Launch persistent communications associated with redistribute.
  !> Packs send buffer.
  subroutine c_redist_32_inv_mpi_copy_persist_start(r, from_here)
    use mp, only: start_comm
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%to_low(1):,   &
         r%to_low(2):    &
         ), intent (in) :: from_here
    integer :: i, ip, nn, nrecv, nsend, j
    !For inverse transform SEND actually refers to received data and
    !RECV refers to data to be sent

    !Post receives
    nrecv = r%nsend
    if (nrecv > 0) then
       call start_comm(r%recv_inv_hand)
    endif

    !Pack send buffer and start sends
    nsend = r%nrecv
    if (nsend > 0) then
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nsend, r, from_here)
       do i = 1, nsend
          ip = r%recv_ip(i)
          nn = r%to(ip)%nn
          !Any benefit in making this a pointer assignment instead?
          !Avoids copying data here but could lead to questionable
          !memory access during the actual send?
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             r%buff_recv(i)%complex_buffer(j) = from_here( &
                  r%to(ip)%k(j),             &
                  r%to(ip)%l(j)              &
                  )
          end do
          !$OMP END DO

          !$OMP MASTER
          call start_comm(r%send_inv_hand(i))
          !$OMP END MASTER
       end do
       !$OMP END PARALLEL
    end if
  end subroutine c_redist_32_inv_mpi_copy_persist_start

  !> Finish persistent communications associated with redistribute.
  !> Unpacks receive buffer.
  subroutine c_redist_32_inv_mpi_copy_persist_end(r, to_here)
    use mp, only: waitall
    implicit none
    type (redist_type), intent (in out) :: r
    complex, dimension ( &
         r%from_low(1):, &
         r%from_low(2):, &
         r%from_low(3):  &
         ), intent (in out) :: to_here
    integer :: i, ip, nn, nrecv, nsend, j
    !For inverse transform SEND actually refers to received data and
    !RECV refers to data to be sent

    !Wait for receives to complete and unpack data
    nrecv = r%nsend
    if (nrecv > 0) then
       call waitall(nrecv, r%recv_inv_hand)

       !Now unpack data
       !$OMP PARALLEL DEFAULT(none) &
       !$OMP PRIVATE(i, ip, nn, j) &
       !$OMP SHARED(nrecv, r, to_here)
       do i = 1, nrecv
          !Which processor is this data from
          ip = r%recv_inv_ip(i)
          nn = r%from(ip)%nn
          !$OMP DO &
          !$OMP SCHEDULE(static)
          do j = 1, nn
             to_here(              &
                  r%from(ip)%k(j), &
                  r%from(ip)%l(j), &
                  r%from(ip)%m(j)  &
                  ) = r%buff_send(i)%complex_buffer(j)
          end do
          !$OMP END DO NOWAIT
       end do
       !$OMP END PARALLEL
    end if

    !Wait for sends to complete
    nsend = r%nrecv
    if (nsend > 0) then
       call waitall(nsend, r%send_inv_hand)
    end if
  end subroutine c_redist_32_inv_mpi_copy_persist_end

  !> FIXME : Add documentation  
  subroutine c_redist_42 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):, &
                        r%from_low(4):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after
    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%from(iproc)%nn
       to_here(r%to(iproc)%k(i),&
               r%to(iproc)%l(i)) &
               = from_here(r%from(iproc)%k(i), &
                           r%from(iproc)%l(i), &
                           r%from(iproc)%m(i), &
                           r%from(iproc)%n(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i), &
                                              r%from(ipto)%m(i), &
                                              r%from(ipto)%n(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i), &
                                              r%from(ipto)%m(i), &
                                              r%from(ipto)%n(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine c_redist_42

  !> FIXME : Add documentation  
  subroutine c_redist_42_inv (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):), intent (in) :: from_here

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):, &
                        r%from_low(4):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after
    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%to(iproc)%nn
       to_here(r%from(iproc)%k(i), &
               r%from(iproc)%l(i), &
               r%from(iproc)%m(i), &
               r%from(iproc)%n(i)) &
               = from_here(r%to(iproc)%k(i), &
                           r%to(iproc)%l(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%complex_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i), &
                        r%from(ipfrom)%m(i), &
                        r%from(ipfrom)%n(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i), &
                        r%from(ipfrom)%m(i), &
                        r%from(ipfrom)%n(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%complex_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine c_redist_42_inv

  !> FIXME : Add documentation
  subroutine c_redist_23 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):, &
                        r%to_low(3):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%from(iproc)%nn
       to_here(r%to(iproc)%k(i),&
               r%to(iproc)%l(i),&
               r%to(iproc)%m(i)) &
               = from_here(r%from(iproc)%k(i), &
                           r%from(iproc)%l(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i), &
                        r%to(ipfrom)%m(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i), &
                        r%to(ipfrom)%m(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine c_redist_23

  !> FIXME : Add documentation
  subroutine c_redist_34 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    complex, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in) :: from_here

    complex, dimension (r%to_low(1):, &
                        r%to_low(2):, &
                        r%to_low(3):, &
                        r%to_low(4):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%from(iproc)%nn
       to_here(r%to(iproc)%k(i),&
               r%to(iproc)%l(i),&
               r%to(iproc)%m(i),&
               r%to(iproc)%n(i)) &
               = from_here(r%from(iproc)%k(i), &
                           r%from(iproc)%l(i), &
                           r%from(iproc)%m(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i), &
                                              r%from(ipto)%m(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i), &
                        r%to(ipfrom)%m(i), &
                        r%to(ipfrom)%n(i)) &
                        = r%complex_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%complex_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i), &
                        r%to(ipfrom)%m(i), &
                        r%to(ipfrom)%n(i)) &
                        = r%complex_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%complex_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i), &
                                              r%from(ipto)%m(i))
             end do
             call send (r%complex_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if
    
  end subroutine c_redist_34

  !> FIXME : Add documentation  
  subroutine r_redist_12 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    real, dimension (r%from_low(1):), intent (in) :: from_here

    real, dimension (r%to_low(1):,r%to_low(2):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%from(iproc)%nn
       to_here(r%to(iproc)%k(i), r%to(iproc)%l(i)) &
            = from_here(r%from(iproc)%k(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%real_buff(i) = from_here(r%from(ipto)%k(i)) 
             end do
             call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%real_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%real_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%real_buff(i) = from_here(r%from(ipto)%k(i))
             end do
             call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine r_redist_12

  !> FIXME : Add documentation  
  subroutine r_redist_22 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    real, dimension (r%from_low(1):, &
                     r%from_low(2):), intent (in) :: from_here

    real, dimension (r%to_low(1):, &
                     r%to_low(2):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%from(iproc)%nn
       to_here(r%to(iproc)%k(i),&
               r%to(iproc)%l(i)) &
               = from_here(r%from(iproc)%k(i), &
                           r%from(iproc)%l(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%real_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i))
             end do
             call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%real_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%real_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%real_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i))
             end do
             call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine r_redist_22

  !> FIXME : Add documentation  
  subroutine r_redist_22_inv (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    real, dimension (r%to_low(1):, &
                     r%to_low(2):), intent (in) :: from_here

    real, dimension (r%from_low(1):, &
                        r%from_low(2):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after
    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%to(iproc)%nn
       to_here(r%from(iproc)%k(i), &
               r%from(iproc)%l(i)) &
               = from_here(r%to(iproc)%k(i), &
                           r%to(iproc)%l(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%real_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%real_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i)) &
                        = r%real_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i)) &
                        = r%real_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%real_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%real_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine r_redist_22_inv

  !> FIXME : Add documentation
  subroutine r_redist_32 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    real, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):), intent (in) :: from_here

    real, dimension (r%to_low(1):, &
                     r%to_low(2):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%from(iproc)%nn
       to_here(r%to(iproc)%k(i),&
               r%to(iproc)%l(i)) &
               = from_here(r%from(iproc)%k(i), &
                           r%from(iproc)%l(i), &
                           r%from(iproc)%m(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%real_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i), &
                                              r%from(ipto)%m(i))
             end do
             call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%real_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%to(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%to(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%to(ipfrom)%nn
                to_here(r%to(ipfrom)%k(i), &
                        r%to(ipfrom)%l(i)) &
                        = r%real_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%from(ipto)%nn > 0) then
             do i = 1, r%from(ipto)%nn
                r%real_buff(i) = from_here(r%from(ipto)%k(i), &
                                              r%from(ipto)%l(i), &
                                              r%from(ipto)%m(i))
             end do
             call send (r%real_buff(1:r%from(ipto)%nn), ipto, idp)
          end if
       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine r_redist_32

  !> FIXME : Add documentation  
  subroutine r_redist_32_inv (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    real, dimension (r%to_low(1):, &
                     r%to_low(2):), intent (in) :: from_here

    real, dimension (r%from_low(1):, &
                     r%from_low(2):, &
                     r%from_low(3):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after
    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%to(iproc)%nn
       to_here(r%from(iproc)%k(i), &
               r%from(iproc)%l(i), &
               r%from(iproc)%m(i)) &
               = from_here(r%to(iproc)%k(i), &
                           r%to(iproc)%l(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       ! avoid deadlock AND ensure mostly parallel resolution
       if (mod(iproc/iadp,2) == 0) then

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%real_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%real_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i), &
                        r%from(ipfrom)%m(i)) &
                        = r%real_buff(i)
             end do
          end if
       else
          ! receive from idpth preceding processor
          if (r%from(ipfrom)%nn > 0) then
             call receive (r%real_buff(1:r%from(ipfrom)%nn), ipfrom, idp)
             do i = 1, r%from(ipfrom)%nn
                to_here(r%from(ipfrom)%k(i), &
                        r%from(ipfrom)%l(i), &
                        r%from(ipfrom)%m(i)) &
                        = r%real_buff(i)
             end do
          end if

          ! send to idpth next processor
          if (r%to(ipto)%nn > 0) then
             do i = 1, r%to(ipto)%nn
                r%real_buff(i) = from_here(r%to(ipto)%k(i), &
                                              r%to(ipto)%l(i))
             end do
             call send (r%real_buff(1:r%to(ipto)%nn), ipto, idp)
          end if

       end if
    end do

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total_after)
       time_redist_mpi = time_redist_mpi + (mp_total_after - mp_total)
    end if

  end subroutine r_redist_32_inv

  !> FIXME : Add documentation
  subroutine r_redist_42 (r, from_here, to_here)
    use job_manage, only: time_message
    use mp, only: iproc, nproc, send, receive, get_mp_times
    implicit none
    type (redist_type), intent (in out) :: r

    real, dimension (r%from_low(1):, &
                        r%from_low(2):, &
                        r%from_low(3):, &
                        r%from_low(4):), intent (in) :: from_here

    real, dimension (r%to_low(1):, &
                     r%to_low(2):), intent (in out) :: to_here

    integer :: i, idp, ipto, ipfrom, iadp
    real :: mp_total, mp_total_after

    if (.not. using_measure_scatter) then
       call time_message(.false.,time_redist,' Redistribute')
       call get_mp_times(total_time = mp_total)
    end if

    ! redistribute from local processor to local processor
    do i = 1, r%from(iproc)%nn
       to_here(r%to(iproc)%k(i),&
               r%to(iproc)%l(i)) &
               = from_here(r%from(iproc)%k(i), &
                           r%from(iproc)%l(i), &
                           r%from(iproc)%m(i), &
                           r%from(iproc)%n(i))
    end do

    ! redistribute to idpth next processor from idpth preceding processor
    ! or redistribute from idpth preceding processor to idpth next processor
    ! to avoid deadlocks
    do idp = 1, nproc-1
       ipto = mod(iproc + idp, nproc)
       ipfrom = mod(iproc + nproc - idp, nproc)
       iadp = min(idp, nproc - idp)
       </