subroutine calculate_kinetic_energy_transfer
use theta_grid, only: ntgrid
use kt_grids, only: ntheta0
use mp, only: iproc, nproc, sum_reduce
integer :: theta_index
real, dimension(:, :, :), allocatable :: T_result_3D
allocate (T_result_3D(nkynew, ntheta0, ntheta0))
T_result_theta = 0.
T_result_theta_kxsource = 0.
T_result_theta_kxtarget = 0.
T_result_theta_kysource = 0.
do theta_index = -ntgrid, ntgrid
if (.not. is_responsible_for_this_theta_grid_index(&
iproc, theta_index, nproc, ntgrid)) cycle
call calculate_kinetic_energy_transfer_this_theta(theta_index, T_result_3D)
! Should these sums really use volume average or similar?
! Should we just write T_result_3D for each theta? Might get large!
! sum over (kxs, kys, kxt). shape (kys, kxs, kxt) -> scalar
T_result_theta(theta_index) = sum(T_result_3D)
! sum over (kxt, kys) for kyt = 0. shape (kys, kxs, kxt) -> (kxs)
T_result_theta_kxsource(:,theta_index) = sum(sum(T_result_3D, dim=1), dim=2)
! sum over (kxs, kys) for kyt = 0. shape (kys, kxs, kxt) -> (kxt)
T_result_theta_kxtarget(:,theta_index) = sum(sum(T_result_3D, dim=1), dim=1)
! sum over (kxs, kxt) for kyt = 0. shape (kys, kxs, kxt) -> (kys)
T_result_theta_kysource(:,theta_index) = sum(sum(T_result_3D, dim=2), dim=2)
end do
! only proc0 needs to know the result
if (distribute_work) then
call sum_reduce (T_result_theta, 0)
call sum_reduce (T_result_theta_kxsource, 0)
call sum_reduce (T_result_theta_kxtarget, 0)
call sum_reduce (T_result_theta_kysource, 0)
end if
end subroutine calculate_kinetic_energy_transfer