job_manage.fpp Source File


Contents

Source Code


Source Code

!> FIXME : Add documentation
module job_manage

  implicit none

  private

  public :: timer_local, time_message, init_checktime, checktime, checkstop, njobs, job_fork

  integer :: njobs
  logical, parameter :: debug = .false.
  real :: wall_clock_initial_time = 0.
  logical :: checktime_initialized = .false.

contains

  !> Returns CPU time in seconds. Currently just a thin wrapper to
  !> method of the same name in mp. In the future we probably want to
  !> remove the method here and change the relevant use statements.
  real function timer_local()
    use mp, only: timer_local_mp => timer_local
    implicit none
    timer_local = timer_local_mp()
  end function timer_local

  !> This routine counts elapsed time between two calls. Currently
  !> just a thin wrapper to method of the same name in mp. In the
  !> future we probably want to remove the method here and change the
  !> relevant use statements.
  subroutine time_message(lprint,targ,chmessage)
    use mp, only: time_message_mp => time_message
    implicit none
    character (len=*), intent(in) :: chmessage
    logical, intent(in) :: lprint
    real, intent(in out) :: targ(2) ! tsum and told
    call time_message_mp(lprint, targ, chmessage)
  end subroutine time_message

  !> FIXME : Add documentation
  subroutine job_fork (n_ensembles)
    use file_utils, only: list_name, run_name, init_job_name
    use file_utils, only: init_error_unit, init_input_unit, list_name
    use file_utils, only: futils_initialized => initialized
    use constants, only: run_name_size
    use mp, only: job, proc0, nproc, init_jobs, broadcast, finish_mp
    implicit none
    integer, intent (in), optional :: n_ensembles
    integer, dimension(:), allocatable :: group0
    integer :: i, l, list_unit, ierr
    character(run_name_size), dimension(:), allocatable :: job_list
    logical :: list_mode
    list_mode = .not. present(n_ensembles)

    ! open file containing list of input files to run and read total
    ! number of input files from first line
    if (list_mode) then
       if (proc0) then
          open (newunit = list_unit, file = trim(list_name))
          read (list_unit, *) njobs
       end if
    else
       njobs = n_ensembles
    end if
    call broadcast (njobs)

    if (nproc < njobs) then
       if (proc0) then
          write (*,*) 
          write (*,*) 'Number of jobs = ',njobs,' and number of processors = ',nproc
          write (*,*) 'Number of processors must not be less than the number of jobs'
          write (*,*) 'Stopping'
          write (*,*) 
       end if
       call finish_mp !Ok as all procs call this routine
       stop 
    end if
       
    if (mod(nproc, njobs) /= 0) then
       if (proc0) then
          write (*,*) 
          write (*,*) 'Number of jobs = ',njobs,' and number of processors = ',nproc
          write (*,*) 'Number of jobs must evenly divide the number of processors.'
          write (*,*) 'Stopping'
          write (*,*) 
       end if
       call finish_mp !Ok as all procs call this routine
       stop
    end if
       
    allocate (job_list(0:njobs-1))
       
    if (proc0) then
       if (list_mode) then
          do i = 0, njobs - 1
             read (list_unit, fmt="(a)") job_list(i)
          end do
          close (list_unit)
       else
          l = len_trim(list_name)
          do i = 0, njobs - 1
             write(job_list(i),'(A,"_",I0)') list_name(1:l-3), i + 1
          end do
       end if
    end if
    
    call broadcast(job_list)
    
    allocate (group0(0:njobs-1))
    
    call init_jobs (njobs, group0, ierr)
    call init_job_name (job_list(job))
    
    if (proc0) then
       call init_error_unit(.true.)
       call init_input_unit(.true.)
    end if

    if (nproc > 1 .and. proc0) &
         & write(*,*) 'Job ',job,' is called ',trim(run_name),&
         & ' and is running on ',nproc,' processors'
    if (nproc == 1) write(*,*) 'Job ',job,' is called ',trim(run_name),&
         & ' and is running on ',nproc,' processor'
    
    deallocate (group0, job_list)

    !> We need to set this for the group proc0, as it is previously
    !> only set for the global proc0
    if (proc0) futils_initialized = .true.
  end subroutine job_fork

  !> Check whether the stop file <run_name>.stop is present, and if so trigger exit.
  subroutine checkstop(exit, exit_reason, list)
    use mp, only: proc0, broadcast
    use file_utils, only: run_name, list_name
    use constants, only: run_name_size
    use exit_codes, only: EXIT_STOP_FILE, exit_code
    use optionals, only: get_option_with_default
    implicit none
    logical, intent (in), optional :: list
    logical, intent (in out) :: exit
    type(exit_code), intent (in out) :: exit_reason
    character(run_name_size) :: filename
    logical :: exit_local, is_list

    ! If .stop file has appeared, set exit flag
    if (proc0) then
       is_list = get_option_with_default(list, .false.)
       if (is_list) then
          filename = list_name(:len_trim(list_name)-5)//".stop"
       else
          filename = trim(run_name)//".stop"
       end if

       inquire(file = filename, exist = exit_local)
       if (exit_local) then
          exit_reason = EXIT_STOP_FILE
          call exit_reason%write_exit_file()
       end if
       exit = exit .or. exit_local
    end if

    call broadcast (exit)
  end subroutine checkstop

  !> FIXME : Add documentation  
  subroutine init_checktime
    if (checktime_initialized) return
    wall_clock_initial_time = timer_local()
    checktime_initialized = .true.
  end subroutine init_checktime

  !> Check whether elapsed run time is within a 5 minutes of exceeding the available CPU
  !> time, and if so trigger the code exit.
  subroutine checktime(avail_time, exit, margin_in)
    use mp, only: proc0, broadcast
    use file_utils, only: error_unit
    use exit_codes, only: EXIT_OUT_OF_TIME
    use optionals, only: get_option_with_default
    implicit none
    ! available time in second
    real, intent(in) :: avail_time
    ! margin
    real, intent(in), optional :: margin_in
    ! true if elapse time exceed available time
    logical, intent(in out) :: exit
    real :: elapse_time
    real, save :: margin = 300. ! 5 minutes
    margin = get_option_with_default(margin_in, margin)

    if (.not. checktime_initialized) then
       call init_checktime
    else
      elapse_time = timer_local() - wall_clock_initial_time

      if (proc0) then
         if (elapse_time >= avail_time - margin) then
            write(error_unit(),'(a,f12.4,a,f12.4)') &
                 & 'Elapse time ',elapse_time, &
                 & ' exceeds available time',avail_time-margin
            write(error_unit(),'(a,f12.4,a,f12.4,a)') &
                 & '  (Given CPU time: ',avail_time, &
                 & '  Margin: ',margin,')'
            exit = .true.
            call EXIT_OUT_OF_TIME%write_exit_file()
         end if
      end if
    end if

    call broadcast(exit)
  end subroutine checktime
end module job_manage