diff options
author | Thomas White <taw@physics.org> | 2021-03-04 09:14:23 +0100 |
---|---|---|
committer | Thomas White <taw@physics.org> | 2021-03-04 12:21:36 +0100 |
commit | bc8ab2fc8af70ecd58120dd30652c5cc7a7d190a (patch) | |
tree | 9fbdd1294eb97127edff4c906d00ddfc26320b55 /src | |
parent | 65a1afdce2da5cd92f6907f6e517b9ec4280cdd5 (diff) |
SLURM: Implement indexing progress monitoring for job arrays
Diffstat (limited to 'src')
-rw-r--r-- | src/gui_backend_slurm.c | 109 |
1 files changed, 67 insertions, 42 deletions
diff --git a/src/gui_backend_slurm.c b/src/gui_backend_slurm.c index 175a3688..24588c13 100644 --- a/src/gui_backend_slurm.c +++ b/src/gui_backend_slurm.c @@ -86,88 +86,113 @@ struct slurm_job }; -static int job_running(uint32_t job_id) +static int job_alive(slurm_job_info_t *job) { - job_info_msg_t *job_info; - int running = 1; - - if ( slurm_load_job(&job_info, job_id, 0) ) { - STATUS("Couldn't get status: %i\n", - slurm_strerror(slurm_get_errno())); - running = 0; - /* FIXME: Distinguish error cond from job complete */ - } - - switch ( job_info->job_array[0].job_state & JOB_STATE_BASE ) { + switch ( job->job_state & JOB_STATE_BASE ) { /* Only the following states are reasons to keep on watching * the job */ case JOB_PENDING : case JOB_RUNNING : case JOB_SUSPENDED : - running = 1; - break; + return 1; default : - running = 0; - break; + return 0; } +} - slurm_free_job_info_msg(job_info); +static int job_running(uint32_t job_id) +{ + job_info_msg_t *job_info; + int running = 1; + + if ( slurm_load_job(&job_info, job_id, 0) ) { + STATUS("Couldn't get status: %i\n", + slurm_strerror(slurm_get_errno())); + return 0; + } + + running = job_alive(&job_info->job_array[0]); + slurm_free_job_info_msg(job_info); return running; } static double indexing_progress(struct slurm_job *job, int *running) { -#if 0 - if ( job->n_blocks > 15 ) { + job_info_msg_t *array_job_info; + int i; + int n_running; + int lowest_alive_task; - /* Fast path for larger number of sub-jobs */ + if ( slurm_load_job(&array_job_info, job->job_id, 0) ) { + STATUS("Couldn't get status: %i\n", + slurm_strerror(slurm_get_errno())); + *running = 0; + return 0.0; + } - int i; - int n_running = 0; + n_running = 0; + lowest_alive_task = job->n_blocks; + for ( i=0; i<array_job_info->record_count; i++ ) { - for ( i=0; i<job->n_blocks; i++ ) { + slurm_job_info_t *job_info = &array_job_info->job_array[i]; - if ( job->job_ids[i] == 0 ) continue; + /* Ignore the array job itself */ + if ( job_info->array_job_id == 0 ) continue; - if ( !job_running(job->job_ids[i]) ) { - job->job_ids[i] = 0; - } else { - n_running++; + if ( job_alive(job_info) ) { + if ( job_info->array_task_id < lowest_alive_task ) { + lowest_alive_task = job_info->array_task_id; } + n_running++; } + } + slurm_free_job_info_msg(array_job_info); - *running = (n_running > 0); - return (double)(job->n_blocks - n_running) / job->n_blocks; + *running = (n_running > 0); - } else { + /* If there are lots of blocks, just count running jobs instead of + * reading loads of log files */ + if ( job->n_blocks > 15 ) { - /* Slow path - higher accuracy for smaller number of sub-jobs */ + /* Didn't find any alive jobs at all? + * Then we've either just started or just finished. */ + if ( lowest_alive_task == job->n_blocks ) { + if ( n_running > 0 ) { + return 0.0; + } else { + return 1.0; + } + } else { + return (double)lowest_alive_task / job->n_blocks; + } + + } else { int i; int n_proc = 0; - *running = 0; for ( i=0; i<job->n_blocks; i++ ) { - n_proc += read_number_processed(job->stderr_filenames[i]); + char tmp[128]; + GFile *stderr_gfile; + char *stderr_filename; - if ( job->job_ids[i] == 0 ) continue; + snprintf(tmp, 127, "stderr-%i.log", i); + stderr_gfile = g_file_get_child(job->workdir, tmp); + stderr_filename = g_file_get_path(stderr_gfile); + g_object_unref(stderr_gfile); + + n_proc += read_number_processed(stderr_filename); + g_free(stderr_filename); - if ( !job_running(job->job_ids[i]) ) { - job->job_ids[i] = 0; - } else { - *running = 1; - } } return (double)n_proc / job->n_frames; } -#endif - return 0.5; } |