aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorThomas White <taw@physics.org>2021-03-04 09:14:23 +0100
committerThomas White <taw@physics.org>2021-03-04 12:21:36 +0100
commitbc8ab2fc8af70ecd58120dd30652c5cc7a7d190a (patch)
tree9fbdd1294eb97127edff4c906d00ddfc26320b55
parent65a1afdce2da5cd92f6907f6e517b9ec4280cdd5 (diff)
SLURM: Implement indexing progress monitoring for job arrays
-rw-r--r--src/gui_backend_slurm.c109
1 files changed, 67 insertions, 42 deletions
diff --git a/src/gui_backend_slurm.c b/src/gui_backend_slurm.c
index 175a3688..24588c13 100644
--- a/src/gui_backend_slurm.c
+++ b/src/gui_backend_slurm.c
@@ -86,88 +86,113 @@ struct slurm_job
};
-static int job_running(uint32_t job_id)
+static int job_alive(slurm_job_info_t *job)
{
- job_info_msg_t *job_info;
- int running = 1;
-
- if ( slurm_load_job(&job_info, job_id, 0) ) {
- STATUS("Couldn't get status: %i\n",
- slurm_strerror(slurm_get_errno()));
- running = 0;
- /* FIXME: Distinguish error cond from job complete */
- }
-
- switch ( job_info->job_array[0].job_state & JOB_STATE_BASE ) {
+ switch ( job->job_state & JOB_STATE_BASE ) {
/* Only the following states are reasons to keep on watching
* the job */
case JOB_PENDING :
case JOB_RUNNING :
case JOB_SUSPENDED :
- running = 1;
- break;
+ return 1;
default :
- running = 0;
- break;
+ return 0;
}
+}
- slurm_free_job_info_msg(job_info);
+static int job_running(uint32_t job_id)
+{
+ job_info_msg_t *job_info;
+ int running = 1;
+
+ if ( slurm_load_job(&job_info, job_id, 0) ) {
+ STATUS("Couldn't get status: %i\n",
+ slurm_strerror(slurm_get_errno()));
+ return 0;
+ }
+
+ running = job_alive(&job_info->job_array[0]);
+ slurm_free_job_info_msg(job_info);
return running;
}
static double indexing_progress(struct slurm_job *job, int *running)
{
-#if 0
- if ( job->n_blocks > 15 ) {
+ job_info_msg_t *array_job_info;
+ int i;
+ int n_running;
+ int lowest_alive_task;
- /* Fast path for larger number of sub-jobs */
+ if ( slurm_load_job(&array_job_info, job->job_id, 0) ) {
+ STATUS("Couldn't get status: %i\n",
+ slurm_strerror(slurm_get_errno()));
+ *running = 0;
+ return 0.0;
+ }
- int i;
- int n_running = 0;
+ n_running = 0;
+ lowest_alive_task = job->n_blocks;
+ for ( i=0; i<array_job_info->record_count; i++ ) {
- for ( i=0; i<job->n_blocks; i++ ) {
+ slurm_job_info_t *job_info = &array_job_info->job_array[i];
- if ( job->job_ids[i] == 0 ) continue;
+ /* Ignore the array job itself */
+ if ( job_info->array_job_id == 0 ) continue;
- if ( !job_running(job->job_ids[i]) ) {
- job->job_ids[i] = 0;
- } else {
- n_running++;
+ if ( job_alive(job_info) ) {
+ if ( job_info->array_task_id < lowest_alive_task ) {
+ lowest_alive_task = job_info->array_task_id;
}
+ n_running++;
}
+ }
+ slurm_free_job_info_msg(array_job_info);
- *running = (n_running > 0);
- return (double)(job->n_blocks - n_running) / job->n_blocks;
+ *running = (n_running > 0);
- } else {
+ /* If there are lots of blocks, just count running jobs instead of
+ * reading loads of log files */
+ if ( job->n_blocks > 15 ) {
- /* Slow path - higher accuracy for smaller number of sub-jobs */
+ /* Didn't find any alive jobs at all?
+ * Then we've either just started or just finished. */
+ if ( lowest_alive_task == job->n_blocks ) {
+ if ( n_running > 0 ) {
+ return 0.0;
+ } else {
+ return 1.0;
+ }
+ } else {
+ return (double)lowest_alive_task / job->n_blocks;
+ }
+
+ } else {
int i;
int n_proc = 0;
- *running = 0;
for ( i=0; i<job->n_blocks; i++ ) {
- n_proc += read_number_processed(job->stderr_filenames[i]);
+ char tmp[128];
+ GFile *stderr_gfile;
+ char *stderr_filename;
- if ( job->job_ids[i] == 0 ) continue;
+ snprintf(tmp, 127, "stderr-%i.log", i);
+ stderr_gfile = g_file_get_child(job->workdir, tmp);
+ stderr_filename = g_file_get_path(stderr_gfile);
+ g_object_unref(stderr_gfile);
+
+ n_proc += read_number_processed(stderr_filename);
+ g_free(stderr_filename);
- if ( !job_running(job->job_ids[i]) ) {
- job->job_ids[i] = 0;
- } else {
- *running = 1;
- }
}
return (double)n_proc / job->n_frames;
}
-#endif
- return 0.5;
}