SLURM: Implement indexing progress monitoring for job arrays

author: Thomas White <taw@physics.org> 2021-03-04 09:14:23 +0100
committer: Thomas White <taw@physics.org> 2021-03-04 12:21:36 +0100
commit: bc8ab2fc8af70ecd58120dd30652c5cc7a7d190a (patch)
tree: 9fbdd1294eb97127edff4c906d00ddfc26320b55
parent: 65a1afdce2da5cd92f6907f6e517b9ec4280cdd5 (diff)
1 files changed, 67 insertions, 42 deletions
diff --git a/src/gui_backend_slurm.c b/src/gui_backend_slurm.c
index 175a3688..24588c13 100644
--- a/src/gui_backend_slurm.c
+++ b/src/gui_backend_slurm.c
@@ -86,88 +86,113 @@ struct slurm_job
 };
 
 
-static int job_running(uint32_t job_id)
+static int job_alive(slurm_job_info_t *job)
 {
-	job_info_msg_t *job_info;
-	int running = 1;
-
-	if ( slurm_load_job(&job_info, job_id, 0) ) {
-		STATUS("Couldn't get status: %i\n",
-		       slurm_strerror(slurm_get_errno()));
-		running = 0;
-		/* FIXME: Distinguish error cond from job complete */
-	}
-
-	switch ( job_info->job_array[0].job_state & JOB_STATE_BASE ) {
+	switch ( job->job_state & JOB_STATE_BASE ) {
 
 		/* Only the following states are reasons to keep on watching
 		 * the job */
 		case JOB_PENDING :
 		case JOB_RUNNING :
 		case JOB_SUSPENDED :
-		running = 1;
-		break;
+		return 1;
 
 		default :
-		running = 0;
-		break;
+		return 0;
 	}
+}
 
-	slurm_free_job_info_msg(job_info);
 
+static int job_running(uint32_t job_id)
+{
+	job_info_msg_t *job_info;
+	int running = 1;
+
+	if ( slurm_load_job(&job_info, job_id, 0) ) {
+		STATUS("Couldn't get status: %i\n",
+		       slurm_strerror(slurm_get_errno()));
+		return 0;
+	}
+
+	running = job_alive(&job_info->job_array[0]);
+	slurm_free_job_info_msg(job_info);
 	return running;
 }
 
 
 static double indexing_progress(struct slurm_job *job, int *running)
 {
-#if 0
-	if ( job->n_blocks > 15 ) {
+	job_info_msg_t *array_job_info;
+	int i;
+	int n_running;
+	int lowest_alive_task;
 
-		/* Fast path for larger number of sub-jobs */
+	if ( slurm_load_job(&array_job_info, job->job_id, 0) ) {
+		STATUS("Couldn't get status: %i\n",
+		       slurm_strerror(slurm_get_errno()));
+		*running = 0;
+		return 0.0;
+	}
 
-		int i;
-		int n_running = 0;
+	n_running = 0;
+	lowest_alive_task = job->n_blocks;
+	for ( i=0; i<array_job_info->record_count; i++ ) {
 
-		for ( i=0; i<job->n_blocks; i++ ) {
+		slurm_job_info_t *job_info = &array_job_info->job_array[i];
 
-			if ( job->job_ids[i] == 0 ) continue;
+		/* Ignore the array job itself */
+		if ( job_info->array_job_id == 0 ) continue;
 
-			if ( !job_running(job->job_ids[i]) ) {
-				job->job_ids[i] = 0;
-			} else {
-				n_running++;
+		if ( job_alive(job_info) ) {
+			if ( job_info->array_task_id < lowest_alive_task ) {
+				lowest_alive_task = job_info->array_task_id;
 			}
+			n_running++;
 		}
+	}
+	slurm_free_job_info_msg(array_job_info);
 
-		*running = (n_running > 0);
-		return (double)(job->n_blocks - n_running) / job->n_blocks;
+	*running = (n_running > 0);
 
-	} else {
+	/* If there are lots of blocks, just count running jobs instead of
+	 * reading loads of log files */
+	if ( job->n_blocks > 15 ) {
 
-		/* Slow path - higher accuracy for smaller number of sub-jobs */
+		/* Didn't find any alive jobs at all?
+		 * Then we've either just started or just finished. */
+		if ( lowest_alive_task == job->n_blocks ) {
+			if ( n_running > 0 ) {
+				return 0.0;
+			} else {
+				return 1.0;
+			}
+		} else {
+			return (double)lowest_alive_task / job->n_blocks;
+		}
+
+	} else {
 
 		int i;
 		int n_proc = 0;
 
-		*running = 0;
 		for ( i=0; i<job->n_blocks; i++ ) {
 
-			n_proc += read_number_processed(job->stderr_filenames[i]);
+			char tmp[128];
+			GFile *stderr_gfile;
+			char *stderr_filename;
 
-			if ( job->job_ids[i] == 0 ) continue;
+			snprintf(tmp, 127, "stderr-%i.log", i);
+			stderr_gfile = g_file_get_child(job->workdir, tmp);
+			stderr_filename = g_file_get_path(stderr_gfile);
+			g_object_unref(stderr_gfile);
+
+			n_proc += read_number_processed(stderr_filename);
+			g_free(stderr_filename);
 
-			if ( !job_running(job->job_ids[i]) ) {
-				job->job_ids[i] = 0;
-			} else {
-				*running = 1;
-			}
 		}
 
 		return (double)n_proc / job->n_frames;
 	}
-#endif
-	return 0.5;
 }
author	Thomas White <taw@physics.org>	2021-03-04 09:14:23 +0100
committer	Thomas White <taw@physics.org>	2021-03-04 12:21:36 +0100
commit	bc8ab2fc8af70ecd58120dd30652c5cc7a7d190a (patch)
tree	9fbdd1294eb97127edff4c906d00ddfc26320b55
parent	65a1afdce2da5cd92f6907f6e517b9ec4280cdd5 (diff)