From c36258b5925e6cf6bf72904635100593573bfcff Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 27 Sep 2007 15:53:38 -0500 Subject: [DLM] block dlm_recv in recovery transition Introduce a per-lockspace rwsem that's held in read mode by dlm_recv threads while working in the dlm. This allows dlm_recv activity to be suspended when the lockspace transitions to, from and between recovery cycles. The specific bug prompting this change is one where an in-progress recovery cycle is aborted by a new recovery cycle. While dlm_recv was processing a recovery message, the recovery cycle was aborted and dlm_recoverd began cleaning up. dlm_recv decremented recover_locks_count on an rsb after dlm_recoverd had reset it to zero. This is fixed by suspending dlm_recv (taking write lock on the rwsem) before aborting the current recovery. The transitions to/from normal and recovery modes are simplified by using this new ability to block dlm_recv. The switch from normal to recovery mode means dlm_recv goes from processing locking messages, to saving them for later, and vice versa. Races are avoided by blocking dlm_recv when setting the flag that switches between modes. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/rcom.c | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) (limited to 'fs/dlm/rcom.c') diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 188b91c027e..ae2fd97fa4a 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -386,7 +386,10 @@ static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_recover_process_copy(ls, rc_in); } -static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) +/* If the lockspace doesn't exist then still send a status message + back; it's possible that it just doesn't have its global_id yet. */ + +int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) { struct dlm_rcom *rc; struct rcom_config *rf; @@ -446,28 +449,11 @@ static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc) return rv; } -/* Called by dlm_recvd; corresponds to dlm_receive_message() but special +/* Called by dlm_recv; corresponds to dlm_receive_message() but special recovery-only comms are sent through here. */ -void dlm_receive_rcom(struct dlm_header *hd, int nodeid) +void dlm_receive_rcom(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid) { - struct dlm_rcom *rc = (struct dlm_rcom *) hd; - struct dlm_ls *ls; - - dlm_rcom_in(rc); - - /* If the lockspace doesn't exist then still send a status message - back; it's possible that it just doesn't have its global_id yet. */ - - ls = dlm_find_lockspace_global(hd->h_lockspace); - if (!ls) { - log_print("lockspace %x from %d type %x not found", - hd->h_lockspace, nodeid, rc->rc_type); - if (rc->rc_type == DLM_RCOM_STATUS) - send_ls_not_ready(nodeid, rc); - return; - } - if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { log_debug(ls, "ignoring recovery message %x from %d", rc->rc_type, nodeid); @@ -477,12 +463,6 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid) if (is_old_reply(ls, rc)) goto out; - if (nodeid != rc->rc_header.h_nodeid) { - log_error(ls, "bad rcom nodeid %d from %d", - rc->rc_header.h_nodeid, nodeid); - goto out; - } - switch (rc->rc_type) { case DLM_RCOM_STATUS: receive_rcom_status(ls, rc); @@ -520,6 +500,6 @@ void dlm_receive_rcom(struct dlm_header *hd, int nodeid) DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); } out: - dlm_put_lockspace(ls); + return; } -- cgit v1.2.3