From c36258b5925e6cf6bf72904635100593573bfcff Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 27 Sep 2007 15:53:38 -0500 Subject: [DLM] block dlm_recv in recovery transition Introduce a per-lockspace rwsem that's held in read mode by dlm_recv threads while working in the dlm. This allows dlm_recv activity to be suspended when the lockspace transitions to, from and between recovery cycles. The specific bug prompting this change is one where an in-progress recovery cycle is aborted by a new recovery cycle. While dlm_recv was processing a recovery message, the recovery cycle was aborted and dlm_recoverd began cleaning up. dlm_recv decremented recover_locks_count on an rsb after dlm_recoverd had reset it to zero. This is fixed by suspending dlm_recv (taking write lock on the rwsem) before aborting the current recovery. The transitions to/from normal and recovery modes are simplified by using this new ability to block dlm_recv. The switch from normal to recovery mode means dlm_recv goes from processing locking messages, to saving them for later, and vice versa. Races are avoided by blocking dlm_recv when setting the flag that switches between modes. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/dlm/requestqueue.c | 58 +++++++++++++++++++++------------------------------ 1 file changed, 24 insertions(+), 34 deletions(-) (limited to 'fs/dlm/requestqueue.c') diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c index 65008d79c96..0de04f17cce 100644 --- a/fs/dlm/requestqueue.c +++ b/fs/dlm/requestqueue.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2007 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -20,7 +20,7 @@ struct rq_entry { struct list_head list; int nodeid; - char request[1]; + char request[0]; }; /* @@ -30,42 +30,39 @@ struct rq_entry { * lockspace is enabled on some while still suspended on others. */ -int dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) +void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) { struct rq_entry *e; int length = hd->h_length; - int rv = 0; e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); if (!e) { - log_print("dlm_add_requestqueue: out of memory\n"); - return 0; + log_print("dlm_add_requestqueue: out of memory len %d", length); + return; } e->nodeid = nodeid; memcpy(e->request, hd, length); - /* We need to check dlm_locking_stopped() after taking the mutex to - avoid a race where dlm_recoverd enables locking and runs - process_requestqueue between our earlier dlm_locking_stopped check - and this addition to the requestqueue. */ - mutex_lock(&ls->ls_requestqueue_mutex); - if (dlm_locking_stopped(ls)) - list_add_tail(&e->list, &ls->ls_requestqueue); - else { - log_debug(ls, "dlm_add_requestqueue skip from %d", nodeid); - kfree(e); - rv = -EAGAIN; - } + list_add_tail(&e->list, &ls->ls_requestqueue); mutex_unlock(&ls->ls_requestqueue_mutex); - return rv; } +/* + * Called by dlm_recoverd to process normal messages saved while recovery was + * happening. Normal locking has been enabled before this is called. dlm_recv + * upon receiving a message, will wait for all saved messages to be drained + * here before processing the message it got. If a new dlm_ls_stop() arrives + * while we're processing these saved messages, it may block trying to suspend + * dlm_recv if dlm_recv is waiting for us in dlm_wait_requestqueue. In that + * case, we don't abort since locking_stopped is still 0. If dlm_recv is not + * waiting for us, then this processing may be aborted due to locking_stopped. + */ + int dlm_process_requestqueue(struct dlm_ls *ls) { struct rq_entry *e; - struct dlm_header *hd; int error = 0; mutex_lock(&ls->ls_requestqueue_mutex); @@ -79,14 +76,7 @@ int dlm_process_requestqueue(struct dlm_ls *ls) e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); mutex_unlock(&ls->ls_requestqueue_mutex); - hd = (struct dlm_header *) e->request; - error = dlm_receive_message(hd, e->nodeid, 1); - - if (error == -EINTR) { - /* entry is left on requestqueue */ - log_debug(ls, "process_requestqueue abort eintr"); - break; - } + dlm_receive_message_saved(ls, (struct dlm_message *)e->request); mutex_lock(&ls->ls_requestqueue_mutex); list_del(&e->list); @@ -106,10 +96,12 @@ int dlm_process_requestqueue(struct dlm_ls *ls) /* * After recovery is done, locking is resumed and dlm_recoverd takes all the - * saved requests and processes them as they would have been by dlm_recvd. At - * the same time, dlm_recvd will start receiving new requests from remote - * nodes. We want to delay dlm_recvd processing new requests until - * dlm_recoverd has finished processing the old saved requests. + * saved requests and processes them as they would have been by dlm_recv. At + * the same time, dlm_recv will start receiving new requests from remote nodes. + * We want to delay dlm_recv processing new requests until dlm_recoverd has + * finished processing the old saved requests. We don't check for locking + * stopped here because dlm_ls_stop won't stop locking until it's suspended us + * (dlm_recv). */ void dlm_wait_requestqueue(struct dlm_ls *ls) @@ -118,8 +110,6 @@ void dlm_wait_requestqueue(struct dlm_ls *ls) mutex_lock(&ls->ls_requestqueue_mutex); if (list_empty(&ls->ls_requestqueue)) break; - if (dlm_locking_stopped(ls)) - break; mutex_unlock(&ls->ls_requestqueue_mutex); schedule(); } -- cgit v1.2.3