aboutsummaryrefslogtreecommitdiff
path: root/fs/ocfs2/dlm
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/dlm')
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h12
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c12
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c12
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c39
-rw-r--r--fs/ocfs2/dlm/dlmlock.c25
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c11
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c65
7 files changed, 126 insertions, 50 deletions
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 42eb53b5293..9c772583744 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,9 +37,7 @@
#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
#define DLM_THREAD_MS 200 // flush at least every 200 ms
-#define DLM_HASH_BITS 7
-#define DLM_HASH_SIZE (1 << DLM_HASH_BITS)
-#define DLM_HASH_MASK (DLM_HASH_SIZE - 1)
+#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head))
enum dlm_ast_type {
DLM_AST = 0,
@@ -87,7 +85,7 @@ enum dlm_ctxt_state {
struct dlm_ctxt
{
struct list_head list;
- struct list_head *resources;
+ struct hlist_head *lockres_hash;
struct list_head dirty_list;
struct list_head purge_list;
struct list_head pending_asts;
@@ -208,13 +206,16 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
#define DLM_LOCK_RES_IN_PROGRESS 0x00000010
#define DLM_LOCK_RES_MIGRATING 0x00000020
+/* max milliseconds to wait to sync up a network failure with a node death */
+#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
+
#define DLM_PURGE_INTERVAL_MS (8 * 1000)
struct dlm_lock_resource
{
/* WARNING: Please see the comment in dlm_init_lockres before
* adding fields here. */
- struct list_head list;
+ struct hlist_node hash_node;
struct kref refs;
/* please keep these next 3 in this order
@@ -658,6 +659,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 6001b22a997..f66e2d818cc 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
} else {
mlog_errno(tmpret);
if (dlm_is_host_down(tmpret)) {
+ /* instead of logging the same network error over
+ * and over, sleep here and wait for the heartbeat
+ * to notice the node is dead. times out after 5s. */
+ dlm_wait_for_node_death(dlm, res->owner,
+ DLM_NODE_DEATH_WAIT_MAX);
ret = DLM_RECOVERING;
mlog(0, "node %u died so returning DLM_RECOVERING "
"from convert message!\n", res->owner);
@@ -421,7 +426,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
struct dlm_lockstatus *lksb;
enum dlm_status status = DLM_NORMAL;
u32 flags;
- int call_ast = 0, kick_thread = 0;
+ int call_ast = 0, kick_thread = 0, ast_reserved = 0;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
@@ -490,6 +495,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
status = __dlm_lockres_state_to_status(res);
if (status == DLM_NORMAL) {
__dlm_lockres_reserve_ast(res);
+ ast_reserved = 1;
res->state |= DLM_LOCK_RES_IN_PROGRESS;
status = __dlmconvert_master(dlm, res, lock, flags,
cnv->requested_type,
@@ -512,10 +518,10 @@ leave:
else
dlm_lock_put(lock);
- /* either queue the ast or release it */
+ /* either queue the ast or release it, if reserved */
if (call_ast)
dlm_queue_ast(dlm, lock);
- else
+ else if (ast_reserved)
dlm_lockres_release_ast(dlm, res);
if (kick_thread)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index f339fe27975..54f61b76ab5 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -117,8 +117,8 @@ EXPORT_SYMBOL_GPL(dlm_print_one_lock);
void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
{
struct dlm_lock_resource *res;
- struct list_head *iter;
- struct list_head *bucket;
+ struct hlist_node *iter;
+ struct hlist_head *bucket;
int i;
mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
@@ -129,12 +129,10 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
}
spin_lock(&dlm->spinlock);
- for (i=0; i<DLM_HASH_SIZE; i++) {
- bucket = &(dlm->resources[i]);
- list_for_each(iter, bucket) {
- res = list_entry(iter, struct dlm_lock_resource, list);
+ for (i=0; i<DLM_HASH_BUCKETS; i++) {
+ bucket = &(dlm->lockres_hash[i]);
+ hlist_for_each_entry(res, iter, bucket, hash_node)
dlm_print_one_lock_resource(res);
- }
}
spin_unlock(&dlm->spinlock);
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ee30837389..8f3a9e3106f 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -77,26 +77,26 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)
{
- list_del_init(&lockres->list);
+ hlist_del_init(&lockres->hash_node);
dlm_lockres_put(lockres);
}
void __dlm_insert_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
- struct list_head *bucket;
+ struct hlist_head *bucket;
struct qstr *q;
assert_spin_locked(&dlm->spinlock);
q = &res->lockname;
q->hash = full_name_hash(q->name, q->len);
- bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);
+ bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
/* get a reference for our hashtable */
dlm_lockres_get(res);
- list_add_tail(&res->list, bucket);
+ hlist_add_head(&res->hash_node, bucket);
}
struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -104,9 +104,9 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
unsigned int len)
{
unsigned int hash;
- struct list_head *iter;
+ struct hlist_node *iter;
struct dlm_lock_resource *tmpres=NULL;
- struct list_head *bucket;
+ struct hlist_head *bucket;
mlog_entry("%.*s\n", len, name);
@@ -114,11 +114,11 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
hash = full_name_hash(name, len);
- bucket = &(dlm->resources[hash & DLM_HASH_MASK]);
+ bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
/* check for pre-existing lock */
- list_for_each(iter, bucket) {
- tmpres = list_entry(iter, struct dlm_lock_resource, list);
+ hlist_for_each(iter, bucket) {
+ tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
if (tmpres->lockname.len == len &&
memcmp(tmpres->lockname.name, name, len) == 0) {
dlm_lockres_get(tmpres);
@@ -193,8 +193,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
{
- if (dlm->resources)
- free_page((unsigned long) dlm->resources);
+ if (dlm->lockres_hash)
+ free_page((unsigned long) dlm->lockres_hash);
if (dlm->name)
kfree(dlm->name);
@@ -303,10 +303,10 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
mlog(0, "Migrating locks from domain %s\n", dlm->name);
restart:
spin_lock(&dlm->spinlock);
- for (i=0; i<DLM_HASH_SIZE; i++) {
- while (!list_empty(&dlm->resources[i])) {
- res = list_entry(dlm->resources[i].next,
- struct dlm_lock_resource, list);
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ while (!hlist_empty(&dlm->lockres_hash[i])) {
+ res = hlist_entry(dlm->lockres_hash[i].first,
+ struct dlm_lock_resource, hash_node);
/* need reference when manually grabbing lockres */
dlm_lockres_get(res);
/* this should unhash the lockres
@@ -1191,18 +1191,17 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
goto leave;
}
- dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);
- if (!dlm->resources) {
+ dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+ if (!dlm->lockres_hash) {
mlog_errno(-ENOMEM);
kfree(dlm->name);
kfree(dlm);
dlm = NULL;
goto leave;
}
- memset(dlm->resources, 0, PAGE_SIZE);
- for (i=0; i<DLM_HASH_SIZE; i++)
- INIT_LIST_HEAD(&dlm->resources[i]);
+ for (i=0; i<DLM_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
strcpy(dlm->name, domain);
dlm->key = key;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index d1a0038557a..671d4ff222c 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -220,6 +220,17 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
dlm_error(status);
dlm_revert_pending_lock(res, lock);
dlm_lock_put(lock);
+ } else if (dlm_is_recovery_lock(res->lockname.name,
+ res->lockname.len)) {
+ /* special case for the $RECOVERY lock.
+ * there will never be an AST delivered to put
+ * this lock on the proper secondary queue
+ * (granted), so do it manually. */
+ mlog(0, "%s: $RECOVERY lock for this node (%u) is "
+ "mastered by %u; got lock, manually granting (no ast)\n",
+ dlm->name, dlm->node_num, res->owner);
+ list_del_init(&lock->list);
+ list_add_tail(&lock->list, &res->granted);
}
spin_unlock(&res->spinlock);
@@ -646,7 +657,19 @@ retry_lock:
mlog(0, "retrying lock with migration/"
"recovery/in progress\n");
msleep(100);
- dlm_wait_for_recovery(dlm);
+ /* no waiting for dlm_reco_thread */
+ if (recovery) {
+ if (status == DLM_RECOVERING) {
+ mlog(0, "%s: got RECOVERING "
+ "for $REOCVERY lock, master "
+ "was %u\n", dlm->name,
+ res->owner);
+ dlm_wait_for_node_death(dlm, res->owner,
+ DLM_NODE_DEATH_WAIT_MAX);
+ }
+ } else {
+ dlm_wait_for_recovery(dlm);
+ }
goto retry_lock;
}
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index a3194fe173d..847dd3cc4cf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -564,7 +564,7 @@ static void dlm_lockres_release(struct kref *kref)
/* By the time we're ready to blow this guy away, we shouldn't
* be on any lists. */
- BUG_ON(!list_empty(&res->list));
+ BUG_ON(!hlist_unhashed(&res->hash_node));
BUG_ON(!list_empty(&res->granted));
BUG_ON(!list_empty(&res->converting));
BUG_ON(!list_empty(&res->blocked));
@@ -605,7 +605,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
init_waitqueue_head(&res->wq);
spin_lock_init(&res->spinlock);
- INIT_LIST_HEAD(&res->list);
+ INIT_HLIST_NODE(&res->hash_node);
INIT_LIST_HEAD(&res->granted);
INIT_LIST_HEAD(&res->converting);
INIT_LIST_HEAD(&res->blocked);
@@ -2482,7 +2482,9 @@ top:
atomic_set(&mle->woken, 1);
spin_unlock(&mle->spinlock);
wake_up(&mle->wq);
- /* final put will take care of list removal */
+ /* do not need events any longer, so detach
+ * from heartbeat */
+ __dlm_mle_detach_hb_events(dlm, mle);
__dlm_put_mle(mle);
}
continue;
@@ -2537,6 +2539,9 @@ top:
spin_unlock(&res->spinlock);
dlm_lockres_put(res);
+ /* about to get rid of mle, detach from heartbeat */
+ __dlm_mle_detach_hb_events(dlm, mle);
+
/* dump the mle */
spin_lock(&dlm->master_lock);
__dlm_put_mle(mle);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 186e9a76aa5..1e232000f3f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -278,6 +278,24 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
return dead;
}
+int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+ if (timeout) {
+ mlog(ML_NOTICE, "%s: waiting %dms for notification of "
+ "death of node %u\n", dlm->name, timeout, node);
+ wait_event_timeout(dlm->dlm_reco_thread_wq,
+ dlm_is_node_dead(dlm, node),
+ msecs_to_jiffies(timeout));
+ } else {
+ mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
+ "of death of node %u\n", dlm->name, node);
+ wait_event(dlm->dlm_reco_thread_wq,
+ dlm_is_node_dead(dlm, node));
+ }
+ /* for now, return 0 */
+ return 0;
+}
+
/* callers of the top-level api calls (dlmlock/dlmunlock) should
* block on the dlm->reco.event when recovery is in progress.
* the dlm recovery thread will set this state when it begins
@@ -1675,7 +1693,10 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
u8 dead_node, u8 new_master)
{
int i;
- struct list_head *iter, *iter2, *bucket;
+ struct list_head *iter, *iter2;
+ struct hlist_node *hash_iter;
+ struct hlist_head *bucket;
+
struct dlm_lock_resource *res;
mlog_entry_void();
@@ -1699,10 +1720,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
* for now we need to run the whole hash, clear
* the RECOVERING state and set the owner
* if necessary */
- for (i=0; i<DLM_HASH_SIZE; i++) {
- bucket = &(dlm->resources[i]);
- list_for_each(iter, bucket) {
- res = list_entry (iter, struct dlm_lock_resource, list);
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ bucket = &(dlm->lockres_hash[i]);
+ hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
if (res->state & DLM_LOCK_RES_RECOVERING) {
if (res->owner == dead_node) {
mlog(0, "(this=%u) res %.*s owner=%u "
@@ -1834,10 +1854,10 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
{
- struct list_head *iter;
+ struct hlist_node *iter;
struct dlm_lock_resource *res;
int i;
- struct list_head *bucket;
+ struct hlist_head *bucket;
struct dlm_lock *lock;
@@ -1858,10 +1878,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
* can be kicked again to see if any ASTs or BASTs
* need to be fired as a result.
*/
- for (i=0; i<DLM_HASH_SIZE; i++) {
- bucket = &(dlm->resources[i]);
- list_for_each(iter, bucket) {
- res = list_entry (iter, struct dlm_lock_resource, list);
+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {
+ bucket = &(dlm->lockres_hash[i]);
+ hlist_for_each_entry(res, iter, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -2032,6 +2051,30 @@ again:
dlm->reco.new_master);
status = -EEXIST;
} else {
+ status = 0;
+
+ /* see if recovery was already finished elsewhere */
+ spin_lock(&dlm->spinlock);
+ if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
+ status = -EINVAL;
+ mlog(0, "%s: got reco EX lock, but "
+ "node got recovered already\n", dlm->name);
+ if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
+ mlog(ML_ERROR, "%s: new master is %u "
+ "but no dead node!\n",
+ dlm->name, dlm->reco.new_master);
+ BUG();
+ }
+ }
+ spin_unlock(&dlm->spinlock);
+ }
+
+ /* if this node has actually become the recovery master,
+ * set the master and send the messages to begin recovery */
+ if (!status) {
+ mlog(0, "%s: dead=%u, this=%u, sending "
+ "begin_reco now\n", dlm->name,
+ dlm->reco.dead_node, dlm->node_num);
status = dlm_send_begin_reco_message(dlm,
dlm->reco.dead_node);
/* this always succeeds */