xref: /linux/fs/ocfs2/dlm/dlmmaster.c (revision a44e4f3ab16bc808590763a543a93b6fbf3abcc4)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- mode: c; c-basic-offset: 8; -*-
3  * vim: noexpandtab sw=8 ts=8 sts=0:
4  *
5  * dlmmod.c
6  *
7  * standalone DLM module
8  *
9  * Copyright (C) 2004 Oracle.  All rights reserved.
10  */
11 
12 
13 #include <linux/module.h>
14 #include <linux/fs.h>
15 #include <linux/types.h>
16 #include <linux/slab.h>
17 #include <linux/highmem.h>
18 #include <linux/init.h>
19 #include <linux/sysctl.h>
20 #include <linux/random.h>
21 #include <linux/blkdev.h>
22 #include <linux/socket.h>
23 #include <linux/inet.h>
24 #include <linux/spinlock.h>
25 #include <linux/delay.h>
26 
27 
28 #include "cluster/heartbeat.h"
29 #include "cluster/nodemanager.h"
30 #include "cluster/tcp.h"
31 
32 #include "dlmapi.h"
33 #include "dlmcommon.h"
34 #include "dlmdomain.h"
35 #include "dlmdebug.h"
36 
37 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
38 #include "cluster/masklog.h"
39 
40 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
41 			      struct dlm_master_list_entry *mle,
42 			      struct o2nm_node *node,
43 			      int idx);
44 static void dlm_mle_node_up(struct dlm_ctxt *dlm,
45 			    struct dlm_master_list_entry *mle,
46 			    struct o2nm_node *node,
47 			    int idx);
48 
49 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data);
50 static int dlm_do_assert_master(struct dlm_ctxt *dlm,
51 				struct dlm_lock_resource *res,
52 				void *nodemap, u32 flags);
53 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data);
54 
55 static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
56 				struct dlm_master_list_entry *mle,
57 				const char *name,
58 				unsigned int namelen)
59 {
60 	if (dlm != mle->dlm)
61 		return 0;
62 
63 	if (namelen != mle->mnamelen ||
64 	    memcmp(name, mle->mname, namelen) != 0)
65 		return 0;
66 
67 	return 1;
68 }
69 
70 static struct kmem_cache *dlm_lockres_cache;
71 static struct kmem_cache *dlm_lockname_cache;
72 static struct kmem_cache *dlm_mle_cache;
73 
74 static void dlm_mle_release(struct kref *kref);
75 static void dlm_init_mle(struct dlm_master_list_entry *mle,
76 			enum dlm_mle_type type,
77 			struct dlm_ctxt *dlm,
78 			struct dlm_lock_resource *res,
79 			const char *name,
80 			unsigned int namelen);
81 static void dlm_put_mle(struct dlm_master_list_entry *mle);
82 static void __dlm_put_mle(struct dlm_master_list_entry *mle);
83 static int dlm_find_mle(struct dlm_ctxt *dlm,
84 			struct dlm_master_list_entry **mle,
85 			char *name, unsigned int namelen);
86 
87 static int dlm_do_master_request(struct dlm_lock_resource *res,
88 				 struct dlm_master_list_entry *mle, int to);
89 
90 
91 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
92 				     struct dlm_lock_resource *res,
93 				     struct dlm_master_list_entry *mle,
94 				     int *blocked);
95 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
96 				    struct dlm_lock_resource *res,
97 				    struct dlm_master_list_entry *mle,
98 				    int blocked);
99 static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
100 				 struct dlm_lock_resource *res,
101 				 struct dlm_master_list_entry *mle,
102 				 struct dlm_master_list_entry **oldmle,
103 				 const char *name, unsigned int namelen,
104 				 u8 new_master, u8 master);
105 
106 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
107 				    struct dlm_lock_resource *res);
108 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
109 				      struct dlm_lock_resource *res);
110 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
111 				       struct dlm_lock_resource *res,
112 				       u8 target);
113 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
114 				       struct dlm_lock_resource *res);
115 
116 
117 int dlm_is_host_down(int errno)
118 {
119 	switch (errno) {
120 		case -EBADF:
121 		case -ECONNREFUSED:
122 		case -ENOTCONN:
123 		case -ECONNRESET:
124 		case -EPIPE:
125 		case -EHOSTDOWN:
126 		case -EHOSTUNREACH:
127 		case -ETIMEDOUT:
128 		case -ECONNABORTED:
129 		case -ENETDOWN:
130 		case -ENETUNREACH:
131 		case -ENETRESET:
132 		case -ESHUTDOWN:
133 		case -ENOPROTOOPT:
134 		case -EINVAL:   /* if returned from our tcp code,
135 				   this means there is no socket */
136 			return 1;
137 	}
138 	return 0;
139 }
140 
141 
142 /*
143  * MASTER LIST FUNCTIONS
144  */
145 
146 
147 /*
148  * regarding master list entries and heartbeat callbacks:
149  *
150  * in order to avoid sleeping and allocation that occurs in
151  * heartbeat, master list entries are simply attached to the
152  * dlm's established heartbeat callbacks.  the mle is attached
153  * when it is created, and since the dlm->spinlock is held at
154  * that time, any heartbeat event will be properly discovered
155  * by the mle.  the mle needs to be detached from the
156  * dlm->mle_hb_events list as soon as heartbeat events are no
157  * longer useful to the mle, and before the mle is freed.
158  *
159  * as a general rule, heartbeat events are no longer needed by
160  * the mle once an "answer" regarding the lock master has been
161  * received.
162  */
163 static inline void __dlm_mle_attach_hb_events(struct dlm_ctxt *dlm,
164 					      struct dlm_master_list_entry *mle)
165 {
166 	assert_spin_locked(&dlm->spinlock);
167 
168 	list_add_tail(&mle->hb_events, &dlm->mle_hb_events);
169 }
170 
171 
172 static inline void __dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
173 					      struct dlm_master_list_entry *mle)
174 {
175 	if (!list_empty(&mle->hb_events))
176 		list_del_init(&mle->hb_events);
177 }
178 
179 
180 static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
181 					    struct dlm_master_list_entry *mle)
182 {
183 	spin_lock(&dlm->spinlock);
184 	__dlm_mle_detach_hb_events(dlm, mle);
185 	spin_unlock(&dlm->spinlock);
186 }
187 
188 static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
189 {
190 	struct dlm_ctxt *dlm;
191 	dlm = mle->dlm;
192 
193 	assert_spin_locked(&dlm->spinlock);
194 	assert_spin_locked(&dlm->master_lock);
195 	mle->inuse++;
196 	kref_get(&mle->mle_refs);
197 }
198 
199 static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
200 {
201 	struct dlm_ctxt *dlm;
202 	dlm = mle->dlm;
203 
204 	spin_lock(&dlm->spinlock);
205 	spin_lock(&dlm->master_lock);
206 	mle->inuse--;
207 	__dlm_put_mle(mle);
208 	spin_unlock(&dlm->master_lock);
209 	spin_unlock(&dlm->spinlock);
210 
211 }
212 
213 /* remove from list and free */
214 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
215 {
216 	struct dlm_ctxt *dlm;
217 	dlm = mle->dlm;
218 
219 	assert_spin_locked(&dlm->spinlock);
220 	assert_spin_locked(&dlm->master_lock);
221 	if (!kref_read(&mle->mle_refs)) {
222 		/* this may or may not crash, but who cares.
223 		 * it's a BUG. */
224 		mlog(ML_ERROR, "bad mle: %p\n", mle);
225 		dlm_print_one_mle(mle);
226 		BUG();
227 	} else
228 		kref_put(&mle->mle_refs, dlm_mle_release);
229 }
230 
231 
232 /* must not have any spinlocks coming in */
233 static void dlm_put_mle(struct dlm_master_list_entry *mle)
234 {
235 	struct dlm_ctxt *dlm;
236 	dlm = mle->dlm;
237 
238 	spin_lock(&dlm->spinlock);
239 	spin_lock(&dlm->master_lock);
240 	__dlm_put_mle(mle);
241 	spin_unlock(&dlm->master_lock);
242 	spin_unlock(&dlm->spinlock);
243 }
244 
245 static inline void dlm_get_mle(struct dlm_master_list_entry *mle)
246 {
247 	kref_get(&mle->mle_refs);
248 }
249 
250 static void dlm_init_mle(struct dlm_master_list_entry *mle,
251 			enum dlm_mle_type type,
252 			struct dlm_ctxt *dlm,
253 			struct dlm_lock_resource *res,
254 			const char *name,
255 			unsigned int namelen)
256 {
257 	assert_spin_locked(&dlm->spinlock);
258 
259 	mle->dlm = dlm;
260 	mle->type = type;
261 	INIT_HLIST_NODE(&mle->master_hash_node);
262 	INIT_LIST_HEAD(&mle->hb_events);
263 	memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
264 	spin_lock_init(&mle->spinlock);
265 	init_waitqueue_head(&mle->wq);
266 	atomic_set(&mle->woken, 0);
267 	kref_init(&mle->mle_refs);
268 	memset(mle->response_map, 0, sizeof(mle->response_map));
269 	mle->master = O2NM_MAX_NODES;
270 	mle->new_master = O2NM_MAX_NODES;
271 	mle->inuse = 0;
272 
273 	BUG_ON(mle->type != DLM_MLE_BLOCK &&
274 	       mle->type != DLM_MLE_MASTER &&
275 	       mle->type != DLM_MLE_MIGRATION);
276 
277 	if (mle->type == DLM_MLE_MASTER) {
278 		BUG_ON(!res);
279 		mle->mleres = res;
280 		memcpy(mle->mname, res->lockname.name, res->lockname.len);
281 		mle->mnamelen = res->lockname.len;
282 		mle->mnamehash = res->lockname.hash;
283 	} else {
284 		BUG_ON(!name);
285 		mle->mleres = NULL;
286 		memcpy(mle->mname, name, namelen);
287 		mle->mnamelen = namelen;
288 		mle->mnamehash = dlm_lockid_hash(name, namelen);
289 	}
290 
291 	atomic_inc(&dlm->mle_tot_count[mle->type]);
292 	atomic_inc(&dlm->mle_cur_count[mle->type]);
293 
294 	/* copy off the node_map and register hb callbacks on our copy */
295 	memcpy(mle->node_map, dlm->domain_map, sizeof(mle->node_map));
296 	memcpy(mle->vote_map, dlm->domain_map, sizeof(mle->vote_map));
297 	clear_bit(dlm->node_num, mle->vote_map);
298 	clear_bit(dlm->node_num, mle->node_map);
299 
300 	/* attach the mle to the domain node up/down events */
301 	__dlm_mle_attach_hb_events(dlm, mle);
302 }
303 
304 void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
305 {
306 	assert_spin_locked(&dlm->spinlock);
307 	assert_spin_locked(&dlm->master_lock);
308 
309 	if (!hlist_unhashed(&mle->master_hash_node))
310 		hlist_del_init(&mle->master_hash_node);
311 }
312 
313 void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry *mle)
314 {
315 	struct hlist_head *bucket;
316 
317 	assert_spin_locked(&dlm->master_lock);
318 
319 	bucket = dlm_master_hash(dlm, mle->mnamehash);
320 	hlist_add_head(&mle->master_hash_node, bucket);
321 }
322 
323 /* returns 1 if found, 0 if not */
324 static int dlm_find_mle(struct dlm_ctxt *dlm,
325 			struct dlm_master_list_entry **mle,
326 			char *name, unsigned int namelen)
327 {
328 	struct dlm_master_list_entry *tmpmle;
329 	struct hlist_head *bucket;
330 	unsigned int hash;
331 
332 	assert_spin_locked(&dlm->master_lock);
333 
334 	hash = dlm_lockid_hash(name, namelen);
335 	bucket = dlm_master_hash(dlm, hash);
336 	hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
337 		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
338 			continue;
339 		dlm_get_mle(tmpmle);
340 		*mle = tmpmle;
341 		return 1;
342 	}
343 	return 0;
344 }
345 
346 void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up)
347 {
348 	struct dlm_master_list_entry *mle;
349 
350 	assert_spin_locked(&dlm->spinlock);
351 
352 	list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) {
353 		if (node_up)
354 			dlm_mle_node_up(dlm, mle, NULL, idx);
355 		else
356 			dlm_mle_node_down(dlm, mle, NULL, idx);
357 	}
358 }
359 
360 static void dlm_mle_node_down(struct dlm_ctxt *dlm,
361 			      struct dlm_master_list_entry *mle,
362 			      struct o2nm_node *node, int idx)
363 {
364 	spin_lock(&mle->spinlock);
365 
366 	if (!test_bit(idx, mle->node_map))
367 		mlog(0, "node %u already removed from nodemap!\n", idx);
368 	else
369 		clear_bit(idx, mle->node_map);
370 
371 	spin_unlock(&mle->spinlock);
372 }
373 
374 static void dlm_mle_node_up(struct dlm_ctxt *dlm,
375 			    struct dlm_master_list_entry *mle,
376 			    struct o2nm_node *node, int idx)
377 {
378 	spin_lock(&mle->spinlock);
379 
380 	if (test_bit(idx, mle->node_map))
381 		mlog(0, "node %u already in node map!\n", idx);
382 	else
383 		set_bit(idx, mle->node_map);
384 
385 	spin_unlock(&mle->spinlock);
386 }
387 
388 
389 int dlm_init_mle_cache(void)
390 {
391 	dlm_mle_cache = kmem_cache_create("o2dlm_mle",
392 					  sizeof(struct dlm_master_list_entry),
393 					  0, SLAB_HWCACHE_ALIGN,
394 					  NULL);
395 	if (dlm_mle_cache == NULL)
396 		return -ENOMEM;
397 	return 0;
398 }
399 
400 void dlm_destroy_mle_cache(void)
401 {
402 	kmem_cache_destroy(dlm_mle_cache);
403 }
404 
405 static void dlm_mle_release(struct kref *kref)
406 {
407 	struct dlm_master_list_entry *mle;
408 	struct dlm_ctxt *dlm;
409 
410 	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
411 	dlm = mle->dlm;
412 
413 	assert_spin_locked(&dlm->spinlock);
414 	assert_spin_locked(&dlm->master_lock);
415 
416 	mlog(0, "Releasing mle for %.*s, type %d\n", mle->mnamelen, mle->mname,
417 	     mle->type);
418 
419 	/* remove from list if not already */
420 	__dlm_unlink_mle(dlm, mle);
421 
422 	/* detach the mle from the domain node up/down events */
423 	__dlm_mle_detach_hb_events(dlm, mle);
424 
425 	atomic_dec(&dlm->mle_cur_count[mle->type]);
426 
427 	/* NOTE: kfree under spinlock here.
428 	 * if this is bad, we can move this to a freelist. */
429 	kmem_cache_free(dlm_mle_cache, mle);
430 }
431 
432 
433 /*
434  * LOCK RESOURCE FUNCTIONS
435  */
436 
437 int dlm_init_master_caches(void)
438 {
439 	dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
440 					      sizeof(struct dlm_lock_resource),
441 					      0, SLAB_HWCACHE_ALIGN, NULL);
442 	if (!dlm_lockres_cache)
443 		goto bail;
444 
445 	dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
446 					       DLM_LOCKID_NAME_MAX, 0,
447 					       SLAB_HWCACHE_ALIGN, NULL);
448 	if (!dlm_lockname_cache)
449 		goto bail;
450 
451 	return 0;
452 bail:
453 	dlm_destroy_master_caches();
454 	return -ENOMEM;
455 }
456 
457 void dlm_destroy_master_caches(void)
458 {
459 	kmem_cache_destroy(dlm_lockname_cache);
460 	dlm_lockname_cache = NULL;
461 
462 	kmem_cache_destroy(dlm_lockres_cache);
463 	dlm_lockres_cache = NULL;
464 }
465 
466 static void dlm_lockres_release(struct kref *kref)
467 {
468 	struct dlm_lock_resource *res;
469 	struct dlm_ctxt *dlm;
470 
471 	res = container_of(kref, struct dlm_lock_resource, refs);
472 	dlm = res->dlm;
473 
474 	/* This should not happen -- all lockres' have a name
475 	 * associated with them at init time. */
476 	BUG_ON(!res->lockname.name);
477 
478 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
479 	     res->lockname.name);
480 
481 	atomic_dec(&dlm->res_cur_count);
482 
483 	if (!hlist_unhashed(&res->hash_node) ||
484 	    !list_empty(&res->granted) ||
485 	    !list_empty(&res->converting) ||
486 	    !list_empty(&res->blocked) ||
487 	    !list_empty(&res->dirty) ||
488 	    !list_empty(&res->recovering) ||
489 	    !list_empty(&res->purge)) {
490 		mlog(ML_ERROR,
491 		     "Going to BUG for resource %.*s."
492 		     "  We're on a list! [%c%c%c%c%c%c%c]\n",
493 		     res->lockname.len, res->lockname.name,
494 		     !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
495 		     !list_empty(&res->granted) ? 'G' : ' ',
496 		     !list_empty(&res->converting) ? 'C' : ' ',
497 		     !list_empty(&res->blocked) ? 'B' : ' ',
498 		     !list_empty(&res->dirty) ? 'D' : ' ',
499 		     !list_empty(&res->recovering) ? 'R' : ' ',
500 		     !list_empty(&res->purge) ? 'P' : ' ');
501 
502 		dlm_print_one_lock_resource(res);
503 	}
504 
505 	/* By the time we're ready to blow this guy away, we shouldn't
506 	 * be on any lists. */
507 	BUG_ON(!hlist_unhashed(&res->hash_node));
508 	BUG_ON(!list_empty(&res->granted));
509 	BUG_ON(!list_empty(&res->converting));
510 	BUG_ON(!list_empty(&res->blocked));
511 	BUG_ON(!list_empty(&res->dirty));
512 	BUG_ON(!list_empty(&res->recovering));
513 	BUG_ON(!list_empty(&res->purge));
514 
515 	kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
516 
517 	kmem_cache_free(dlm_lockres_cache, res);
518 }
519 
520 void dlm_lockres_put(struct dlm_lock_resource *res)
521 {
522 	kref_put(&res->refs, dlm_lockres_release);
523 }
524 
525 static void dlm_init_lockres(struct dlm_ctxt *dlm,
526 			     struct dlm_lock_resource *res,
527 			     const char *name, unsigned int namelen)
528 {
529 	char *qname;
530 
531 	/* If we memset here, we lose our reference to the kmalloc'd
532 	 * res->lockname.name, so be sure to init every field
533 	 * correctly! */
534 
535 	qname = (char *) res->lockname.name;
536 	memcpy(qname, name, namelen);
537 
538 	res->lockname.len = namelen;
539 	res->lockname.hash = dlm_lockid_hash(name, namelen);
540 
541 	init_waitqueue_head(&res->wq);
542 	spin_lock_init(&res->spinlock);
543 	INIT_HLIST_NODE(&res->hash_node);
544 	INIT_LIST_HEAD(&res->granted);
545 	INIT_LIST_HEAD(&res->converting);
546 	INIT_LIST_HEAD(&res->blocked);
547 	INIT_LIST_HEAD(&res->dirty);
548 	INIT_LIST_HEAD(&res->recovering);
549 	INIT_LIST_HEAD(&res->purge);
550 	INIT_LIST_HEAD(&res->tracking);
551 	atomic_set(&res->asts_reserved, 0);
552 	res->migration_pending = 0;
553 	res->inflight_locks = 0;
554 	res->inflight_assert_workers = 0;
555 
556 	res->dlm = dlm;
557 
558 	kref_init(&res->refs);
559 
560 	atomic_inc(&dlm->res_tot_count);
561 	atomic_inc(&dlm->res_cur_count);
562 
563 	/* just for consistency */
564 	spin_lock(&res->spinlock);
565 	dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
566 	spin_unlock(&res->spinlock);
567 
568 	res->state = DLM_LOCK_RES_IN_PROGRESS;
569 
570 	res->last_used = 0;
571 
572 	spin_lock(&dlm->track_lock);
573 	list_add_tail(&res->tracking, &dlm->tracking_list);
574 	spin_unlock(&dlm->track_lock);
575 
576 	memset(res->lvb, 0, DLM_LVB_LEN);
577 	memset(res->refmap, 0, sizeof(res->refmap));
578 }
579 
580 struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
581 				   const char *name,
582 				   unsigned int namelen)
583 {
584 	struct dlm_lock_resource *res = NULL;
585 
586 	res = kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
587 	if (!res)
588 		goto error;
589 
590 	res->lockname.name = kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
591 	if (!res->lockname.name)
592 		goto error;
593 
594 	dlm_init_lockres(dlm, res, name, namelen);
595 	return res;
596 
597 error:
598 	if (res)
599 		kmem_cache_free(dlm_lockres_cache, res);
600 	return NULL;
601 }
602 
603 void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
604 				struct dlm_lock_resource *res, int bit)
605 {
606 	assert_spin_locked(&res->spinlock);
607 
608 	mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
609 	     res->lockname.name, bit, __builtin_return_address(0));
610 
611 	set_bit(bit, res->refmap);
612 }
613 
614 void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
615 				  struct dlm_lock_resource *res, int bit)
616 {
617 	assert_spin_locked(&res->spinlock);
618 
619 	mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
620 	     res->lockname.name, bit, __builtin_return_address(0));
621 
622 	clear_bit(bit, res->refmap);
623 }
624 
625 static void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
626 				   struct dlm_lock_resource *res)
627 {
628 	res->inflight_locks++;
629 
630 	mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
631 	     res->lockname.len, res->lockname.name, res->inflight_locks,
632 	     __builtin_return_address(0));
633 }
634 
635 void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
636 				   struct dlm_lock_resource *res)
637 {
638 	assert_spin_locked(&res->spinlock);
639 	__dlm_lockres_grab_inflight_ref(dlm, res);
640 }
641 
642 void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
643 				   struct dlm_lock_resource *res)
644 {
645 	assert_spin_locked(&res->spinlock);
646 
647 	BUG_ON(res->inflight_locks == 0);
648 
649 	res->inflight_locks--;
650 
651 	mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
652 	     res->lockname.len, res->lockname.name, res->inflight_locks,
653 	     __builtin_return_address(0));
654 
655 	wake_up(&res->wq);
656 }
657 
658 void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm,
659 		struct dlm_lock_resource *res)
660 {
661 	assert_spin_locked(&res->spinlock);
662 	res->inflight_assert_workers++;
663 	mlog(0, "%s:%.*s: inflight assert worker++: now %u\n",
664 			dlm->name, res->lockname.len, res->lockname.name,
665 			res->inflight_assert_workers);
666 }
667 
668 static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
669 		struct dlm_lock_resource *res)
670 {
671 	assert_spin_locked(&res->spinlock);
672 	BUG_ON(res->inflight_assert_workers == 0);
673 	res->inflight_assert_workers--;
674 	mlog(0, "%s:%.*s: inflight assert worker--: now %u\n",
675 			dlm->name, res->lockname.len, res->lockname.name,
676 			res->inflight_assert_workers);
677 }
678 
679 static void dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm,
680 		struct dlm_lock_resource *res)
681 {
682 	spin_lock(&res->spinlock);
683 	__dlm_lockres_drop_inflight_worker(dlm, res);
684 	spin_unlock(&res->spinlock);
685 }
686 
687 /*
688  * lookup a lock resource by name.
689  * may already exist in the hashtable.
690  * lockid is null terminated
691  *
692  * if not, allocate enough for the lockres and for
693  * the temporary structure used in doing the mastering.
694  *
695  * also, do a lookup in the dlm->master_list to see
696  * if another node has begun mastering the same lock.
697  * if so, there should be a block entry in there
698  * for this name, and we should *not* attempt to master
699  * the lock here.   need to wait around for that node
700  * to assert_master (or die).
701  *
702  */
703 struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
704 					  const char *lockid,
705 					  int namelen,
706 					  int flags)
707 {
708 	struct dlm_lock_resource *tmpres=NULL, *res=NULL;
709 	struct dlm_master_list_entry *mle = NULL;
710 	struct dlm_master_list_entry *alloc_mle = NULL;
711 	int blocked = 0;
712 	int ret, nodenum;
713 	struct dlm_node_iter iter;
714 	unsigned int hash;
715 	int tries = 0;
716 	int bit, wait_on_recovery = 0;
717 
718 	BUG_ON(!lockid);
719 
720 	hash = dlm_lockid_hash(lockid, namelen);
721 
722 	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
723 
724 lookup:
725 	spin_lock(&dlm->spinlock);
726 	tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
727 	if (tmpres) {
728 		spin_unlock(&dlm->spinlock);
729 		spin_lock(&tmpres->spinlock);
730 
731 		/*
732 		 * Right after dlm spinlock was released, dlm_thread could have
733 		 * purged the lockres. Check if lockres got unhashed. If so
734 		 * start over.
735 		 */
736 		if (hlist_unhashed(&tmpres->hash_node)) {
737 			spin_unlock(&tmpres->spinlock);
738 			dlm_lockres_put(tmpres);
739 			tmpres = NULL;
740 			goto lookup;
741 		}
742 
743 		/* Wait on the thread that is mastering the resource */
744 		if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
745 			__dlm_wait_on_lockres(tmpres);
746 			BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
747 			spin_unlock(&tmpres->spinlock);
748 			dlm_lockres_put(tmpres);
749 			tmpres = NULL;
750 			goto lookup;
751 		}
752 
753 		/* Wait on the resource purge to complete before continuing */
754 		if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
755 			BUG_ON(tmpres->owner == dlm->node_num);
756 			__dlm_wait_on_lockres_flags(tmpres,
757 						    DLM_LOCK_RES_DROPPING_REF);
758 			spin_unlock(&tmpres->spinlock);
759 			dlm_lockres_put(tmpres);
760 			tmpres = NULL;
761 			goto lookup;
762 		}
763 
764 		/* Grab inflight ref to pin the resource */
765 		dlm_lockres_grab_inflight_ref(dlm, tmpres);
766 
767 		spin_unlock(&tmpres->spinlock);
768 		if (res) {
769 			spin_lock(&dlm->track_lock);
770 			if (!list_empty(&res->tracking))
771 				list_del_init(&res->tracking);
772 			else
773 				mlog(ML_ERROR, "Resource %.*s not "
774 						"on the Tracking list\n",
775 						res->lockname.len,
776 						res->lockname.name);
777 			spin_unlock(&dlm->track_lock);
778 			dlm_lockres_put(res);
779 		}
780 		res = tmpres;
781 		goto leave;
782 	}
783 
784 	if (!res) {
785 		spin_unlock(&dlm->spinlock);
786 		mlog(0, "allocating a new resource\n");
787 		/* nothing found and we need to allocate one. */
788 		alloc_mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
789 		if (!alloc_mle)
790 			goto leave;
791 		res = dlm_new_lockres(dlm, lockid, namelen);
792 		if (!res)
793 			goto leave;
794 		goto lookup;
795 	}
796 
797 	mlog(0, "no lockres found, allocated our own: %p\n", res);
798 
799 	if (flags & LKM_LOCAL) {
800 		/* caller knows it's safe to assume it's not mastered elsewhere
801 		 * DONE!  return right away */
802 		spin_lock(&res->spinlock);
803 		dlm_change_lockres_owner(dlm, res, dlm->node_num);
804 		__dlm_insert_lockres(dlm, res);
805 		dlm_lockres_grab_inflight_ref(dlm, res);
806 		spin_unlock(&res->spinlock);
807 		spin_unlock(&dlm->spinlock);
808 		/* lockres still marked IN_PROGRESS */
809 		goto wake_waiters;
810 	}
811 
812 	/* check master list to see if another node has started mastering it */
813 	spin_lock(&dlm->master_lock);
814 
815 	/* if we found a block, wait for lock to be mastered by another node */
816 	blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen);
817 	if (blocked) {
818 		int mig;
819 		if (mle->type == DLM_MLE_MASTER) {
820 			mlog(ML_ERROR, "master entry for nonexistent lock!\n");
821 			BUG();
822 		}
823 		mig = (mle->type == DLM_MLE_MIGRATION);
824 		/* if there is a migration in progress, let the migration
825 		 * finish before continuing.  we can wait for the absence
826 		 * of the MIGRATION mle: either the migrate finished or
827 		 * one of the nodes died and the mle was cleaned up.
828 		 * if there is a BLOCK here, but it already has a master
829 		 * set, we are too late.  the master does not have a ref
830 		 * for us in the refmap.  detach the mle and drop it.
831 		 * either way, go back to the top and start over. */
832 		if (mig || mle->master != O2NM_MAX_NODES) {
833 			BUG_ON(mig && mle->master == dlm->node_num);
834 			/* we arrived too late.  the master does not
835 			 * have a ref for us. retry. */
836 			mlog(0, "%s:%.*s: late on %s\n",
837 			     dlm->name, namelen, lockid,
838 			     mig ?  "MIGRATION" : "BLOCK");
839 			spin_unlock(&dlm->master_lock);
840 			spin_unlock(&dlm->spinlock);
841 
842 			/* master is known, detach */
843 			if (!mig)
844 				dlm_mle_detach_hb_events(dlm, mle);
845 			dlm_put_mle(mle);
846 			mle = NULL;
847 			/* this is lame, but we can't wait on either
848 			 * the mle or lockres waitqueue here */
849 			if (mig)
850 				msleep(100);
851 			goto lookup;
852 		}
853 	} else {
854 		/* go ahead and try to master lock on this node */
855 		mle = alloc_mle;
856 		/* make sure this does not get freed below */
857 		alloc_mle = NULL;
858 		dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0);
859 		set_bit(dlm->node_num, mle->maybe_map);
860 		__dlm_insert_mle(dlm, mle);
861 
862 		/* still holding the dlm spinlock, check the recovery map
863 		 * to see if there are any nodes that still need to be
864 		 * considered.  these will not appear in the mle nodemap
865 		 * but they might own this lockres.  wait on them. */
866 		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
867 		if (bit < O2NM_MAX_NODES) {
868 			mlog(0, "%s: res %.*s, At least one node (%d) "
869 			     "to recover before lock mastery can begin\n",
870 			     dlm->name, namelen, (char *)lockid, bit);
871 			wait_on_recovery = 1;
872 		}
873 	}
874 
875 	/* at this point there is either a DLM_MLE_BLOCK or a
876 	 * DLM_MLE_MASTER on the master list, so it's safe to add the
877 	 * lockres to the hashtable.  anyone who finds the lock will
878 	 * still have to wait on the IN_PROGRESS. */
879 
880 	/* finally add the lockres to its hash bucket */
881 	__dlm_insert_lockres(dlm, res);
882 
883 	/* since this lockres is new it doesn't not require the spinlock */
884 	__dlm_lockres_grab_inflight_ref(dlm, res);
885 
886 	/* get an extra ref on the mle in case this is a BLOCK
887 	 * if so, the creator of the BLOCK may try to put the last
888 	 * ref at this time in the assert master handler, so we
889 	 * need an extra one to keep from a bad ptr deref. */
890 	dlm_get_mle_inuse(mle);
891 	spin_unlock(&dlm->master_lock);
892 	spin_unlock(&dlm->spinlock);
893 
894 redo_request:
895 	while (wait_on_recovery) {
896 		/* any cluster changes that occurred after dropping the
897 		 * dlm spinlock would be detectable be a change on the mle,
898 		 * so we only need to clear out the recovery map once. */
899 		if (dlm_is_recovery_lock(lockid, namelen)) {
900 			mlog(0, "%s: Recovery map is not empty, but must "
901 			     "master $RECOVERY lock now\n", dlm->name);
902 			if (!dlm_pre_master_reco_lockres(dlm, res))
903 				wait_on_recovery = 0;
904 			else {
905 				mlog(0, "%s: waiting 500ms for heartbeat state "
906 				    "change\n", dlm->name);
907 				msleep(500);
908 			}
909 			continue;
910 		}
911 
912 		dlm_kick_recovery_thread(dlm);
913 		msleep(1000);
914 		dlm_wait_for_recovery(dlm);
915 
916 		spin_lock(&dlm->spinlock);
917 		bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
918 		if (bit < O2NM_MAX_NODES) {
919 			mlog(0, "%s: res %.*s, At least one node (%d) "
920 			     "to recover before lock mastery can begin\n",
921 			     dlm->name, namelen, (char *)lockid, bit);
922 			wait_on_recovery = 1;
923 		} else
924 			wait_on_recovery = 0;
925 		spin_unlock(&dlm->spinlock);
926 
927 		if (wait_on_recovery)
928 			dlm_wait_for_node_recovery(dlm, bit, 10000);
929 	}
930 
931 	/* must wait for lock to be mastered elsewhere */
932 	if (blocked)
933 		goto wait;
934 
935 	ret = -EINVAL;
936 	dlm_node_iter_init(mle->vote_map, &iter);
937 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
938 		ret = dlm_do_master_request(res, mle, nodenum);
939 		if (ret < 0)
940 			mlog_errno(ret);
941 		if (mle->master != O2NM_MAX_NODES) {
942 			/* found a master ! */
943 			if (mle->master <= nodenum)
944 				break;
945 			/* if our master request has not reached the master
946 			 * yet, keep going until it does.  this is how the
947 			 * master will know that asserts are needed back to
948 			 * the lower nodes. */
949 			mlog(0, "%s: res %.*s, Requests only up to %u but "
950 			     "master is %u, keep going\n", dlm->name, namelen,
951 			     lockid, nodenum, mle->master);
952 		}
953 	}
954 
955 wait:
956 	/* keep going until the response map includes all nodes */
957 	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
958 	if (ret < 0) {
959 		wait_on_recovery = 1;
960 		mlog(0, "%s: res %.*s, Node map changed, redo the master "
961 		     "request now, blocked=%d\n", dlm->name, res->lockname.len,
962 		     res->lockname.name, blocked);
963 		if (++tries > 20) {
964 			mlog(ML_ERROR, "%s: res %.*s, Spinning on "
965 			     "dlm_wait_for_lock_mastery, blocked = %d\n",
966 			     dlm->name, res->lockname.len,
967 			     res->lockname.name, blocked);
968 			dlm_print_one_lock_resource(res);
969 			dlm_print_one_mle(mle);
970 			tries = 0;
971 		}
972 		goto redo_request;
973 	}
974 
975 	mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
976 	     res->lockname.name, res->owner);
977 	/* make sure we never continue without this */
978 	BUG_ON(res->owner == O2NM_MAX_NODES);
979 
980 	/* master is known, detach if not already detached */
981 	dlm_mle_detach_hb_events(dlm, mle);
982 	dlm_put_mle(mle);
983 	/* put the extra ref */
984 	dlm_put_mle_inuse(mle);
985 
986 wake_waiters:
987 	spin_lock(&res->spinlock);
988 	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
989 	spin_unlock(&res->spinlock);
990 	wake_up(&res->wq);
991 
992 leave:
993 	/* need to free the unused mle */
994 	if (alloc_mle)
995 		kmem_cache_free(dlm_mle_cache, alloc_mle);
996 
997 	return res;
998 }
999 
1000 
1001 #define DLM_MASTERY_TIMEOUT_MS   5000
1002 
1003 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
1004 				     struct dlm_lock_resource *res,
1005 				     struct dlm_master_list_entry *mle,
1006 				     int *blocked)
1007 {
1008 	u8 m;
1009 	int ret, bit;
1010 	int map_changed, voting_done;
1011 	int assert, sleep;
1012 
1013 recheck:
1014 	ret = 0;
1015 	assert = 0;
1016 
1017 	/* check if another node has already become the owner */
1018 	spin_lock(&res->spinlock);
1019 	if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1020 		mlog(0, "%s:%.*s: owner is suddenly %u\n", dlm->name,
1021 		     res->lockname.len, res->lockname.name, res->owner);
1022 		spin_unlock(&res->spinlock);
1023 		/* this will cause the master to re-assert across
1024 		 * the whole cluster, freeing up mles */
1025 		if (res->owner != dlm->node_num) {
1026 			ret = dlm_do_master_request(res, mle, res->owner);
1027 			if (ret < 0) {
1028 				/* give recovery a chance to run */
1029 				mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
1030 				msleep(500);
1031 				goto recheck;
1032 			}
1033 		}
1034 		ret = 0;
1035 		goto leave;
1036 	}
1037 	spin_unlock(&res->spinlock);
1038 
1039 	spin_lock(&mle->spinlock);
1040 	m = mle->master;
1041 	map_changed = (memcmp(mle->vote_map, mle->node_map,
1042 			      sizeof(mle->vote_map)) != 0);
1043 	voting_done = (memcmp(mle->vote_map, mle->response_map,
1044 			     sizeof(mle->vote_map)) == 0);
1045 
1046 	/* restart if we hit any errors */
1047 	if (map_changed) {
1048 		int b;
1049 		mlog(0, "%s: %.*s: node map changed, restarting\n",
1050 		     dlm->name, res->lockname.len, res->lockname.name);
1051 		ret = dlm_restart_lock_mastery(dlm, res, mle, *blocked);
1052 		b = (mle->type == DLM_MLE_BLOCK);
1053 		if ((*blocked && !b) || (!*blocked && b)) {
1054 			mlog(0, "%s:%.*s: status change: old=%d new=%d\n",
1055 			     dlm->name, res->lockname.len, res->lockname.name,
1056 			     *blocked, b);
1057 			*blocked = b;
1058 		}
1059 		spin_unlock(&mle->spinlock);
1060 		if (ret < 0) {
1061 			mlog_errno(ret);
1062 			goto leave;
1063 		}
1064 		mlog(0, "%s:%.*s: restart lock mastery succeeded, "
1065 		     "rechecking now\n", dlm->name, res->lockname.len,
1066 		     res->lockname.name);
1067 		goto recheck;
1068 	} else {
1069 		if (!voting_done) {
1070 			mlog(0, "map not changed and voting not done "
1071 			     "for %s:%.*s\n", dlm->name, res->lockname.len,
1072 			     res->lockname.name);
1073 		}
1074 	}
1075 
1076 	if (m != O2NM_MAX_NODES) {
1077 		/* another node has done an assert!
1078 		 * all done! */
1079 		sleep = 0;
1080 	} else {
1081 		sleep = 1;
1082 		/* have all nodes responded? */
1083 		if (voting_done && !*blocked) {
1084 			bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
1085 			if (dlm->node_num <= bit) {
1086 				/* my node number is lowest.
1087 			 	 * now tell other nodes that I am
1088 				 * mastering this. */
1089 				mle->master = dlm->node_num;
1090 				/* ref was grabbed in get_lock_resource
1091 				 * will be dropped in dlmlock_master */
1092 				assert = 1;
1093 				sleep = 0;
1094 			}
1095 			/* if voting is done, but we have not received
1096 			 * an assert master yet, we must sleep */
1097 		}
1098 	}
1099 
1100 	spin_unlock(&mle->spinlock);
1101 
1102 	/* sleep if we haven't finished voting yet */
1103 	if (sleep) {
1104 		unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS);
1105 		atomic_set(&mle->woken, 0);
1106 		(void)wait_event_timeout(mle->wq,
1107 					 (atomic_read(&mle->woken) == 1),
1108 					 timeo);
1109 		if (res->owner == O2NM_MAX_NODES) {
1110 			mlog(0, "%s:%.*s: waiting again\n", dlm->name,
1111 			     res->lockname.len, res->lockname.name);
1112 			goto recheck;
1113 		}
1114 		mlog(0, "done waiting, master is %u\n", res->owner);
1115 		ret = 0;
1116 		goto leave;
1117 	}
1118 
1119 	ret = 0;   /* done */
1120 	if (assert) {
1121 		m = dlm->node_num;
1122 		mlog(0, "about to master %.*s here, this=%u\n",
1123 		     res->lockname.len, res->lockname.name, m);
1124 		ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0);
1125 		if (ret) {
1126 			/* This is a failure in the network path,
1127 			 * not in the response to the assert_master
1128 			 * (any nonzero response is a BUG on this node).
1129 			 * Most likely a socket just got disconnected
1130 			 * due to node death. */
1131 			mlog_errno(ret);
1132 		}
1133 		/* no longer need to restart lock mastery.
1134 		 * all living nodes have been contacted. */
1135 		ret = 0;
1136 	}
1137 
1138 	/* set the lockres owner */
1139 	spin_lock(&res->spinlock);
1140 	/* mastery reference obtained either during
1141 	 * assert_master_handler or in get_lock_resource */
1142 	dlm_change_lockres_owner(dlm, res, m);
1143 	spin_unlock(&res->spinlock);
1144 
1145 leave:
1146 	return ret;
1147 }
1148 
1149 struct dlm_bitmap_diff_iter
1150 {
1151 	int curnode;
1152 	unsigned long *orig_bm;
1153 	unsigned long *cur_bm;
1154 	unsigned long diff_bm[BITS_TO_LONGS(O2NM_MAX_NODES)];
1155 };
1156 
1157 enum dlm_node_state_change
1158 {
1159 	NODE_DOWN = -1,
1160 	NODE_NO_CHANGE = 0,
1161 	NODE_UP
1162 };
1163 
1164 static void dlm_bitmap_diff_iter_init(struct dlm_bitmap_diff_iter *iter,
1165 				      unsigned long *orig_bm,
1166 				      unsigned long *cur_bm)
1167 {
1168 	unsigned long p1, p2;
1169 	int i;
1170 
1171 	iter->curnode = -1;
1172 	iter->orig_bm = orig_bm;
1173 	iter->cur_bm = cur_bm;
1174 
1175 	for (i = 0; i < BITS_TO_LONGS(O2NM_MAX_NODES); i++) {
1176        		p1 = *(iter->orig_bm + i);
1177 	       	p2 = *(iter->cur_bm + i);
1178 		iter->diff_bm[i] = (p1 & ~p2) | (p2 & ~p1);
1179 	}
1180 }
1181 
1182 static int dlm_bitmap_diff_iter_next(struct dlm_bitmap_diff_iter *iter,
1183 				     enum dlm_node_state_change *state)
1184 {
1185 	int bit;
1186 
1187 	if (iter->curnode >= O2NM_MAX_NODES)
1188 		return -ENOENT;
1189 
1190 	bit = find_next_bit(iter->diff_bm, O2NM_MAX_NODES,
1191 			    iter->curnode+1);
1192 	if (bit >= O2NM_MAX_NODES) {
1193 		iter->curnode = O2NM_MAX_NODES;
1194 		return -ENOENT;
1195 	}
1196 
1197 	/* if it was there in the original then this node died */
1198 	if (test_bit(bit, iter->orig_bm))
1199 		*state = NODE_DOWN;
1200 	else
1201 		*state = NODE_UP;
1202 
1203 	iter->curnode = bit;
1204 	return bit;
1205 }
1206 
1207 
1208 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1209 				    struct dlm_lock_resource *res,
1210 				    struct dlm_master_list_entry *mle,
1211 				    int blocked)
1212 {
1213 	struct dlm_bitmap_diff_iter bdi;
1214 	enum dlm_node_state_change sc;
1215 	int node;
1216 	int ret = 0;
1217 
1218 	mlog(0, "something happened such that the "
1219 	     "master process may need to be restarted!\n");
1220 
1221 	assert_spin_locked(&mle->spinlock);
1222 
1223 	dlm_bitmap_diff_iter_init(&bdi, mle->vote_map, mle->node_map);
1224 	node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1225 	while (node >= 0) {
1226 		if (sc == NODE_UP) {
1227 			/* a node came up.  clear any old vote from
1228 			 * the response map and set it in the vote map
1229 			 * then restart the mastery. */
1230 			mlog(ML_NOTICE, "node %d up while restarting\n", node);
1231 
1232 			/* redo the master request, but only for the new node */
1233 			mlog(0, "sending request to new node\n");
1234 			clear_bit(node, mle->response_map);
1235 			set_bit(node, mle->vote_map);
1236 		} else {
1237 			mlog(ML_ERROR, "node down! %d\n", node);
1238 			if (blocked) {
1239 				int lowest = find_next_bit(mle->maybe_map,
1240 						       O2NM_MAX_NODES, 0);
1241 
1242 				/* act like it was never there */
1243 				clear_bit(node, mle->maybe_map);
1244 
1245 			       	if (node == lowest) {
1246 					mlog(0, "expected master %u died"
1247 					    " while this node was blocked "
1248 					    "waiting on it!\n", node);
1249 					lowest = find_next_bit(mle->maybe_map,
1250 						       	O2NM_MAX_NODES,
1251 						       	lowest+1);
1252 					if (lowest < O2NM_MAX_NODES) {
1253 						mlog(0, "%s:%.*s:still "
1254 						     "blocked. waiting on %u "
1255 						     "now\n", dlm->name,
1256 						     res->lockname.len,
1257 						     res->lockname.name,
1258 						     lowest);
1259 					} else {
1260 						/* mle is an MLE_BLOCK, but
1261 						 * there is now nothing left to
1262 						 * block on.  we need to return
1263 						 * all the way back out and try
1264 						 * again with an MLE_MASTER.
1265 						 * dlm_do_local_recovery_cleanup
1266 						 * has already run, so the mle
1267 						 * refcount is ok */
1268 						mlog(0, "%s:%.*s: no "
1269 						     "longer blocking. try to "
1270 						     "master this here\n",
1271 						     dlm->name,
1272 						     res->lockname.len,
1273 						     res->lockname.name);
1274 						mle->type = DLM_MLE_MASTER;
1275 						mle->mleres = res;
1276 					}
1277 				}
1278 			}
1279 
1280 			/* now blank out everything, as if we had never
1281 			 * contacted anyone */
1282 			memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
1283 			memset(mle->response_map, 0, sizeof(mle->response_map));
1284 			/* reset the vote_map to the current node_map */
1285 			memcpy(mle->vote_map, mle->node_map,
1286 			       sizeof(mle->node_map));
1287 			/* put myself into the maybe map */
1288 			if (mle->type != DLM_MLE_BLOCK)
1289 				set_bit(dlm->node_num, mle->maybe_map);
1290 		}
1291 		ret = -EAGAIN;
1292 		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1293 	}
1294 	return ret;
1295 }
1296 
1297 
1298 /*
1299  * DLM_MASTER_REQUEST_MSG
1300  *
1301  * returns: 0 on success,
1302  *          -errno on a network error
1303  *
1304  * on error, the caller should assume the target node is "dead"
1305  *
1306  */
1307 
1308 static int dlm_do_master_request(struct dlm_lock_resource *res,
1309 				 struct dlm_master_list_entry *mle, int to)
1310 {
1311 	struct dlm_ctxt *dlm = mle->dlm;
1312 	struct dlm_master_request request;
1313 	int ret, response=0, resend;
1314 
1315 	memset(&request, 0, sizeof(request));
1316 	request.node_idx = dlm->node_num;
1317 
1318 	BUG_ON(mle->type == DLM_MLE_MIGRATION);
1319 
1320 	request.namelen = (u8)mle->mnamelen;
1321 	memcpy(request.name, mle->mname, request.namelen);
1322 
1323 again:
1324 	ret = o2net_send_message(DLM_MASTER_REQUEST_MSG, dlm->key, &request,
1325 				 sizeof(request), to, &response);
1326 	if (ret < 0)  {
1327 		if (ret == -ESRCH) {
1328 			/* should never happen */
1329 			mlog(ML_ERROR, "TCP stack not ready!\n");
1330 			BUG();
1331 		} else if (ret == -EINVAL) {
1332 			mlog(ML_ERROR, "bad args passed to o2net!\n");
1333 			BUG();
1334 		} else if (ret == -ENOMEM) {
1335 			mlog(ML_ERROR, "out of memory while trying to send "
1336 			     "network message!  retrying\n");
1337 			/* this is totally crude */
1338 			msleep(50);
1339 			goto again;
1340 		} else if (!dlm_is_host_down(ret)) {
1341 			/* not a network error. bad. */
1342 			mlog_errno(ret);
1343 			mlog(ML_ERROR, "unhandled error!");
1344 			BUG();
1345 		}
1346 		/* all other errors should be network errors,
1347 		 * and likely indicate node death */
1348 		mlog(ML_ERROR, "link to %d went down!\n", to);
1349 		goto out;
1350 	}
1351 
1352 	ret = 0;
1353 	resend = 0;
1354 	spin_lock(&mle->spinlock);
1355 	switch (response) {
1356 		case DLM_MASTER_RESP_YES:
1357 			set_bit(to, mle->response_map);
1358 			mlog(0, "node %u is the master, response=YES\n", to);
1359 			mlog(0, "%s:%.*s: master node %u now knows I have a "
1360 			     "reference\n", dlm->name, res->lockname.len,
1361 			     res->lockname.name, to);
1362 			mle->master = to;
1363 			break;
1364 		case DLM_MASTER_RESP_NO:
1365 			mlog(0, "node %u not master, response=NO\n", to);
1366 			set_bit(to, mle->response_map);
1367 			break;
1368 		case DLM_MASTER_RESP_MAYBE:
1369 			mlog(0, "node %u not master, response=MAYBE\n", to);
1370 			set_bit(to, mle->response_map);
1371 			set_bit(to, mle->maybe_map);
1372 			break;
1373 		case DLM_MASTER_RESP_ERROR:
1374 			mlog(0, "node %u hit an error, resending\n", to);
1375 			resend = 1;
1376 			response = 0;
1377 			break;
1378 		default:
1379 			mlog(ML_ERROR, "bad response! %u\n", response);
1380 			BUG();
1381 	}
1382 	spin_unlock(&mle->spinlock);
1383 	if (resend) {
1384 		/* this is also totally crude */
1385 		msleep(50);
1386 		goto again;
1387 	}
1388 
1389 out:
1390 	return ret;
1391 }
1392 
1393 /*
1394  * locks that can be taken here:
1395  * dlm->spinlock
1396  * res->spinlock
1397  * mle->spinlock
1398  * dlm->master_list
1399  *
1400  * if possible, TRIM THIS DOWN!!!
1401  */
1402 int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
1403 			       void **ret_data)
1404 {
1405 	u8 response = DLM_MASTER_RESP_MAYBE;
1406 	struct dlm_ctxt *dlm = data;
1407 	struct dlm_lock_resource *res = NULL;
1408 	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
1409 	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
1410 	char *name;
1411 	unsigned int namelen, hash;
1412 	int found, ret;
1413 	int set_maybe;
1414 	int dispatch_assert = 0;
1415 	int dispatched = 0;
1416 
1417 	if (!dlm_grab(dlm))
1418 		return DLM_MASTER_RESP_NO;
1419 
1420 	if (!dlm_domain_fully_joined(dlm)) {
1421 		response = DLM_MASTER_RESP_NO;
1422 		goto send_response;
1423 	}
1424 
1425 	name = request->name;
1426 	namelen = request->namelen;
1427 	hash = dlm_lockid_hash(name, namelen);
1428 
1429 	if (namelen > DLM_LOCKID_NAME_MAX) {
1430 		response = DLM_IVBUFLEN;
1431 		goto send_response;
1432 	}
1433 
1434 way_up_top:
1435 	spin_lock(&dlm->spinlock);
1436 	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1437 	if (res) {
1438 		spin_unlock(&dlm->spinlock);
1439 
1440 		/* take care of the easy cases up front */
1441 		spin_lock(&res->spinlock);
1442 
1443 		/*
1444 		 * Right after dlm spinlock was released, dlm_thread could have
1445 		 * purged the lockres. Check if lockres got unhashed. If so
1446 		 * start over.
1447 		 */
1448 		if (hlist_unhashed(&res->hash_node)) {
1449 			spin_unlock(&res->spinlock);
1450 			dlm_lockres_put(res);
1451 			goto way_up_top;
1452 		}
1453 
1454 		if (res->state & (DLM_LOCK_RES_RECOVERING|
1455 				  DLM_LOCK_RES_MIGRATING)) {
1456 			spin_unlock(&res->spinlock);
1457 			mlog(0, "returning DLM_MASTER_RESP_ERROR since res is "
1458 			     "being recovered/migrated\n");
1459 			response = DLM_MASTER_RESP_ERROR;
1460 			if (mle)
1461 				kmem_cache_free(dlm_mle_cache, mle);
1462 			goto send_response;
1463 		}
1464 
1465 		if (res->owner == dlm->node_num) {
1466 			dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
1467 			spin_unlock(&res->spinlock);
1468 			response = DLM_MASTER_RESP_YES;
1469 			if (mle)
1470 				kmem_cache_free(dlm_mle_cache, mle);
1471 
1472 			/* this node is the owner.
1473 			 * there is some extra work that needs to
1474 			 * happen now.  the requesting node has
1475 			 * caused all nodes up to this one to
1476 			 * create mles.  this node now needs to
1477 			 * go back and clean those up. */
1478 			dispatch_assert = 1;
1479 			goto send_response;
1480 		} else if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1481 			spin_unlock(&res->spinlock);
1482 			// mlog(0, "node %u is the master\n", res->owner);
1483 			response = DLM_MASTER_RESP_NO;
1484 			if (mle)
1485 				kmem_cache_free(dlm_mle_cache, mle);
1486 			goto send_response;
1487 		}
1488 
1489 		/* ok, there is no owner.  either this node is
1490 		 * being blocked, or it is actively trying to
1491 		 * master this lock. */
1492 		if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1493 			mlog(ML_ERROR, "lock with no owner should be "
1494 			     "in-progress!\n");
1495 			BUG();
1496 		}
1497 
1498 		// mlog(0, "lockres is in progress...\n");
1499 		spin_lock(&dlm->master_lock);
1500 		found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1501 		if (!found) {
1502 			mlog(ML_ERROR, "no mle found for this lock!\n");
1503 			BUG();
1504 		}
1505 		set_maybe = 1;
1506 		spin_lock(&tmpmle->spinlock);
1507 		if (tmpmle->type == DLM_MLE_BLOCK) {
1508 			// mlog(0, "this node is waiting for "
1509 			// "lockres to be mastered\n");
1510 			response = DLM_MASTER_RESP_NO;
1511 		} else if (tmpmle->type == DLM_MLE_MIGRATION) {
1512 			mlog(0, "node %u is master, but trying to migrate to "
1513 			     "node %u.\n", tmpmle->master, tmpmle->new_master);
1514 			if (tmpmle->master == dlm->node_num) {
1515 				mlog(ML_ERROR, "no owner on lockres, but this "
1516 				     "node is trying to migrate it to %u?!\n",
1517 				     tmpmle->new_master);
1518 				BUG();
1519 			} else {
1520 				/* the real master can respond on its own */
1521 				response = DLM_MASTER_RESP_NO;
1522 			}
1523 		} else if (tmpmle->master != DLM_LOCK_RES_OWNER_UNKNOWN) {
1524 			set_maybe = 0;
1525 			if (tmpmle->master == dlm->node_num) {
1526 				response = DLM_MASTER_RESP_YES;
1527 				/* this node will be the owner.
1528 				 * go back and clean the mles on any
1529 				 * other nodes */
1530 				dispatch_assert = 1;
1531 				dlm_lockres_set_refmap_bit(dlm, res,
1532 							   request->node_idx);
1533 			} else
1534 				response = DLM_MASTER_RESP_NO;
1535 		} else {
1536 			// mlog(0, "this node is attempting to "
1537 			// "master lockres\n");
1538 			response = DLM_MASTER_RESP_MAYBE;
1539 		}
1540 		if (set_maybe)
1541 			set_bit(request->node_idx, tmpmle->maybe_map);
1542 		spin_unlock(&tmpmle->spinlock);
1543 
1544 		spin_unlock(&dlm->master_lock);
1545 		spin_unlock(&res->spinlock);
1546 
1547 		/* keep the mle attached to heartbeat events */
1548 		dlm_put_mle(tmpmle);
1549 		if (mle)
1550 			kmem_cache_free(dlm_mle_cache, mle);
1551 		goto send_response;
1552 	}
1553 
1554 	/*
1555 	 * lockres doesn't exist on this node
1556 	 * if there is an MLE_BLOCK, return NO
1557 	 * if there is an MLE_MASTER, return MAYBE
1558 	 * otherwise, add an MLE_BLOCK, return NO
1559 	 */
1560 	spin_lock(&dlm->master_lock);
1561 	found = dlm_find_mle(dlm, &tmpmle, name, namelen);
1562 	if (!found) {
1563 		/* this lockid has never been seen on this node yet */
1564 		// mlog(0, "no mle found\n");
1565 		if (!mle) {
1566 			spin_unlock(&dlm->master_lock);
1567 			spin_unlock(&dlm->spinlock);
1568 
1569 			mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
1570 			if (!mle) {
1571 				response = DLM_MASTER_RESP_ERROR;
1572 				mlog_errno(-ENOMEM);
1573 				goto send_response;
1574 			}
1575 			goto way_up_top;
1576 		}
1577 
1578 		// mlog(0, "this is second time thru, already allocated, "
1579 		// "add the block.\n");
1580 		dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
1581 		set_bit(request->node_idx, mle->maybe_map);
1582 		__dlm_insert_mle(dlm, mle);
1583 		response = DLM_MASTER_RESP_NO;
1584 	} else {
1585 		spin_lock(&tmpmle->spinlock);
1586 		if (tmpmle->master == dlm->node_num) {
1587 			mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
1588 			BUG();
1589 		}
1590 		if (tmpmle->type == DLM_MLE_BLOCK)
1591 			response = DLM_MASTER_RESP_NO;
1592 		else if (tmpmle->type == DLM_MLE_MIGRATION) {
1593 			mlog(0, "migration mle was found (%u->%u)\n",
1594 			     tmpmle->master, tmpmle->new_master);
1595 			/* real master can respond on its own */
1596 			response = DLM_MASTER_RESP_NO;
1597 		} else
1598 			response = DLM_MASTER_RESP_MAYBE;
1599 		set_bit(request->node_idx, tmpmle->maybe_map);
1600 		spin_unlock(&tmpmle->spinlock);
1601 	}
1602 	spin_unlock(&dlm->master_lock);
1603 	spin_unlock(&dlm->spinlock);
1604 
1605 	if (found) {
1606 		/* keep the mle attached to heartbeat events */
1607 		dlm_put_mle(tmpmle);
1608 	}
1609 send_response:
1610 	/*
1611 	 * __dlm_lookup_lockres() grabbed a reference to this lockres.
1612 	 * The reference is released by dlm_assert_master_worker() under
1613 	 * the call to dlm_dispatch_assert_master().  If
1614 	 * dlm_assert_master_worker() isn't called, we drop it here.
1615 	 */
1616 	if (dispatch_assert) {
1617 		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
1618 			     dlm->node_num, res->lockname.len, res->lockname.name);
1619 		spin_lock(&res->spinlock);
1620 		ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx,
1621 						 DLM_ASSERT_MASTER_MLE_CLEANUP);
1622 		if (ret < 0) {
1623 			mlog(ML_ERROR, "failed to dispatch assert master work\n");
1624 			response = DLM_MASTER_RESP_ERROR;
1625 			spin_unlock(&res->spinlock);
1626 			dlm_lockres_put(res);
1627 		} else {
1628 			dispatched = 1;
1629 			__dlm_lockres_grab_inflight_worker(dlm, res);
1630 			spin_unlock(&res->spinlock);
1631 		}
1632 	} else {
1633 		if (res)
1634 			dlm_lockres_put(res);
1635 	}
1636 
1637 	if (!dispatched)
1638 		dlm_put(dlm);
1639 	return response;
1640 }
1641 
1642 /*
1643  * DLM_ASSERT_MASTER_MSG
1644  */
1645 
1646 
1647 /*
1648  * NOTE: this can be used for debugging
1649  * can periodically run all locks owned by this node
1650  * and re-assert across the cluster...
1651  */
1652 static int dlm_do_assert_master(struct dlm_ctxt *dlm,
1653 				struct dlm_lock_resource *res,
1654 				void *nodemap, u32 flags)
1655 {
1656 	struct dlm_assert_master assert;
1657 	int to, tmpret;
1658 	struct dlm_node_iter iter;
1659 	int ret = 0;
1660 	int reassert;
1661 	const char *lockname = res->lockname.name;
1662 	unsigned int namelen = res->lockname.len;
1663 
1664 	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
1665 
1666 	spin_lock(&res->spinlock);
1667 	res->state |= DLM_LOCK_RES_SETREF_INPROG;
1668 	spin_unlock(&res->spinlock);
1669 
1670 again:
1671 	reassert = 0;
1672 
1673 	/* note that if this nodemap is empty, it returns 0 */
1674 	dlm_node_iter_init(nodemap, &iter);
1675 	while ((to = dlm_node_iter_next(&iter)) >= 0) {
1676 		int r = 0;
1677 		struct dlm_master_list_entry *mle = NULL;
1678 
1679 		mlog(0, "sending assert master to %d (%.*s)\n", to,
1680 		     namelen, lockname);
1681 		memset(&assert, 0, sizeof(assert));
1682 		assert.node_idx = dlm->node_num;
1683 		assert.namelen = namelen;
1684 		memcpy(assert.name, lockname, namelen);
1685 		assert.flags = cpu_to_be32(flags);
1686 
1687 		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
1688 					    &assert, sizeof(assert), to, &r);
1689 		if (tmpret < 0) {
1690 			mlog(ML_ERROR, "Error %d when sending message %u (key "
1691 			     "0x%x) to node %u\n", tmpret,
1692 			     DLM_ASSERT_MASTER_MSG, dlm->key, to);
1693 			if (!dlm_is_host_down(tmpret)) {
1694 				mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
1695 				BUG();
1696 			}
1697 			/* a node died.  finish out the rest of the nodes. */
1698 			mlog(0, "link to %d went down!\n", to);
1699 			/* any nonzero status return will do */
1700 			ret = tmpret;
1701 			r = 0;
1702 		} else if (r < 0) {
1703 			/* ok, something horribly messed.  kill thyself. */
1704 			mlog(ML_ERROR,"during assert master of %.*s to %u, "
1705 			     "got %d.\n", namelen, lockname, to, r);
1706 			spin_lock(&dlm->spinlock);
1707 			spin_lock(&dlm->master_lock);
1708 			if (dlm_find_mle(dlm, &mle, (char *)lockname,
1709 					 namelen)) {
1710 				dlm_print_one_mle(mle);
1711 				__dlm_put_mle(mle);
1712 			}
1713 			spin_unlock(&dlm->master_lock);
1714 			spin_unlock(&dlm->spinlock);
1715 			BUG();
1716 		}
1717 
1718 		if (r & DLM_ASSERT_RESPONSE_REASSERT &&
1719 		    !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) {
1720 				mlog(ML_ERROR, "%.*s: very strange, "
1721 				     "master MLE but no lockres on %u\n",
1722 				     namelen, lockname, to);
1723 		}
1724 
1725 		if (r & DLM_ASSERT_RESPONSE_REASSERT) {
1726 			mlog(0, "%.*s: node %u create mles on other "
1727 			     "nodes and requests a re-assert\n",
1728 			     namelen, lockname, to);
1729 			reassert = 1;
1730 		}
1731 		if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) {
1732 			mlog(0, "%.*s: node %u has a reference to this "
1733 			     "lockres, set the bit in the refmap\n",
1734 			     namelen, lockname, to);
1735 			spin_lock(&res->spinlock);
1736 			dlm_lockres_set_refmap_bit(dlm, res, to);
1737 			spin_unlock(&res->spinlock);
1738 		}
1739 	}
1740 
1741 	if (reassert)
1742 		goto again;
1743 
1744 	spin_lock(&res->spinlock);
1745 	res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
1746 	spin_unlock(&res->spinlock);
1747 	wake_up(&res->wq);
1748 
1749 	return ret;
1750 }
1751 
1752 /*
1753  * locks that can be taken here:
1754  * dlm->spinlock
1755  * res->spinlock
1756  * mle->spinlock
1757  * dlm->master_list
1758  *
1759  * if possible, TRIM THIS DOWN!!!
1760  */
1761 int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
1762 			      void **ret_data)
1763 {
1764 	struct dlm_ctxt *dlm = data;
1765 	struct dlm_master_list_entry *mle = NULL;
1766 	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
1767 	struct dlm_lock_resource *res = NULL;
1768 	char *name;
1769 	unsigned int namelen, hash;
1770 	u32 flags;
1771 	int master_request = 0, have_lockres_ref = 0;
1772 	int ret = 0;
1773 
1774 	if (!dlm_grab(dlm))
1775 		return 0;
1776 
1777 	name = assert->name;
1778 	namelen = assert->namelen;
1779 	hash = dlm_lockid_hash(name, namelen);
1780 	flags = be32_to_cpu(assert->flags);
1781 
1782 	if (namelen > DLM_LOCKID_NAME_MAX) {
1783 		mlog(ML_ERROR, "Invalid name length!");
1784 		goto done;
1785 	}
1786 
1787 	spin_lock(&dlm->spinlock);
1788 
1789 	if (flags)
1790 		mlog(0, "assert_master with flags: %u\n", flags);
1791 
1792 	/* find the MLE */
1793 	spin_lock(&dlm->master_lock);
1794 	if (!dlm_find_mle(dlm, &mle, name, namelen)) {
1795 		/* not an error, could be master just re-asserting */
1796 		mlog(0, "just got an assert_master from %u, but no "
1797 		     "MLE for it! (%.*s)\n", assert->node_idx,
1798 		     namelen, name);
1799 	} else {
1800 		int bit = find_next_bit (mle->maybe_map, O2NM_MAX_NODES, 0);
1801 		if (bit >= O2NM_MAX_NODES) {
1802 			/* not necessarily an error, though less likely.
1803 			 * could be master just re-asserting. */
1804 			mlog(0, "no bits set in the maybe_map, but %u "
1805 			     "is asserting! (%.*s)\n", assert->node_idx,
1806 			     namelen, name);
1807 		} else if (bit != assert->node_idx) {
1808 			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1809 				mlog(0, "master %u was found, %u should "
1810 				     "back off\n", assert->node_idx, bit);
1811 			} else {
1812 				/* with the fix for bug 569, a higher node
1813 				 * number winning the mastery will respond
1814 				 * YES to mastery requests, but this node
1815 				 * had no way of knowing.  let it pass. */
1816 				mlog(0, "%u is the lowest node, "
1817 				     "%u is asserting. (%.*s)  %u must "
1818 				     "have begun after %u won.\n", bit,
1819 				     assert->node_idx, namelen, name, bit,
1820 				     assert->node_idx);
1821 			}
1822 		}
1823 		if (mle->type == DLM_MLE_MIGRATION) {
1824 			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
1825 				mlog(0, "%s:%.*s: got cleanup assert"
1826 				     " from %u for migration\n",
1827 				     dlm->name, namelen, name,
1828 				     assert->node_idx);
1829 			} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
1830 				mlog(0, "%s:%.*s: got unrelated assert"
1831 				     " from %u for migration, ignoring\n",
1832 				     dlm->name, namelen, name,
1833 				     assert->node_idx);
1834 				__dlm_put_mle(mle);
1835 				spin_unlock(&dlm->master_lock);
1836 				spin_unlock(&dlm->spinlock);
1837 				goto done;
1838 			}
1839 		}
1840 	}
1841 	spin_unlock(&dlm->master_lock);
1842 
1843 	/* ok everything checks out with the MLE
1844 	 * now check to see if there is a lockres */
1845 	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
1846 	if (res) {
1847 		spin_lock(&res->spinlock);
1848 		if (res->state & DLM_LOCK_RES_RECOVERING)  {
1849 			mlog(ML_ERROR, "%u asserting but %.*s is "
1850 			     "RECOVERING!\n", assert->node_idx, namelen, name);
1851 			goto kill;
1852 		}
1853 		if (!mle) {
1854 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
1855 			    res->owner != assert->node_idx) {
1856 				mlog(ML_ERROR, "DIE! Mastery assert from %u, "
1857 				     "but current owner is %u! (%.*s)\n",
1858 				     assert->node_idx, res->owner, namelen,
1859 				     name);
1860 				__dlm_print_one_lock_resource(res);
1861 				BUG();
1862 			}
1863 		} else if (mle->type != DLM_MLE_MIGRATION) {
1864 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
1865 				/* owner is just re-asserting */
1866 				if (res->owner == assert->node_idx) {
1867 					mlog(0, "owner %u re-asserting on "
1868 					     "lock %.*s\n", assert->node_idx,
1869 					     namelen, name);
1870 					goto ok;
1871 				}
1872 				mlog(ML_ERROR, "got assert_master from "
1873 				     "node %u, but %u is the owner! "
1874 				     "(%.*s)\n", assert->node_idx,
1875 				     res->owner, namelen, name);
1876 				goto kill;
1877 			}
1878 			if (!(res->state & DLM_LOCK_RES_IN_PROGRESS)) {
1879 				mlog(ML_ERROR, "got assert from %u, but lock "
1880 				     "with no owner should be "
1881 				     "in-progress! (%.*s)\n",
1882 				     assert->node_idx,
1883 				     namelen, name);
1884 				goto kill;
1885 			}
1886 		} else /* mle->type == DLM_MLE_MIGRATION */ {
1887 			/* should only be getting an assert from new master */
1888 			if (assert->node_idx != mle->new_master) {
1889 				mlog(ML_ERROR, "got assert from %u, but "
1890 				     "new master is %u, and old master "
1891 				     "was %u (%.*s)\n",
1892 				     assert->node_idx, mle->new_master,
1893 				     mle->master, namelen, name);
1894 				goto kill;
1895 			}
1896 
1897 		}
1898 ok:
1899 		spin_unlock(&res->spinlock);
1900 	}
1901 
1902 	// mlog(0, "woo!  got an assert_master from node %u!\n",
1903 	// 	     assert->node_idx);
1904 	if (mle) {
1905 		int extra_ref = 0;
1906 		int nn = -1;
1907 		int rr, err = 0;
1908 
1909 		spin_lock(&mle->spinlock);
1910 		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
1911 			extra_ref = 1;
1912 		else {
1913 			/* MASTER mle: if any bits set in the response map
1914 			 * then the calling node needs to re-assert to clear
1915 			 * up nodes that this node contacted */
1916 			while ((nn = find_next_bit (mle->response_map, O2NM_MAX_NODES,
1917 						    nn+1)) < O2NM_MAX_NODES) {
1918 				if (nn != dlm->node_num && nn != assert->node_idx) {
1919 					master_request = 1;
1920 					break;
1921 				}
1922 			}
1923 		}
1924 		mle->master = assert->node_idx;
1925 		atomic_set(&mle->woken, 1);
1926 		wake_up(&mle->wq);
1927 		spin_unlock(&mle->spinlock);
1928 
1929 		if (res) {
1930 			int wake = 0;
1931 			spin_lock(&res->spinlock);
1932 			if (mle->type == DLM_MLE_MIGRATION) {
1933 				mlog(0, "finishing off migration of lockres %.*s, "
1934 			     		"from %u to %u\n",
1935 			       		res->lockname.len, res->lockname.name,
1936 			       		dlm->node_num, mle->new_master);
1937 				res->state &= ~DLM_LOCK_RES_MIGRATING;
1938 				wake = 1;
1939 				dlm_change_lockres_owner(dlm, res, mle->new_master);
1940 				BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
1941 			} else {
1942 				dlm_change_lockres_owner(dlm, res, mle->master);
1943 			}
1944 			spin_unlock(&res->spinlock);
1945 			have_lockres_ref = 1;
1946 			if (wake)
1947 				wake_up(&res->wq);
1948 		}
1949 
1950 		/* master is known, detach if not already detached.
1951 		 * ensures that only one assert_master call will happen
1952 		 * on this mle. */
1953 		spin_lock(&dlm->master_lock);
1954 
1955 		rr = kref_read(&mle->mle_refs);
1956 		if (mle->inuse > 0) {
1957 			if (extra_ref && rr < 3)
1958 				err = 1;
1959 			else if (!extra_ref && rr < 2)
1960 				err = 1;
1961 		} else {
1962 			if (extra_ref && rr < 2)
1963 				err = 1;
1964 			else if (!extra_ref && rr < 1)
1965 				err = 1;
1966 		}
1967 		if (err) {
1968 			mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
1969 			     "that will mess up this node, refs=%d, extra=%d, "
1970 			     "inuse=%d\n", dlm->name, namelen, name,
1971 			     assert->node_idx, rr, extra_ref, mle->inuse);
1972 			dlm_print_one_mle(mle);
1973 		}
1974 		__dlm_unlink_mle(dlm, mle);
1975 		__dlm_mle_detach_hb_events(dlm, mle);
1976 		__dlm_put_mle(mle);
1977 		if (extra_ref) {
1978 			/* the assert master message now balances the extra
1979 		 	 * ref given by the master / migration request message.
1980 		 	 * if this is the last put, it will be removed
1981 		 	 * from the list. */
1982 			__dlm_put_mle(mle);
1983 		}
1984 		spin_unlock(&dlm->master_lock);
1985 	} else if (res) {
1986 		if (res->owner != assert->node_idx) {
1987 			mlog(0, "assert_master from %u, but current "
1988 			     "owner is %u (%.*s), no mle\n", assert->node_idx,
1989 			     res->owner, namelen, name);
1990 		}
1991 	}
1992 	spin_unlock(&dlm->spinlock);
1993 
1994 done:
1995 	ret = 0;
1996 	if (res) {
1997 		spin_lock(&res->spinlock);
1998 		res->state |= DLM_LOCK_RES_SETREF_INPROG;
1999 		spin_unlock(&res->spinlock);
2000 		*ret_data = (void *)res;
2001 	}
2002 	dlm_put(dlm);
2003 	if (master_request) {
2004 		mlog(0, "need to tell master to reassert\n");
2005 		/* positive. negative would shoot down the node. */
2006 		ret |= DLM_ASSERT_RESPONSE_REASSERT;
2007 		if (!have_lockres_ref) {
2008 			mlog(ML_ERROR, "strange, got assert from %u, MASTER "
2009 			     "mle present here for %s:%.*s, but no lockres!\n",
2010 			     assert->node_idx, dlm->name, namelen, name);
2011 		}
2012 	}
2013 	if (have_lockres_ref) {
2014 		/* let the master know we have a reference to the lockres */
2015 		ret |= DLM_ASSERT_RESPONSE_MASTERY_REF;
2016 		mlog(0, "%s:%.*s: got assert from %u, need a ref\n",
2017 		     dlm->name, namelen, name, assert->node_idx);
2018 	}
2019 	return ret;
2020 
2021 kill:
2022 	/* kill the caller! */
2023 	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
2024 	     "and killing the other node now!  This node is OK and can continue.\n");
2025 	__dlm_print_one_lock_resource(res);
2026 	spin_unlock(&res->spinlock);
2027 	spin_lock(&dlm->master_lock);
2028 	if (mle)
2029 		__dlm_put_mle(mle);
2030 	spin_unlock(&dlm->master_lock);
2031 	spin_unlock(&dlm->spinlock);
2032 	*ret_data = (void *)res;
2033 	dlm_put(dlm);
2034 	return -EINVAL;
2035 }
2036 
2037 void dlm_assert_master_post_handler(int status, void *data, void *ret_data)
2038 {
2039 	struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data;
2040 
2041 	if (ret_data) {
2042 		spin_lock(&res->spinlock);
2043 		res->state &= ~DLM_LOCK_RES_SETREF_INPROG;
2044 		spin_unlock(&res->spinlock);
2045 		wake_up(&res->wq);
2046 		dlm_lockres_put(res);
2047 	}
2048 	return;
2049 }
2050 
2051 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
2052 			       struct dlm_lock_resource *res,
2053 			       int ignore_higher, u8 request_from, u32 flags)
2054 {
2055 	struct dlm_work_item *item;
2056 	item = kzalloc(sizeof(*item), GFP_ATOMIC);
2057 	if (!item)
2058 		return -ENOMEM;
2059 
2060 
2061 	/* queue up work for dlm_assert_master_worker */
2062 	dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
2063 	item->u.am.lockres = res; /* already have a ref */
2064 	/* can optionally ignore node numbers higher than this node */
2065 	item->u.am.ignore_higher = ignore_higher;
2066 	item->u.am.request_from = request_from;
2067 	item->u.am.flags = flags;
2068 
2069 	if (ignore_higher)
2070 		mlog(0, "IGNORE HIGHER: %.*s\n", res->lockname.len,
2071 		     res->lockname.name);
2072 
2073 	spin_lock(&dlm->work_lock);
2074 	list_add_tail(&item->list, &dlm->work_list);
2075 	spin_unlock(&dlm->work_lock);
2076 
2077 	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2078 	return 0;
2079 }
2080 
2081 static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
2082 {
2083 	struct dlm_ctxt *dlm = data;
2084 	int ret = 0;
2085 	struct dlm_lock_resource *res;
2086 	unsigned long nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
2087 	int ignore_higher;
2088 	int bit;
2089 	u8 request_from;
2090 	u32 flags;
2091 
2092 	dlm = item->dlm;
2093 	res = item->u.am.lockres;
2094 	ignore_higher = item->u.am.ignore_higher;
2095 	request_from = item->u.am.request_from;
2096 	flags = item->u.am.flags;
2097 
2098 	spin_lock(&dlm->spinlock);
2099 	memcpy(nodemap, dlm->domain_map, sizeof(nodemap));
2100 	spin_unlock(&dlm->spinlock);
2101 
2102 	clear_bit(dlm->node_num, nodemap);
2103 	if (ignore_higher) {
2104 		/* if is this just to clear up mles for nodes below
2105 		 * this node, do not send the message to the original
2106 		 * caller or any node number higher than this */
2107 		clear_bit(request_from, nodemap);
2108 		bit = dlm->node_num;
2109 		while (1) {
2110 			bit = find_next_bit(nodemap, O2NM_MAX_NODES,
2111 					    bit+1);
2112 		       	if (bit >= O2NM_MAX_NODES)
2113 				break;
2114 			clear_bit(bit, nodemap);
2115 		}
2116 	}
2117 
2118 	/*
2119 	 * If we're migrating this lock to someone else, we are no
2120 	 * longer allowed to assert out own mastery.  OTOH, we need to
2121 	 * prevent migration from starting while we're still asserting
2122 	 * our dominance.  The reserved ast delays migration.
2123 	 */
2124 	spin_lock(&res->spinlock);
2125 	if (res->state & DLM_LOCK_RES_MIGRATING) {
2126 		mlog(0, "Someone asked us to assert mastery, but we're "
2127 		     "in the middle of migration.  Skipping assert, "
2128 		     "the new master will handle that.\n");
2129 		spin_unlock(&res->spinlock);
2130 		goto put;
2131 	} else
2132 		__dlm_lockres_reserve_ast(res);
2133 	spin_unlock(&res->spinlock);
2134 
2135 	/* this call now finishes out the nodemap
2136 	 * even if one or more nodes die */
2137 	mlog(0, "worker about to master %.*s here, this=%u\n",
2138 		     res->lockname.len, res->lockname.name, dlm->node_num);
2139 	ret = dlm_do_assert_master(dlm, res, nodemap, flags);
2140 	if (ret < 0) {
2141 		/* no need to restart, we are done */
2142 		if (!dlm_is_host_down(ret))
2143 			mlog_errno(ret);
2144 	}
2145 
2146 	/* Ok, we've asserted ourselves.  Let's let migration start. */
2147 	dlm_lockres_release_ast(dlm, res);
2148 
2149 put:
2150 	dlm_lockres_drop_inflight_worker(dlm, res);
2151 
2152 	dlm_lockres_put(res);
2153 
2154 	mlog(0, "finished with dlm_assert_master_worker\n");
2155 }
2156 
2157 /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread.
2158  * We cannot wait for node recovery to complete to begin mastering this
2159  * lockres because this lockres is used to kick off recovery! ;-)
2160  * So, do a pre-check on all living nodes to see if any of those nodes
2161  * think that $RECOVERY is currently mastered by a dead node.  If so,
2162  * we wait a short time to allow that node to get notified by its own
2163  * heartbeat stack, then check again.  All $RECOVERY lock resources
2164  * mastered by dead nodes are purged when the heartbeat callback is
2165  * fired, so we can know for sure that it is safe to continue once
2166  * the node returns a live node or no node.  */
2167 static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
2168 				       struct dlm_lock_resource *res)
2169 {
2170 	struct dlm_node_iter iter;
2171 	int nodenum;
2172 	int ret = 0;
2173 	u8 master = DLM_LOCK_RES_OWNER_UNKNOWN;
2174 
2175 	spin_lock(&dlm->spinlock);
2176 	dlm_node_iter_init(dlm->domain_map, &iter);
2177 	spin_unlock(&dlm->spinlock);
2178 
2179 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
2180 		/* do not send to self */
2181 		if (nodenum == dlm->node_num)
2182 			continue;
2183 		ret = dlm_do_master_requery(dlm, res, nodenum, &master);
2184 		if (ret < 0) {
2185 			mlog_errno(ret);
2186 			if (!dlm_is_host_down(ret))
2187 				BUG();
2188 			/* host is down, so answer for that node would be
2189 			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
2190 			ret = 0;
2191 		}
2192 
2193 		if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
2194 			/* check to see if this master is in the recovery map */
2195 			spin_lock(&dlm->spinlock);
2196 			if (test_bit(master, dlm->recovery_map)) {
2197 				mlog(ML_NOTICE, "%s: node %u has not seen "
2198 				     "node %u go down yet, and thinks the "
2199 				     "dead node is mastering the recovery "
2200 				     "lock.  must wait.\n", dlm->name,
2201 				     nodenum, master);
2202 				ret = -EAGAIN;
2203 			}
2204 			spin_unlock(&dlm->spinlock);
2205 			mlog(0, "%s: reco lock master is %u\n", dlm->name,
2206 			     master);
2207 			break;
2208 		}
2209 	}
2210 	return ret;
2211 }
2212 
2213 /*
2214  * DLM_DEREF_LOCKRES_MSG
2215  */
2216 
2217 int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2218 {
2219 	struct dlm_deref_lockres deref;
2220 	int ret = 0, r;
2221 	const char *lockname;
2222 	unsigned int namelen;
2223 
2224 	lockname = res->lockname.name;
2225 	namelen = res->lockname.len;
2226 	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2227 
2228 	memset(&deref, 0, sizeof(deref));
2229 	deref.node_idx = dlm->node_num;
2230 	deref.namelen = namelen;
2231 	memcpy(deref.name, lockname, namelen);
2232 
2233 	ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2234 				 &deref, sizeof(deref), res->owner, &r);
2235 	if (ret < 0)
2236 		mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
2237 		     dlm->name, namelen, lockname, ret, res->owner);
2238 	else if (r < 0) {
2239 		/* BAD.  other node says I did not have a ref. */
2240 		mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2241 		     dlm->name, namelen, lockname, res->owner, r);
2242 		dlm_print_one_lock_resource(res);
2243 		if (r == -ENOMEM)
2244 			BUG();
2245 	} else
2246 		ret = r;
2247 
2248 	return ret;
2249 }
2250 
2251 int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2252 			      void **ret_data)
2253 {
2254 	struct dlm_ctxt *dlm = data;
2255 	struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf;
2256 	struct dlm_lock_resource *res = NULL;
2257 	char *name;
2258 	unsigned int namelen;
2259 	int ret = -EINVAL;
2260 	u8 node;
2261 	unsigned int hash;
2262 	struct dlm_work_item *item;
2263 	int cleared = 0;
2264 	int dispatch = 0;
2265 
2266 	if (!dlm_grab(dlm))
2267 		return 0;
2268 
2269 	name = deref->name;
2270 	namelen = deref->namelen;
2271 	node = deref->node_idx;
2272 
2273 	if (namelen > DLM_LOCKID_NAME_MAX) {
2274 		mlog(ML_ERROR, "Invalid name length!");
2275 		goto done;
2276 	}
2277 	if (deref->node_idx >= O2NM_MAX_NODES) {
2278 		mlog(ML_ERROR, "Invalid node number: %u\n", node);
2279 		goto done;
2280 	}
2281 
2282 	hash = dlm_lockid_hash(name, namelen);
2283 
2284 	spin_lock(&dlm->spinlock);
2285 	res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2286 	if (!res) {
2287 		spin_unlock(&dlm->spinlock);
2288 		mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2289 		     dlm->name, namelen, name);
2290 		goto done;
2291 	}
2292 	spin_unlock(&dlm->spinlock);
2293 
2294 	spin_lock(&res->spinlock);
2295 	if (res->state & DLM_LOCK_RES_SETREF_INPROG)
2296 		dispatch = 1;
2297 	else {
2298 		BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2299 		if (test_bit(node, res->refmap)) {
2300 			dlm_lockres_clear_refmap_bit(dlm, res, node);
2301 			cleared = 1;
2302 		}
2303 	}
2304 	spin_unlock(&res->spinlock);
2305 
2306 	if (!dispatch) {
2307 		if (cleared)
2308 			dlm_lockres_calc_usage(dlm, res);
2309 		else {
2310 			mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2311 		     	"but it is already dropped!\n", dlm->name,
2312 		     	res->lockname.len, res->lockname.name, node);
2313 			dlm_print_one_lock_resource(res);
2314 		}
2315 		ret = DLM_DEREF_RESPONSE_DONE;
2316 		goto done;
2317 	}
2318 
2319 	item = kzalloc(sizeof(*item), GFP_NOFS);
2320 	if (!item) {
2321 		ret = -ENOMEM;
2322 		mlog_errno(ret);
2323 		goto done;
2324 	}
2325 
2326 	dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL);
2327 	item->u.dl.deref_res = res;
2328 	item->u.dl.deref_node = node;
2329 
2330 	spin_lock(&dlm->work_lock);
2331 	list_add_tail(&item->list, &dlm->work_list);
2332 	spin_unlock(&dlm->work_lock);
2333 
2334 	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
2335 	return DLM_DEREF_RESPONSE_INPROG;
2336 
2337 done:
2338 	if (res)
2339 		dlm_lockres_put(res);
2340 	dlm_put(dlm);
2341 
2342 	return ret;
2343 }
2344 
2345 int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
2346 			      void **ret_data)
2347 {
2348 	struct dlm_ctxt *dlm = data;
2349 	struct dlm_deref_lockres_done *deref
2350 			= (struct dlm_deref_lockres_done *)msg->buf;
2351 	struct dlm_lock_resource *res = NULL;
2352 	char *name;
2353 	unsigned int namelen;
2354 	int ret = -EINVAL;
2355 	u8 node;
2356 	unsigned int hash;
2357 
2358 	if (!dlm_grab(dlm))
2359 		return 0;
2360 
2361 	name = deref->name;
2362 	namelen = deref->namelen;
2363 	node = deref->node_idx;
2364 
2365 	if (namelen > DLM_LOCKID_NAME_MAX) {
2366 		mlog(ML_ERROR, "Invalid name length!");
2367 		goto done;
2368 	}
2369 	if (deref->node_idx >= O2NM_MAX_NODES) {
2370 		mlog(ML_ERROR, "Invalid node number: %u\n", node);
2371 		goto done;
2372 	}
2373 
2374 	hash = dlm_lockid_hash(name, namelen);
2375 
2376 	spin_lock(&dlm->spinlock);
2377 	res = __dlm_lookup_lockres_full(dlm, name, namelen, hash);
2378 	if (!res) {
2379 		spin_unlock(&dlm->spinlock);
2380 		mlog(ML_ERROR, "%s:%.*s: bad lockres name\n",
2381 		     dlm->name, namelen, name);
2382 		goto done;
2383 	}
2384 
2385 	spin_lock(&res->spinlock);
2386 	if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) {
2387 		spin_unlock(&res->spinlock);
2388 		spin_unlock(&dlm->spinlock);
2389 		mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done "
2390 			"but it is already derefed!\n", dlm->name,
2391 			res->lockname.len, res->lockname.name, node);
2392 		ret = 0;
2393 		goto done;
2394 	}
2395 
2396 	__dlm_do_purge_lockres(dlm, res);
2397 	spin_unlock(&res->spinlock);
2398 	wake_up(&res->wq);
2399 
2400 	spin_unlock(&dlm->spinlock);
2401 
2402 	ret = 0;
2403 done:
2404 	if (res)
2405 		dlm_lockres_put(res);
2406 	dlm_put(dlm);
2407 	return ret;
2408 }
2409 
2410 static void dlm_drop_lockres_ref_done(struct dlm_ctxt *dlm,
2411 		struct dlm_lock_resource *res, u8 node)
2412 {
2413 	struct dlm_deref_lockres_done deref;
2414 	int ret = 0, r;
2415 	const char *lockname;
2416 	unsigned int namelen;
2417 
2418 	lockname = res->lockname.name;
2419 	namelen = res->lockname.len;
2420 	BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2421 
2422 	memset(&deref, 0, sizeof(deref));
2423 	deref.node_idx = dlm->node_num;
2424 	deref.namelen = namelen;
2425 	memcpy(deref.name, lockname, namelen);
2426 
2427 	ret = o2net_send_message(DLM_DEREF_LOCKRES_DONE, dlm->key,
2428 				 &deref, sizeof(deref), node, &r);
2429 	if (ret < 0) {
2430 		mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF DONE "
2431 				" to node %u\n", dlm->name, namelen,
2432 				lockname, ret, node);
2433 	} else if (r < 0) {
2434 		/* ignore the error */
2435 		mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2436 		     dlm->name, namelen, lockname, node, r);
2437 		dlm_print_one_lock_resource(res);
2438 	}
2439 }
2440 
2441 static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2442 {
2443 	struct dlm_ctxt *dlm;
2444 	struct dlm_lock_resource *res;
2445 	u8 node;
2446 	u8 cleared = 0;
2447 
2448 	dlm = item->dlm;
2449 	res = item->u.dl.deref_res;
2450 	node = item->u.dl.deref_node;
2451 
2452 	spin_lock(&res->spinlock);
2453 	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2454 	__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2455 	if (test_bit(node, res->refmap)) {
2456 		dlm_lockres_clear_refmap_bit(dlm, res, node);
2457 		cleared = 1;
2458 	}
2459 	spin_unlock(&res->spinlock);
2460 
2461 	dlm_drop_lockres_ref_done(dlm, res, node);
2462 
2463 	if (cleared) {
2464 		mlog(0, "%s:%.*s node %u ref dropped in dispatch\n",
2465 		     dlm->name, res->lockname.len, res->lockname.name, node);
2466 		dlm_lockres_calc_usage(dlm, res);
2467 	} else {
2468 		mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref "
2469 		     "but it is already dropped!\n", dlm->name,
2470 		     res->lockname.len, res->lockname.name, node);
2471 		dlm_print_one_lock_resource(res);
2472 	}
2473 
2474 	dlm_lockres_put(res);
2475 }
2476 
2477 /*
2478  * A migratable resource is one that is :
2479  * 1. locally mastered, and,
2480  * 2. zero local locks, and,
2481  * 3. one or more non-local locks, or, one or more references
2482  * Returns 1 if yes, 0 if not.
2483  */
2484 static int dlm_is_lockres_migratable(struct dlm_ctxt *dlm,
2485 				      struct dlm_lock_resource *res)
2486 {
2487 	enum dlm_lockres_list idx;
2488 	int nonlocal = 0, node_ref;
2489 	struct list_head *queue;
2490 	struct dlm_lock *lock;
2491 	u64 cookie;
2492 
2493 	assert_spin_locked(&res->spinlock);
2494 
2495 	/* delay migration when the lockres is in MIGRATING state */
2496 	if (res->state & DLM_LOCK_RES_MIGRATING)
2497 		return 0;
2498 
2499 	/* delay migration when the lockres is in RECOCERING state */
2500 	if (res->state & (DLM_LOCK_RES_RECOVERING|
2501 			DLM_LOCK_RES_RECOVERY_WAITING))
2502 		return 0;
2503 
2504 	if (res->owner != dlm->node_num)
2505 		return 0;
2506 
2507         for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2508 		queue = dlm_list_idx_to_ptr(res, idx);
2509 		list_for_each_entry(lock, queue, list) {
2510 			if (lock->ml.node != dlm->node_num) {
2511 				nonlocal++;
2512 				continue;
2513 			}
2514 			cookie = be64_to_cpu(lock->ml.cookie);
2515 			mlog(0, "%s: Not migratable res %.*s, lock %u:%llu on "
2516 			     "%s list\n", dlm->name, res->lockname.len,
2517 			     res->lockname.name,
2518 			     dlm_get_lock_cookie_node(cookie),
2519 			     dlm_get_lock_cookie_seq(cookie),
2520 			     dlm_list_in_text(idx));
2521 			return 0;
2522 		}
2523 	}
2524 
2525 	if (!nonlocal) {
2526 		node_ref = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
2527 		if (node_ref >= O2NM_MAX_NODES)
2528 			return 0;
2529 	}
2530 
2531 	mlog(0, "%s: res %.*s, Migratable\n", dlm->name, res->lockname.len,
2532 	     res->lockname.name);
2533 
2534 	return 1;
2535 }
2536 
2537 /*
2538  * DLM_MIGRATE_LOCKRES
2539  */
2540 
2541 
2542 static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
2543 			       struct dlm_lock_resource *res, u8 target)
2544 {
2545 	struct dlm_master_list_entry *mle = NULL;
2546 	struct dlm_master_list_entry *oldmle = NULL;
2547  	struct dlm_migratable_lockres *mres = NULL;
2548 	int ret = 0;
2549 	const char *name;
2550 	unsigned int namelen;
2551 	int mle_added = 0;
2552 	int wake = 0;
2553 
2554 	if (!dlm_grab(dlm))
2555 		return -EINVAL;
2556 
2557 	BUG_ON(target == O2NM_MAX_NODES);
2558 
2559 	name = res->lockname.name;
2560 	namelen = res->lockname.len;
2561 
2562 	mlog(0, "%s: Migrating %.*s to node %u\n", dlm->name, namelen, name,
2563 	     target);
2564 
2565 	/* preallocate up front. if this fails, abort */
2566 	ret = -ENOMEM;
2567 	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
2568 	if (!mres) {
2569 		mlog_errno(ret);
2570 		goto leave;
2571 	}
2572 
2573 	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
2574 	if (!mle) {
2575 		mlog_errno(ret);
2576 		goto leave;
2577 	}
2578 	ret = 0;
2579 
2580 	/*
2581 	 * clear any existing master requests and
2582 	 * add the migration mle to the list
2583 	 */
2584 	spin_lock(&dlm->spinlock);
2585 	spin_lock(&dlm->master_lock);
2586 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle, name,
2587 				    namelen, target, dlm->node_num);
2588 	/* get an extra reference on the mle.
2589 	 * otherwise the assert_master from the new
2590 	 * master will destroy this.
2591 	 */
2592 	if (ret != -EEXIST)
2593 		dlm_get_mle_inuse(mle);
2594 
2595 	spin_unlock(&dlm->master_lock);
2596 	spin_unlock(&dlm->spinlock);
2597 
2598 	if (ret == -EEXIST) {
2599 		mlog(0, "another process is already migrating it\n");
2600 		goto fail;
2601 	}
2602 	mle_added = 1;
2603 
2604 	/*
2605 	 * set the MIGRATING flag and flush asts
2606 	 * if we fail after this we need to re-dirty the lockres
2607 	 */
2608 	if (dlm_mark_lockres_migrating(dlm, res, target) < 0) {
2609 		mlog(ML_ERROR, "tried to migrate %.*s to %u, but "
2610 		     "the target went down.\n", res->lockname.len,
2611 		     res->lockname.name, target);
2612 		spin_lock(&res->spinlock);
2613 		res->state &= ~DLM_LOCK_RES_MIGRATING;
2614 		wake = 1;
2615 		spin_unlock(&res->spinlock);
2616 		ret = -EINVAL;
2617 	}
2618 
2619 fail:
2620 	if (ret != -EEXIST && oldmle) {
2621 		/* master is known, detach if not already detached */
2622 		dlm_mle_detach_hb_events(dlm, oldmle);
2623 		dlm_put_mle(oldmle);
2624 	}
2625 
2626 	if (ret < 0) {
2627 		if (mle_added) {
2628 			dlm_mle_detach_hb_events(dlm, mle);
2629 			dlm_put_mle(mle);
2630 			dlm_put_mle_inuse(mle);
2631 		} else if (mle) {
2632 			kmem_cache_free(dlm_mle_cache, mle);
2633 			mle = NULL;
2634 		}
2635 		goto leave;
2636 	}
2637 
2638 	/*
2639 	 * at this point, we have a migration target, an mle
2640 	 * in the master list, and the MIGRATING flag set on
2641 	 * the lockres
2642 	 */
2643 
2644 	/* now that remote nodes are spinning on the MIGRATING flag,
2645 	 * ensure that all assert_master work is flushed. */
2646 	flush_workqueue(dlm->dlm_worker);
2647 
2648 	/* notify new node and send all lock state */
2649 	/* call send_one_lockres with migration flag.
2650 	 * this serves as notice to the target node that a
2651 	 * migration is starting. */
2652 	ret = dlm_send_one_lockres(dlm, res, mres, target,
2653 				   DLM_MRES_MIGRATION);
2654 
2655 	if (ret < 0) {
2656 		mlog(0, "migration to node %u failed with %d\n",
2657 		     target, ret);
2658 		/* migration failed, detach and clean up mle */
2659 		dlm_mle_detach_hb_events(dlm, mle);
2660 		dlm_put_mle(mle);
2661 		dlm_put_mle_inuse(mle);
2662 		spin_lock(&res->spinlock);
2663 		res->state &= ~DLM_LOCK_RES_MIGRATING;
2664 		wake = 1;
2665 		spin_unlock(&res->spinlock);
2666 		if (dlm_is_host_down(ret))
2667 			dlm_wait_for_node_death(dlm, target,
2668 						DLM_NODE_DEATH_WAIT_MAX);
2669 		goto leave;
2670 	}
2671 
2672 	/* at this point, the target sends a message to all nodes,
2673 	 * (using dlm_do_migrate_request).  this node is skipped since
2674 	 * we had to put an mle in the list to begin the process.  this
2675 	 * node now waits for target to do an assert master.  this node
2676 	 * will be the last one notified, ensuring that the migration
2677 	 * is complete everywhere.  if the target dies while this is
2678 	 * going on, some nodes could potentially see the target as the
2679 	 * master, so it is important that my recovery finds the migration
2680 	 * mle and sets the master to UNKNOWN. */
2681 
2682 
2683 	/* wait for new node to assert master */
2684 	while (1) {
2685 		ret = wait_event_interruptible_timeout(mle->wq,
2686 					(atomic_read(&mle->woken) == 1),
2687 					msecs_to_jiffies(5000));
2688 
2689 		if (ret >= 0) {
2690 		       	if (atomic_read(&mle->woken) == 1 ||
2691 			    res->owner == target)
2692 				break;
2693 
2694 			mlog(0, "%s:%.*s: timed out during migration\n",
2695 			     dlm->name, res->lockname.len, res->lockname.name);
2696 			/* avoid hang during shutdown when migrating lockres
2697 			 * to a node which also goes down */
2698 			if (dlm_is_node_dead(dlm, target)) {
2699 				mlog(0, "%s:%.*s: expected migration "
2700 				     "target %u is no longer up, restarting\n",
2701 				     dlm->name, res->lockname.len,
2702 				     res->lockname.name, target);
2703 				ret = -EINVAL;
2704 				/* migration failed, detach and clean up mle */
2705 				dlm_mle_detach_hb_events(dlm, mle);
2706 				dlm_put_mle(mle);
2707 				dlm_put_mle_inuse(mle);
2708 				spin_lock(&res->spinlock);
2709 				res->state &= ~DLM_LOCK_RES_MIGRATING;
2710 				wake = 1;
2711 				spin_unlock(&res->spinlock);
2712 				goto leave;
2713 			}
2714 		} else
2715 			mlog(0, "%s:%.*s: caught signal during migration\n",
2716 			     dlm->name, res->lockname.len, res->lockname.name);
2717 	}
2718 
2719 	/* all done, set the owner, clear the flag */
2720 	spin_lock(&res->spinlock);
2721 	dlm_set_lockres_owner(dlm, res, target);
2722 	res->state &= ~DLM_LOCK_RES_MIGRATING;
2723 	dlm_remove_nonlocal_locks(dlm, res);
2724 	spin_unlock(&res->spinlock);
2725 	wake_up(&res->wq);
2726 
2727 	/* master is known, detach if not already detached */
2728 	dlm_mle_detach_hb_events(dlm, mle);
2729 	dlm_put_mle_inuse(mle);
2730 	ret = 0;
2731 
2732 	dlm_lockres_calc_usage(dlm, res);
2733 
2734 leave:
2735 	/* re-dirty the lockres if we failed */
2736 	if (ret < 0)
2737 		dlm_kick_thread(dlm, res);
2738 
2739 	/* wake up waiters if the MIGRATING flag got set
2740 	 * but migration failed */
2741 	if (wake)
2742 		wake_up(&res->wq);
2743 
2744 	if (mres)
2745 		free_page((unsigned long)mres);
2746 
2747 	dlm_put(dlm);
2748 
2749 	mlog(0, "%s: Migrating %.*s to %u, returns %d\n", dlm->name, namelen,
2750 	     name, target, ret);
2751 	return ret;
2752 }
2753 
2754 #define DLM_MIGRATION_RETRY_MS  100
2755 
2756 /*
2757  * Should be called only after beginning the domain leave process.
2758  * There should not be any remaining locks on nonlocal lock resources,
2759  * and there should be no local locks left on locally mastered resources.
2760  *
2761  * Called with the dlm spinlock held, may drop it to do migration, but
2762  * will re-acquire before exit.
2763  *
2764  * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped
2765  */
2766 int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2767 {
2768 	int ret;
2769 	int lock_dropped = 0;
2770 	u8 target = O2NM_MAX_NODES;
2771 
2772 	assert_spin_locked(&dlm->spinlock);
2773 
2774 	spin_lock(&res->spinlock);
2775 	if (dlm_is_lockres_migratable(dlm, res))
2776 		target = dlm_pick_migration_target(dlm, res);
2777 	spin_unlock(&res->spinlock);
2778 
2779 	if (target == O2NM_MAX_NODES)
2780 		goto leave;
2781 
2782 	/* Wheee! Migrate lockres here! Will sleep so drop spinlock. */
2783 	spin_unlock(&dlm->spinlock);
2784 	lock_dropped = 1;
2785 	ret = dlm_migrate_lockres(dlm, res, target);
2786 	if (ret)
2787 		mlog(0, "%s: res %.*s, Migrate to node %u failed with %d\n",
2788 		     dlm->name, res->lockname.len, res->lockname.name,
2789 		     target, ret);
2790 	spin_lock(&dlm->spinlock);
2791 leave:
2792 	return lock_dropped;
2793 }
2794 
2795 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock)
2796 {
2797 	int ret;
2798 	spin_lock(&dlm->ast_lock);
2799 	spin_lock(&lock->spinlock);
2800 	ret = (list_empty(&lock->bast_list) && !lock->bast_pending);
2801 	spin_unlock(&lock->spinlock);
2802 	spin_unlock(&dlm->ast_lock);
2803 	return ret;
2804 }
2805 
2806 static int dlm_migration_can_proceed(struct dlm_ctxt *dlm,
2807 				     struct dlm_lock_resource *res,
2808 				     u8 mig_target)
2809 {
2810 	int can_proceed;
2811 	spin_lock(&res->spinlock);
2812 	can_proceed = !!(res->state & DLM_LOCK_RES_MIGRATING);
2813 	spin_unlock(&res->spinlock);
2814 
2815 	/* target has died, so make the caller break out of the
2816 	 * wait_event, but caller must recheck the domain_map */
2817 	spin_lock(&dlm->spinlock);
2818 	if (!test_bit(mig_target, dlm->domain_map))
2819 		can_proceed = 1;
2820 	spin_unlock(&dlm->spinlock);
2821 	return can_proceed;
2822 }
2823 
2824 static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm,
2825 				struct dlm_lock_resource *res)
2826 {
2827 	int ret;
2828 	spin_lock(&res->spinlock);
2829 	ret = !!(res->state & DLM_LOCK_RES_DIRTY);
2830 	spin_unlock(&res->spinlock);
2831 	return ret;
2832 }
2833 
2834 
2835 static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm,
2836 				       struct dlm_lock_resource *res,
2837 				       u8 target)
2838 {
2839 	int ret = 0;
2840 
2841 	mlog(0, "dlm_mark_lockres_migrating: %.*s, from %u to %u\n",
2842 	       res->lockname.len, res->lockname.name, dlm->node_num,
2843 	       target);
2844 	/* need to set MIGRATING flag on lockres.  this is done by
2845 	 * ensuring that all asts have been flushed for this lockres. */
2846 	spin_lock(&res->spinlock);
2847 	BUG_ON(res->migration_pending);
2848 	res->migration_pending = 1;
2849 	/* strategy is to reserve an extra ast then release
2850 	 * it below, letting the release do all of the work */
2851 	__dlm_lockres_reserve_ast(res);
2852 	spin_unlock(&res->spinlock);
2853 
2854 	/* now flush all the pending asts */
2855 	dlm_kick_thread(dlm, res);
2856 	/* before waiting on DIRTY, block processes which may
2857 	 * try to dirty the lockres before MIGRATING is set */
2858 	spin_lock(&res->spinlock);
2859 	BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY);
2860 	res->state |= DLM_LOCK_RES_BLOCK_DIRTY;
2861 	spin_unlock(&res->spinlock);
2862 	/* now wait on any pending asts and the DIRTY state */
2863 	wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res));
2864 	dlm_lockres_release_ast(dlm, res);
2865 
2866 	mlog(0, "about to wait on migration_wq, dirty=%s\n",
2867 	       res->state & DLM_LOCK_RES_DIRTY ? "yes" : "no");
2868 	/* if the extra ref we just put was the final one, this
2869 	 * will pass thru immediately.  otherwise, we need to wait
2870 	 * for the last ast to finish. */
2871 again:
2872 	ret = wait_event_interruptible_timeout(dlm->migration_wq,
2873 		   dlm_migration_can_proceed(dlm, res, target),
2874 		   msecs_to_jiffies(1000));
2875 	if (ret < 0) {
2876 		mlog(0, "woken again: migrating? %s, dead? %s\n",
2877 		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2878 		       test_bit(target, dlm->domain_map) ? "no":"yes");
2879 	} else {
2880 		mlog(0, "all is well: migrating? %s, dead? %s\n",
2881 		       res->state & DLM_LOCK_RES_MIGRATING ? "yes":"no",
2882 		       test_bit(target, dlm->domain_map) ? "no":"yes");
2883 	}
2884 	if (!dlm_migration_can_proceed(dlm, res, target)) {
2885 		mlog(0, "trying again...\n");
2886 		goto again;
2887 	}
2888 
2889 	ret = 0;
2890 	/* did the target go down or die? */
2891 	spin_lock(&dlm->spinlock);
2892 	if (!test_bit(target, dlm->domain_map)) {
2893 		mlog(ML_ERROR, "aha. migration target %u just went down\n",
2894 		     target);
2895 		ret = -EHOSTDOWN;
2896 	}
2897 	spin_unlock(&dlm->spinlock);
2898 
2899 	/*
2900 	 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
2901 	 * another try; otherwise, we are sure the MIGRATING state is there,
2902 	 * drop the unneeded state which blocked threads trying to DIRTY
2903 	 */
2904 	spin_lock(&res->spinlock);
2905 	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
2906 	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
2907 	if (!ret)
2908 		BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
2909 	else
2910 		res->migration_pending = 0;
2911 	spin_unlock(&res->spinlock);
2912 
2913 	/*
2914 	 * at this point:
2915 	 *
2916 	 *   o the DLM_LOCK_RES_MIGRATING flag is set if target not down
2917 	 *   o there are no pending asts on this lockres
2918 	 *   o all processes trying to reserve an ast on this
2919 	 *     lockres must wait for the MIGRATING flag to clear
2920 	 */
2921 	return ret;
2922 }
2923 
2924 /* last step in the migration process.
2925  * original master calls this to free all of the dlm_lock
2926  * structures that used to be for other nodes. */
2927 static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2928 				      struct dlm_lock_resource *res)
2929 {
2930 	struct list_head *queue = &res->granted;
2931 	int i, bit;
2932 	struct dlm_lock *lock, *next;
2933 
2934 	assert_spin_locked(&res->spinlock);
2935 
2936 	BUG_ON(res->owner == dlm->node_num);
2937 
2938 	for (i=0; i<3; i++) {
2939 		list_for_each_entry_safe(lock, next, queue, list) {
2940 			if (lock->ml.node != dlm->node_num) {
2941 				mlog(0, "putting lock for node %u\n",
2942 				     lock->ml.node);
2943 				/* be extra careful */
2944 				BUG_ON(!list_empty(&lock->ast_list));
2945 				BUG_ON(!list_empty(&lock->bast_list));
2946 				BUG_ON(lock->ast_pending);
2947 				BUG_ON(lock->bast_pending);
2948 				dlm_lockres_clear_refmap_bit(dlm, res,
2949 							     lock->ml.node);
2950 				list_del_init(&lock->list);
2951 				dlm_lock_put(lock);
2952 				/* In a normal unlock, we would have added a
2953 				 * DLM_UNLOCK_FREE_LOCK action. Force it. */
2954 				dlm_lock_put(lock);
2955 			}
2956 		}
2957 		queue++;
2958 	}
2959 	bit = 0;
2960 	while (1) {
2961 		bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
2962 		if (bit >= O2NM_MAX_NODES)
2963 			break;
2964 		/* do not clear the local node reference, if there is a
2965 		 * process holding this, let it drop the ref itself */
2966 		if (bit != dlm->node_num) {
2967 			mlog(0, "%s:%.*s: node %u had a ref to this "
2968 			     "migrating lockres, clearing\n", dlm->name,
2969 			     res->lockname.len, res->lockname.name, bit);
2970 			dlm_lockres_clear_refmap_bit(dlm, res, bit);
2971 		}
2972 		bit++;
2973 	}
2974 }
2975 
2976 /*
2977  * Pick a node to migrate the lock resource to. This function selects a
2978  * potential target based first on the locks and then on refmap. It skips
2979  * nodes that are in the process of exiting the domain.
2980  */
2981 static u8 dlm_pick_migration_target(struct dlm_ctxt *dlm,
2982 				    struct dlm_lock_resource *res)
2983 {
2984 	enum dlm_lockres_list idx;
2985 	struct list_head *queue = &res->granted;
2986 	struct dlm_lock *lock;
2987 	int noderef;
2988 	u8 nodenum = O2NM_MAX_NODES;
2989 
2990 	assert_spin_locked(&dlm->spinlock);
2991 	assert_spin_locked(&res->spinlock);
2992 
2993 	/* Go through all the locks */
2994 	for (idx = DLM_GRANTED_LIST; idx <= DLM_BLOCKED_LIST; idx++) {
2995 		queue = dlm_list_idx_to_ptr(res, idx);
2996 		list_for_each_entry(lock, queue, list) {
2997 			if (lock->ml.node == dlm->node_num)
2998 				continue;
2999 			if (test_bit(lock->ml.node, dlm->exit_domain_map))
3000 				continue;
3001 			nodenum = lock->ml.node;
3002 			goto bail;
3003 		}
3004 	}
3005 
3006 	/* Go thru the refmap */
3007 	noderef = -1;
3008 	while (1) {
3009 		noderef = find_next_bit(res->refmap, O2NM_MAX_NODES,
3010 					noderef + 1);
3011 		if (noderef >= O2NM_MAX_NODES)
3012 			break;
3013 		if (noderef == dlm->node_num)
3014 			continue;
3015 		if (test_bit(noderef, dlm->exit_domain_map))
3016 			continue;
3017 		nodenum = noderef;
3018 		goto bail;
3019 	}
3020 
3021 bail:
3022 	return nodenum;
3023 }
3024 
3025 /* this is called by the new master once all lockres
3026  * data has been received */
3027 static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
3028 				  struct dlm_lock_resource *res,
3029 				  u8 master, u8 new_master,
3030 				  struct dlm_node_iter *iter)
3031 {
3032 	struct dlm_migrate_request migrate;
3033 	int ret, skip, status = 0;
3034 	int nodenum;
3035 
3036 	memset(&migrate, 0, sizeof(migrate));
3037 	migrate.namelen = res->lockname.len;
3038 	memcpy(migrate.name, res->lockname.name, migrate.namelen);
3039 	migrate.new_master = new_master;
3040 	migrate.master = master;
3041 
3042 	ret = 0;
3043 
3044 	/* send message to all nodes, except the master and myself */
3045 	while ((nodenum = dlm_node_iter_next(iter)) >= 0) {
3046 		if (nodenum == master ||
3047 		    nodenum == new_master)
3048 			continue;
3049 
3050 		/* We could race exit domain. If exited, skip. */
3051 		spin_lock(&dlm->spinlock);
3052 		skip = (!test_bit(nodenum, dlm->domain_map));
3053 		spin_unlock(&dlm->spinlock);
3054 		if (skip) {
3055 			clear_bit(nodenum, iter->node_map);
3056 			continue;
3057 		}
3058 
3059 		ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
3060 					 &migrate, sizeof(migrate), nodenum,
3061 					 &status);
3062 		if (ret < 0) {
3063 			mlog(ML_ERROR, "%s: res %.*s, Error %d send "
3064 			     "MIGRATE_REQUEST to node %u\n", dlm->name,
3065 			     migrate.namelen, migrate.name, ret, nodenum);
3066 			if (!dlm_is_host_down(ret)) {
3067 				mlog(ML_ERROR, "unhandled error=%d!\n", ret);
3068 				BUG();
3069 			}
3070 			clear_bit(nodenum, iter->node_map);
3071 			ret = 0;
3072 		} else if (status < 0) {
3073 			mlog(0, "migrate request (node %u) returned %d!\n",
3074 			     nodenum, status);
3075 			ret = status;
3076 		} else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) {
3077 			/* during the migration request we short-circuited
3078 			 * the mastery of the lockres.  make sure we have
3079 			 * a mastery ref for nodenum */
3080 			mlog(0, "%s:%.*s: need ref for node %u\n",
3081 			     dlm->name, res->lockname.len, res->lockname.name,
3082 			     nodenum);
3083 			spin_lock(&res->spinlock);
3084 			dlm_lockres_set_refmap_bit(dlm, res, nodenum);
3085 			spin_unlock(&res->spinlock);
3086 		}
3087 	}
3088 
3089 	if (ret < 0)
3090 		mlog_errno(ret);
3091 
3092 	mlog(0, "returning ret=%d\n", ret);
3093 	return ret;
3094 }
3095 
3096 
3097 /* if there is an existing mle for this lockres, we now know who the master is.
3098  * (the one who sent us *this* message) we can clear it up right away.
3099  * since the process that put the mle on the list still has a reference to it,
3100  * we can unhash it now, set the master and wake the process.  as a result,
3101  * we will have no mle in the list to start with.  now we can add an mle for
3102  * the migration and this should be the only one found for those scanning the
3103  * list.  */
3104 int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
3105 				void **ret_data)
3106 {
3107 	struct dlm_ctxt *dlm = data;
3108 	struct dlm_lock_resource *res = NULL;
3109 	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
3110 	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
3111 	const char *name;
3112 	unsigned int namelen, hash;
3113 	int ret = 0;
3114 
3115 	if (!dlm_grab(dlm))
3116 		return 0;
3117 
3118 	name = migrate->name;
3119 	namelen = migrate->namelen;
3120 	hash = dlm_lockid_hash(name, namelen);
3121 
3122 	/* preallocate.. if this fails, abort */
3123 	mle = kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
3124 
3125 	if (!mle) {
3126 		ret = -ENOMEM;
3127 		goto leave;
3128 	}
3129 
3130 	/* check for pre-existing lock */
3131 	spin_lock(&dlm->spinlock);
3132 	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
3133 	if (res) {
3134 		spin_lock(&res->spinlock);
3135 		if (res->state & DLM_LOCK_RES_RECOVERING) {
3136 			/* if all is working ok, this can only mean that we got
3137 		 	* a migrate request from a node that we now see as
3138 		 	* dead.  what can we do here?  drop it to the floor? */
3139 			spin_unlock(&res->spinlock);
3140 			mlog(ML_ERROR, "Got a migrate request, but the "
3141 			     "lockres is marked as recovering!");
3142 			kmem_cache_free(dlm_mle_cache, mle);
3143 			ret = -EINVAL; /* need a better solution */
3144 			goto unlock;
3145 		}
3146 		res->state |= DLM_LOCK_RES_MIGRATING;
3147 		spin_unlock(&res->spinlock);
3148 	}
3149 
3150 	spin_lock(&dlm->master_lock);
3151 	/* ignore status.  only nonzero status would BUG. */
3152 	ret = dlm_add_migration_mle(dlm, res, mle, &oldmle,
3153 				    name, namelen,
3154 				    migrate->new_master,
3155 				    migrate->master);
3156 
3157 	if (ret < 0)
3158 		kmem_cache_free(dlm_mle_cache, mle);
3159 
3160 	spin_unlock(&dlm->master_lock);
3161 unlock:
3162 	spin_unlock(&dlm->spinlock);
3163 
3164 	if (oldmle) {
3165 		/* master is known, detach if not already detached */
3166 		dlm_mle_detach_hb_events(dlm, oldmle);
3167 		dlm_put_mle(oldmle);
3168 	}
3169 
3170 	if (res)
3171 		dlm_lockres_put(res);
3172 leave:
3173 	dlm_put(dlm);
3174 	return ret;
3175 }
3176 
3177 /* must be holding dlm->spinlock and dlm->master_lock
3178  * when adding a migration mle, we can clear any other mles
3179  * in the master list because we know with certainty that
3180  * the master is "master".  so we remove any old mle from
3181  * the list after setting it's master field, and then add
3182  * the new migration mle.  this way we can hold with the rule
3183  * of having only one mle for a given lock name at all times. */
3184 static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
3185 				 struct dlm_lock_resource *res,
3186 				 struct dlm_master_list_entry *mle,
3187 				 struct dlm_master_list_entry **oldmle,
3188 				 const char *name, unsigned int namelen,
3189 				 u8 new_master, u8 master)
3190 {
3191 	int found;
3192 	int ret = 0;
3193 
3194 	*oldmle = NULL;
3195 
3196 	assert_spin_locked(&dlm->spinlock);
3197 	assert_spin_locked(&dlm->master_lock);
3198 
3199 	/* caller is responsible for any ref taken here on oldmle */
3200 	found = dlm_find_mle(dlm, oldmle, (char *)name, namelen);
3201 	if (found) {
3202 		struct dlm_master_list_entry *tmp = *oldmle;
3203 		spin_lock(&tmp->spinlock);
3204 		if (tmp->type == DLM_MLE_MIGRATION) {
3205 			if (master == dlm->node_num) {
3206 				/* ah another process raced me to it */
3207 				mlog(0, "tried to migrate %.*s, but some "
3208 				     "process beat me to it\n",
3209 				     namelen, name);
3210 				spin_unlock(&tmp->spinlock);
3211 				return -EEXIST;
3212 			} else {
3213 				/* bad.  2 NODES are trying to migrate! */
3214 				mlog(ML_ERROR, "migration error  mle: "
3215 				     "master=%u new_master=%u // request: "
3216 				     "master=%u new_master=%u // "
3217 				     "lockres=%.*s\n",
3218 				     tmp->master, tmp->new_master,
3219 				     master, new_master,
3220 				     namelen, name);
3221 				BUG();
3222 			}
3223 		} else {
3224 			/* this is essentially what assert_master does */
3225 			tmp->master = master;
3226 			atomic_set(&tmp->woken, 1);
3227 			wake_up(&tmp->wq);
3228 			/* remove it so that only one mle will be found */
3229 			__dlm_unlink_mle(dlm, tmp);
3230 			__dlm_mle_detach_hb_events(dlm, tmp);
3231 			if (tmp->type == DLM_MLE_MASTER) {
3232 				ret = DLM_MIGRATE_RESPONSE_MASTERY_REF;
3233 				mlog(0, "%s:%.*s: master=%u, newmaster=%u, "
3234 						"telling master to get ref "
3235 						"for cleared out mle during "
3236 						"migration\n", dlm->name,
3237 						namelen, name, master,
3238 						new_master);
3239 			}
3240 		}
3241 		spin_unlock(&tmp->spinlock);
3242 	}
3243 
3244 	/* now add a migration mle to the tail of the list */
3245 	dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen);
3246 	mle->new_master = new_master;
3247 	/* the new master will be sending an assert master for this.
3248 	 * at that point we will get the refmap reference */
3249 	mle->master = master;
3250 	/* do this for consistency with other mle types */
3251 	set_bit(new_master, mle->maybe_map);
3252 	__dlm_insert_mle(dlm, mle);
3253 
3254 	return ret;
3255 }
3256 
3257 /*
3258  * Sets the owner of the lockres, associated to the mle, to UNKNOWN
3259  */
3260 static struct dlm_lock_resource *dlm_reset_mleres_owner(struct dlm_ctxt *dlm,
3261 					struct dlm_master_list_entry *mle)
3262 {
3263 	struct dlm_lock_resource *res;
3264 
3265 	/* Find the lockres associated to the mle and set its owner to UNK */
3266 	res = __dlm_lookup_lockres(dlm, mle->mname, mle->mnamelen,
3267 				   mle->mnamehash);
3268 	if (res) {
3269 		spin_unlock(&dlm->master_lock);
3270 
3271 		/* move lockres onto recovery list */
3272 		spin_lock(&res->spinlock);
3273 		dlm_set_lockres_owner(dlm, res, DLM_LOCK_RES_OWNER_UNKNOWN);
3274 		dlm_move_lockres_to_recovery_list(dlm, res);
3275 		spin_unlock(&res->spinlock);
3276 		dlm_lockres_put(res);
3277 
3278 		/* about to get rid of mle, detach from heartbeat */
3279 		__dlm_mle_detach_hb_events(dlm, mle);
3280 
3281 		/* dump the mle */
3282 		spin_lock(&dlm->master_lock);
3283 		__dlm_put_mle(mle);
3284 		spin_unlock(&dlm->master_lock);
3285 	}
3286 
3287 	return res;
3288 }
3289 
3290 static void dlm_clean_migration_mle(struct dlm_ctxt *dlm,
3291 				    struct dlm_master_list_entry *mle)
3292 {
3293 	__dlm_mle_detach_hb_events(dlm, mle);
3294 
3295 	spin_lock(&mle->spinlock);
3296 	__dlm_unlink_mle(dlm, mle);
3297 	atomic_set(&mle->woken, 1);
3298 	spin_unlock(&mle->spinlock);
3299 
3300 	wake_up(&mle->wq);
3301 }
3302 
3303 static void dlm_clean_block_mle(struct dlm_ctxt *dlm,
3304 				struct dlm_master_list_entry *mle, u8 dead_node)
3305 {
3306 	int bit;
3307 
3308 	BUG_ON(mle->type != DLM_MLE_BLOCK);
3309 
3310 	spin_lock(&mle->spinlock);
3311 	bit = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0);
3312 	if (bit != dead_node) {
3313 		mlog(0, "mle found, but dead node %u would not have been "
3314 		     "master\n", dead_node);
3315 		spin_unlock(&mle->spinlock);
3316 	} else {
3317 		/* Must drop the refcount by one since the assert_master will
3318 		 * never arrive. This may result in the mle being unlinked and
3319 		 * freed, but there may still be a process waiting in the
3320 		 * dlmlock path which is fine. */
3321 		mlog(0, "node %u was expected master\n", dead_node);
3322 		atomic_set(&mle->woken, 1);
3323 		spin_unlock(&mle->spinlock);
3324 		wake_up(&mle->wq);
3325 
3326 		/* Do not need events any longer, so detach from heartbeat */
3327 		__dlm_mle_detach_hb_events(dlm, mle);
3328 		__dlm_put_mle(mle);
3329 	}
3330 }
3331 
3332 void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
3333 {
3334 	struct dlm_master_list_entry *mle;
3335 	struct dlm_lock_resource *res;
3336 	struct hlist_head *bucket;
3337 	struct hlist_node *tmp;
3338 	unsigned int i;
3339 
3340 	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
3341 top:
3342 	assert_spin_locked(&dlm->spinlock);
3343 
3344 	/* clean the master list */
3345 	spin_lock(&dlm->master_lock);
3346 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3347 		bucket = dlm_master_hash(dlm, i);
3348 		hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3349 			BUG_ON(mle->type != DLM_MLE_BLOCK &&
3350 			       mle->type != DLM_MLE_MASTER &&
3351 			       mle->type != DLM_MLE_MIGRATION);
3352 
3353 			/* MASTER mles are initiated locally. The waiting
3354 			 * process will notice the node map change shortly.
3355 			 * Let that happen as normal. */
3356 			if (mle->type == DLM_MLE_MASTER)
3357 				continue;
3358 
3359 			/* BLOCK mles are initiated by other nodes. Need to
3360 			 * clean up if the dead node would have been the
3361 			 * master. */
3362 			if (mle->type == DLM_MLE_BLOCK) {
3363 				dlm_clean_block_mle(dlm, mle, dead_node);
3364 				continue;
3365 			}
3366 
3367 			/* Everything else is a MIGRATION mle */
3368 
3369 			/* The rule for MIGRATION mles is that the master
3370 			 * becomes UNKNOWN if *either* the original or the new
3371 			 * master dies. All UNKNOWN lockres' are sent to
3372 			 * whichever node becomes the recovery master. The new
3373 			 * master is responsible for determining if there is
3374 			 * still a master for this lockres, or if he needs to
3375 			 * take over mastery. Either way, this node should
3376 			 * expect another message to resolve this. */
3377 
3378 			if (mle->master != dead_node &&
3379 			    mle->new_master != dead_node)
3380 				continue;
3381 
3382 			if (mle->new_master == dead_node && mle->inuse) {
3383 				mlog(ML_NOTICE, "%s: target %u died during "
3384 						"migration from %u, the MLE is "
3385 						"still keep used, ignore it!\n",
3386 						dlm->name, dead_node,
3387 						mle->master);
3388 				continue;
3389 			}
3390 
3391 			/* If we have reached this point, this mle needs to be
3392 			 * removed from the list and freed. */
3393 			dlm_clean_migration_mle(dlm, mle);
3394 
3395 			mlog(0, "%s: node %u died during migration from "
3396 			     "%u to %u!\n", dlm->name, dead_node, mle->master,
3397 			     mle->new_master);
3398 
3399 			/* If we find a lockres associated with the mle, we've
3400 			 * hit this rare case that messes up our lock ordering.
3401 			 * If so, we need to drop the master lock so that we can
3402 			 * take the lockres lock, meaning that we will have to
3403 			 * restart from the head of list. */
3404 			res = dlm_reset_mleres_owner(dlm, mle);
3405 			if (res)
3406 				/* restart */
3407 				goto top;
3408 
3409 			/* This may be the last reference */
3410 			__dlm_put_mle(mle);
3411 		}
3412 	}
3413 	spin_unlock(&dlm->master_lock);
3414 }
3415 
3416 int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3417 			 u8 old_master)
3418 {
3419 	struct dlm_node_iter iter;
3420 	int ret = 0;
3421 
3422 	spin_lock(&dlm->spinlock);
3423 	dlm_node_iter_init(dlm->domain_map, &iter);
3424 	clear_bit(old_master, iter.node_map);
3425 	clear_bit(dlm->node_num, iter.node_map);
3426 	spin_unlock(&dlm->spinlock);
3427 
3428 	/* ownership of the lockres is changing.  account for the
3429 	 * mastery reference here since old_master will briefly have
3430 	 * a reference after the migration completes */
3431 	spin_lock(&res->spinlock);
3432 	dlm_lockres_set_refmap_bit(dlm, res, old_master);
3433 	spin_unlock(&res->spinlock);
3434 
3435 	mlog(0, "now time to do a migrate request to other nodes\n");
3436 	ret = dlm_do_migrate_request(dlm, res, old_master,
3437 				     dlm->node_num, &iter);
3438 	if (ret < 0) {
3439 		mlog_errno(ret);
3440 		goto leave;
3441 	}
3442 
3443 	mlog(0, "doing assert master of %.*s to all except the original node\n",
3444 	     res->lockname.len, res->lockname.name);
3445 	/* this call now finishes out the nodemap
3446 	 * even if one or more nodes die */
3447 	ret = dlm_do_assert_master(dlm, res, iter.node_map,
3448 				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
3449 	if (ret < 0) {
3450 		/* no longer need to retry.  all living nodes contacted. */
3451 		mlog_errno(ret);
3452 		ret = 0;
3453 	}
3454 
3455 	memset(iter.node_map, 0, sizeof(iter.node_map));
3456 	set_bit(old_master, iter.node_map);
3457 	mlog(0, "doing assert master of %.*s back to %u\n",
3458 	     res->lockname.len, res->lockname.name, old_master);
3459 	ret = dlm_do_assert_master(dlm, res, iter.node_map,
3460 				   DLM_ASSERT_MASTER_FINISH_MIGRATION);
3461 	if (ret < 0) {
3462 		mlog(0, "assert master to original master failed "
3463 		     "with %d.\n", ret);
3464 		/* the only nonzero status here would be because of
3465 		 * a dead original node.  we're done. */
3466 		ret = 0;
3467 	}
3468 
3469 	/* all done, set the owner, clear the flag */
3470 	spin_lock(&res->spinlock);
3471 	dlm_set_lockres_owner(dlm, res, dlm->node_num);
3472 	res->state &= ~DLM_LOCK_RES_MIGRATING;
3473 	spin_unlock(&res->spinlock);
3474 	/* re-dirty it on the new master */
3475 	dlm_kick_thread(dlm, res);
3476 	wake_up(&res->wq);
3477 leave:
3478 	return ret;
3479 }
3480 
3481 /*
3482  * LOCKRES AST REFCOUNT
3483  * this is integral to migration
3484  */
3485 
3486 /* for future intent to call an ast, reserve one ahead of time.
3487  * this should be called only after waiting on the lockres
3488  * with dlm_wait_on_lockres, and while still holding the
3489  * spinlock after the call. */
3490 void __dlm_lockres_reserve_ast(struct dlm_lock_resource *res)
3491 {
3492 	assert_spin_locked(&res->spinlock);
3493 	if (res->state & DLM_LOCK_RES_MIGRATING) {
3494 		__dlm_print_one_lock_resource(res);
3495 	}
3496 	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3497 
3498 	atomic_inc(&res->asts_reserved);
3499 }
3500 
3501 /*
3502  * used to drop the reserved ast, either because it went unused,
3503  * or because the ast/bast was actually called.
3504  *
3505  * also, if there is a pending migration on this lockres,
3506  * and this was the last pending ast on the lockres,
3507  * atomically set the MIGRATING flag before we drop the lock.
3508  * this is how we ensure that migration can proceed with no
3509  * asts in progress.  note that it is ok if the state of the
3510  * queues is such that a lock should be granted in the future
3511  * or that a bast should be fired, because the new master will
3512  * shuffle the lists on this lockres as soon as it is migrated.
3513  */
3514 void dlm_lockres_release_ast(struct dlm_ctxt *dlm,
3515 			     struct dlm_lock_resource *res)
3516 {
3517 	if (!atomic_dec_and_lock(&res->asts_reserved, &res->spinlock))
3518 		return;
3519 
3520 	if (!res->migration_pending) {
3521 		spin_unlock(&res->spinlock);
3522 		return;
3523 	}
3524 
3525 	BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
3526 	res->migration_pending = 0;
3527 	res->state |= DLM_LOCK_RES_MIGRATING;
3528 	spin_unlock(&res->spinlock);
3529 	wake_up(&res->wq);
3530 	wake_up(&dlm->migration_wq);
3531 }
3532 
3533 void dlm_force_free_mles(struct dlm_ctxt *dlm)
3534 {
3535 	int i;
3536 	struct hlist_head *bucket;
3537 	struct dlm_master_list_entry *mle;
3538 	struct hlist_node *tmp;
3539 
3540 	/*
3541 	 * We notified all other nodes that we are exiting the domain and
3542 	 * marked the dlm state to DLM_CTXT_LEAVING. If any mles are still
3543 	 * around we force free them and wake any processes that are waiting
3544 	 * on the mles
3545 	 */
3546 	spin_lock(&dlm->spinlock);
3547 	spin_lock(&dlm->master_lock);
3548 
3549 	BUG_ON(dlm->dlm_state != DLM_CTXT_LEAVING);
3550 	BUG_ON((find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES));
3551 
3552 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
3553 		bucket = dlm_master_hash(dlm, i);
3554 		hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
3555 			if (mle->type != DLM_MLE_BLOCK) {
3556 				mlog(ML_ERROR, "bad mle: %p\n", mle);
3557 				dlm_print_one_mle(mle);
3558 			}
3559 			atomic_set(&mle->woken, 1);
3560 			wake_up(&mle->wq);
3561 
3562 			__dlm_unlink_mle(dlm, mle);
3563 			__dlm_mle_detach_hb_events(dlm, mle);
3564 			__dlm_put_mle(mle);
3565 		}
3566 	}
3567 	spin_unlock(&dlm->master_lock);
3568 	spin_unlock(&dlm->spinlock);
3569 }
3570