xref: /linux/fs/ocfs2/dlmglue.c (revision 5e8d780d745c1619aba81fe7166c5a4b5cad2b84)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * dlmglue.c
5  *
6  * Code which implements an OCFS2 specific interface to our DLM.
7  *
8  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/types.h>
27 #include <linux/slab.h>
28 #include <linux/highmem.h>
29 #include <linux/mm.h>
30 #include <linux/smp_lock.h>
31 #include <linux/crc32.h>
32 #include <linux/kthread.h>
33 #include <linux/pagemap.h>
34 #include <linux/debugfs.h>
35 #include <linux/seq_file.h>
36 
37 #include <cluster/heartbeat.h>
38 #include <cluster/nodemanager.h>
39 #include <cluster/tcp.h>
40 
41 #include <dlm/dlmapi.h>
42 
43 #define MLOG_MASK_PREFIX ML_DLM_GLUE
44 #include <cluster/masklog.h>
45 
46 #include "ocfs2.h"
47 
48 #include "alloc.h"
49 #include "dlmglue.h"
50 #include "extent_map.h"
51 #include "heartbeat.h"
52 #include "inode.h"
53 #include "journal.h"
54 #include "slot_map.h"
55 #include "super.h"
56 #include "uptodate.h"
57 #include "vote.h"
58 
59 #include "buffer_head_io.h"
60 
61 struct ocfs2_mask_waiter {
62 	struct list_head	mw_item;
63 	int			mw_status;
64 	struct completion	mw_complete;
65 	unsigned long		mw_mask;
66 	unsigned long		mw_goal;
67 };
68 
69 static void ocfs2_inode_ast_func(void *opaque);
70 static void ocfs2_inode_bast_func(void *opaque,
71 				  int level);
72 static void ocfs2_super_ast_func(void *opaque);
73 static void ocfs2_super_bast_func(void *opaque,
74 				  int level);
75 static void ocfs2_rename_ast_func(void *opaque);
76 static void ocfs2_rename_bast_func(void *opaque,
77 				   int level);
78 
79 /* so far, all locks have gotten along with the same unlock ast */
80 static void ocfs2_unlock_ast_func(void *opaque,
81 				  enum dlm_status status);
82 static int ocfs2_do_unblock_meta(struct inode *inode,
83 				 int *requeue);
84 static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
85 			      int *requeue);
86 static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
87 			      int *requeue);
88 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
89 			      int *requeue);
90 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
91 				  int *requeue);
92 typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
93 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
94 				      struct ocfs2_lock_res *lockres,
95 				      int *requeue,
96 				      ocfs2_convert_worker_t *worker);
97 
98 struct ocfs2_lock_res_ops {
99 	void (*ast)(void *);
100 	void (*bast)(void *, int);
101 	void (*unlock_ast)(void *, enum dlm_status);
102 	int  (*unblock)(struct ocfs2_lock_res *, int *);
103 };
104 
105 static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
106 	.ast		= ocfs2_inode_ast_func,
107 	.bast		= ocfs2_inode_bast_func,
108 	.unlock_ast	= ocfs2_unlock_ast_func,
109 	.unblock	= ocfs2_unblock_inode_lock,
110 };
111 
112 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
113 	.ast		= ocfs2_inode_ast_func,
114 	.bast		= ocfs2_inode_bast_func,
115 	.unlock_ast	= ocfs2_unlock_ast_func,
116 	.unblock	= ocfs2_unblock_meta,
117 };
118 
119 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
120 				      int blocking);
121 
122 static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
123 	.ast		= ocfs2_inode_ast_func,
124 	.bast		= ocfs2_inode_bast_func,
125 	.unlock_ast	= ocfs2_unlock_ast_func,
126 	.unblock	= ocfs2_unblock_data,
127 };
128 
129 static struct ocfs2_lock_res_ops ocfs2_super_lops = {
130 	.ast		= ocfs2_super_ast_func,
131 	.bast		= ocfs2_super_bast_func,
132 	.unlock_ast	= ocfs2_unlock_ast_func,
133 	.unblock	= ocfs2_unblock_osb_lock,
134 };
135 
136 static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
137 	.ast		= ocfs2_rename_ast_func,
138 	.bast		= ocfs2_rename_bast_func,
139 	.unlock_ast	= ocfs2_unlock_ast_func,
140 	.unblock	= ocfs2_unblock_osb_lock,
141 };
142 
143 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
144 {
145 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
146 		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
147 		lockres->l_type == OCFS2_LOCK_TYPE_RW;
148 }
149 
150 static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
151 {
152 	return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
153 }
154 
155 static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
156 {
157 	return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
158 }
159 
160 static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
161 {
162 	BUG_ON(!ocfs2_is_super_lock(lockres)
163 	       && !ocfs2_is_rename_lock(lockres));
164 
165 	return (struct ocfs2_super *) lockres->l_priv;
166 }
167 
168 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
169 {
170 	BUG_ON(!ocfs2_is_inode_lock(lockres));
171 
172 	return (struct inode *) lockres->l_priv;
173 }
174 
175 static int ocfs2_lock_create(struct ocfs2_super *osb,
176 			     struct ocfs2_lock_res *lockres,
177 			     int level,
178 			     int dlm_flags);
179 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
180 						     int wanted);
181 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
182 				 struct ocfs2_lock_res *lockres,
183 				 int level);
184 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
185 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
186 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
187 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
188 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
189 					struct ocfs2_lock_res *lockres);
190 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
191 						int convert);
192 #define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
193 	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
194 		"resource %s: %s\n", dlm_errname(_stat), _func,	\
195 		_lockres->l_name, dlm_errmsg(_stat));		\
196 } while (0)
197 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
198 				 struct ocfs2_lock_res *lockres);
199 static int ocfs2_meta_lock_update(struct inode *inode,
200 				  struct buffer_head **bh);
201 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
202 static inline int ocfs2_highest_compat_lock_level(int level);
203 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
204 						  struct ocfs2_lock_res *lockres,
205 						  int new_level);
206 
207 static char *ocfs2_lock_type_strings[] = {
208 	[OCFS2_LOCK_TYPE_META] = "Meta",
209 	[OCFS2_LOCK_TYPE_DATA] = "Data",
210 	[OCFS2_LOCK_TYPE_SUPER] = "Super",
211 	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
212 	/* Need to differntiate from [R]ename.. serializing writes is the
213 	 * important job it does, anyway. */
214 	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
215 };
216 
217 static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
218 {
219 	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
220 	return ocfs2_lock_type_strings[type];
221 }
222 
223 static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
224 				  u64 blkno,
225 				  u32 generation,
226 				  char *name)
227 {
228 	int len;
229 
230 	mlog_entry_void();
231 
232 	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
233 
234 	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
235 		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
236 		       (long long)blkno, generation);
237 
238 	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
239 
240 	mlog(0, "built lock resource with name: %s\n", name);
241 
242 	mlog_exit_void();
243 }
244 
245 static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
246 
247 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
248 				       struct ocfs2_dlm_debug *dlm_debug)
249 {
250 	mlog(0, "Add tracking for lockres %s\n", res->l_name);
251 
252 	spin_lock(&ocfs2_dlm_tracking_lock);
253 	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
254 	spin_unlock(&ocfs2_dlm_tracking_lock);
255 }
256 
257 static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
258 {
259 	spin_lock(&ocfs2_dlm_tracking_lock);
260 	if (!list_empty(&res->l_debug_list))
261 		list_del_init(&res->l_debug_list);
262 	spin_unlock(&ocfs2_dlm_tracking_lock);
263 }
264 
265 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
266 				       struct ocfs2_lock_res *res,
267 				       enum ocfs2_lock_type type,
268 				       u64 blkno,
269 				       u32 generation,
270 				       struct ocfs2_lock_res_ops *ops,
271 				       void *priv)
272 {
273 	ocfs2_build_lock_name(type, blkno, generation, res->l_name);
274 
275 	res->l_type          = type;
276 	res->l_ops           = ops;
277 	res->l_priv          = priv;
278 
279 	res->l_level         = LKM_IVMODE;
280 	res->l_requested     = LKM_IVMODE;
281 	res->l_blocking      = LKM_IVMODE;
282 	res->l_action        = OCFS2_AST_INVALID;
283 	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
284 
285 	res->l_flags         = OCFS2_LOCK_INITIALIZED;
286 
287 	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
288 }
289 
290 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
291 {
292 	/* This also clears out the lock status block */
293 	memset(res, 0, sizeof(struct ocfs2_lock_res));
294 	spin_lock_init(&res->l_lock);
295 	init_waitqueue_head(&res->l_event);
296 	INIT_LIST_HEAD(&res->l_blocked_list);
297 	INIT_LIST_HEAD(&res->l_mask_waiters);
298 }
299 
300 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
301 			       enum ocfs2_lock_type type,
302 			       struct inode *inode)
303 {
304 	struct ocfs2_lock_res_ops *ops;
305 
306 	switch(type) {
307 		case OCFS2_LOCK_TYPE_RW:
308 			ops = &ocfs2_inode_rw_lops;
309 			break;
310 		case OCFS2_LOCK_TYPE_META:
311 			ops = &ocfs2_inode_meta_lops;
312 			break;
313 		case OCFS2_LOCK_TYPE_DATA:
314 			ops = &ocfs2_inode_data_lops;
315 			break;
316 		default:
317 			mlog_bug_on_msg(1, "type: %d\n", type);
318 			ops = NULL; /* thanks, gcc */
319 			break;
320 	};
321 
322 	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
323 				   OCFS2_I(inode)->ip_blkno,
324 				   inode->i_generation, ops, inode);
325 }
326 
327 static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
328 				      struct ocfs2_super *osb)
329 {
330 	/* Superblock lockres doesn't come from a slab so we call init
331 	 * once on it manually.  */
332 	ocfs2_lock_res_init_once(res);
333 	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
334 				   OCFS2_SUPER_BLOCK_BLKNO, 0,
335 				   &ocfs2_super_lops, osb);
336 }
337 
338 static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
339 				       struct ocfs2_super *osb)
340 {
341 	/* Rename lockres doesn't come from a slab so we call init
342 	 * once on it manually.  */
343 	ocfs2_lock_res_init_once(res);
344 	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
345 				   &ocfs2_rename_lops, osb);
346 }
347 
348 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
349 {
350 	mlog_entry_void();
351 
352 	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
353 		return;
354 
355 	ocfs2_remove_lockres_tracking(res);
356 
357 	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
358 			"Lockres %s is on the blocked list\n",
359 			res->l_name);
360 	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
361 			"Lockres %s has mask waiters pending\n",
362 			res->l_name);
363 	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
364 			"Lockres %s is locked\n",
365 			res->l_name);
366 	mlog_bug_on_msg(res->l_ro_holders,
367 			"Lockres %s has %u ro holders\n",
368 			res->l_name, res->l_ro_holders);
369 	mlog_bug_on_msg(res->l_ex_holders,
370 			"Lockres %s has %u ex holders\n",
371 			res->l_name, res->l_ex_holders);
372 
373 	/* Need to clear out the lock status block for the dlm */
374 	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
375 
376 	res->l_flags = 0UL;
377 	mlog_exit_void();
378 }
379 
380 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
381 				     int level)
382 {
383 	mlog_entry_void();
384 
385 	BUG_ON(!lockres);
386 
387 	switch(level) {
388 	case LKM_EXMODE:
389 		lockres->l_ex_holders++;
390 		break;
391 	case LKM_PRMODE:
392 		lockres->l_ro_holders++;
393 		break;
394 	default:
395 		BUG();
396 	}
397 
398 	mlog_exit_void();
399 }
400 
401 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
402 				     int level)
403 {
404 	mlog_entry_void();
405 
406 	BUG_ON(!lockres);
407 
408 	switch(level) {
409 	case LKM_EXMODE:
410 		BUG_ON(!lockres->l_ex_holders);
411 		lockres->l_ex_holders--;
412 		break;
413 	case LKM_PRMODE:
414 		BUG_ON(!lockres->l_ro_holders);
415 		lockres->l_ro_holders--;
416 		break;
417 	default:
418 		BUG();
419 	}
420 	mlog_exit_void();
421 }
422 
423 /* WARNING: This function lives in a world where the only three lock
424  * levels are EX, PR, and NL. It *will* have to be adjusted when more
425  * lock types are added. */
426 static inline int ocfs2_highest_compat_lock_level(int level)
427 {
428 	int new_level = LKM_EXMODE;
429 
430 	if (level == LKM_EXMODE)
431 		new_level = LKM_NLMODE;
432 	else if (level == LKM_PRMODE)
433 		new_level = LKM_PRMODE;
434 	return new_level;
435 }
436 
437 static void lockres_set_flags(struct ocfs2_lock_res *lockres,
438 			      unsigned long newflags)
439 {
440 	struct list_head *pos, *tmp;
441 	struct ocfs2_mask_waiter *mw;
442 
443  	assert_spin_locked(&lockres->l_lock);
444 
445 	lockres->l_flags = newflags;
446 
447 	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
448 		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
449 		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
450 			continue;
451 
452 		list_del_init(&mw->mw_item);
453 		mw->mw_status = 0;
454 		complete(&mw->mw_complete);
455 	}
456 }
457 static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
458 {
459 	lockres_set_flags(lockres, lockres->l_flags | or);
460 }
461 static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
462 				unsigned long clear)
463 {
464 	lockres_set_flags(lockres, lockres->l_flags & ~clear);
465 }
466 
467 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
468 {
469 	mlog_entry_void();
470 
471 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
472 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
473 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
474 	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
475 
476 	lockres->l_level = lockres->l_requested;
477 	if (lockres->l_level <=
478 	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
479 		lockres->l_blocking = LKM_NLMODE;
480 		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
481 	}
482 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
483 
484 	mlog_exit_void();
485 }
486 
487 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
488 {
489 	mlog_entry_void();
490 
491 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
492 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
493 
494 	/* Convert from RO to EX doesn't really need anything as our
495 	 * information is already up to data. Convert from NL to
496 	 * *anything* however should mark ourselves as needing an
497 	 * update */
498 	if (lockres->l_level == LKM_NLMODE)
499 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
500 
501 	lockres->l_level = lockres->l_requested;
502 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
503 
504 	mlog_exit_void();
505 }
506 
507 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
508 {
509 	mlog_entry_void();
510 
511 	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
512 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
513 
514 	if (lockres->l_requested > LKM_NLMODE &&
515 	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
516 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
517 
518 	lockres->l_level = lockres->l_requested;
519 	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
520 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
521 
522 	mlog_exit_void();
523 }
524 
525 static void ocfs2_inode_ast_func(void *opaque)
526 {
527 	struct ocfs2_lock_res *lockres = opaque;
528 	struct inode *inode;
529 	struct dlm_lockstatus *lksb;
530 	unsigned long flags;
531 
532 	mlog_entry_void();
533 
534 	inode = ocfs2_lock_res_inode(lockres);
535 
536 	mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n",
537 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action,
538 	     ocfs2_lock_type_string(lockres->l_type));
539 
540 	BUG_ON(!ocfs2_is_inode_lock(lockres));
541 
542 	spin_lock_irqsave(&lockres->l_lock, flags);
543 
544 	lksb = &(lockres->l_lksb);
545 	if (lksb->status != DLM_NORMAL) {
546 		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
547 		     "on inode %llu\n", lksb->status,
548 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
549 		spin_unlock_irqrestore(&lockres->l_lock, flags);
550 		mlog_exit_void();
551 		return;
552 	}
553 
554 	switch(lockres->l_action) {
555 	case OCFS2_AST_ATTACH:
556 		ocfs2_generic_handle_attach_action(lockres);
557 		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
558 		break;
559 	case OCFS2_AST_CONVERT:
560 		ocfs2_generic_handle_convert_action(lockres);
561 		break;
562 	case OCFS2_AST_DOWNCONVERT:
563 		ocfs2_generic_handle_downconvert_action(lockres);
564 		break;
565 	default:
566 		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
567 		     "lockres flags = 0x%lx, unlock action: %u\n",
568 		     lockres->l_name, lockres->l_action, lockres->l_flags,
569 		     lockres->l_unlock_action);
570 
571 		BUG();
572 	}
573 
574 	/* data and rw locking ignores refresh flag for now. */
575 	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
576 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
577 
578 	/* set it to something invalid so if we get called again we
579 	 * can catch it. */
580 	lockres->l_action = OCFS2_AST_INVALID;
581 	spin_unlock_irqrestore(&lockres->l_lock, flags);
582 	wake_up(&lockres->l_event);
583 
584 	mlog_exit_void();
585 }
586 
587 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
588 				     int level)
589 {
590 	int needs_downconvert = 0;
591 	mlog_entry_void();
592 
593 	assert_spin_locked(&lockres->l_lock);
594 
595 	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
596 
597 	if (level > lockres->l_blocking) {
598 		/* only schedule a downconvert if we haven't already scheduled
599 		 * one that goes low enough to satisfy the level we're
600 		 * blocking.  this also catches the case where we get
601 		 * duplicate BASTs */
602 		if (ocfs2_highest_compat_lock_level(level) <
603 		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
604 			needs_downconvert = 1;
605 
606 		lockres->l_blocking = level;
607 	}
608 
609 	mlog_exit(needs_downconvert);
610 	return needs_downconvert;
611 }
612 
613 static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
614 				    struct ocfs2_lock_res *lockres,
615 				    int level)
616 {
617 	int needs_downconvert;
618 	unsigned long flags;
619 
620 	mlog_entry_void();
621 
622 	BUG_ON(level <= LKM_NLMODE);
623 
624 	spin_lock_irqsave(&lockres->l_lock, flags);
625 	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
626 	if (needs_downconvert)
627 		ocfs2_schedule_blocked_lock(osb, lockres);
628 	spin_unlock_irqrestore(&lockres->l_lock, flags);
629 
630 	ocfs2_kick_vote_thread(osb);
631 
632 	wake_up(&lockres->l_event);
633 	mlog_exit_void();
634 }
635 
636 static void ocfs2_inode_bast_func(void *opaque, int level)
637 {
638 	struct ocfs2_lock_res *lockres = opaque;
639 	struct inode *inode;
640 	struct ocfs2_super *osb;
641 
642 	mlog_entry_void();
643 
644 	BUG_ON(!ocfs2_is_inode_lock(lockres));
645 
646 	inode = ocfs2_lock_res_inode(lockres);
647 	osb = OCFS2_SB(inode->i_sb);
648 
649 	mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n",
650 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, level,
651 	     lockres->l_level, ocfs2_lock_type_string(lockres->l_type));
652 
653 	ocfs2_generic_bast_func(osb, lockres, level);
654 
655 	mlog_exit_void();
656 }
657 
658 static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
659 				   int ignore_refresh)
660 {
661 	struct dlm_lockstatus *lksb = &lockres->l_lksb;
662 	unsigned long flags;
663 
664 	spin_lock_irqsave(&lockres->l_lock, flags);
665 
666 	if (lksb->status != DLM_NORMAL) {
667 		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
668 		     lockres->l_name, lksb->status);
669 		spin_unlock_irqrestore(&lockres->l_lock, flags);
670 		return;
671 	}
672 
673 	switch(lockres->l_action) {
674 	case OCFS2_AST_ATTACH:
675 		ocfs2_generic_handle_attach_action(lockres);
676 		break;
677 	case OCFS2_AST_CONVERT:
678 		ocfs2_generic_handle_convert_action(lockres);
679 		break;
680 	case OCFS2_AST_DOWNCONVERT:
681 		ocfs2_generic_handle_downconvert_action(lockres);
682 		break;
683 	default:
684 		BUG();
685 	}
686 
687 	if (ignore_refresh)
688 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
689 
690 	/* set it to something invalid so if we get called again we
691 	 * can catch it. */
692 	lockres->l_action = OCFS2_AST_INVALID;
693 	spin_unlock_irqrestore(&lockres->l_lock, flags);
694 
695 	wake_up(&lockres->l_event);
696 }
697 
698 static void ocfs2_super_ast_func(void *opaque)
699 {
700 	struct ocfs2_lock_res *lockres = opaque;
701 
702 	mlog_entry_void();
703 	mlog(0, "Superblock AST fired\n");
704 
705 	BUG_ON(!ocfs2_is_super_lock(lockres));
706 	ocfs2_generic_ast_func(lockres, 0);
707 
708 	mlog_exit_void();
709 }
710 
711 static void ocfs2_super_bast_func(void *opaque,
712 				  int level)
713 {
714 	struct ocfs2_lock_res *lockres = opaque;
715 	struct ocfs2_super *osb;
716 
717 	mlog_entry_void();
718 	mlog(0, "Superblock BAST fired\n");
719 
720 	BUG_ON(!ocfs2_is_super_lock(lockres));
721        	osb = ocfs2_lock_res_super(lockres);
722 	ocfs2_generic_bast_func(osb, lockres, level);
723 
724 	mlog_exit_void();
725 }
726 
727 static void ocfs2_rename_ast_func(void *opaque)
728 {
729 	struct ocfs2_lock_res *lockres = opaque;
730 
731 	mlog_entry_void();
732 
733 	mlog(0, "Rename AST fired\n");
734 
735 	BUG_ON(!ocfs2_is_rename_lock(lockres));
736 
737 	ocfs2_generic_ast_func(lockres, 1);
738 
739 	mlog_exit_void();
740 }
741 
742 static void ocfs2_rename_bast_func(void *opaque,
743 				   int level)
744 {
745 	struct ocfs2_lock_res *lockres = opaque;
746 	struct ocfs2_super *osb;
747 
748 	mlog_entry_void();
749 
750 	mlog(0, "Rename BAST fired\n");
751 
752 	BUG_ON(!ocfs2_is_rename_lock(lockres));
753 
754 	osb = ocfs2_lock_res_super(lockres);
755 	ocfs2_generic_bast_func(osb, lockres, level);
756 
757 	mlog_exit_void();
758 }
759 
760 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
761 						int convert)
762 {
763 	unsigned long flags;
764 
765 	mlog_entry_void();
766 	spin_lock_irqsave(&lockres->l_lock, flags);
767 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
768 	if (convert)
769 		lockres->l_action = OCFS2_AST_INVALID;
770 	else
771 		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
772 	spin_unlock_irqrestore(&lockres->l_lock, flags);
773 
774 	wake_up(&lockres->l_event);
775 	mlog_exit_void();
776 }
777 
778 /* Note: If we detect another process working on the lock (i.e.,
779  * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
780  * to do the right thing in that case.
781  */
782 static int ocfs2_lock_create(struct ocfs2_super *osb,
783 			     struct ocfs2_lock_res *lockres,
784 			     int level,
785 			     int dlm_flags)
786 {
787 	int ret = 0;
788 	enum dlm_status status;
789 	unsigned long flags;
790 
791 	mlog_entry_void();
792 
793 	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
794 	     dlm_flags);
795 
796 	spin_lock_irqsave(&lockres->l_lock, flags);
797 	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
798 	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
799 		spin_unlock_irqrestore(&lockres->l_lock, flags);
800 		goto bail;
801 	}
802 
803 	lockres->l_action = OCFS2_AST_ATTACH;
804 	lockres->l_requested = level;
805 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
806 	spin_unlock_irqrestore(&lockres->l_lock, flags);
807 
808 	status = dlmlock(osb->dlm,
809 			 level,
810 			 &lockres->l_lksb,
811 			 dlm_flags,
812 			 lockres->l_name,
813 			 lockres->l_ops->ast,
814 			 lockres,
815 			 lockres->l_ops->bast);
816 	if (status != DLM_NORMAL) {
817 		ocfs2_log_dlm_error("dlmlock", status, lockres);
818 		ret = -EINVAL;
819 		ocfs2_recover_from_dlm_error(lockres, 1);
820 	}
821 
822 	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
823 
824 bail:
825 	mlog_exit(ret);
826 	return ret;
827 }
828 
829 static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
830 					int flag)
831 {
832 	unsigned long flags;
833 	int ret;
834 
835 	spin_lock_irqsave(&lockres->l_lock, flags);
836 	ret = lockres->l_flags & flag;
837 	spin_unlock_irqrestore(&lockres->l_lock, flags);
838 
839 	return ret;
840 }
841 
842 static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
843 
844 {
845 	wait_event(lockres->l_event,
846 		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
847 }
848 
849 static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
850 
851 {
852 	wait_event(lockres->l_event,
853 		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
854 }
855 
856 /* predict what lock level we'll be dropping down to on behalf
857  * of another node, and return true if the currently wanted
858  * level will be compatible with it. */
859 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
860 						     int wanted)
861 {
862 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
863 
864 	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
865 }
866 
867 static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
868 {
869 	INIT_LIST_HEAD(&mw->mw_item);
870 	init_completion(&mw->mw_complete);
871 }
872 
873 static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
874 {
875 	wait_for_completion(&mw->mw_complete);
876 	/* Re-arm the completion in case we want to wait on it again */
877 	INIT_COMPLETION(mw->mw_complete);
878 	return mw->mw_status;
879 }
880 
881 static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
882 				    struct ocfs2_mask_waiter *mw,
883 				    unsigned long mask,
884 				    unsigned long goal)
885 {
886 	BUG_ON(!list_empty(&mw->mw_item));
887 
888 	assert_spin_locked(&lockres->l_lock);
889 
890 	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
891 	mw->mw_mask = mask;
892 	mw->mw_goal = goal;
893 }
894 
895 /* returns 0 if the mw that was removed was already satisfied, -EBUSY
896  * if the mask still hadn't reached its goal */
897 static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
898 				      struct ocfs2_mask_waiter *mw)
899 {
900 	unsigned long flags;
901 	int ret = 0;
902 
903 	spin_lock_irqsave(&lockres->l_lock, flags);
904 	if (!list_empty(&mw->mw_item)) {
905 		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
906 			ret = -EBUSY;
907 
908 		list_del_init(&mw->mw_item);
909 		init_completion(&mw->mw_complete);
910 	}
911 	spin_unlock_irqrestore(&lockres->l_lock, flags);
912 
913 	return ret;
914 
915 }
916 
917 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
918 			      struct ocfs2_lock_res *lockres,
919 			      int level,
920 			      int lkm_flags,
921 			      int arg_flags)
922 {
923 	struct ocfs2_mask_waiter mw;
924 	enum dlm_status status;
925 	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
926 	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
927 	unsigned long flags;
928 
929 	mlog_entry_void();
930 
931 	ocfs2_init_mask_waiter(&mw);
932 
933 again:
934 	wait = 0;
935 
936 	if (catch_signals && signal_pending(current)) {
937 		ret = -ERESTARTSYS;
938 		goto out;
939 	}
940 
941 	spin_lock_irqsave(&lockres->l_lock, flags);
942 
943 	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
944 			"Cluster lock called on freeing lockres %s! flags "
945 			"0x%lx\n", lockres->l_name, lockres->l_flags);
946 
947 	/* We only compare against the currently granted level
948 	 * here. If the lock is blocked waiting on a downconvert,
949 	 * we'll get caught below. */
950 	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
951 	    level > lockres->l_level) {
952 		/* is someone sitting in dlm_lock? If so, wait on
953 		 * them. */
954 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
955 		wait = 1;
956 		goto unlock;
957 	}
958 
959 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
960 		/* lock has not been created yet. */
961 		spin_unlock_irqrestore(&lockres->l_lock, flags);
962 
963 		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
964 		if (ret < 0) {
965 			mlog_errno(ret);
966 			goto out;
967 		}
968 		goto again;
969 	}
970 
971 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
972 	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
973 		/* is the lock is currently blocked on behalf of
974 		 * another node */
975 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
976 		wait = 1;
977 		goto unlock;
978 	}
979 
980 	if (level > lockres->l_level) {
981 		if (lockres->l_action != OCFS2_AST_INVALID)
982 			mlog(ML_ERROR, "lockres %s has action %u pending\n",
983 			     lockres->l_name, lockres->l_action);
984 
985 		lockres->l_action = OCFS2_AST_CONVERT;
986 		lockres->l_requested = level;
987 		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
988 		spin_unlock_irqrestore(&lockres->l_lock, flags);
989 
990 		BUG_ON(level == LKM_IVMODE);
991 		BUG_ON(level == LKM_NLMODE);
992 
993 		mlog(0, "lock %s, convert from %d to level = %d\n",
994 		     lockres->l_name, lockres->l_level, level);
995 
996 		/* call dlm_lock to upgrade lock now */
997 		status = dlmlock(osb->dlm,
998 				 level,
999 				 &lockres->l_lksb,
1000 				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
1001 				 lockres->l_name,
1002 				 lockres->l_ops->ast,
1003 				 lockres,
1004 				 lockres->l_ops->bast);
1005 		if (status != DLM_NORMAL) {
1006 			if ((lkm_flags & LKM_NOQUEUE) &&
1007 			    (status == DLM_NOTQUEUED))
1008 				ret = -EAGAIN;
1009 			else {
1010 				ocfs2_log_dlm_error("dlmlock", status,
1011 						    lockres);
1012 				ret = -EINVAL;
1013 			}
1014 			ocfs2_recover_from_dlm_error(lockres, 1);
1015 			goto out;
1016 		}
1017 
1018 		mlog(0, "lock %s, successfull return from dlmlock\n",
1019 		     lockres->l_name);
1020 
1021 		/* At this point we've gone inside the dlm and need to
1022 		 * complete our work regardless. */
1023 		catch_signals = 0;
1024 
1025 		/* wait for busy to clear and carry on */
1026 		goto again;
1027 	}
1028 
1029 	/* Ok, if we get here then we're good to go. */
1030 	ocfs2_inc_holders(lockres, level);
1031 
1032 	ret = 0;
1033 unlock:
1034 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1035 out:
1036 	/*
1037 	 * This is helping work around a lock inversion between the page lock
1038 	 * and dlm locks.  One path holds the page lock while calling aops
1039 	 * which block acquiring dlm locks.  The voting thread holds dlm
1040 	 * locks while acquiring page locks while down converting data locks.
1041 	 * This block is helping an aop path notice the inversion and back
1042 	 * off to unlock its page lock before trying the dlm lock again.
1043 	 */
1044 	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1045 	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1046 		wait = 0;
1047 		if (lockres_remove_mask_waiter(lockres, &mw))
1048 			ret = -EAGAIN;
1049 		else
1050 			goto again;
1051 	}
1052 	if (wait) {
1053 		ret = ocfs2_wait_for_mask(&mw);
1054 		if (ret == 0)
1055 			goto again;
1056 		mlog_errno(ret);
1057 	}
1058 
1059 	mlog_exit(ret);
1060 	return ret;
1061 }
1062 
1063 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1064 				 struct ocfs2_lock_res *lockres,
1065 				 int level)
1066 {
1067 	unsigned long flags;
1068 
1069 	mlog_entry_void();
1070 	spin_lock_irqsave(&lockres->l_lock, flags);
1071 	ocfs2_dec_holders(lockres, level);
1072 	ocfs2_vote_on_unlock(osb, lockres);
1073 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1074 	mlog_exit_void();
1075 }
1076 
1077 static int ocfs2_create_new_inode_lock(struct inode *inode,
1078 				       struct ocfs2_lock_res *lockres)
1079 {
1080 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1081 	unsigned long flags;
1082 
1083 	spin_lock_irqsave(&lockres->l_lock, flags);
1084 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1085 	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1086 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1087 
1088 	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
1089 }
1090 
1091 /* Grants us an EX lock on the data and metadata resources, skipping
1092  * the normal cluster directory lookup. Use this ONLY on newly created
1093  * inodes which other nodes can't possibly see, and which haven't been
1094  * hashed in the inode hash yet. This can give us a good performance
1095  * increase as it'll skip the network broadcast normally associated
1096  * with creating a new lock resource. */
1097 int ocfs2_create_new_inode_locks(struct inode *inode)
1098 {
1099 	int ret;
1100 
1101 	BUG_ON(!inode);
1102 	BUG_ON(!ocfs2_inode_is_new(inode));
1103 
1104 	mlog_entry_void();
1105 
1106 	mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
1107 
1108 	/* NOTE: That we don't increment any of the holder counts, nor
1109 	 * do we add anything to a journal handle. Since this is
1110 	 * supposed to be a new inode which the cluster doesn't know
1111 	 * about yet, there is no need to.  As far as the LVB handling
1112 	 * is concerned, this is basically like acquiring an EX lock
1113 	 * on a resource which has an invalid one -- we'll set it
1114 	 * valid when we release the EX. */
1115 
1116 	ret = ocfs2_create_new_inode_lock(inode,
1117 					  &OCFS2_I(inode)->ip_rw_lockres);
1118 	if (ret) {
1119 		mlog_errno(ret);
1120 		goto bail;
1121 	}
1122 
1123 	ret = ocfs2_create_new_inode_lock(inode,
1124 					  &OCFS2_I(inode)->ip_meta_lockres);
1125 	if (ret) {
1126 		mlog_errno(ret);
1127 		goto bail;
1128 	}
1129 
1130 	ret = ocfs2_create_new_inode_lock(inode,
1131 					  &OCFS2_I(inode)->ip_data_lockres);
1132 	if (ret) {
1133 		mlog_errno(ret);
1134 		goto bail;
1135 	}
1136 
1137 bail:
1138 	mlog_exit(ret);
1139 	return ret;
1140 }
1141 
1142 int ocfs2_rw_lock(struct inode *inode, int write)
1143 {
1144 	int status, level;
1145 	struct ocfs2_lock_res *lockres;
1146 
1147 	BUG_ON(!inode);
1148 
1149 	mlog_entry_void();
1150 
1151 	mlog(0, "inode %llu take %s RW lock\n",
1152 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1153 	     write ? "EXMODE" : "PRMODE");
1154 
1155 	lockres = &OCFS2_I(inode)->ip_rw_lockres;
1156 
1157 	level = write ? LKM_EXMODE : LKM_PRMODE;
1158 
1159 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1160 				    0);
1161 	if (status < 0)
1162 		mlog_errno(status);
1163 
1164 	mlog_exit(status);
1165 	return status;
1166 }
1167 
1168 void ocfs2_rw_unlock(struct inode *inode, int write)
1169 {
1170 	int level = write ? LKM_EXMODE : LKM_PRMODE;
1171 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1172 
1173 	mlog_entry_void();
1174 
1175 	mlog(0, "inode %llu drop %s RW lock\n",
1176 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1177 	     write ? "EXMODE" : "PRMODE");
1178 
1179 	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1180 
1181 	mlog_exit_void();
1182 }
1183 
1184 int ocfs2_data_lock_full(struct inode *inode,
1185 			 int write,
1186 			 int arg_flags)
1187 {
1188 	int status = 0, level;
1189 	struct ocfs2_lock_res *lockres;
1190 
1191 	BUG_ON(!inode);
1192 
1193 	mlog_entry_void();
1194 
1195 	mlog(0, "inode %llu take %s DATA lock\n",
1196 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1197 	     write ? "EXMODE" : "PRMODE");
1198 
1199 	/* We'll allow faking a readonly data lock for
1200 	 * rodevices. */
1201 	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1202 		if (write) {
1203 			status = -EROFS;
1204 			mlog_errno(status);
1205 		}
1206 		goto out;
1207 	}
1208 
1209 	lockres = &OCFS2_I(inode)->ip_data_lockres;
1210 
1211 	level = write ? LKM_EXMODE : LKM_PRMODE;
1212 
1213 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1214 				    0, arg_flags);
1215 	if (status < 0 && status != -EAGAIN)
1216 		mlog_errno(status);
1217 
1218 out:
1219 	mlog_exit(status);
1220 	return status;
1221 }
1222 
1223 /* see ocfs2_meta_lock_with_page() */
1224 int ocfs2_data_lock_with_page(struct inode *inode,
1225 			      int write,
1226 			      struct page *page)
1227 {
1228 	int ret;
1229 
1230 	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1231 	if (ret == -EAGAIN) {
1232 		unlock_page(page);
1233 		if (ocfs2_data_lock(inode, write) == 0)
1234 			ocfs2_data_unlock(inode, write);
1235 		ret = AOP_TRUNCATED_PAGE;
1236 	}
1237 
1238 	return ret;
1239 }
1240 
1241 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1242 				 struct ocfs2_lock_res *lockres)
1243 {
1244 	int kick = 0;
1245 
1246 	mlog_entry_void();
1247 
1248 	/* If we know that another node is waiting on our lock, kick
1249 	 * the vote thread * pre-emptively when we reach a release
1250 	 * condition. */
1251 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1252 		switch(lockres->l_blocking) {
1253 		case LKM_EXMODE:
1254 			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1255 				kick = 1;
1256 			break;
1257 		case LKM_PRMODE:
1258 			if (!lockres->l_ex_holders)
1259 				kick = 1;
1260 			break;
1261 		default:
1262 			BUG();
1263 		}
1264 	}
1265 
1266 	if (kick)
1267 		ocfs2_kick_vote_thread(osb);
1268 
1269 	mlog_exit_void();
1270 }
1271 
1272 void ocfs2_data_unlock(struct inode *inode,
1273 		       int write)
1274 {
1275 	int level = write ? LKM_EXMODE : LKM_PRMODE;
1276 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1277 
1278 	mlog_entry_void();
1279 
1280 	mlog(0, "inode %llu drop %s DATA lock\n",
1281 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1282 	     write ? "EXMODE" : "PRMODE");
1283 
1284 	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1285 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1286 
1287 	mlog_exit_void();
1288 }
1289 
1290 #define OCFS2_SEC_BITS   34
1291 #define OCFS2_SEC_SHIFT  (64 - 34)
1292 #define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
1293 
1294 /* LVB only has room for 64 bits of time here so we pack it for
1295  * now. */
1296 static u64 ocfs2_pack_timespec(struct timespec *spec)
1297 {
1298 	u64 res;
1299 	u64 sec = spec->tv_sec;
1300 	u32 nsec = spec->tv_nsec;
1301 
1302 	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1303 
1304 	return res;
1305 }
1306 
1307 /* Call this with the lockres locked. I am reasonably sure we don't
1308  * need ip_lock in this function as anyone who would be changing those
1309  * values is supposed to be blocked in ocfs2_meta_lock right now. */
1310 static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1311 {
1312 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1313 	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1314 	struct ocfs2_meta_lvb *lvb;
1315 
1316 	mlog_entry_void();
1317 
1318 	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1319 
1320 	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
1321 	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
1322 	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1323 	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
1324 	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
1325 	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
1326 	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
1327 	lvb->lvb_iatime_packed  =
1328 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1329 	lvb->lvb_ictime_packed =
1330 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1331 	lvb->lvb_imtime_packed =
1332 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1333 
1334 	mlog_meta_lvb(0, lockres);
1335 
1336 	mlog_exit_void();
1337 }
1338 
1339 static void ocfs2_unpack_timespec(struct timespec *spec,
1340 				  u64 packed_time)
1341 {
1342 	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1343 	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1344 }
1345 
1346 static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1347 {
1348 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1349 	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1350 	struct ocfs2_meta_lvb *lvb;
1351 
1352 	mlog_entry_void();
1353 
1354 	mlog_meta_lvb(0, lockres);
1355 
1356 	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1357 
1358 	/* We're safe here without the lockres lock... */
1359 	spin_lock(&oi->ip_lock);
1360 	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1361 	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1362 
1363 	/* fast-symlinks are a special case */
1364 	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1365 		inode->i_blocks = 0;
1366 	else
1367 		inode->i_blocks =
1368 			ocfs2_align_bytes_to_sectors(i_size_read(inode));
1369 
1370 	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
1371 	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
1372 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
1373 	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
1374 	ocfs2_unpack_timespec(&inode->i_atime,
1375 			      be64_to_cpu(lvb->lvb_iatime_packed));
1376 	ocfs2_unpack_timespec(&inode->i_mtime,
1377 			      be64_to_cpu(lvb->lvb_imtime_packed));
1378 	ocfs2_unpack_timespec(&inode->i_ctime,
1379 			      be64_to_cpu(lvb->lvb_ictime_packed));
1380 	spin_unlock(&oi->ip_lock);
1381 
1382 	mlog_exit_void();
1383 }
1384 
1385 static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
1386 {
1387 	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1388 
1389 	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
1390 		return 1;
1391 	return 0;
1392 }
1393 
1394 /* Determine whether a lock resource needs to be refreshed, and
1395  * arbitrate who gets to refresh it.
1396  *
1397  *   0 means no refresh needed.
1398  *
1399  *   > 0 means you need to refresh this and you MUST call
1400  *   ocfs2_complete_lock_res_refresh afterwards. */
1401 static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1402 {
1403 	unsigned long flags;
1404 	int status = 0;
1405 
1406 	mlog_entry_void();
1407 
1408 refresh_check:
1409 	spin_lock_irqsave(&lockres->l_lock, flags);
1410 	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1411 		spin_unlock_irqrestore(&lockres->l_lock, flags);
1412 		goto bail;
1413 	}
1414 
1415 	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1416 		spin_unlock_irqrestore(&lockres->l_lock, flags);
1417 
1418 		ocfs2_wait_on_refreshing_lock(lockres);
1419 		goto refresh_check;
1420 	}
1421 
1422 	/* Ok, I'll be the one to refresh this lock. */
1423 	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1424 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1425 
1426 	status = 1;
1427 bail:
1428 	mlog_exit(status);
1429 	return status;
1430 }
1431 
1432 /* If status is non zero, I'll mark it as not being in refresh
1433  * anymroe, but i won't clear the needs refresh flag. */
1434 static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1435 						   int status)
1436 {
1437 	unsigned long flags;
1438 	mlog_entry_void();
1439 
1440 	spin_lock_irqsave(&lockres->l_lock, flags);
1441 	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1442 	if (!status)
1443 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1444 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1445 
1446 	wake_up(&lockres->l_event);
1447 
1448 	mlog_exit_void();
1449 }
1450 
1451 /* may or may not return a bh if it went to disk. */
1452 static int ocfs2_meta_lock_update(struct inode *inode,
1453 				  struct buffer_head **bh)
1454 {
1455 	int status = 0;
1456 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1457 	struct ocfs2_lock_res *lockres;
1458 	struct ocfs2_dinode *fe;
1459 
1460 	mlog_entry_void();
1461 
1462 	spin_lock(&oi->ip_lock);
1463 	if (oi->ip_flags & OCFS2_INODE_DELETED) {
1464 		mlog(0, "Orphaned inode %llu was deleted while we "
1465 		     "were waiting on a lock. ip_flags = 0x%x\n",
1466 		     (unsigned long long)oi->ip_blkno, oi->ip_flags);
1467 		spin_unlock(&oi->ip_lock);
1468 		status = -ENOENT;
1469 		goto bail;
1470 	}
1471 	spin_unlock(&oi->ip_lock);
1472 
1473 	lockres = &oi->ip_meta_lockres;
1474 
1475 	if (!ocfs2_should_refresh_lock_res(lockres))
1476 		goto bail;
1477 
1478 	/* This will discard any caching information we might have had
1479 	 * for the inode metadata. */
1480 	ocfs2_metadata_cache_purge(inode);
1481 
1482 	/* will do nothing for inode types that don't use the extent
1483 	 * map (directories, bitmap files, etc) */
1484 	ocfs2_extent_map_trunc(inode, 0);
1485 
1486 	if (ocfs2_meta_lvb_is_trustable(lockres)) {
1487 		mlog(0, "Trusting LVB on inode %llu\n",
1488 		     (unsigned long long)oi->ip_blkno);
1489 		ocfs2_refresh_inode_from_lvb(inode);
1490 	} else {
1491 		/* Boo, we have to go to disk. */
1492 		/* read bh, cast, ocfs2_refresh_inode */
1493 		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1494 					  bh, OCFS2_BH_CACHED, inode);
1495 		if (status < 0) {
1496 			mlog_errno(status);
1497 			goto bail_refresh;
1498 		}
1499 		fe = (struct ocfs2_dinode *) (*bh)->b_data;
1500 
1501 		/* This is a good chance to make sure we're not
1502 		 * locking an invalid object.
1503 		 *
1504 		 * We bug on a stale inode here because we checked
1505 		 * above whether it was wiped from disk. The wiping
1506 		 * node provides a guarantee that we receive that
1507 		 * message and can mark the inode before dropping any
1508 		 * locks associated with it. */
1509 		if (!OCFS2_IS_VALID_DINODE(fe)) {
1510 			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1511 			status = -EIO;
1512 			goto bail_refresh;
1513 		}
1514 		mlog_bug_on_msg(inode->i_generation !=
1515 				le32_to_cpu(fe->i_generation),
1516 				"Invalid dinode %llu disk generation: %u "
1517 				"inode->i_generation: %u\n",
1518 				(unsigned long long)oi->ip_blkno,
1519 				le32_to_cpu(fe->i_generation),
1520 				inode->i_generation);
1521 		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1522 				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
1523 				"Stale dinode %llu dtime: %llu flags: 0x%x\n",
1524 				(unsigned long long)oi->ip_blkno,
1525 				(unsigned long long)le64_to_cpu(fe->i_dtime),
1526 				le32_to_cpu(fe->i_flags));
1527 
1528 		ocfs2_refresh_inode(inode, fe);
1529 	}
1530 
1531 	status = 0;
1532 bail_refresh:
1533 	ocfs2_complete_lock_res_refresh(lockres, status);
1534 bail:
1535 	mlog_exit(status);
1536 	return status;
1537 }
1538 
1539 static int ocfs2_assign_bh(struct inode *inode,
1540 			   struct buffer_head **ret_bh,
1541 			   struct buffer_head *passed_bh)
1542 {
1543 	int status;
1544 
1545 	if (passed_bh) {
1546 		/* Ok, the update went to disk for us, use the
1547 		 * returned bh. */
1548 		*ret_bh = passed_bh;
1549 		get_bh(*ret_bh);
1550 
1551 		return 0;
1552 	}
1553 
1554 	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1555 				  OCFS2_I(inode)->ip_blkno,
1556 				  ret_bh,
1557 				  OCFS2_BH_CACHED,
1558 				  inode);
1559 	if (status < 0)
1560 		mlog_errno(status);
1561 
1562 	return status;
1563 }
1564 
1565 /*
1566  * returns < 0 error if the callback will never be called, otherwise
1567  * the result of the lock will be communicated via the callback.
1568  */
1569 int ocfs2_meta_lock_full(struct inode *inode,
1570 			 struct ocfs2_journal_handle *handle,
1571 			 struct buffer_head **ret_bh,
1572 			 int ex,
1573 			 int arg_flags)
1574 {
1575 	int status, level, dlm_flags, acquired;
1576 	struct ocfs2_lock_res *lockres;
1577 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1578 	struct buffer_head *local_bh = NULL;
1579 
1580 	BUG_ON(!inode);
1581 
1582 	mlog_entry_void();
1583 
1584 	mlog(0, "inode %llu, take %s META lock\n",
1585 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1586 	     ex ? "EXMODE" : "PRMODE");
1587 
1588 	status = 0;
1589 	acquired = 0;
1590 	/* We'll allow faking a readonly metadata lock for
1591 	 * rodevices. */
1592 	if (ocfs2_is_hard_readonly(osb)) {
1593 		if (ex)
1594 			status = -EROFS;
1595 		goto bail;
1596 	}
1597 
1598 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1599 		wait_event(osb->recovery_event,
1600 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1601 
1602 	acquired = 0;
1603 	lockres = &OCFS2_I(inode)->ip_meta_lockres;
1604 	level = ex ? LKM_EXMODE : LKM_PRMODE;
1605 	dlm_flags = 0;
1606 	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1607 		dlm_flags |= LKM_NOQUEUE;
1608 
1609 	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1610 	if (status < 0) {
1611 		if (status != -EAGAIN && status != -EIOCBRETRY)
1612 			mlog_errno(status);
1613 		goto bail;
1614 	}
1615 
1616 	/* Notify the error cleanup path to drop the cluster lock. */
1617 	acquired = 1;
1618 
1619 	/* We wait twice because a node may have died while we were in
1620 	 * the lower dlm layers. The second time though, we've
1621 	 * committed to owning this lock so we don't allow signals to
1622 	 * abort the operation. */
1623 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1624 		wait_event(osb->recovery_event,
1625 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1626 
1627 	/* This is fun. The caller may want a bh back, or it may
1628 	 * not. ocfs2_meta_lock_update definitely wants one in, but
1629 	 * may or may not read one, depending on what's in the
1630 	 * LVB. The result of all of this is that we've *only* gone to
1631 	 * disk if we have to, so the complexity is worthwhile. */
1632 	status = ocfs2_meta_lock_update(inode, &local_bh);
1633 	if (status < 0) {
1634 		if (status != -ENOENT)
1635 			mlog_errno(status);
1636 		goto bail;
1637 	}
1638 
1639 	if (ret_bh) {
1640 		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1641 		if (status < 0) {
1642 			mlog_errno(status);
1643 			goto bail;
1644 		}
1645 	}
1646 
1647 	if (handle) {
1648 		status = ocfs2_handle_add_lock(handle, inode);
1649 		if (status < 0)
1650 			mlog_errno(status);
1651 	}
1652 
1653 bail:
1654 	if (status < 0) {
1655 		if (ret_bh && (*ret_bh)) {
1656 			brelse(*ret_bh);
1657 			*ret_bh = NULL;
1658 		}
1659 		if (acquired)
1660 			ocfs2_meta_unlock(inode, ex);
1661 	}
1662 
1663 	if (local_bh)
1664 		brelse(local_bh);
1665 
1666 	mlog_exit(status);
1667 	return status;
1668 }
1669 
1670 /*
1671  * This is working around a lock inversion between tasks acquiring DLM locks
1672  * while holding a page lock and the vote thread which blocks dlm lock acquiry
1673  * while acquiring page locks.
1674  *
1675  * ** These _with_page variantes are only intended to be called from aop
1676  * methods that hold page locks and return a very specific *positive* error
1677  * code that aop methods pass up to the VFS -- test for errors with != 0. **
1678  *
1679  * The DLM is called such that it returns -EAGAIN if it would have blocked
1680  * waiting for the vote thread.  In that case we unlock our page so the vote
1681  * thread can make progress.  Once we've done this we have to return
1682  * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1683  * into the VFS who will then immediately retry the aop call.
1684  *
1685  * We do a blocking lock and immediate unlock before returning, though, so that
1686  * the lock has a great chance of being cached on this node by the time the VFS
1687  * calls back to retry the aop.    This has a potential to livelock as nodes
1688  * ping locks back and forth, but that's a risk we're willing to take to avoid
1689  * the lock inversion simply.
1690  */
1691 int ocfs2_meta_lock_with_page(struct inode *inode,
1692 			      struct ocfs2_journal_handle *handle,
1693 			      struct buffer_head **ret_bh,
1694 			      int ex,
1695 			      struct page *page)
1696 {
1697 	int ret;
1698 
1699 	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
1700 				   OCFS2_LOCK_NONBLOCK);
1701 	if (ret == -EAGAIN) {
1702 		unlock_page(page);
1703 		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
1704 			ocfs2_meta_unlock(inode, ex);
1705 		ret = AOP_TRUNCATED_PAGE;
1706 	}
1707 
1708 	return ret;
1709 }
1710 
1711 void ocfs2_meta_unlock(struct inode *inode,
1712 		       int ex)
1713 {
1714 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1715 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
1716 
1717 	mlog_entry_void();
1718 
1719 	mlog(0, "inode %llu drop %s META lock\n",
1720 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1721 	     ex ? "EXMODE" : "PRMODE");
1722 
1723 	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1724 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1725 
1726 	mlog_exit_void();
1727 }
1728 
1729 int ocfs2_super_lock(struct ocfs2_super *osb,
1730 		     int ex)
1731 {
1732 	int status;
1733 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1734 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1735 	struct buffer_head *bh;
1736 	struct ocfs2_slot_info *si = osb->slot_info;
1737 
1738 	mlog_entry_void();
1739 
1740 	if (ocfs2_is_hard_readonly(osb))
1741 		return -EROFS;
1742 
1743 	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1744 	if (status < 0) {
1745 		mlog_errno(status);
1746 		goto bail;
1747 	}
1748 
1749 	/* The super block lock path is really in the best position to
1750 	 * know when resources covered by the lock need to be
1751 	 * refreshed, so we do it here. Of course, making sense of
1752 	 * everything is up to the caller :) */
1753 	status = ocfs2_should_refresh_lock_res(lockres);
1754 	if (status < 0) {
1755 		mlog_errno(status);
1756 		goto bail;
1757 	}
1758 	if (status) {
1759 		bh = si->si_bh;
1760 		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1761 					  si->si_inode);
1762 		if (status == 0)
1763 			ocfs2_update_slot_info(si);
1764 
1765 		ocfs2_complete_lock_res_refresh(lockres, status);
1766 
1767 		if (status < 0)
1768 			mlog_errno(status);
1769 	}
1770 bail:
1771 	mlog_exit(status);
1772 	return status;
1773 }
1774 
1775 void ocfs2_super_unlock(struct ocfs2_super *osb,
1776 			int ex)
1777 {
1778 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1779 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1780 
1781 	ocfs2_cluster_unlock(osb, lockres, level);
1782 }
1783 
1784 int ocfs2_rename_lock(struct ocfs2_super *osb)
1785 {
1786 	int status;
1787 	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1788 
1789 	if (ocfs2_is_hard_readonly(osb))
1790 		return -EROFS;
1791 
1792 	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
1793 	if (status < 0)
1794 		mlog_errno(status);
1795 
1796 	return status;
1797 }
1798 
1799 void ocfs2_rename_unlock(struct ocfs2_super *osb)
1800 {
1801 	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1802 
1803 	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
1804 }
1805 
1806 /* Reference counting of the dlm debug structure. We want this because
1807  * open references on the debug inodes can live on after a mount, so
1808  * we can't rely on the ocfs2_super to always exist. */
1809 static void ocfs2_dlm_debug_free(struct kref *kref)
1810 {
1811 	struct ocfs2_dlm_debug *dlm_debug;
1812 
1813 	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
1814 
1815 	kfree(dlm_debug);
1816 }
1817 
1818 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
1819 {
1820 	if (dlm_debug)
1821 		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
1822 }
1823 
1824 static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
1825 {
1826 	kref_get(&debug->d_refcnt);
1827 }
1828 
1829 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
1830 {
1831 	struct ocfs2_dlm_debug *dlm_debug;
1832 
1833 	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
1834 	if (!dlm_debug) {
1835 		mlog_errno(-ENOMEM);
1836 		goto out;
1837 	}
1838 
1839 	kref_init(&dlm_debug->d_refcnt);
1840 	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
1841 	dlm_debug->d_locking_state = NULL;
1842 out:
1843 	return dlm_debug;
1844 }
1845 
1846 /* Access to this is arbitrated for us via seq_file->sem. */
1847 struct ocfs2_dlm_seq_priv {
1848 	struct ocfs2_dlm_debug *p_dlm_debug;
1849 	struct ocfs2_lock_res p_iter_res;
1850 	struct ocfs2_lock_res p_tmp_res;
1851 };
1852 
1853 static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
1854 						 struct ocfs2_dlm_seq_priv *priv)
1855 {
1856 	struct ocfs2_lock_res *iter, *ret = NULL;
1857 	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
1858 
1859 	assert_spin_locked(&ocfs2_dlm_tracking_lock);
1860 
1861 	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
1862 		/* discover the head of the list */
1863 		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
1864 			mlog(0, "End of list found, %p\n", ret);
1865 			break;
1866 		}
1867 
1868 		/* We track our "dummy" iteration lockres' by a NULL
1869 		 * l_ops field. */
1870 		if (iter->l_ops != NULL) {
1871 			ret = iter;
1872 			break;
1873 		}
1874 	}
1875 
1876 	return ret;
1877 }
1878 
1879 static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
1880 {
1881 	struct ocfs2_dlm_seq_priv *priv = m->private;
1882 	struct ocfs2_lock_res *iter;
1883 
1884 	spin_lock(&ocfs2_dlm_tracking_lock);
1885 	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
1886 	if (iter) {
1887 		/* Since lockres' have the lifetime of their container
1888 		 * (which can be inodes, ocfs2_supers, etc) we want to
1889 		 * copy this out to a temporary lockres while still
1890 		 * under the spinlock. Obviously after this we can't
1891 		 * trust any pointers on the copy returned, but that's
1892 		 * ok as the information we want isn't typically held
1893 		 * in them. */
1894 		priv->p_tmp_res = *iter;
1895 		iter = &priv->p_tmp_res;
1896 	}
1897 	spin_unlock(&ocfs2_dlm_tracking_lock);
1898 
1899 	return iter;
1900 }
1901 
1902 static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
1903 {
1904 }
1905 
1906 static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
1907 {
1908 	struct ocfs2_dlm_seq_priv *priv = m->private;
1909 	struct ocfs2_lock_res *iter = v;
1910 	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
1911 
1912 	spin_lock(&ocfs2_dlm_tracking_lock);
1913 	iter = ocfs2_dlm_next_res(iter, priv);
1914 	list_del_init(&dummy->l_debug_list);
1915 	if (iter) {
1916 		list_add(&dummy->l_debug_list, &iter->l_debug_list);
1917 		priv->p_tmp_res = *iter;
1918 		iter = &priv->p_tmp_res;
1919 	}
1920 	spin_unlock(&ocfs2_dlm_tracking_lock);
1921 
1922 	return iter;
1923 }
1924 
1925 /* So that debugfs.ocfs2 can determine which format is being used */
1926 #define OCFS2_DLM_DEBUG_STR_VERSION 1
1927 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
1928 {
1929 	int i;
1930 	char *lvb;
1931 	struct ocfs2_lock_res *lockres = v;
1932 
1933 	if (!lockres)
1934 		return -EINVAL;
1935 
1936 	seq_printf(m, "0x%x\t"
1937 		   "%.*s\t"
1938 		   "%d\t"
1939 		   "0x%lx\t"
1940 		   "0x%x\t"
1941 		   "0x%x\t"
1942 		   "%u\t"
1943 		   "%u\t"
1944 		   "%d\t"
1945 		   "%d\t",
1946 		   OCFS2_DLM_DEBUG_STR_VERSION,
1947 		   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
1948 		   lockres->l_level,
1949 		   lockres->l_flags,
1950 		   lockres->l_action,
1951 		   lockres->l_unlock_action,
1952 		   lockres->l_ro_holders,
1953 		   lockres->l_ex_holders,
1954 		   lockres->l_requested,
1955 		   lockres->l_blocking);
1956 
1957 	/* Dump the raw LVB */
1958 	lvb = lockres->l_lksb.lvb;
1959 	for(i = 0; i < DLM_LVB_LEN; i++)
1960 		seq_printf(m, "0x%x\t", lvb[i]);
1961 
1962 	/* End the line */
1963 	seq_printf(m, "\n");
1964 	return 0;
1965 }
1966 
1967 static struct seq_operations ocfs2_dlm_seq_ops = {
1968 	.start =	ocfs2_dlm_seq_start,
1969 	.stop =		ocfs2_dlm_seq_stop,
1970 	.next =		ocfs2_dlm_seq_next,
1971 	.show =		ocfs2_dlm_seq_show,
1972 };
1973 
1974 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
1975 {
1976 	struct seq_file *seq = (struct seq_file *) file->private_data;
1977 	struct ocfs2_dlm_seq_priv *priv = seq->private;
1978 	struct ocfs2_lock_res *res = &priv->p_iter_res;
1979 
1980 	ocfs2_remove_lockres_tracking(res);
1981 	ocfs2_put_dlm_debug(priv->p_dlm_debug);
1982 	return seq_release_private(inode, file);
1983 }
1984 
1985 static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
1986 {
1987 	int ret;
1988 	struct ocfs2_dlm_seq_priv *priv;
1989 	struct seq_file *seq;
1990 	struct ocfs2_super *osb;
1991 
1992 	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
1993 	if (!priv) {
1994 		ret = -ENOMEM;
1995 		mlog_errno(ret);
1996 		goto out;
1997 	}
1998 	osb = (struct ocfs2_super *) inode->u.generic_ip;
1999 	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2000 	priv->p_dlm_debug = osb->osb_dlm_debug;
2001 	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2002 
2003 	ret = seq_open(file, &ocfs2_dlm_seq_ops);
2004 	if (ret) {
2005 		kfree(priv);
2006 		mlog_errno(ret);
2007 		goto out;
2008 	}
2009 
2010 	seq = (struct seq_file *) file->private_data;
2011 	seq->private = priv;
2012 
2013 	ocfs2_add_lockres_tracking(&priv->p_iter_res,
2014 				   priv->p_dlm_debug);
2015 
2016 out:
2017 	return ret;
2018 }
2019 
2020 static const struct file_operations ocfs2_dlm_debug_fops = {
2021 	.open =		ocfs2_dlm_debug_open,
2022 	.release =	ocfs2_dlm_debug_release,
2023 	.read =		seq_read,
2024 	.llseek =	seq_lseek,
2025 };
2026 
2027 static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2028 {
2029 	int ret = 0;
2030 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2031 
2032 	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2033 							 S_IFREG|S_IRUSR,
2034 							 osb->osb_debug_root,
2035 							 osb,
2036 							 &ocfs2_dlm_debug_fops);
2037 	if (!dlm_debug->d_locking_state) {
2038 		ret = -EINVAL;
2039 		mlog(ML_ERROR,
2040 		     "Unable to create locking state debugfs file.\n");
2041 		goto out;
2042 	}
2043 
2044 	ocfs2_get_dlm_debug(dlm_debug);
2045 out:
2046 	return ret;
2047 }
2048 
2049 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2050 {
2051 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2052 
2053 	if (dlm_debug) {
2054 		debugfs_remove(dlm_debug->d_locking_state);
2055 		ocfs2_put_dlm_debug(dlm_debug);
2056 	}
2057 }
2058 
2059 int ocfs2_dlm_init(struct ocfs2_super *osb)
2060 {
2061 	int status;
2062 	u32 dlm_key;
2063 	struct dlm_ctxt *dlm;
2064 
2065 	mlog_entry_void();
2066 
2067 	status = ocfs2_dlm_init_debug(osb);
2068 	if (status < 0) {
2069 		mlog_errno(status);
2070 		goto bail;
2071 	}
2072 
2073 	/* launch vote thread */
2074 	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
2075 	if (IS_ERR(osb->vote_task)) {
2076 		status = PTR_ERR(osb->vote_task);
2077 		osb->vote_task = NULL;
2078 		mlog_errno(status);
2079 		goto bail;
2080 	}
2081 
2082 	/* used by the dlm code to make message headers unique, each
2083 	 * node in this domain must agree on this. */
2084 	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2085 
2086 	/* for now, uuid == domain */
2087 	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2088 	if (IS_ERR(dlm)) {
2089 		status = PTR_ERR(dlm);
2090 		mlog_errno(status);
2091 		goto bail;
2092 	}
2093 
2094 	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2095 	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2096 
2097 	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2098 
2099 	osb->dlm = dlm;
2100 
2101 	status = 0;
2102 bail:
2103 	if (status < 0) {
2104 		ocfs2_dlm_shutdown_debug(osb);
2105 		if (osb->vote_task)
2106 			kthread_stop(osb->vote_task);
2107 	}
2108 
2109 	mlog_exit(status);
2110 	return status;
2111 }
2112 
2113 void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2114 {
2115 	mlog_entry_void();
2116 
2117 	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2118 
2119 	ocfs2_drop_osb_locks(osb);
2120 
2121 	if (osb->vote_task) {
2122 		kthread_stop(osb->vote_task);
2123 		osb->vote_task = NULL;
2124 	}
2125 
2126 	ocfs2_lock_res_free(&osb->osb_super_lockres);
2127 	ocfs2_lock_res_free(&osb->osb_rename_lockres);
2128 
2129 	dlm_unregister_domain(osb->dlm);
2130 	osb->dlm = NULL;
2131 
2132 	ocfs2_dlm_shutdown_debug(osb);
2133 
2134 	mlog_exit_void();
2135 }
2136 
2137 static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
2138 {
2139 	struct ocfs2_lock_res *lockres = opaque;
2140 	unsigned long flags;
2141 
2142 	mlog_entry_void();
2143 
2144 	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2145 	     lockres->l_unlock_action);
2146 
2147 	spin_lock_irqsave(&lockres->l_lock, flags);
2148 	/* We tried to cancel a convert request, but it was already
2149 	 * granted. All we want to do here is clear our unlock
2150 	 * state. The wake_up call done at the bottom is redundant
2151 	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2152 	 * hurt anything anyway */
2153 	if (status == DLM_CANCELGRANT &&
2154 	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2155 		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2156 
2157 		/* We don't clear the busy flag in this case as it
2158 		 * should have been cleared by the ast which the dlm
2159 		 * has called. */
2160 		goto complete_unlock;
2161 	}
2162 
2163 	if (status != DLM_NORMAL) {
2164 		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2165 		     "unlock_action %d\n", status, lockres->l_name,
2166 		     lockres->l_unlock_action);
2167 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2168 		return;
2169 	}
2170 
2171 	switch(lockres->l_unlock_action) {
2172 	case OCFS2_UNLOCK_CANCEL_CONVERT:
2173 		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2174 		lockres->l_action = OCFS2_AST_INVALID;
2175 		break;
2176 	case OCFS2_UNLOCK_DROP_LOCK:
2177 		lockres->l_level = LKM_IVMODE;
2178 		break;
2179 	default:
2180 		BUG();
2181 	}
2182 
2183 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2184 complete_unlock:
2185 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2186 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2187 
2188 	wake_up(&lockres->l_event);
2189 
2190 	mlog_exit_void();
2191 }
2192 
2193 typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
2194 
2195 struct drop_lock_cb {
2196 	ocfs2_pre_drop_cb_t	*drop_func;
2197 	void			*drop_data;
2198 };
2199 
2200 static int ocfs2_drop_lock(struct ocfs2_super *osb,
2201 			   struct ocfs2_lock_res *lockres,
2202 			   struct drop_lock_cb *dcb)
2203 {
2204 	enum dlm_status status;
2205 	unsigned long flags;
2206 
2207 	/* We didn't get anywhere near actually using this lockres. */
2208 	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2209 		goto out;
2210 
2211 	spin_lock_irqsave(&lockres->l_lock, flags);
2212 
2213 	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2214 			"lockres %s, flags 0x%lx\n",
2215 			lockres->l_name, lockres->l_flags);
2216 
2217 	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2218 		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2219 		     "%u, unlock_action = %u\n",
2220 		     lockres->l_name, lockres->l_flags, lockres->l_action,
2221 		     lockres->l_unlock_action);
2222 
2223 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2224 
2225 		/* XXX: Today we just wait on any busy
2226 		 * locks... Perhaps we need to cancel converts in the
2227 		 * future? */
2228 		ocfs2_wait_on_busy_lock(lockres);
2229 
2230 		spin_lock_irqsave(&lockres->l_lock, flags);
2231 	}
2232 
2233 	if (dcb)
2234 		dcb->drop_func(lockres, dcb->drop_data);
2235 
2236 	if (lockres->l_flags & OCFS2_LOCK_BUSY)
2237 		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2238 		     lockres->l_name);
2239 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2240 		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2241 
2242 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2243 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2244 		goto out;
2245 	}
2246 
2247 	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2248 
2249 	/* make sure we never get here while waiting for an ast to
2250 	 * fire. */
2251 	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2252 
2253 	/* is this necessary? */
2254 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2255 	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2256 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2257 
2258 	mlog(0, "lock %s\n", lockres->l_name);
2259 
2260 	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
2261 			   lockres->l_ops->unlock_ast, lockres);
2262 	if (status != DLM_NORMAL) {
2263 		ocfs2_log_dlm_error("dlmunlock", status, lockres);
2264 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2265 		dlm_print_one_lock(lockres->l_lksb.lockid);
2266 		BUG();
2267 	}
2268 	mlog(0, "lock %s, successfull return from dlmunlock\n",
2269 	     lockres->l_name);
2270 
2271 	ocfs2_wait_on_busy_lock(lockres);
2272 out:
2273 	mlog_exit(0);
2274 	return 0;
2275 }
2276 
2277 /* Mark the lockres as being dropped. It will no longer be
2278  * queued if blocking, but we still may have to wait on it
2279  * being dequeued from the vote thread before we can consider
2280  * it safe to drop.
2281  *
2282  * You can *not* attempt to call cluster_lock on this lockres anymore. */
2283 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2284 {
2285 	int status;
2286 	struct ocfs2_mask_waiter mw;
2287 	unsigned long flags;
2288 
2289 	ocfs2_init_mask_waiter(&mw);
2290 
2291 	spin_lock_irqsave(&lockres->l_lock, flags);
2292 	lockres->l_flags |= OCFS2_LOCK_FREEING;
2293 	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2294 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2295 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2296 
2297 		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2298 
2299 		status = ocfs2_wait_for_mask(&mw);
2300 		if (status)
2301 			mlog_errno(status);
2302 
2303 		spin_lock_irqsave(&lockres->l_lock, flags);
2304 	}
2305 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2306 }
2307 
2308 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2309 {
2310 	int status;
2311 
2312 	mlog_entry_void();
2313 
2314 	ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
2315 
2316 	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
2317 	if (status < 0)
2318 		mlog_errno(status);
2319 
2320 	ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
2321 
2322 	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
2323 	if (status < 0)
2324 		mlog_errno(status);
2325 
2326 	mlog_exit(status);
2327 }
2328 
2329 static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
2330 {
2331 	struct inode *inode = data;
2332 
2333 	/* the metadata lock requires a bit more work as we have an
2334 	 * LVB to worry about. */
2335 	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2336 	    lockres->l_level == LKM_EXMODE &&
2337 	    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2338 		__ocfs2_stuff_meta_lvb(inode);
2339 }
2340 
2341 int ocfs2_drop_inode_locks(struct inode *inode)
2342 {
2343 	int status, err;
2344 	struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
2345 
2346 	mlog_entry_void();
2347 
2348 	/* No need to call ocfs2_mark_lockres_freeing here -
2349 	 * ocfs2_clear_inode has done it for us. */
2350 
2351 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2352 			      &OCFS2_I(inode)->ip_data_lockres,
2353 			      NULL);
2354 	if (err < 0)
2355 		mlog_errno(err);
2356 
2357 	status = err;
2358 
2359 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2360 			      &OCFS2_I(inode)->ip_meta_lockres,
2361 			      &meta_dcb);
2362 	if (err < 0)
2363 		mlog_errno(err);
2364 	if (err < 0 && !status)
2365 		status = err;
2366 
2367 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2368 			      &OCFS2_I(inode)->ip_rw_lockres,
2369 			      NULL);
2370 	if (err < 0)
2371 		mlog_errno(err);
2372 	if (err < 0 && !status)
2373 		status = err;
2374 
2375 	mlog_exit(status);
2376 	return status;
2377 }
2378 
2379 static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2380 				      int new_level)
2381 {
2382 	assert_spin_locked(&lockres->l_lock);
2383 
2384 	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2385 
2386 	if (lockres->l_level <= new_level) {
2387 		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2388 		     lockres->l_level, new_level);
2389 		BUG();
2390 	}
2391 
2392 	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2393 	     lockres->l_name, new_level, lockres->l_blocking);
2394 
2395 	lockres->l_action = OCFS2_AST_DOWNCONVERT;
2396 	lockres->l_requested = new_level;
2397 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2398 }
2399 
2400 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2401 				  struct ocfs2_lock_res *lockres,
2402 				  int new_level,
2403 				  int lvb)
2404 {
2405 	int ret, dlm_flags = LKM_CONVERT;
2406 	enum dlm_status status;
2407 
2408 	mlog_entry_void();
2409 
2410 	if (lvb)
2411 		dlm_flags |= LKM_VALBLK;
2412 
2413 	status = dlmlock(osb->dlm,
2414 			 new_level,
2415 			 &lockres->l_lksb,
2416 			 dlm_flags,
2417 			 lockres->l_name,
2418 			 lockres->l_ops->ast,
2419 			 lockres,
2420 			 lockres->l_ops->bast);
2421 	if (status != DLM_NORMAL) {
2422 		ocfs2_log_dlm_error("dlmlock", status, lockres);
2423 		ret = -EINVAL;
2424 		ocfs2_recover_from_dlm_error(lockres, 1);
2425 		goto bail;
2426 	}
2427 
2428 	ret = 0;
2429 bail:
2430 	mlog_exit(ret);
2431 	return ret;
2432 }
2433 
2434 /* returns 1 when the caller should unlock and call dlmunlock */
2435 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2436 				        struct ocfs2_lock_res *lockres)
2437 {
2438 	assert_spin_locked(&lockres->l_lock);
2439 
2440 	mlog_entry_void();
2441 	mlog(0, "lock %s\n", lockres->l_name);
2442 
2443 	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2444 		/* If we're already trying to cancel a lock conversion
2445 		 * then just drop the spinlock and allow the caller to
2446 		 * requeue this lock. */
2447 
2448 		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2449 		return 0;
2450 	}
2451 
2452 	/* were we in a convert when we got the bast fire? */
2453 	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2454 	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
2455 	/* set things up for the unlockast to know to just
2456 	 * clear out the ast_action and unset busy, etc. */
2457 	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2458 
2459 	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2460 			"lock %s, invalid flags: 0x%lx\n",
2461 			lockres->l_name, lockres->l_flags);
2462 
2463 	return 1;
2464 }
2465 
2466 static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2467 				struct ocfs2_lock_res *lockres)
2468 {
2469 	int ret;
2470 	enum dlm_status status;
2471 
2472 	mlog_entry_void();
2473 	mlog(0, "lock %s\n", lockres->l_name);
2474 
2475 	ret = 0;
2476 	status = dlmunlock(osb->dlm,
2477 			   &lockres->l_lksb,
2478 			   LKM_CANCEL,
2479 			   lockres->l_ops->unlock_ast,
2480 			   lockres);
2481 	if (status != DLM_NORMAL) {
2482 		ocfs2_log_dlm_error("dlmunlock", status, lockres);
2483 		ret = -EINVAL;
2484 		ocfs2_recover_from_dlm_error(lockres, 0);
2485 	}
2486 
2487 	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2488 
2489 	mlog_exit(ret);
2490 	return ret;
2491 }
2492 
2493 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
2494 						  struct ocfs2_lock_res *lockres,
2495 						  int new_level)
2496 {
2497 	int ret;
2498 
2499 	mlog_entry_void();
2500 
2501 	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2502 
2503 	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2504 		ret = 0;
2505 		mlog(0, "lockres %s currently being refreshed -- backing "
2506 		     "off!\n", lockres->l_name);
2507 	} else if (new_level == LKM_PRMODE)
2508 		ret = !lockres->l_ex_holders &&
2509 			ocfs2_inode_fully_checkpointed(inode);
2510 	else /* Must be NLMODE we're converting to. */
2511 		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
2512 			ocfs2_inode_fully_checkpointed(inode);
2513 
2514 	mlog_exit(ret);
2515 	return ret;
2516 }
2517 
2518 static int ocfs2_do_unblock_meta(struct inode *inode,
2519 				 int *requeue)
2520 {
2521 	int new_level;
2522 	int set_lvb = 0;
2523 	int ret = 0;
2524 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
2525 	unsigned long flags;
2526 
2527 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2528 
2529 	mlog_entry_void();
2530 
2531 	spin_lock_irqsave(&lockres->l_lock, flags);
2532 
2533 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2534 
2535 	mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
2536 	     lockres->l_blocking);
2537 
2538 	BUG_ON(lockres->l_level != LKM_EXMODE &&
2539 	       lockres->l_level != LKM_PRMODE);
2540 
2541 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2542 		*requeue = 1;
2543 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
2544 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2545 		if (ret) {
2546 			ret = ocfs2_cancel_convert(osb, lockres);
2547 			if (ret < 0)
2548 				mlog_errno(ret);
2549 		}
2550 		goto leave;
2551 	}
2552 
2553 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2554 
2555 	mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
2556 	     lockres->l_level, lockres->l_blocking, new_level);
2557 
2558 	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
2559 		if (lockres->l_level == LKM_EXMODE)
2560 			set_lvb = 1;
2561 
2562 		/* If the lock hasn't been refreshed yet (rare), then
2563 		 * our memory inode values are old and we skip
2564 		 * stuffing the lvb. There's no need to actually clear
2565 		 * out the lvb here as it's value is still valid. */
2566 		if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2567 			if (set_lvb)
2568 				__ocfs2_stuff_meta_lvb(inode);
2569 		} else
2570 			mlog(0, "lockres %s: downconverting stale lock!\n",
2571 			     lockres->l_name);
2572 
2573 		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
2574 		     "l_blocking=%d, new_level=%d\n",
2575 		     lockres->l_level, lockres->l_blocking, new_level);
2576 
2577 		ocfs2_prepare_downconvert(lockres, new_level);
2578 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2579 		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
2580 		goto leave;
2581 	}
2582 	if (!ocfs2_inode_fully_checkpointed(inode))
2583 		ocfs2_start_checkpoint(osb);
2584 
2585 	*requeue = 1;
2586 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2587 	ret = 0;
2588 leave:
2589 	mlog_exit(ret);
2590 	return ret;
2591 }
2592 
2593 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
2594 				      struct ocfs2_lock_res *lockres,
2595 				      int *requeue,
2596 				      ocfs2_convert_worker_t *worker)
2597 {
2598 	unsigned long flags;
2599 	int blocking;
2600 	int new_level;
2601 	int ret = 0;
2602 
2603 	mlog_entry_void();
2604 
2605 	spin_lock_irqsave(&lockres->l_lock, flags);
2606 
2607 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2608 
2609 recheck:
2610 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2611 		*requeue = 1;
2612 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
2613 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2614 		if (ret) {
2615 			ret = ocfs2_cancel_convert(osb, lockres);
2616 			if (ret < 0)
2617 				mlog_errno(ret);
2618 		}
2619 		goto leave;
2620 	}
2621 
2622 	/* if we're blocking an exclusive and we have *any* holders,
2623 	 * then requeue. */
2624 	if ((lockres->l_blocking == LKM_EXMODE)
2625 	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
2626 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2627 		*requeue = 1;
2628 		ret = 0;
2629 		goto leave;
2630 	}
2631 
2632 	/* If it's a PR we're blocking, then only
2633 	 * requeue if we've got any EX holders */
2634 	if (lockres->l_blocking == LKM_PRMODE &&
2635 	    lockres->l_ex_holders) {
2636 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2637 		*requeue = 1;
2638 		ret = 0;
2639 		goto leave;
2640 	}
2641 
2642 	/* If we get here, then we know that there are no more
2643 	 * incompatible holders (and anyone asking for an incompatible
2644 	 * lock is blocked). We can now downconvert the lock */
2645 	if (!worker)
2646 		goto downconvert;
2647 
2648 	/* Some lockres types want to do a bit of work before
2649 	 * downconverting a lock. Allow that here. The worker function
2650 	 * may sleep, so we save off a copy of what we're blocking as
2651 	 * it may change while we're not holding the spin lock. */
2652 	blocking = lockres->l_blocking;
2653 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2654 
2655 	worker(lockres, blocking);
2656 
2657 	spin_lock_irqsave(&lockres->l_lock, flags);
2658 	if (blocking != lockres->l_blocking) {
2659 		/* If this changed underneath us, then we can't drop
2660 		 * it just yet. */
2661 		goto recheck;
2662 	}
2663 
2664 downconvert:
2665 	*requeue = 0;
2666 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2667 
2668 	ocfs2_prepare_downconvert(lockres, new_level);
2669 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2670 	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
2671 leave:
2672 	mlog_exit(ret);
2673 	return ret;
2674 }
2675 
2676 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2677 				      int blocking)
2678 {
2679 	struct inode *inode;
2680 	struct address_space *mapping;
2681 
2682 	mlog_entry_void();
2683 
2684        	inode = ocfs2_lock_res_inode(lockres);
2685 	mapping = inode->i_mapping;
2686 
2687 	if (filemap_fdatawrite(mapping)) {
2688 		mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
2689 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2690 	}
2691 	sync_mapping_buffers(mapping);
2692 	if (blocking == LKM_EXMODE) {
2693 		truncate_inode_pages(mapping, 0);
2694 		unmap_mapping_range(mapping, 0, 0, 0);
2695 	} else {
2696 		/* We only need to wait on the I/O if we're not also
2697 		 * truncating pages because truncate_inode_pages waits
2698 		 * for us above. We don't truncate pages if we're
2699 		 * blocking anything < EXMODE because we want to keep
2700 		 * them around in that case. */
2701 		filemap_fdatawait(mapping);
2702 	}
2703 
2704 	mlog_exit_void();
2705 }
2706 
2707 int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
2708 		       int *requeue)
2709 {
2710 	int status;
2711 	struct inode *inode;
2712 	struct ocfs2_super *osb;
2713 
2714 	mlog_entry_void();
2715 
2716 	inode = ocfs2_lock_res_inode(lockres);
2717 	osb = OCFS2_SB(inode->i_sb);
2718 
2719 	mlog(0, "unblock inode %llu\n",
2720 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2721 
2722 	status = ocfs2_generic_unblock_lock(osb,
2723 					    lockres,
2724 					    requeue,
2725 					    ocfs2_data_convert_worker);
2726 	if (status < 0)
2727 		mlog_errno(status);
2728 
2729 	mlog(0, "inode %llu, requeue = %d\n",
2730 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue);
2731 
2732 	mlog_exit(status);
2733 	return status;
2734 }
2735 
2736 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
2737 				    int *requeue)
2738 {
2739 	int status;
2740 	struct inode *inode;
2741 
2742 	mlog_entry_void();
2743 
2744 	mlog(0, "Unblock lockres %s\n", lockres->l_name);
2745 
2746 	inode  = ocfs2_lock_res_inode(lockres);
2747 
2748 	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
2749 					    lockres,
2750 					    requeue,
2751 					    NULL);
2752 	if (status < 0)
2753 		mlog_errno(status);
2754 
2755 	mlog_exit(status);
2756 	return status;
2757 }
2758 
2759 
2760 int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
2761 		       int *requeue)
2762 {
2763 	int status;
2764 	struct inode *inode;
2765 
2766 	mlog_entry_void();
2767 
2768        	inode = ocfs2_lock_res_inode(lockres);
2769 
2770 	mlog(0, "unblock inode %llu\n",
2771 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2772 
2773 	status = ocfs2_do_unblock_meta(inode, requeue);
2774 	if (status < 0)
2775 		mlog_errno(status);
2776 
2777 	mlog(0, "inode %llu, requeue = %d\n",
2778 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue);
2779 
2780 	mlog_exit(status);
2781 	return status;
2782 }
2783 
2784 /* Generic unblock function for any lockres whose private data is an
2785  * ocfs2_super pointer. */
2786 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
2787 				  int *requeue)
2788 {
2789 	int status;
2790 	struct ocfs2_super *osb;
2791 
2792 	mlog_entry_void();
2793 
2794 	mlog(0, "Unblock lockres %s\n", lockres->l_name);
2795 
2796 	osb = ocfs2_lock_res_super(lockres);
2797 
2798 	status = ocfs2_generic_unblock_lock(osb,
2799 					    lockres,
2800 					    requeue,
2801 					    NULL);
2802 	if (status < 0)
2803 		mlog_errno(status);
2804 
2805 	mlog_exit(status);
2806 	return status;
2807 }
2808 
2809 void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
2810 				struct ocfs2_lock_res *lockres)
2811 {
2812 	int status;
2813 	int requeue = 0;
2814 	unsigned long flags;
2815 
2816 	/* Our reference to the lockres in this function can be
2817 	 * considered valid until we remove the OCFS2_LOCK_QUEUED
2818 	 * flag. */
2819 
2820 	mlog_entry_void();
2821 
2822 	BUG_ON(!lockres);
2823 	BUG_ON(!lockres->l_ops);
2824 	BUG_ON(!lockres->l_ops->unblock);
2825 
2826 	mlog(0, "lockres %s blocked.\n", lockres->l_name);
2827 
2828 	/* Detect whether a lock has been marked as going away while
2829 	 * the vote thread was processing other things. A lock can
2830 	 * still be marked with OCFS2_LOCK_FREEING after this check,
2831 	 * but short circuiting here will still save us some
2832 	 * performance. */
2833 	spin_lock_irqsave(&lockres->l_lock, flags);
2834 	if (lockres->l_flags & OCFS2_LOCK_FREEING)
2835 		goto unqueue;
2836 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2837 
2838 	status = lockres->l_ops->unblock(lockres, &requeue);
2839 	if (status < 0)
2840 		mlog_errno(status);
2841 
2842 	spin_lock_irqsave(&lockres->l_lock, flags);
2843 unqueue:
2844 	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
2845 		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
2846 	} else
2847 		ocfs2_schedule_blocked_lock(osb, lockres);
2848 
2849 	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
2850 	     requeue ? "yes" : "no");
2851 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2852 
2853 	mlog_exit_void();
2854 }
2855 
2856 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
2857 					struct ocfs2_lock_res *lockres)
2858 {
2859 	mlog_entry_void();
2860 
2861 	assert_spin_locked(&lockres->l_lock);
2862 
2863 	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
2864 		/* Do not schedule a lock for downconvert when it's on
2865 		 * the way to destruction - any nodes wanting access
2866 		 * to the resource will get it soon. */
2867 		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
2868 		     lockres->l_name, lockres->l_flags);
2869 		return;
2870 	}
2871 
2872 	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
2873 
2874 	spin_lock(&osb->vote_task_lock);
2875 	if (list_empty(&lockres->l_blocked_list)) {
2876 		list_add_tail(&lockres->l_blocked_list,
2877 			      &osb->blocked_lock_list);
2878 		osb->blocked_lock_count++;
2879 	}
2880 	spin_unlock(&osb->vote_task_lock);
2881 
2882 	mlog_exit_void();
2883 }
2884 
2885 /* This aids in debugging situations where a bad LVB might be involved. */
2886 void ocfs2_dump_meta_lvb_info(u64 level,
2887 			      const char *function,
2888 			      unsigned int line,
2889 			      struct ocfs2_lock_res *lockres)
2890 {
2891 	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
2892 
2893 	mlog(level, "LVB information for %s (called from %s:%u):\n",
2894 	     lockres->l_name, function, line);
2895 	mlog(level, "version: %u, clusters: %u\n",
2896 	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
2897 	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
2898 	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
2899 	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
2900 	     be16_to_cpu(lvb->lvb_imode));
2901 	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
2902 	     "mtime_packed 0x%llx\n", be16_to_cpu(lvb->lvb_inlink),
2903 	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
2904 	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
2905 	     (long long)be64_to_cpu(lvb->lvb_imtime_packed));
2906 }
2907