xref: /linux/fs/ocfs2/dlmglue.c (revision f3d9478b2ce468c3115b02ecae7e975990697f15)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * dlmglue.c
5  *
6  * Code which implements an OCFS2 specific interface to our DLM.
7  *
8  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/types.h>
27 #include <linux/slab.h>
28 #include <linux/highmem.h>
29 #include <linux/mm.h>
30 #include <linux/smp_lock.h>
31 #include <linux/crc32.h>
32 #include <linux/kthread.h>
33 #include <linux/pagemap.h>
34 #include <linux/debugfs.h>
35 #include <linux/seq_file.h>
36 
37 #include <cluster/heartbeat.h>
38 #include <cluster/nodemanager.h>
39 #include <cluster/tcp.h>
40 
41 #include <dlm/dlmapi.h>
42 
43 #define MLOG_MASK_PREFIX ML_DLM_GLUE
44 #include <cluster/masklog.h>
45 
46 #include "ocfs2.h"
47 
48 #include "alloc.h"
49 #include "dlmglue.h"
50 #include "extent_map.h"
51 #include "heartbeat.h"
52 #include "inode.h"
53 #include "journal.h"
54 #include "slot_map.h"
55 #include "super.h"
56 #include "uptodate.h"
57 #include "vote.h"
58 
59 #include "buffer_head_io.h"
60 
61 struct ocfs2_mask_waiter {
62 	struct list_head	mw_item;
63 	int			mw_status;
64 	struct completion	mw_complete;
65 	unsigned long		mw_mask;
66 	unsigned long		mw_goal;
67 };
68 
69 static void ocfs2_inode_ast_func(void *opaque);
70 static void ocfs2_inode_bast_func(void *opaque,
71 				  int level);
72 static void ocfs2_super_ast_func(void *opaque);
73 static void ocfs2_super_bast_func(void *opaque,
74 				  int level);
75 static void ocfs2_rename_ast_func(void *opaque);
76 static void ocfs2_rename_bast_func(void *opaque,
77 				   int level);
78 
79 /* so far, all locks have gotten along with the same unlock ast */
80 static void ocfs2_unlock_ast_func(void *opaque,
81 				  enum dlm_status status);
82 static int ocfs2_do_unblock_meta(struct inode *inode,
83 				 int *requeue);
84 static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
85 			      int *requeue);
86 static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
87 			      int *requeue);
88 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
89 			      int *requeue);
90 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
91 				  int *requeue);
92 typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
93 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
94 				      struct ocfs2_lock_res *lockres,
95 				      int *requeue,
96 				      ocfs2_convert_worker_t *worker);
97 
98 struct ocfs2_lock_res_ops {
99 	void (*ast)(void *);
100 	void (*bast)(void *, int);
101 	void (*unlock_ast)(void *, enum dlm_status);
102 	int  (*unblock)(struct ocfs2_lock_res *, int *);
103 };
104 
105 static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
106 	.ast		= ocfs2_inode_ast_func,
107 	.bast		= ocfs2_inode_bast_func,
108 	.unlock_ast	= ocfs2_unlock_ast_func,
109 	.unblock	= ocfs2_unblock_inode_lock,
110 };
111 
112 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
113 	.ast		= ocfs2_inode_ast_func,
114 	.bast		= ocfs2_inode_bast_func,
115 	.unlock_ast	= ocfs2_unlock_ast_func,
116 	.unblock	= ocfs2_unblock_meta,
117 };
118 
119 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
120 				      int blocking);
121 
122 static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
123 	.ast		= ocfs2_inode_ast_func,
124 	.bast		= ocfs2_inode_bast_func,
125 	.unlock_ast	= ocfs2_unlock_ast_func,
126 	.unblock	= ocfs2_unblock_data,
127 };
128 
129 static struct ocfs2_lock_res_ops ocfs2_super_lops = {
130 	.ast		= ocfs2_super_ast_func,
131 	.bast		= ocfs2_super_bast_func,
132 	.unlock_ast	= ocfs2_unlock_ast_func,
133 	.unblock	= ocfs2_unblock_osb_lock,
134 };
135 
136 static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
137 	.ast		= ocfs2_rename_ast_func,
138 	.bast		= ocfs2_rename_bast_func,
139 	.unlock_ast	= ocfs2_unlock_ast_func,
140 	.unblock	= ocfs2_unblock_osb_lock,
141 };
142 
143 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
144 {
145 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
146 		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
147 		lockres->l_type == OCFS2_LOCK_TYPE_RW;
148 }
149 
150 static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
151 {
152 	return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
153 }
154 
155 static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
156 {
157 	return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
158 }
159 
160 static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
161 {
162 	BUG_ON(!ocfs2_is_super_lock(lockres)
163 	       && !ocfs2_is_rename_lock(lockres));
164 
165 	return (struct ocfs2_super *) lockres->l_priv;
166 }
167 
168 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
169 {
170 	BUG_ON(!ocfs2_is_inode_lock(lockres));
171 
172 	return (struct inode *) lockres->l_priv;
173 }
174 
175 static int ocfs2_lock_create(struct ocfs2_super *osb,
176 			     struct ocfs2_lock_res *lockres,
177 			     int level,
178 			     int dlm_flags);
179 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
180 						     int wanted);
181 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
182 				 struct ocfs2_lock_res *lockres,
183 				 int level);
184 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
185 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
186 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
187 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
188 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
189 					struct ocfs2_lock_res *lockres);
190 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
191 						int convert);
192 #define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
193 	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
194 		"resource %s: %s\n", dlm_errname(_stat), _func,	\
195 		_lockres->l_name, dlm_errmsg(_stat));		\
196 } while (0)
197 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
198 				 struct ocfs2_lock_res *lockres);
199 static int ocfs2_meta_lock_update(struct inode *inode,
200 				  struct buffer_head **bh);
201 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
202 static inline int ocfs2_highest_compat_lock_level(int level);
203 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
204 						  struct ocfs2_lock_res *lockres,
205 						  int new_level);
206 
207 static char *ocfs2_lock_type_strings[] = {
208 	[OCFS2_LOCK_TYPE_META] = "Meta",
209 	[OCFS2_LOCK_TYPE_DATA] = "Data",
210 	[OCFS2_LOCK_TYPE_SUPER] = "Super",
211 	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
212 	/* Need to differntiate from [R]ename.. serializing writes is the
213 	 * important job it does, anyway. */
214 	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
215 };
216 
217 static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
218 {
219 	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
220 	return ocfs2_lock_type_strings[type];
221 }
222 
223 static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
224 				  u64 blkno,
225 				  u32 generation,
226 				  char *name)
227 {
228 	int len;
229 
230 	mlog_entry_void();
231 
232 	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
233 
234 	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
235 		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
236 		       (long long)blkno, generation);
237 
238 	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
239 
240 	mlog(0, "built lock resource with name: %s\n", name);
241 
242 	mlog_exit_void();
243 }
244 
245 static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED;
246 
247 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
248 				       struct ocfs2_dlm_debug *dlm_debug)
249 {
250 	mlog(0, "Add tracking for lockres %s\n", res->l_name);
251 
252 	spin_lock(&ocfs2_dlm_tracking_lock);
253 	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
254 	spin_unlock(&ocfs2_dlm_tracking_lock);
255 }
256 
257 static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
258 {
259 	spin_lock(&ocfs2_dlm_tracking_lock);
260 	if (!list_empty(&res->l_debug_list))
261 		list_del_init(&res->l_debug_list);
262 	spin_unlock(&ocfs2_dlm_tracking_lock);
263 }
264 
265 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
266 				       struct ocfs2_lock_res *res,
267 				       enum ocfs2_lock_type type,
268 				       u64 blkno,
269 				       u32 generation,
270 				       struct ocfs2_lock_res_ops *ops,
271 				       void *priv)
272 {
273 	ocfs2_build_lock_name(type, blkno, generation, res->l_name);
274 
275 	res->l_type          = type;
276 	res->l_ops           = ops;
277 	res->l_priv          = priv;
278 
279 	res->l_level         = LKM_IVMODE;
280 	res->l_requested     = LKM_IVMODE;
281 	res->l_blocking      = LKM_IVMODE;
282 	res->l_action        = OCFS2_AST_INVALID;
283 	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
284 
285 	res->l_flags         = OCFS2_LOCK_INITIALIZED;
286 
287 	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
288 }
289 
290 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
291 {
292 	/* This also clears out the lock status block */
293 	memset(res, 0, sizeof(struct ocfs2_lock_res));
294 	spin_lock_init(&res->l_lock);
295 	init_waitqueue_head(&res->l_event);
296 	INIT_LIST_HEAD(&res->l_blocked_list);
297 	INIT_LIST_HEAD(&res->l_mask_waiters);
298 }
299 
300 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
301 			       enum ocfs2_lock_type type,
302 			       struct inode *inode)
303 {
304 	struct ocfs2_lock_res_ops *ops;
305 
306 	switch(type) {
307 		case OCFS2_LOCK_TYPE_RW:
308 			ops = &ocfs2_inode_rw_lops;
309 			break;
310 		case OCFS2_LOCK_TYPE_META:
311 			ops = &ocfs2_inode_meta_lops;
312 			break;
313 		case OCFS2_LOCK_TYPE_DATA:
314 			ops = &ocfs2_inode_data_lops;
315 			break;
316 		default:
317 			mlog_bug_on_msg(1, "type: %d\n", type);
318 			ops = NULL; /* thanks, gcc */
319 			break;
320 	};
321 
322 	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
323 				   OCFS2_I(inode)->ip_blkno,
324 				   inode->i_generation, ops, inode);
325 }
326 
327 static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
328 				      struct ocfs2_super *osb)
329 {
330 	/* Superblock lockres doesn't come from a slab so we call init
331 	 * once on it manually.  */
332 	ocfs2_lock_res_init_once(res);
333 	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
334 				   OCFS2_SUPER_BLOCK_BLKNO, 0,
335 				   &ocfs2_super_lops, osb);
336 }
337 
338 static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
339 				       struct ocfs2_super *osb)
340 {
341 	/* Rename lockres doesn't come from a slab so we call init
342 	 * once on it manually.  */
343 	ocfs2_lock_res_init_once(res);
344 	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
345 				   &ocfs2_rename_lops, osb);
346 }
347 
348 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
349 {
350 	mlog_entry_void();
351 
352 	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
353 		return;
354 
355 	ocfs2_remove_lockres_tracking(res);
356 
357 	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
358 			"Lockres %s is on the blocked list\n",
359 			res->l_name);
360 	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
361 			"Lockres %s has mask waiters pending\n",
362 			res->l_name);
363 	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
364 			"Lockres %s is locked\n",
365 			res->l_name);
366 	mlog_bug_on_msg(res->l_ro_holders,
367 			"Lockres %s has %u ro holders\n",
368 			res->l_name, res->l_ro_holders);
369 	mlog_bug_on_msg(res->l_ex_holders,
370 			"Lockres %s has %u ex holders\n",
371 			res->l_name, res->l_ex_holders);
372 
373 	/* Need to clear out the lock status block for the dlm */
374 	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
375 
376 	res->l_flags = 0UL;
377 	mlog_exit_void();
378 }
379 
380 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
381 				     int level)
382 {
383 	mlog_entry_void();
384 
385 	BUG_ON(!lockres);
386 
387 	switch(level) {
388 	case LKM_EXMODE:
389 		lockres->l_ex_holders++;
390 		break;
391 	case LKM_PRMODE:
392 		lockres->l_ro_holders++;
393 		break;
394 	default:
395 		BUG();
396 	}
397 
398 	mlog_exit_void();
399 }
400 
401 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
402 				     int level)
403 {
404 	mlog_entry_void();
405 
406 	BUG_ON(!lockres);
407 
408 	switch(level) {
409 	case LKM_EXMODE:
410 		BUG_ON(!lockres->l_ex_holders);
411 		lockres->l_ex_holders--;
412 		break;
413 	case LKM_PRMODE:
414 		BUG_ON(!lockres->l_ro_holders);
415 		lockres->l_ro_holders--;
416 		break;
417 	default:
418 		BUG();
419 	}
420 	mlog_exit_void();
421 }
422 
423 /* WARNING: This function lives in a world where the only three lock
424  * levels are EX, PR, and NL. It *will* have to be adjusted when more
425  * lock types are added. */
426 static inline int ocfs2_highest_compat_lock_level(int level)
427 {
428 	int new_level = LKM_EXMODE;
429 
430 	if (level == LKM_EXMODE)
431 		new_level = LKM_NLMODE;
432 	else if (level == LKM_PRMODE)
433 		new_level = LKM_PRMODE;
434 	return new_level;
435 }
436 
437 static void lockres_set_flags(struct ocfs2_lock_res *lockres,
438 			      unsigned long newflags)
439 {
440 	struct list_head *pos, *tmp;
441 	struct ocfs2_mask_waiter *mw;
442 
443  	assert_spin_locked(&lockres->l_lock);
444 
445 	lockres->l_flags = newflags;
446 
447 	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
448 		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
449 		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
450 			continue;
451 
452 		list_del_init(&mw->mw_item);
453 		mw->mw_status = 0;
454 		complete(&mw->mw_complete);
455 	}
456 }
457 static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
458 {
459 	lockres_set_flags(lockres, lockres->l_flags | or);
460 }
461 static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
462 				unsigned long clear)
463 {
464 	lockres_set_flags(lockres, lockres->l_flags & ~clear);
465 }
466 
467 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
468 {
469 	mlog_entry_void();
470 
471 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
472 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
473 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
474 	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
475 
476 	lockres->l_level = lockres->l_requested;
477 	if (lockres->l_level <=
478 	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
479 		lockres->l_blocking = LKM_NLMODE;
480 		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
481 	}
482 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
483 
484 	mlog_exit_void();
485 }
486 
487 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
488 {
489 	mlog_entry_void();
490 
491 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
492 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
493 
494 	/* Convert from RO to EX doesn't really need anything as our
495 	 * information is already up to data. Convert from NL to
496 	 * *anything* however should mark ourselves as needing an
497 	 * update */
498 	if (lockres->l_level == LKM_NLMODE)
499 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
500 
501 	lockres->l_level = lockres->l_requested;
502 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
503 
504 	mlog_exit_void();
505 }
506 
507 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
508 {
509 	mlog_entry_void();
510 
511 	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
512 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
513 
514 	if (lockres->l_requested > LKM_NLMODE &&
515 	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
516 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
517 
518 	lockres->l_level = lockres->l_requested;
519 	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
520 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
521 
522 	mlog_exit_void();
523 }
524 
525 static void ocfs2_inode_ast_func(void *opaque)
526 {
527 	struct ocfs2_lock_res *lockres = opaque;
528 	struct inode *inode;
529 	struct dlm_lockstatus *lksb;
530 	unsigned long flags;
531 
532 	mlog_entry_void();
533 
534 	inode = ocfs2_lock_res_inode(lockres);
535 
536 	mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n",
537 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action,
538 	     ocfs2_lock_type_string(lockres->l_type));
539 
540 	BUG_ON(!ocfs2_is_inode_lock(lockres));
541 
542 	spin_lock_irqsave(&lockres->l_lock, flags);
543 
544 	lksb = &(lockres->l_lksb);
545 	if (lksb->status != DLM_NORMAL) {
546 		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
547 		     "on inode %llu\n", lksb->status,
548 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
549 		spin_unlock_irqrestore(&lockres->l_lock, flags);
550 		mlog_exit_void();
551 		return;
552 	}
553 
554 	switch(lockres->l_action) {
555 	case OCFS2_AST_ATTACH:
556 		ocfs2_generic_handle_attach_action(lockres);
557 		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
558 		break;
559 	case OCFS2_AST_CONVERT:
560 		ocfs2_generic_handle_convert_action(lockres);
561 		break;
562 	case OCFS2_AST_DOWNCONVERT:
563 		ocfs2_generic_handle_downconvert_action(lockres);
564 		break;
565 	default:
566 		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
567 		     "lockres flags = 0x%lx, unlock action: %u\n",
568 		     lockres->l_name, lockres->l_action, lockres->l_flags,
569 		     lockres->l_unlock_action);
570 
571 		BUG();
572 	}
573 
574 	/* data and rw locking ignores refresh flag for now. */
575 	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
576 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
577 
578 	/* set it to something invalid so if we get called again we
579 	 * can catch it. */
580 	lockres->l_action = OCFS2_AST_INVALID;
581 	spin_unlock_irqrestore(&lockres->l_lock, flags);
582 	wake_up(&lockres->l_event);
583 
584 	mlog_exit_void();
585 }
586 
587 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
588 				     int level)
589 {
590 	int needs_downconvert = 0;
591 	mlog_entry_void();
592 
593 	assert_spin_locked(&lockres->l_lock);
594 
595 	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
596 
597 	if (level > lockres->l_blocking) {
598 		/* only schedule a downconvert if we haven't already scheduled
599 		 * one that goes low enough to satisfy the level we're
600 		 * blocking.  this also catches the case where we get
601 		 * duplicate BASTs */
602 		if (ocfs2_highest_compat_lock_level(level) <
603 		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
604 			needs_downconvert = 1;
605 
606 		lockres->l_blocking = level;
607 	}
608 
609 	mlog_exit(needs_downconvert);
610 	return needs_downconvert;
611 }
612 
613 static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
614 				    struct ocfs2_lock_res *lockres,
615 				    int level)
616 {
617 	int needs_downconvert;
618 	unsigned long flags;
619 
620 	mlog_entry_void();
621 
622 	BUG_ON(level <= LKM_NLMODE);
623 
624 	spin_lock_irqsave(&lockres->l_lock, flags);
625 	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
626 	if (needs_downconvert)
627 		ocfs2_schedule_blocked_lock(osb, lockres);
628 	spin_unlock_irqrestore(&lockres->l_lock, flags);
629 
630 	ocfs2_kick_vote_thread(osb);
631 
632 	wake_up(&lockres->l_event);
633 	mlog_exit_void();
634 }
635 
636 static void ocfs2_inode_bast_func(void *opaque, int level)
637 {
638 	struct ocfs2_lock_res *lockres = opaque;
639 	struct inode *inode;
640 	struct ocfs2_super *osb;
641 
642 	mlog_entry_void();
643 
644 	BUG_ON(!ocfs2_is_inode_lock(lockres));
645 
646 	inode = ocfs2_lock_res_inode(lockres);
647 	osb = OCFS2_SB(inode->i_sb);
648 
649 	mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n",
650 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, level,
651 	     lockres->l_level, ocfs2_lock_type_string(lockres->l_type));
652 
653 	ocfs2_generic_bast_func(osb, lockres, level);
654 
655 	mlog_exit_void();
656 }
657 
658 static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
659 				   int ignore_refresh)
660 {
661 	struct dlm_lockstatus *lksb = &lockres->l_lksb;
662 	unsigned long flags;
663 
664 	spin_lock_irqsave(&lockres->l_lock, flags);
665 
666 	if (lksb->status != DLM_NORMAL) {
667 		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
668 		     lockres->l_name, lksb->status);
669 		spin_unlock_irqrestore(&lockres->l_lock, flags);
670 		return;
671 	}
672 
673 	switch(lockres->l_action) {
674 	case OCFS2_AST_ATTACH:
675 		ocfs2_generic_handle_attach_action(lockres);
676 		break;
677 	case OCFS2_AST_CONVERT:
678 		ocfs2_generic_handle_convert_action(lockres);
679 		break;
680 	case OCFS2_AST_DOWNCONVERT:
681 		ocfs2_generic_handle_downconvert_action(lockres);
682 		break;
683 	default:
684 		BUG();
685 	}
686 
687 	if (ignore_refresh)
688 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
689 
690 	/* set it to something invalid so if we get called again we
691 	 * can catch it. */
692 	lockres->l_action = OCFS2_AST_INVALID;
693 	spin_unlock_irqrestore(&lockres->l_lock, flags);
694 
695 	wake_up(&lockres->l_event);
696 }
697 
698 static void ocfs2_super_ast_func(void *opaque)
699 {
700 	struct ocfs2_lock_res *lockres = opaque;
701 
702 	mlog_entry_void();
703 	mlog(0, "Superblock AST fired\n");
704 
705 	BUG_ON(!ocfs2_is_super_lock(lockres));
706 	ocfs2_generic_ast_func(lockres, 0);
707 
708 	mlog_exit_void();
709 }
710 
711 static void ocfs2_super_bast_func(void *opaque,
712 				  int level)
713 {
714 	struct ocfs2_lock_res *lockres = opaque;
715 	struct ocfs2_super *osb;
716 
717 	mlog_entry_void();
718 	mlog(0, "Superblock BAST fired\n");
719 
720 	BUG_ON(!ocfs2_is_super_lock(lockres));
721        	osb = ocfs2_lock_res_super(lockres);
722 	ocfs2_generic_bast_func(osb, lockres, level);
723 
724 	mlog_exit_void();
725 }
726 
727 static void ocfs2_rename_ast_func(void *opaque)
728 {
729 	struct ocfs2_lock_res *lockres = opaque;
730 
731 	mlog_entry_void();
732 
733 	mlog(0, "Rename AST fired\n");
734 
735 	BUG_ON(!ocfs2_is_rename_lock(lockres));
736 
737 	ocfs2_generic_ast_func(lockres, 1);
738 
739 	mlog_exit_void();
740 }
741 
742 static void ocfs2_rename_bast_func(void *opaque,
743 				   int level)
744 {
745 	struct ocfs2_lock_res *lockres = opaque;
746 	struct ocfs2_super *osb;
747 
748 	mlog_entry_void();
749 
750 	mlog(0, "Rename BAST fired\n");
751 
752 	BUG_ON(!ocfs2_is_rename_lock(lockres));
753 
754 	osb = ocfs2_lock_res_super(lockres);
755 	ocfs2_generic_bast_func(osb, lockres, level);
756 
757 	mlog_exit_void();
758 }
759 
760 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
761 						int convert)
762 {
763 	unsigned long flags;
764 
765 	mlog_entry_void();
766 	spin_lock_irqsave(&lockres->l_lock, flags);
767 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
768 	if (convert)
769 		lockres->l_action = OCFS2_AST_INVALID;
770 	else
771 		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
772 	spin_unlock_irqrestore(&lockres->l_lock, flags);
773 
774 	wake_up(&lockres->l_event);
775 	mlog_exit_void();
776 }
777 
778 /* Note: If we detect another process working on the lock (i.e.,
779  * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
780  * to do the right thing in that case.
781  */
782 static int ocfs2_lock_create(struct ocfs2_super *osb,
783 			     struct ocfs2_lock_res *lockres,
784 			     int level,
785 			     int dlm_flags)
786 {
787 	int ret = 0;
788 	enum dlm_status status;
789 	unsigned long flags;
790 
791 	mlog_entry_void();
792 
793 	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
794 	     dlm_flags);
795 
796 	spin_lock_irqsave(&lockres->l_lock, flags);
797 	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
798 	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
799 		spin_unlock_irqrestore(&lockres->l_lock, flags);
800 		goto bail;
801 	}
802 
803 	lockres->l_action = OCFS2_AST_ATTACH;
804 	lockres->l_requested = level;
805 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
806 	spin_unlock_irqrestore(&lockres->l_lock, flags);
807 
808 	status = dlmlock(osb->dlm,
809 			 level,
810 			 &lockres->l_lksb,
811 			 dlm_flags,
812 			 lockres->l_name,
813 			 lockres->l_ops->ast,
814 			 lockres,
815 			 lockres->l_ops->bast);
816 	if (status != DLM_NORMAL) {
817 		ocfs2_log_dlm_error("dlmlock", status, lockres);
818 		ret = -EINVAL;
819 		ocfs2_recover_from_dlm_error(lockres, 1);
820 	}
821 
822 	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
823 
824 bail:
825 	mlog_exit(ret);
826 	return ret;
827 }
828 
829 static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
830 					int flag)
831 {
832 	unsigned long flags;
833 	int ret;
834 
835 	spin_lock_irqsave(&lockres->l_lock, flags);
836 	ret = lockres->l_flags & flag;
837 	spin_unlock_irqrestore(&lockres->l_lock, flags);
838 
839 	return ret;
840 }
841 
842 static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
843 
844 {
845 	wait_event(lockres->l_event,
846 		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
847 }
848 
849 static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
850 
851 {
852 	wait_event(lockres->l_event,
853 		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
854 }
855 
856 /* predict what lock level we'll be dropping down to on behalf
857  * of another node, and return true if the currently wanted
858  * level will be compatible with it. */
859 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
860 						     int wanted)
861 {
862 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
863 
864 	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
865 }
866 
867 static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
868 {
869 	INIT_LIST_HEAD(&mw->mw_item);
870 	init_completion(&mw->mw_complete);
871 }
872 
873 static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
874 {
875 	wait_for_completion(&mw->mw_complete);
876 	/* Re-arm the completion in case we want to wait on it again */
877 	INIT_COMPLETION(mw->mw_complete);
878 	return mw->mw_status;
879 }
880 
881 static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
882 				    struct ocfs2_mask_waiter *mw,
883 				    unsigned long mask,
884 				    unsigned long goal)
885 {
886 	BUG_ON(!list_empty(&mw->mw_item));
887 
888 	assert_spin_locked(&lockres->l_lock);
889 
890 	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
891 	mw->mw_mask = mask;
892 	mw->mw_goal = goal;
893 }
894 
895 /* returns 0 if the mw that was removed was already satisfied, -EBUSY
896  * if the mask still hadn't reached its goal */
897 static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
898 				      struct ocfs2_mask_waiter *mw)
899 {
900 	unsigned long flags;
901 	int ret = 0;
902 
903 	spin_lock_irqsave(&lockres->l_lock, flags);
904 	if (!list_empty(&mw->mw_item)) {
905 		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
906 			ret = -EBUSY;
907 
908 		list_del_init(&mw->mw_item);
909 		init_completion(&mw->mw_complete);
910 	}
911 	spin_unlock_irqrestore(&lockres->l_lock, flags);
912 
913 	return ret;
914 
915 }
916 
917 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
918 			      struct ocfs2_lock_res *lockres,
919 			      int level,
920 			      int lkm_flags,
921 			      int arg_flags)
922 {
923 	struct ocfs2_mask_waiter mw;
924 	enum dlm_status status;
925 	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
926 	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
927 	unsigned long flags;
928 
929 	mlog_entry_void();
930 
931 	ocfs2_init_mask_waiter(&mw);
932 
933 again:
934 	wait = 0;
935 
936 	if (catch_signals && signal_pending(current)) {
937 		ret = -ERESTARTSYS;
938 		goto out;
939 	}
940 
941 	spin_lock_irqsave(&lockres->l_lock, flags);
942 
943 	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
944 			"Cluster lock called on freeing lockres %s! flags "
945 			"0x%lx\n", lockres->l_name, lockres->l_flags);
946 
947 	/* We only compare against the currently granted level
948 	 * here. If the lock is blocked waiting on a downconvert,
949 	 * we'll get caught below. */
950 	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
951 	    level > lockres->l_level) {
952 		/* is someone sitting in dlm_lock? If so, wait on
953 		 * them. */
954 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
955 		wait = 1;
956 		goto unlock;
957 	}
958 
959 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
960 		/* lock has not been created yet. */
961 		spin_unlock_irqrestore(&lockres->l_lock, flags);
962 
963 		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
964 		if (ret < 0) {
965 			mlog_errno(ret);
966 			goto out;
967 		}
968 		goto again;
969 	}
970 
971 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
972 	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
973 		/* is the lock is currently blocked on behalf of
974 		 * another node */
975 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
976 		wait = 1;
977 		goto unlock;
978 	}
979 
980 	if (level > lockres->l_level) {
981 		if (lockres->l_action != OCFS2_AST_INVALID)
982 			mlog(ML_ERROR, "lockres %s has action %u pending\n",
983 			     lockres->l_name, lockres->l_action);
984 
985 		lockres->l_action = OCFS2_AST_CONVERT;
986 		lockres->l_requested = level;
987 		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
988 		spin_unlock_irqrestore(&lockres->l_lock, flags);
989 
990 		BUG_ON(level == LKM_IVMODE);
991 		BUG_ON(level == LKM_NLMODE);
992 
993 		mlog(0, "lock %s, convert from %d to level = %d\n",
994 		     lockres->l_name, lockres->l_level, level);
995 
996 		/* call dlm_lock to upgrade lock now */
997 		status = dlmlock(osb->dlm,
998 				 level,
999 				 &lockres->l_lksb,
1000 				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
1001 				 lockres->l_name,
1002 				 lockres->l_ops->ast,
1003 				 lockres,
1004 				 lockres->l_ops->bast);
1005 		if (status != DLM_NORMAL) {
1006 			if ((lkm_flags & LKM_NOQUEUE) &&
1007 			    (status == DLM_NOTQUEUED))
1008 				ret = -EAGAIN;
1009 			else {
1010 				ocfs2_log_dlm_error("dlmlock", status,
1011 						    lockres);
1012 				ret = -EINVAL;
1013 			}
1014 			ocfs2_recover_from_dlm_error(lockres, 1);
1015 			goto out;
1016 		}
1017 
1018 		mlog(0, "lock %s, successfull return from dlmlock\n",
1019 		     lockres->l_name);
1020 
1021 		/* At this point we've gone inside the dlm and need to
1022 		 * complete our work regardless. */
1023 		catch_signals = 0;
1024 
1025 		/* wait for busy to clear and carry on */
1026 		goto again;
1027 	}
1028 
1029 	/* Ok, if we get here then we're good to go. */
1030 	ocfs2_inc_holders(lockres, level);
1031 
1032 	ret = 0;
1033 unlock:
1034 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1035 out:
1036 	/*
1037 	 * This is helping work around a lock inversion between the page lock
1038 	 * and dlm locks.  One path holds the page lock while calling aops
1039 	 * which block acquiring dlm locks.  The voting thread holds dlm
1040 	 * locks while acquiring page locks while down converting data locks.
1041 	 * This block is helping an aop path notice the inversion and back
1042 	 * off to unlock its page lock before trying the dlm lock again.
1043 	 */
1044 	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1045 	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1046 		wait = 0;
1047 		if (lockres_remove_mask_waiter(lockres, &mw))
1048 			ret = -EAGAIN;
1049 		else
1050 			goto again;
1051 	}
1052 	if (wait) {
1053 		ret = ocfs2_wait_for_mask(&mw);
1054 		if (ret == 0)
1055 			goto again;
1056 		mlog_errno(ret);
1057 	}
1058 
1059 	mlog_exit(ret);
1060 	return ret;
1061 }
1062 
1063 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1064 				 struct ocfs2_lock_res *lockres,
1065 				 int level)
1066 {
1067 	unsigned long flags;
1068 
1069 	mlog_entry_void();
1070 	spin_lock_irqsave(&lockres->l_lock, flags);
1071 	ocfs2_dec_holders(lockres, level);
1072 	ocfs2_vote_on_unlock(osb, lockres);
1073 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1074 	mlog_exit_void();
1075 }
1076 
1077 static int ocfs2_create_new_inode_lock(struct inode *inode,
1078 				       struct ocfs2_lock_res *lockres)
1079 {
1080 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1081 	unsigned long flags;
1082 
1083 	spin_lock_irqsave(&lockres->l_lock, flags);
1084 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1085 	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1086 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1087 
1088 	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
1089 }
1090 
1091 /* Grants us an EX lock on the data and metadata resources, skipping
1092  * the normal cluster directory lookup. Use this ONLY on newly created
1093  * inodes which other nodes can't possibly see, and which haven't been
1094  * hashed in the inode hash yet. This can give us a good performance
1095  * increase as it'll skip the network broadcast normally associated
1096  * with creating a new lock resource. */
1097 int ocfs2_create_new_inode_locks(struct inode *inode)
1098 {
1099 	int ret;
1100 
1101 	BUG_ON(!inode);
1102 	BUG_ON(!ocfs2_inode_is_new(inode));
1103 
1104 	mlog_entry_void();
1105 
1106 	mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
1107 
1108 	/* NOTE: That we don't increment any of the holder counts, nor
1109 	 * do we add anything to a journal handle. Since this is
1110 	 * supposed to be a new inode which the cluster doesn't know
1111 	 * about yet, there is no need to.  As far as the LVB handling
1112 	 * is concerned, this is basically like acquiring an EX lock
1113 	 * on a resource which has an invalid one -- we'll set it
1114 	 * valid when we release the EX. */
1115 
1116 	ret = ocfs2_create_new_inode_lock(inode,
1117 					  &OCFS2_I(inode)->ip_rw_lockres);
1118 	if (ret) {
1119 		mlog_errno(ret);
1120 		goto bail;
1121 	}
1122 
1123 	ret = ocfs2_create_new_inode_lock(inode,
1124 					  &OCFS2_I(inode)->ip_meta_lockres);
1125 	if (ret) {
1126 		mlog_errno(ret);
1127 		goto bail;
1128 	}
1129 
1130 	ret = ocfs2_create_new_inode_lock(inode,
1131 					  &OCFS2_I(inode)->ip_data_lockres);
1132 	if (ret) {
1133 		mlog_errno(ret);
1134 		goto bail;
1135 	}
1136 
1137 bail:
1138 	mlog_exit(ret);
1139 	return ret;
1140 }
1141 
1142 int ocfs2_rw_lock(struct inode *inode, int write)
1143 {
1144 	int status, level;
1145 	struct ocfs2_lock_res *lockres;
1146 
1147 	BUG_ON(!inode);
1148 
1149 	mlog_entry_void();
1150 
1151 	mlog(0, "inode %llu take %s RW lock\n",
1152 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1153 	     write ? "EXMODE" : "PRMODE");
1154 
1155 	lockres = &OCFS2_I(inode)->ip_rw_lockres;
1156 
1157 	level = write ? LKM_EXMODE : LKM_PRMODE;
1158 
1159 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1160 				    0);
1161 	if (status < 0)
1162 		mlog_errno(status);
1163 
1164 	mlog_exit(status);
1165 	return status;
1166 }
1167 
1168 void ocfs2_rw_unlock(struct inode *inode, int write)
1169 {
1170 	int level = write ? LKM_EXMODE : LKM_PRMODE;
1171 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1172 
1173 	mlog_entry_void();
1174 
1175 	mlog(0, "inode %llu drop %s RW lock\n",
1176 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1177 	     write ? "EXMODE" : "PRMODE");
1178 
1179 	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1180 
1181 	mlog_exit_void();
1182 }
1183 
1184 int ocfs2_data_lock_full(struct inode *inode,
1185 			 int write,
1186 			 int arg_flags)
1187 {
1188 	int status = 0, level;
1189 	struct ocfs2_lock_res *lockres;
1190 
1191 	BUG_ON(!inode);
1192 
1193 	mlog_entry_void();
1194 
1195 	mlog(0, "inode %llu take %s DATA lock\n",
1196 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1197 	     write ? "EXMODE" : "PRMODE");
1198 
1199 	/* We'll allow faking a readonly data lock for
1200 	 * rodevices. */
1201 	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1202 		if (write) {
1203 			status = -EROFS;
1204 			mlog_errno(status);
1205 		}
1206 		goto out;
1207 	}
1208 
1209 	lockres = &OCFS2_I(inode)->ip_data_lockres;
1210 
1211 	level = write ? LKM_EXMODE : LKM_PRMODE;
1212 
1213 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1214 				    0, arg_flags);
1215 	if (status < 0 && status != -EAGAIN)
1216 		mlog_errno(status);
1217 
1218 out:
1219 	mlog_exit(status);
1220 	return status;
1221 }
1222 
1223 /* see ocfs2_meta_lock_with_page() */
1224 int ocfs2_data_lock_with_page(struct inode *inode,
1225 			      int write,
1226 			      struct page *page)
1227 {
1228 	int ret;
1229 
1230 	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1231 	if (ret == -EAGAIN) {
1232 		unlock_page(page);
1233 		if (ocfs2_data_lock(inode, write) == 0)
1234 			ocfs2_data_unlock(inode, write);
1235 		ret = AOP_TRUNCATED_PAGE;
1236 	}
1237 
1238 	return ret;
1239 }
1240 
1241 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1242 				 struct ocfs2_lock_res *lockres)
1243 {
1244 	int kick = 0;
1245 
1246 	mlog_entry_void();
1247 
1248 	/* If we know that another node is waiting on our lock, kick
1249 	 * the vote thread * pre-emptively when we reach a release
1250 	 * condition. */
1251 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1252 		switch(lockres->l_blocking) {
1253 		case LKM_EXMODE:
1254 			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1255 				kick = 1;
1256 			break;
1257 		case LKM_PRMODE:
1258 			if (!lockres->l_ex_holders)
1259 				kick = 1;
1260 			break;
1261 		default:
1262 			BUG();
1263 		}
1264 	}
1265 
1266 	if (kick)
1267 		ocfs2_kick_vote_thread(osb);
1268 
1269 	mlog_exit_void();
1270 }
1271 
1272 void ocfs2_data_unlock(struct inode *inode,
1273 		       int write)
1274 {
1275 	int level = write ? LKM_EXMODE : LKM_PRMODE;
1276 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1277 
1278 	mlog_entry_void();
1279 
1280 	mlog(0, "inode %llu drop %s DATA lock\n",
1281 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1282 	     write ? "EXMODE" : "PRMODE");
1283 
1284 	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1285 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1286 
1287 	mlog_exit_void();
1288 }
1289 
1290 #define OCFS2_SEC_BITS   34
1291 #define OCFS2_SEC_SHIFT  (64 - 34)
1292 #define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
1293 
1294 /* LVB only has room for 64 bits of time here so we pack it for
1295  * now. */
1296 static u64 ocfs2_pack_timespec(struct timespec *spec)
1297 {
1298 	u64 res;
1299 	u64 sec = spec->tv_sec;
1300 	u32 nsec = spec->tv_nsec;
1301 
1302 	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1303 
1304 	return res;
1305 }
1306 
1307 /* Call this with the lockres locked. I am reasonably sure we don't
1308  * need ip_lock in this function as anyone who would be changing those
1309  * values is supposed to be blocked in ocfs2_meta_lock right now. */
1310 static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1311 {
1312 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1313 	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1314 	struct ocfs2_meta_lvb *lvb;
1315 
1316 	mlog_entry_void();
1317 
1318 	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1319 
1320 	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
1321 	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
1322 	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1323 	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
1324 	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
1325 	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
1326 	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
1327 	lvb->lvb_iatime_packed  =
1328 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1329 	lvb->lvb_ictime_packed =
1330 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1331 	lvb->lvb_imtime_packed =
1332 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1333 
1334 	mlog_meta_lvb(0, lockres);
1335 
1336 	mlog_exit_void();
1337 }
1338 
1339 static void ocfs2_unpack_timespec(struct timespec *spec,
1340 				  u64 packed_time)
1341 {
1342 	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1343 	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1344 }
1345 
1346 static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1347 {
1348 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1349 	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1350 	struct ocfs2_meta_lvb *lvb;
1351 
1352 	mlog_entry_void();
1353 
1354 	mlog_meta_lvb(0, lockres);
1355 
1356 	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1357 
1358 	/* We're safe here without the lockres lock... */
1359 	spin_lock(&oi->ip_lock);
1360 	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1361 	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1362 
1363 	/* fast-symlinks are a special case */
1364 	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1365 		inode->i_blocks = 0;
1366 	else
1367 		inode->i_blocks =
1368 			ocfs2_align_bytes_to_sectors(i_size_read(inode));
1369 
1370 	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
1371 	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
1372 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
1373 	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
1374 	ocfs2_unpack_timespec(&inode->i_atime,
1375 			      be64_to_cpu(lvb->lvb_iatime_packed));
1376 	ocfs2_unpack_timespec(&inode->i_mtime,
1377 			      be64_to_cpu(lvb->lvb_imtime_packed));
1378 	ocfs2_unpack_timespec(&inode->i_ctime,
1379 			      be64_to_cpu(lvb->lvb_ictime_packed));
1380 	spin_unlock(&oi->ip_lock);
1381 
1382 	mlog_exit_void();
1383 }
1384 
1385 static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
1386 {
1387 	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1388 
1389 	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
1390 		return 1;
1391 	return 0;
1392 }
1393 
1394 /* Determine whether a lock resource needs to be refreshed, and
1395  * arbitrate who gets to refresh it.
1396  *
1397  *   0 means no refresh needed.
1398  *
1399  *   > 0 means you need to refresh this and you MUST call
1400  *   ocfs2_complete_lock_res_refresh afterwards. */
1401 static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1402 {
1403 	unsigned long flags;
1404 	int status = 0;
1405 
1406 	mlog_entry_void();
1407 
1408 refresh_check:
1409 	spin_lock_irqsave(&lockres->l_lock, flags);
1410 	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1411 		spin_unlock_irqrestore(&lockres->l_lock, flags);
1412 		goto bail;
1413 	}
1414 
1415 	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1416 		spin_unlock_irqrestore(&lockres->l_lock, flags);
1417 
1418 		ocfs2_wait_on_refreshing_lock(lockres);
1419 		goto refresh_check;
1420 	}
1421 
1422 	/* Ok, I'll be the one to refresh this lock. */
1423 	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1424 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1425 
1426 	status = 1;
1427 bail:
1428 	mlog_exit(status);
1429 	return status;
1430 }
1431 
1432 /* If status is non zero, I'll mark it as not being in refresh
1433  * anymroe, but i won't clear the needs refresh flag. */
1434 static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1435 						   int status)
1436 {
1437 	unsigned long flags;
1438 	mlog_entry_void();
1439 
1440 	spin_lock_irqsave(&lockres->l_lock, flags);
1441 	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1442 	if (!status)
1443 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1444 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1445 
1446 	wake_up(&lockres->l_event);
1447 
1448 	mlog_exit_void();
1449 }
1450 
1451 /* may or may not return a bh if it went to disk. */
1452 static int ocfs2_meta_lock_update(struct inode *inode,
1453 				  struct buffer_head **bh)
1454 {
1455 	int status = 0;
1456 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1457 	struct ocfs2_lock_res *lockres;
1458 	struct ocfs2_dinode *fe;
1459 
1460 	mlog_entry_void();
1461 
1462 	spin_lock(&oi->ip_lock);
1463 	if (oi->ip_flags & OCFS2_INODE_DELETED) {
1464 		mlog(0, "Orphaned inode %llu was deleted while we "
1465 		     "were waiting on a lock. ip_flags = 0x%x\n",
1466 		     (unsigned long long)oi->ip_blkno, oi->ip_flags);
1467 		spin_unlock(&oi->ip_lock);
1468 		status = -ENOENT;
1469 		goto bail;
1470 	}
1471 	spin_unlock(&oi->ip_lock);
1472 
1473 	lockres = &oi->ip_meta_lockres;
1474 
1475 	if (!ocfs2_should_refresh_lock_res(lockres))
1476 		goto bail;
1477 
1478 	/* This will discard any caching information we might have had
1479 	 * for the inode metadata. */
1480 	ocfs2_metadata_cache_purge(inode);
1481 
1482 	/* will do nothing for inode types that don't use the extent
1483 	 * map (directories, bitmap files, etc) */
1484 	ocfs2_extent_map_trunc(inode, 0);
1485 
1486 	if (ocfs2_meta_lvb_is_trustable(lockres)) {
1487 		mlog(0, "Trusting LVB on inode %llu\n",
1488 		     (unsigned long long)oi->ip_blkno);
1489 		ocfs2_refresh_inode_from_lvb(inode);
1490 	} else {
1491 		/* Boo, we have to go to disk. */
1492 		/* read bh, cast, ocfs2_refresh_inode */
1493 		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1494 					  bh, OCFS2_BH_CACHED, inode);
1495 		if (status < 0) {
1496 			mlog_errno(status);
1497 			goto bail_refresh;
1498 		}
1499 		fe = (struct ocfs2_dinode *) (*bh)->b_data;
1500 
1501 		/* This is a good chance to make sure we're not
1502 		 * locking an invalid object.
1503 		 *
1504 		 * We bug on a stale inode here because we checked
1505 		 * above whether it was wiped from disk. The wiping
1506 		 * node provides a guarantee that we receive that
1507 		 * message and can mark the inode before dropping any
1508 		 * locks associated with it. */
1509 		if (!OCFS2_IS_VALID_DINODE(fe)) {
1510 			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1511 			status = -EIO;
1512 			goto bail_refresh;
1513 		}
1514 		mlog_bug_on_msg(inode->i_generation !=
1515 				le32_to_cpu(fe->i_generation),
1516 				"Invalid dinode %llu disk generation: %u "
1517 				"inode->i_generation: %u\n",
1518 				(unsigned long long)oi->ip_blkno,
1519 				le32_to_cpu(fe->i_generation),
1520 				inode->i_generation);
1521 		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1522 				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
1523 				"Stale dinode %llu dtime: %llu flags: 0x%x\n",
1524 				(unsigned long long)oi->ip_blkno,
1525 				(unsigned long long)le64_to_cpu(fe->i_dtime),
1526 				le32_to_cpu(fe->i_flags));
1527 
1528 		ocfs2_refresh_inode(inode, fe);
1529 	}
1530 
1531 	status = 0;
1532 bail_refresh:
1533 	ocfs2_complete_lock_res_refresh(lockres, status);
1534 bail:
1535 	mlog_exit(status);
1536 	return status;
1537 }
1538 
1539 static int ocfs2_assign_bh(struct inode *inode,
1540 			   struct buffer_head **ret_bh,
1541 			   struct buffer_head *passed_bh)
1542 {
1543 	int status;
1544 
1545 	if (passed_bh) {
1546 		/* Ok, the update went to disk for us, use the
1547 		 * returned bh. */
1548 		*ret_bh = passed_bh;
1549 		get_bh(*ret_bh);
1550 
1551 		return 0;
1552 	}
1553 
1554 	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1555 				  OCFS2_I(inode)->ip_blkno,
1556 				  ret_bh,
1557 				  OCFS2_BH_CACHED,
1558 				  inode);
1559 	if (status < 0)
1560 		mlog_errno(status);
1561 
1562 	return status;
1563 }
1564 
1565 /*
1566  * returns < 0 error if the callback will never be called, otherwise
1567  * the result of the lock will be communicated via the callback.
1568  */
1569 int ocfs2_meta_lock_full(struct inode *inode,
1570 			 struct ocfs2_journal_handle *handle,
1571 			 struct buffer_head **ret_bh,
1572 			 int ex,
1573 			 int arg_flags)
1574 {
1575 	int status, level, dlm_flags, acquired;
1576 	struct ocfs2_lock_res *lockres;
1577 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1578 	struct buffer_head *local_bh = NULL;
1579 
1580 	BUG_ON(!inode);
1581 
1582 	mlog_entry_void();
1583 
1584 	mlog(0, "inode %llu, take %s META lock\n",
1585 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1586 	     ex ? "EXMODE" : "PRMODE");
1587 
1588 	status = 0;
1589 	acquired = 0;
1590 	/* We'll allow faking a readonly metadata lock for
1591 	 * rodevices. */
1592 	if (ocfs2_is_hard_readonly(osb)) {
1593 		if (ex)
1594 			status = -EROFS;
1595 		goto bail;
1596 	}
1597 
1598 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1599 		wait_event(osb->recovery_event,
1600 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1601 
1602 	acquired = 0;
1603 	lockres = &OCFS2_I(inode)->ip_meta_lockres;
1604 	level = ex ? LKM_EXMODE : LKM_PRMODE;
1605 	dlm_flags = 0;
1606 	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1607 		dlm_flags |= LKM_NOQUEUE;
1608 
1609 	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1610 	if (status < 0) {
1611 		if (status != -EAGAIN && status != -EIOCBRETRY)
1612 			mlog_errno(status);
1613 		goto bail;
1614 	}
1615 
1616 	/* Notify the error cleanup path to drop the cluster lock. */
1617 	acquired = 1;
1618 
1619 	/* We wait twice because a node may have died while we were in
1620 	 * the lower dlm layers. The second time though, we've
1621 	 * committed to owning this lock so we don't allow signals to
1622 	 * abort the operation. */
1623 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1624 		wait_event(osb->recovery_event,
1625 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1626 
1627 	/* This is fun. The caller may want a bh back, or it may
1628 	 * not. ocfs2_meta_lock_update definitely wants one in, but
1629 	 * may or may not read one, depending on what's in the
1630 	 * LVB. The result of all of this is that we've *only* gone to
1631 	 * disk if we have to, so the complexity is worthwhile. */
1632 	status = ocfs2_meta_lock_update(inode, &local_bh);
1633 	if (status < 0) {
1634 		if (status != -ENOENT)
1635 			mlog_errno(status);
1636 		goto bail;
1637 	}
1638 
1639 	if (ret_bh) {
1640 		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1641 		if (status < 0) {
1642 			mlog_errno(status);
1643 			goto bail;
1644 		}
1645 	}
1646 
1647 	if (handle) {
1648 		status = ocfs2_handle_add_lock(handle, inode);
1649 		if (status < 0)
1650 			mlog_errno(status);
1651 	}
1652 
1653 bail:
1654 	if (status < 0) {
1655 		if (ret_bh && (*ret_bh)) {
1656 			brelse(*ret_bh);
1657 			*ret_bh = NULL;
1658 		}
1659 		if (acquired)
1660 			ocfs2_meta_unlock(inode, ex);
1661 	}
1662 
1663 	if (local_bh)
1664 		brelse(local_bh);
1665 
1666 	mlog_exit(status);
1667 	return status;
1668 }
1669 
1670 /*
1671  * This is working around a lock inversion between tasks acquiring DLM locks
1672  * while holding a page lock and the vote thread which blocks dlm lock acquiry
1673  * while acquiring page locks.
1674  *
1675  * ** These _with_page variantes are only intended to be called from aop
1676  * methods that hold page locks and return a very specific *positive* error
1677  * code that aop methods pass up to the VFS -- test for errors with != 0. **
1678  *
1679  * The DLM is called such that it returns -EAGAIN if it would have blocked
1680  * waiting for the vote thread.  In that case we unlock our page so the vote
1681  * thread can make progress.  Once we've done this we have to return
1682  * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1683  * into the VFS who will then immediately retry the aop call.
1684  *
1685  * We do a blocking lock and immediate unlock before returning, though, so that
1686  * the lock has a great chance of being cached on this node by the time the VFS
1687  * calls back to retry the aop.    This has a potential to livelock as nodes
1688  * ping locks back and forth, but that's a risk we're willing to take to avoid
1689  * the lock inversion simply.
1690  */
1691 int ocfs2_meta_lock_with_page(struct inode *inode,
1692 			      struct ocfs2_journal_handle *handle,
1693 			      struct buffer_head **ret_bh,
1694 			      int ex,
1695 			      struct page *page)
1696 {
1697 	int ret;
1698 
1699 	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
1700 				   OCFS2_LOCK_NONBLOCK);
1701 	if (ret == -EAGAIN) {
1702 		unlock_page(page);
1703 		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
1704 			ocfs2_meta_unlock(inode, ex);
1705 		ret = AOP_TRUNCATED_PAGE;
1706 	}
1707 
1708 	return ret;
1709 }
1710 
1711 void ocfs2_meta_unlock(struct inode *inode,
1712 		       int ex)
1713 {
1714 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1715 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
1716 
1717 	mlog_entry_void();
1718 
1719 	mlog(0, "inode %llu drop %s META lock\n",
1720 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1721 	     ex ? "EXMODE" : "PRMODE");
1722 
1723 	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1724 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1725 
1726 	mlog_exit_void();
1727 }
1728 
1729 int ocfs2_super_lock(struct ocfs2_super *osb,
1730 		     int ex)
1731 {
1732 	int status;
1733 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1734 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1735 	struct buffer_head *bh;
1736 	struct ocfs2_slot_info *si = osb->slot_info;
1737 
1738 	mlog_entry_void();
1739 
1740 	if (ocfs2_is_hard_readonly(osb))
1741 		return -EROFS;
1742 
1743 	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1744 	if (status < 0) {
1745 		mlog_errno(status);
1746 		goto bail;
1747 	}
1748 
1749 	/* The super block lock path is really in the best position to
1750 	 * know when resources covered by the lock need to be
1751 	 * refreshed, so we do it here. Of course, making sense of
1752 	 * everything is up to the caller :) */
1753 	status = ocfs2_should_refresh_lock_res(lockres);
1754 	if (status < 0) {
1755 		mlog_errno(status);
1756 		goto bail;
1757 	}
1758 	if (status) {
1759 		bh = si->si_bh;
1760 		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1761 					  si->si_inode);
1762 		if (status == 0)
1763 			ocfs2_update_slot_info(si);
1764 
1765 		ocfs2_complete_lock_res_refresh(lockres, status);
1766 
1767 		if (status < 0)
1768 			mlog_errno(status);
1769 	}
1770 bail:
1771 	mlog_exit(status);
1772 	return status;
1773 }
1774 
1775 void ocfs2_super_unlock(struct ocfs2_super *osb,
1776 			int ex)
1777 {
1778 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1779 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1780 
1781 	ocfs2_cluster_unlock(osb, lockres, level);
1782 }
1783 
1784 int ocfs2_rename_lock(struct ocfs2_super *osb)
1785 {
1786 	int status;
1787 	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1788 
1789 	if (ocfs2_is_hard_readonly(osb))
1790 		return -EROFS;
1791 
1792 	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
1793 	if (status < 0)
1794 		mlog_errno(status);
1795 
1796 	return status;
1797 }
1798 
1799 void ocfs2_rename_unlock(struct ocfs2_super *osb)
1800 {
1801 	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1802 
1803 	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
1804 }
1805 
1806 /* Reference counting of the dlm debug structure. We want this because
1807  * open references on the debug inodes can live on after a mount, so
1808  * we can't rely on the ocfs2_super to always exist. */
1809 static void ocfs2_dlm_debug_free(struct kref *kref)
1810 {
1811 	struct ocfs2_dlm_debug *dlm_debug;
1812 
1813 	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
1814 
1815 	kfree(dlm_debug);
1816 }
1817 
1818 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
1819 {
1820 	if (dlm_debug)
1821 		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
1822 }
1823 
1824 static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
1825 {
1826 	kref_get(&debug->d_refcnt);
1827 }
1828 
1829 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
1830 {
1831 	struct ocfs2_dlm_debug *dlm_debug;
1832 
1833 	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
1834 	if (!dlm_debug) {
1835 		mlog_errno(-ENOMEM);
1836 		goto out;
1837 	}
1838 
1839 	kref_init(&dlm_debug->d_refcnt);
1840 	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
1841 	dlm_debug->d_locking_state = NULL;
1842 out:
1843 	return dlm_debug;
1844 }
1845 
1846 /* Access to this is arbitrated for us via seq_file->sem. */
1847 struct ocfs2_dlm_seq_priv {
1848 	struct ocfs2_dlm_debug *p_dlm_debug;
1849 	struct ocfs2_lock_res p_iter_res;
1850 	struct ocfs2_lock_res p_tmp_res;
1851 };
1852 
1853 static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
1854 						 struct ocfs2_dlm_seq_priv *priv)
1855 {
1856 	struct ocfs2_lock_res *iter, *ret = NULL;
1857 	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
1858 
1859 	assert_spin_locked(&ocfs2_dlm_tracking_lock);
1860 
1861 	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
1862 		/* discover the head of the list */
1863 		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
1864 			mlog(0, "End of list found, %p\n", ret);
1865 			break;
1866 		}
1867 
1868 		/* We track our "dummy" iteration lockres' by a NULL
1869 		 * l_ops field. */
1870 		if (iter->l_ops != NULL) {
1871 			ret = iter;
1872 			break;
1873 		}
1874 	}
1875 
1876 	return ret;
1877 }
1878 
1879 static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
1880 {
1881 	struct ocfs2_dlm_seq_priv *priv = m->private;
1882 	struct ocfs2_lock_res *iter;
1883 
1884 	spin_lock(&ocfs2_dlm_tracking_lock);
1885 	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
1886 	if (iter) {
1887 		/* Since lockres' have the lifetime of their container
1888 		 * (which can be inodes, ocfs2_supers, etc) we want to
1889 		 * copy this out to a temporary lockres while still
1890 		 * under the spinlock. Obviously after this we can't
1891 		 * trust any pointers on the copy returned, but that's
1892 		 * ok as the information we want isn't typically held
1893 		 * in them. */
1894 		priv->p_tmp_res = *iter;
1895 		iter = &priv->p_tmp_res;
1896 	}
1897 	spin_unlock(&ocfs2_dlm_tracking_lock);
1898 
1899 	return iter;
1900 }
1901 
1902 static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
1903 {
1904 }
1905 
1906 static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
1907 {
1908 	struct ocfs2_dlm_seq_priv *priv = m->private;
1909 	struct ocfs2_lock_res *iter = v;
1910 	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
1911 
1912 	spin_lock(&ocfs2_dlm_tracking_lock);
1913 	iter = ocfs2_dlm_next_res(iter, priv);
1914 	list_del_init(&dummy->l_debug_list);
1915 	if (iter) {
1916 		list_add(&dummy->l_debug_list, &iter->l_debug_list);
1917 		priv->p_tmp_res = *iter;
1918 		iter = &priv->p_tmp_res;
1919 	}
1920 	spin_unlock(&ocfs2_dlm_tracking_lock);
1921 
1922 	return iter;
1923 }
1924 
1925 /* So that debugfs.ocfs2 can determine which format is being used */
1926 #define OCFS2_DLM_DEBUG_STR_VERSION 1
1927 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
1928 {
1929 	int i;
1930 	char *lvb;
1931 	struct ocfs2_lock_res *lockres = v;
1932 
1933 	if (!lockres)
1934 		return -EINVAL;
1935 
1936 	seq_printf(m, "0x%x\t"
1937 		   "%.*s\t"
1938 		   "%d\t"
1939 		   "0x%lx\t"
1940 		   "0x%x\t"
1941 		   "0x%x\t"
1942 		   "%u\t"
1943 		   "%u\t"
1944 		   "%d\t"
1945 		   "%d\t",
1946 		   OCFS2_DLM_DEBUG_STR_VERSION,
1947 		   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
1948 		   lockres->l_level,
1949 		   lockres->l_flags,
1950 		   lockres->l_action,
1951 		   lockres->l_unlock_action,
1952 		   lockres->l_ro_holders,
1953 		   lockres->l_ex_holders,
1954 		   lockres->l_requested,
1955 		   lockres->l_blocking);
1956 
1957 	/* Dump the raw LVB */
1958 	lvb = lockres->l_lksb.lvb;
1959 	for(i = 0; i < DLM_LVB_LEN; i++)
1960 		seq_printf(m, "0x%x\t", lvb[i]);
1961 
1962 	/* End the line */
1963 	seq_printf(m, "\n");
1964 	return 0;
1965 }
1966 
1967 static struct seq_operations ocfs2_dlm_seq_ops = {
1968 	.start =	ocfs2_dlm_seq_start,
1969 	.stop =		ocfs2_dlm_seq_stop,
1970 	.next =		ocfs2_dlm_seq_next,
1971 	.show =		ocfs2_dlm_seq_show,
1972 };
1973 
1974 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
1975 {
1976 	struct seq_file *seq = (struct seq_file *) file->private_data;
1977 	struct ocfs2_dlm_seq_priv *priv = seq->private;
1978 	struct ocfs2_lock_res *res = &priv->p_iter_res;
1979 
1980 	ocfs2_remove_lockres_tracking(res);
1981 	ocfs2_put_dlm_debug(priv->p_dlm_debug);
1982 	return seq_release_private(inode, file);
1983 }
1984 
1985 static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
1986 {
1987 	int ret;
1988 	struct ocfs2_dlm_seq_priv *priv;
1989 	struct seq_file *seq;
1990 	struct ocfs2_super *osb;
1991 
1992 	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
1993 	if (!priv) {
1994 		ret = -ENOMEM;
1995 		mlog_errno(ret);
1996 		goto out;
1997 	}
1998 	osb = (struct ocfs2_super *) inode->u.generic_ip;
1999 	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2000 	priv->p_dlm_debug = osb->osb_dlm_debug;
2001 	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2002 
2003 	ret = seq_open(file, &ocfs2_dlm_seq_ops);
2004 	if (ret) {
2005 		kfree(priv);
2006 		mlog_errno(ret);
2007 		goto out;
2008 	}
2009 
2010 	seq = (struct seq_file *) file->private_data;
2011 	seq->private = priv;
2012 
2013 	ocfs2_add_lockres_tracking(&priv->p_iter_res,
2014 				   priv->p_dlm_debug);
2015 
2016 out:
2017 	return ret;
2018 }
2019 
2020 static const struct file_operations ocfs2_dlm_debug_fops = {
2021 	.open =		ocfs2_dlm_debug_open,
2022 	.release =	ocfs2_dlm_debug_release,
2023 	.read =		seq_read,
2024 	.llseek =	seq_lseek,
2025 };
2026 
2027 static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2028 {
2029 	int ret = 0;
2030 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2031 
2032 	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2033 							 S_IFREG|S_IRUSR,
2034 							 osb->osb_debug_root,
2035 							 osb,
2036 							 &ocfs2_dlm_debug_fops);
2037 	if (!dlm_debug->d_locking_state) {
2038 		ret = -EINVAL;
2039 		mlog(ML_ERROR,
2040 		     "Unable to create locking state debugfs file.\n");
2041 		goto out;
2042 	}
2043 
2044 	ocfs2_get_dlm_debug(dlm_debug);
2045 out:
2046 	return ret;
2047 }
2048 
2049 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2050 {
2051 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2052 
2053 	if (dlm_debug) {
2054 		debugfs_remove(dlm_debug->d_locking_state);
2055 		ocfs2_put_dlm_debug(dlm_debug);
2056 	}
2057 }
2058 
2059 int ocfs2_dlm_init(struct ocfs2_super *osb)
2060 {
2061 	int status;
2062 	u32 dlm_key;
2063 	struct dlm_ctxt *dlm;
2064 
2065 	mlog_entry_void();
2066 
2067 	status = ocfs2_dlm_init_debug(osb);
2068 	if (status < 0) {
2069 		mlog_errno(status);
2070 		goto bail;
2071 	}
2072 
2073 	/* launch vote thread */
2074 	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
2075 				     osb->osb_id);
2076 	if (IS_ERR(osb->vote_task)) {
2077 		status = PTR_ERR(osb->vote_task);
2078 		osb->vote_task = NULL;
2079 		mlog_errno(status);
2080 		goto bail;
2081 	}
2082 
2083 	/* used by the dlm code to make message headers unique, each
2084 	 * node in this domain must agree on this. */
2085 	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2086 
2087 	/* for now, uuid == domain */
2088 	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2089 	if (IS_ERR(dlm)) {
2090 		status = PTR_ERR(dlm);
2091 		mlog_errno(status);
2092 		goto bail;
2093 	}
2094 
2095 	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2096 	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2097 
2098 	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2099 
2100 	osb->dlm = dlm;
2101 
2102 	status = 0;
2103 bail:
2104 	if (status < 0) {
2105 		ocfs2_dlm_shutdown_debug(osb);
2106 		if (osb->vote_task)
2107 			kthread_stop(osb->vote_task);
2108 	}
2109 
2110 	mlog_exit(status);
2111 	return status;
2112 }
2113 
2114 void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2115 {
2116 	mlog_entry_void();
2117 
2118 	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2119 
2120 	ocfs2_drop_osb_locks(osb);
2121 
2122 	if (osb->vote_task) {
2123 		kthread_stop(osb->vote_task);
2124 		osb->vote_task = NULL;
2125 	}
2126 
2127 	ocfs2_lock_res_free(&osb->osb_super_lockres);
2128 	ocfs2_lock_res_free(&osb->osb_rename_lockres);
2129 
2130 	dlm_unregister_domain(osb->dlm);
2131 	osb->dlm = NULL;
2132 
2133 	ocfs2_dlm_shutdown_debug(osb);
2134 
2135 	mlog_exit_void();
2136 }
2137 
2138 static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
2139 {
2140 	struct ocfs2_lock_res *lockres = opaque;
2141 	unsigned long flags;
2142 
2143 	mlog_entry_void();
2144 
2145 	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2146 	     lockres->l_unlock_action);
2147 
2148 	spin_lock_irqsave(&lockres->l_lock, flags);
2149 	/* We tried to cancel a convert request, but it was already
2150 	 * granted. All we want to do here is clear our unlock
2151 	 * state. The wake_up call done at the bottom is redundant
2152 	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2153 	 * hurt anything anyway */
2154 	if (status == DLM_CANCELGRANT &&
2155 	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2156 		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2157 
2158 		/* We don't clear the busy flag in this case as it
2159 		 * should have been cleared by the ast which the dlm
2160 		 * has called. */
2161 		goto complete_unlock;
2162 	}
2163 
2164 	if (status != DLM_NORMAL) {
2165 		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2166 		     "unlock_action %d\n", status, lockres->l_name,
2167 		     lockres->l_unlock_action);
2168 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2169 		return;
2170 	}
2171 
2172 	switch(lockres->l_unlock_action) {
2173 	case OCFS2_UNLOCK_CANCEL_CONVERT:
2174 		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2175 		lockres->l_action = OCFS2_AST_INVALID;
2176 		break;
2177 	case OCFS2_UNLOCK_DROP_LOCK:
2178 		lockres->l_level = LKM_IVMODE;
2179 		break;
2180 	default:
2181 		BUG();
2182 	}
2183 
2184 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2185 complete_unlock:
2186 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2187 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2188 
2189 	wake_up(&lockres->l_event);
2190 
2191 	mlog_exit_void();
2192 }
2193 
2194 typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
2195 
2196 struct drop_lock_cb {
2197 	ocfs2_pre_drop_cb_t	*drop_func;
2198 	void			*drop_data;
2199 };
2200 
2201 static int ocfs2_drop_lock(struct ocfs2_super *osb,
2202 			   struct ocfs2_lock_res *lockres,
2203 			   struct drop_lock_cb *dcb)
2204 {
2205 	enum dlm_status status;
2206 	unsigned long flags;
2207 
2208 	/* We didn't get anywhere near actually using this lockres. */
2209 	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2210 		goto out;
2211 
2212 	spin_lock_irqsave(&lockres->l_lock, flags);
2213 
2214 	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2215 			"lockres %s, flags 0x%lx\n",
2216 			lockres->l_name, lockres->l_flags);
2217 
2218 	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2219 		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2220 		     "%u, unlock_action = %u\n",
2221 		     lockres->l_name, lockres->l_flags, lockres->l_action,
2222 		     lockres->l_unlock_action);
2223 
2224 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2225 
2226 		/* XXX: Today we just wait on any busy
2227 		 * locks... Perhaps we need to cancel converts in the
2228 		 * future? */
2229 		ocfs2_wait_on_busy_lock(lockres);
2230 
2231 		spin_lock_irqsave(&lockres->l_lock, flags);
2232 	}
2233 
2234 	if (dcb)
2235 		dcb->drop_func(lockres, dcb->drop_data);
2236 
2237 	if (lockres->l_flags & OCFS2_LOCK_BUSY)
2238 		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2239 		     lockres->l_name);
2240 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2241 		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2242 
2243 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2244 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2245 		goto out;
2246 	}
2247 
2248 	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2249 
2250 	/* make sure we never get here while waiting for an ast to
2251 	 * fire. */
2252 	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2253 
2254 	/* is this necessary? */
2255 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2256 	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2257 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2258 
2259 	mlog(0, "lock %s\n", lockres->l_name);
2260 
2261 	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
2262 			   lockres->l_ops->unlock_ast, lockres);
2263 	if (status != DLM_NORMAL) {
2264 		ocfs2_log_dlm_error("dlmunlock", status, lockres);
2265 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2266 		dlm_print_one_lock(lockres->l_lksb.lockid);
2267 		BUG();
2268 	}
2269 	mlog(0, "lock %s, successfull return from dlmunlock\n",
2270 	     lockres->l_name);
2271 
2272 	ocfs2_wait_on_busy_lock(lockres);
2273 out:
2274 	mlog_exit(0);
2275 	return 0;
2276 }
2277 
2278 /* Mark the lockres as being dropped. It will no longer be
2279  * queued if blocking, but we still may have to wait on it
2280  * being dequeued from the vote thread before we can consider
2281  * it safe to drop.
2282  *
2283  * You can *not* attempt to call cluster_lock on this lockres anymore. */
2284 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2285 {
2286 	int status;
2287 	struct ocfs2_mask_waiter mw;
2288 	unsigned long flags;
2289 
2290 	ocfs2_init_mask_waiter(&mw);
2291 
2292 	spin_lock_irqsave(&lockres->l_lock, flags);
2293 	lockres->l_flags |= OCFS2_LOCK_FREEING;
2294 	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2295 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2296 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2297 
2298 		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2299 
2300 		status = ocfs2_wait_for_mask(&mw);
2301 		if (status)
2302 			mlog_errno(status);
2303 
2304 		spin_lock_irqsave(&lockres->l_lock, flags);
2305 	}
2306 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2307 }
2308 
2309 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2310 {
2311 	int status;
2312 
2313 	mlog_entry_void();
2314 
2315 	ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
2316 
2317 	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
2318 	if (status < 0)
2319 		mlog_errno(status);
2320 
2321 	ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
2322 
2323 	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
2324 	if (status < 0)
2325 		mlog_errno(status);
2326 
2327 	mlog_exit(status);
2328 }
2329 
2330 static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
2331 {
2332 	struct inode *inode = data;
2333 
2334 	/* the metadata lock requires a bit more work as we have an
2335 	 * LVB to worry about. */
2336 	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2337 	    lockres->l_level == LKM_EXMODE &&
2338 	    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2339 		__ocfs2_stuff_meta_lvb(inode);
2340 }
2341 
2342 int ocfs2_drop_inode_locks(struct inode *inode)
2343 {
2344 	int status, err;
2345 	struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
2346 
2347 	mlog_entry_void();
2348 
2349 	/* No need to call ocfs2_mark_lockres_freeing here -
2350 	 * ocfs2_clear_inode has done it for us. */
2351 
2352 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2353 			      &OCFS2_I(inode)->ip_data_lockres,
2354 			      NULL);
2355 	if (err < 0)
2356 		mlog_errno(err);
2357 
2358 	status = err;
2359 
2360 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2361 			      &OCFS2_I(inode)->ip_meta_lockres,
2362 			      &meta_dcb);
2363 	if (err < 0)
2364 		mlog_errno(err);
2365 	if (err < 0 && !status)
2366 		status = err;
2367 
2368 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2369 			      &OCFS2_I(inode)->ip_rw_lockres,
2370 			      NULL);
2371 	if (err < 0)
2372 		mlog_errno(err);
2373 	if (err < 0 && !status)
2374 		status = err;
2375 
2376 	mlog_exit(status);
2377 	return status;
2378 }
2379 
2380 static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2381 				      int new_level)
2382 {
2383 	assert_spin_locked(&lockres->l_lock);
2384 
2385 	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2386 
2387 	if (lockres->l_level <= new_level) {
2388 		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2389 		     lockres->l_level, new_level);
2390 		BUG();
2391 	}
2392 
2393 	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2394 	     lockres->l_name, new_level, lockres->l_blocking);
2395 
2396 	lockres->l_action = OCFS2_AST_DOWNCONVERT;
2397 	lockres->l_requested = new_level;
2398 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2399 }
2400 
2401 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2402 				  struct ocfs2_lock_res *lockres,
2403 				  int new_level,
2404 				  int lvb)
2405 {
2406 	int ret, dlm_flags = LKM_CONVERT;
2407 	enum dlm_status status;
2408 
2409 	mlog_entry_void();
2410 
2411 	if (lvb)
2412 		dlm_flags |= LKM_VALBLK;
2413 
2414 	status = dlmlock(osb->dlm,
2415 			 new_level,
2416 			 &lockres->l_lksb,
2417 			 dlm_flags,
2418 			 lockres->l_name,
2419 			 lockres->l_ops->ast,
2420 			 lockres,
2421 			 lockres->l_ops->bast);
2422 	if (status != DLM_NORMAL) {
2423 		ocfs2_log_dlm_error("dlmlock", status, lockres);
2424 		ret = -EINVAL;
2425 		ocfs2_recover_from_dlm_error(lockres, 1);
2426 		goto bail;
2427 	}
2428 
2429 	ret = 0;
2430 bail:
2431 	mlog_exit(ret);
2432 	return ret;
2433 }
2434 
2435 /* returns 1 when the caller should unlock and call dlmunlock */
2436 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2437 				        struct ocfs2_lock_res *lockres)
2438 {
2439 	assert_spin_locked(&lockres->l_lock);
2440 
2441 	mlog_entry_void();
2442 	mlog(0, "lock %s\n", lockres->l_name);
2443 
2444 	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2445 		/* If we're already trying to cancel a lock conversion
2446 		 * then just drop the spinlock and allow the caller to
2447 		 * requeue this lock. */
2448 
2449 		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2450 		return 0;
2451 	}
2452 
2453 	/* were we in a convert when we got the bast fire? */
2454 	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2455 	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
2456 	/* set things up for the unlockast to know to just
2457 	 * clear out the ast_action and unset busy, etc. */
2458 	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2459 
2460 	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2461 			"lock %s, invalid flags: 0x%lx\n",
2462 			lockres->l_name, lockres->l_flags);
2463 
2464 	return 1;
2465 }
2466 
2467 static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2468 				struct ocfs2_lock_res *lockres)
2469 {
2470 	int ret;
2471 	enum dlm_status status;
2472 
2473 	mlog_entry_void();
2474 	mlog(0, "lock %s\n", lockres->l_name);
2475 
2476 	ret = 0;
2477 	status = dlmunlock(osb->dlm,
2478 			   &lockres->l_lksb,
2479 			   LKM_CANCEL,
2480 			   lockres->l_ops->unlock_ast,
2481 			   lockres);
2482 	if (status != DLM_NORMAL) {
2483 		ocfs2_log_dlm_error("dlmunlock", status, lockres);
2484 		ret = -EINVAL;
2485 		ocfs2_recover_from_dlm_error(lockres, 0);
2486 	}
2487 
2488 	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2489 
2490 	mlog_exit(ret);
2491 	return ret;
2492 }
2493 
2494 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
2495 						  struct ocfs2_lock_res *lockres,
2496 						  int new_level)
2497 {
2498 	int ret;
2499 
2500 	mlog_entry_void();
2501 
2502 	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2503 
2504 	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2505 		ret = 0;
2506 		mlog(0, "lockres %s currently being refreshed -- backing "
2507 		     "off!\n", lockres->l_name);
2508 	} else if (new_level == LKM_PRMODE)
2509 		ret = !lockres->l_ex_holders &&
2510 			ocfs2_inode_fully_checkpointed(inode);
2511 	else /* Must be NLMODE we're converting to. */
2512 		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
2513 			ocfs2_inode_fully_checkpointed(inode);
2514 
2515 	mlog_exit(ret);
2516 	return ret;
2517 }
2518 
2519 static int ocfs2_do_unblock_meta(struct inode *inode,
2520 				 int *requeue)
2521 {
2522 	int new_level;
2523 	int set_lvb = 0;
2524 	int ret = 0;
2525 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
2526 	unsigned long flags;
2527 
2528 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2529 
2530 	mlog_entry_void();
2531 
2532 	spin_lock_irqsave(&lockres->l_lock, flags);
2533 
2534 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2535 
2536 	mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
2537 	     lockres->l_blocking);
2538 
2539 	BUG_ON(lockres->l_level != LKM_EXMODE &&
2540 	       lockres->l_level != LKM_PRMODE);
2541 
2542 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2543 		*requeue = 1;
2544 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
2545 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2546 		if (ret) {
2547 			ret = ocfs2_cancel_convert(osb, lockres);
2548 			if (ret < 0)
2549 				mlog_errno(ret);
2550 		}
2551 		goto leave;
2552 	}
2553 
2554 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2555 
2556 	mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
2557 	     lockres->l_level, lockres->l_blocking, new_level);
2558 
2559 	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
2560 		if (lockres->l_level == LKM_EXMODE)
2561 			set_lvb = 1;
2562 
2563 		/* If the lock hasn't been refreshed yet (rare), then
2564 		 * our memory inode values are old and we skip
2565 		 * stuffing the lvb. There's no need to actually clear
2566 		 * out the lvb here as it's value is still valid. */
2567 		if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2568 			if (set_lvb)
2569 				__ocfs2_stuff_meta_lvb(inode);
2570 		} else
2571 			mlog(0, "lockres %s: downconverting stale lock!\n",
2572 			     lockres->l_name);
2573 
2574 		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
2575 		     "l_blocking=%d, new_level=%d\n",
2576 		     lockres->l_level, lockres->l_blocking, new_level);
2577 
2578 		ocfs2_prepare_downconvert(lockres, new_level);
2579 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2580 		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
2581 		goto leave;
2582 	}
2583 	if (!ocfs2_inode_fully_checkpointed(inode))
2584 		ocfs2_start_checkpoint(osb);
2585 
2586 	*requeue = 1;
2587 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2588 	ret = 0;
2589 leave:
2590 	mlog_exit(ret);
2591 	return ret;
2592 }
2593 
2594 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
2595 				      struct ocfs2_lock_res *lockres,
2596 				      int *requeue,
2597 				      ocfs2_convert_worker_t *worker)
2598 {
2599 	unsigned long flags;
2600 	int blocking;
2601 	int new_level;
2602 	int ret = 0;
2603 
2604 	mlog_entry_void();
2605 
2606 	spin_lock_irqsave(&lockres->l_lock, flags);
2607 
2608 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2609 
2610 recheck:
2611 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2612 		*requeue = 1;
2613 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
2614 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2615 		if (ret) {
2616 			ret = ocfs2_cancel_convert(osb, lockres);
2617 			if (ret < 0)
2618 				mlog_errno(ret);
2619 		}
2620 		goto leave;
2621 	}
2622 
2623 	/* if we're blocking an exclusive and we have *any* holders,
2624 	 * then requeue. */
2625 	if ((lockres->l_blocking == LKM_EXMODE)
2626 	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
2627 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2628 		*requeue = 1;
2629 		ret = 0;
2630 		goto leave;
2631 	}
2632 
2633 	/* If it's a PR we're blocking, then only
2634 	 * requeue if we've got any EX holders */
2635 	if (lockres->l_blocking == LKM_PRMODE &&
2636 	    lockres->l_ex_holders) {
2637 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2638 		*requeue = 1;
2639 		ret = 0;
2640 		goto leave;
2641 	}
2642 
2643 	/* If we get here, then we know that there are no more
2644 	 * incompatible holders (and anyone asking for an incompatible
2645 	 * lock is blocked). We can now downconvert the lock */
2646 	if (!worker)
2647 		goto downconvert;
2648 
2649 	/* Some lockres types want to do a bit of work before
2650 	 * downconverting a lock. Allow that here. The worker function
2651 	 * may sleep, so we save off a copy of what we're blocking as
2652 	 * it may change while we're not holding the spin lock. */
2653 	blocking = lockres->l_blocking;
2654 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2655 
2656 	worker(lockres, blocking);
2657 
2658 	spin_lock_irqsave(&lockres->l_lock, flags);
2659 	if (blocking != lockres->l_blocking) {
2660 		/* If this changed underneath us, then we can't drop
2661 		 * it just yet. */
2662 		goto recheck;
2663 	}
2664 
2665 downconvert:
2666 	*requeue = 0;
2667 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2668 
2669 	ocfs2_prepare_downconvert(lockres, new_level);
2670 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2671 	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
2672 leave:
2673 	mlog_exit(ret);
2674 	return ret;
2675 }
2676 
2677 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2678 				      int blocking)
2679 {
2680 	struct inode *inode;
2681 	struct address_space *mapping;
2682 
2683 	mlog_entry_void();
2684 
2685        	inode = ocfs2_lock_res_inode(lockres);
2686 	mapping = inode->i_mapping;
2687 
2688 	if (filemap_fdatawrite(mapping)) {
2689 		mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
2690 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2691 	}
2692 	sync_mapping_buffers(mapping);
2693 	if (blocking == LKM_EXMODE) {
2694 		truncate_inode_pages(mapping, 0);
2695 		unmap_mapping_range(mapping, 0, 0, 0);
2696 	} else {
2697 		/* We only need to wait on the I/O if we're not also
2698 		 * truncating pages because truncate_inode_pages waits
2699 		 * for us above. We don't truncate pages if we're
2700 		 * blocking anything < EXMODE because we want to keep
2701 		 * them around in that case. */
2702 		filemap_fdatawait(mapping);
2703 	}
2704 
2705 	mlog_exit_void();
2706 }
2707 
2708 int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
2709 		       int *requeue)
2710 {
2711 	int status;
2712 	struct inode *inode;
2713 	struct ocfs2_super *osb;
2714 
2715 	mlog_entry_void();
2716 
2717 	inode = ocfs2_lock_res_inode(lockres);
2718 	osb = OCFS2_SB(inode->i_sb);
2719 
2720 	mlog(0, "unblock inode %llu\n",
2721 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2722 
2723 	status = ocfs2_generic_unblock_lock(osb,
2724 					    lockres,
2725 					    requeue,
2726 					    ocfs2_data_convert_worker);
2727 	if (status < 0)
2728 		mlog_errno(status);
2729 
2730 	mlog(0, "inode %llu, requeue = %d\n",
2731 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue);
2732 
2733 	mlog_exit(status);
2734 	return status;
2735 }
2736 
2737 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
2738 				    int *requeue)
2739 {
2740 	int status;
2741 	struct inode *inode;
2742 
2743 	mlog_entry_void();
2744 
2745 	mlog(0, "Unblock lockres %s\n", lockres->l_name);
2746 
2747 	inode  = ocfs2_lock_res_inode(lockres);
2748 
2749 	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
2750 					    lockres,
2751 					    requeue,
2752 					    NULL);
2753 	if (status < 0)
2754 		mlog_errno(status);
2755 
2756 	mlog_exit(status);
2757 	return status;
2758 }
2759 
2760 
2761 int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
2762 		       int *requeue)
2763 {
2764 	int status;
2765 	struct inode *inode;
2766 
2767 	mlog_entry_void();
2768 
2769        	inode = ocfs2_lock_res_inode(lockres);
2770 
2771 	mlog(0, "unblock inode %llu\n",
2772 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2773 
2774 	status = ocfs2_do_unblock_meta(inode, requeue);
2775 	if (status < 0)
2776 		mlog_errno(status);
2777 
2778 	mlog(0, "inode %llu, requeue = %d\n",
2779 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue);
2780 
2781 	mlog_exit(status);
2782 	return status;
2783 }
2784 
2785 /* Generic unblock function for any lockres whose private data is an
2786  * ocfs2_super pointer. */
2787 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
2788 				  int *requeue)
2789 {
2790 	int status;
2791 	struct ocfs2_super *osb;
2792 
2793 	mlog_entry_void();
2794 
2795 	mlog(0, "Unblock lockres %s\n", lockres->l_name);
2796 
2797 	osb = ocfs2_lock_res_super(lockres);
2798 
2799 	status = ocfs2_generic_unblock_lock(osb,
2800 					    lockres,
2801 					    requeue,
2802 					    NULL);
2803 	if (status < 0)
2804 		mlog_errno(status);
2805 
2806 	mlog_exit(status);
2807 	return status;
2808 }
2809 
2810 void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
2811 				struct ocfs2_lock_res *lockres)
2812 {
2813 	int status;
2814 	int requeue = 0;
2815 	unsigned long flags;
2816 
2817 	/* Our reference to the lockres in this function can be
2818 	 * considered valid until we remove the OCFS2_LOCK_QUEUED
2819 	 * flag. */
2820 
2821 	mlog_entry_void();
2822 
2823 	BUG_ON(!lockres);
2824 	BUG_ON(!lockres->l_ops);
2825 	BUG_ON(!lockres->l_ops->unblock);
2826 
2827 	mlog(0, "lockres %s blocked.\n", lockres->l_name);
2828 
2829 	/* Detect whether a lock has been marked as going away while
2830 	 * the vote thread was processing other things. A lock can
2831 	 * still be marked with OCFS2_LOCK_FREEING after this check,
2832 	 * but short circuiting here will still save us some
2833 	 * performance. */
2834 	spin_lock_irqsave(&lockres->l_lock, flags);
2835 	if (lockres->l_flags & OCFS2_LOCK_FREEING)
2836 		goto unqueue;
2837 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2838 
2839 	status = lockres->l_ops->unblock(lockres, &requeue);
2840 	if (status < 0)
2841 		mlog_errno(status);
2842 
2843 	spin_lock_irqsave(&lockres->l_lock, flags);
2844 unqueue:
2845 	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
2846 		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
2847 	} else
2848 		ocfs2_schedule_blocked_lock(osb, lockres);
2849 
2850 	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
2851 	     requeue ? "yes" : "no");
2852 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2853 
2854 	mlog_exit_void();
2855 }
2856 
2857 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
2858 					struct ocfs2_lock_res *lockres)
2859 {
2860 	mlog_entry_void();
2861 
2862 	assert_spin_locked(&lockres->l_lock);
2863 
2864 	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
2865 		/* Do not schedule a lock for downconvert when it's on
2866 		 * the way to destruction - any nodes wanting access
2867 		 * to the resource will get it soon. */
2868 		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
2869 		     lockres->l_name, lockres->l_flags);
2870 		return;
2871 	}
2872 
2873 	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
2874 
2875 	spin_lock(&osb->vote_task_lock);
2876 	if (list_empty(&lockres->l_blocked_list)) {
2877 		list_add_tail(&lockres->l_blocked_list,
2878 			      &osb->blocked_lock_list);
2879 		osb->blocked_lock_count++;
2880 	}
2881 	spin_unlock(&osb->vote_task_lock);
2882 
2883 	mlog_exit_void();
2884 }
2885 
2886 /* This aids in debugging situations where a bad LVB might be involved. */
2887 void ocfs2_dump_meta_lvb_info(u64 level,
2888 			      const char *function,
2889 			      unsigned int line,
2890 			      struct ocfs2_lock_res *lockres)
2891 {
2892 	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
2893 
2894 	mlog(level, "LVB information for %s (called from %s:%u):\n",
2895 	     lockres->l_name, function, line);
2896 	mlog(level, "version: %u, clusters: %u\n",
2897 	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
2898 	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
2899 	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
2900 	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
2901 	     be16_to_cpu(lvb->lvb_imode));
2902 	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
2903 	     "mtime_packed 0x%llx\n", be16_to_cpu(lvb->lvb_inlink),
2904 	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
2905 	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
2906 	     (long long)be64_to_cpu(lvb->lvb_imtime_packed));
2907 }
2908