xref: /linux/fs/ocfs2/dlmglue.c (revision f24e9f586b377749dff37554696cf3a105540c94)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * dlmglue.c
5  *
6  * Code which implements an OCFS2 specific interface to our DLM.
7  *
8  * Copyright (C) 2003, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/types.h>
27 #include <linux/slab.h>
28 #include <linux/highmem.h>
29 #include <linux/mm.h>
30 #include <linux/smp_lock.h>
31 #include <linux/crc32.h>
32 #include <linux/kthread.h>
33 #include <linux/pagemap.h>
34 #include <linux/debugfs.h>
35 #include <linux/seq_file.h>
36 
37 #include <cluster/heartbeat.h>
38 #include <cluster/nodemanager.h>
39 #include <cluster/tcp.h>
40 
41 #include <dlm/dlmapi.h>
42 
43 #define MLOG_MASK_PREFIX ML_DLM_GLUE
44 #include <cluster/masklog.h>
45 
46 #include "ocfs2.h"
47 
48 #include "alloc.h"
49 #include "dlmglue.h"
50 #include "extent_map.h"
51 #include "heartbeat.h"
52 #include "inode.h"
53 #include "journal.h"
54 #include "slot_map.h"
55 #include "super.h"
56 #include "uptodate.h"
57 #include "vote.h"
58 
59 #include "buffer_head_io.h"
60 
61 struct ocfs2_mask_waiter {
62 	struct list_head	mw_item;
63 	int			mw_status;
64 	struct completion	mw_complete;
65 	unsigned long		mw_mask;
66 	unsigned long		mw_goal;
67 };
68 
69 static void ocfs2_inode_ast_func(void *opaque);
70 static void ocfs2_inode_bast_func(void *opaque,
71 				  int level);
72 static void ocfs2_super_ast_func(void *opaque);
73 static void ocfs2_super_bast_func(void *opaque,
74 				  int level);
75 static void ocfs2_rename_ast_func(void *opaque);
76 static void ocfs2_rename_bast_func(void *opaque,
77 				   int level);
78 
79 /* so far, all locks have gotten along with the same unlock ast */
80 static void ocfs2_unlock_ast_func(void *opaque,
81 				  enum dlm_status status);
82 static int ocfs2_do_unblock_meta(struct inode *inode,
83 				 int *requeue);
84 static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
85 			      int *requeue);
86 static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
87 			      int *requeue);
88 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
89 			      int *requeue);
90 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
91 				  int *requeue);
92 typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int);
93 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
94 				      struct ocfs2_lock_res *lockres,
95 				      int *requeue,
96 				      ocfs2_convert_worker_t *worker);
97 
98 struct ocfs2_lock_res_ops {
99 	void (*ast)(void *);
100 	void (*bast)(void *, int);
101 	void (*unlock_ast)(void *, enum dlm_status);
102 	int  (*unblock)(struct ocfs2_lock_res *, int *);
103 };
104 
105 static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = {
106 	.ast		= ocfs2_inode_ast_func,
107 	.bast		= ocfs2_inode_bast_func,
108 	.unlock_ast	= ocfs2_unlock_ast_func,
109 	.unblock	= ocfs2_unblock_inode_lock,
110 };
111 
112 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = {
113 	.ast		= ocfs2_inode_ast_func,
114 	.bast		= ocfs2_inode_bast_func,
115 	.unlock_ast	= ocfs2_unlock_ast_func,
116 	.unblock	= ocfs2_unblock_meta,
117 };
118 
119 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
120 				      int blocking);
121 
122 static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = {
123 	.ast		= ocfs2_inode_ast_func,
124 	.bast		= ocfs2_inode_bast_func,
125 	.unlock_ast	= ocfs2_unlock_ast_func,
126 	.unblock	= ocfs2_unblock_data,
127 };
128 
129 static struct ocfs2_lock_res_ops ocfs2_super_lops = {
130 	.ast		= ocfs2_super_ast_func,
131 	.bast		= ocfs2_super_bast_func,
132 	.unlock_ast	= ocfs2_unlock_ast_func,
133 	.unblock	= ocfs2_unblock_osb_lock,
134 };
135 
136 static struct ocfs2_lock_res_ops ocfs2_rename_lops = {
137 	.ast		= ocfs2_rename_ast_func,
138 	.bast		= ocfs2_rename_bast_func,
139 	.unlock_ast	= ocfs2_unlock_ast_func,
140 	.unblock	= ocfs2_unblock_osb_lock,
141 };
142 
143 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
144 {
145 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
146 		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
147 		lockres->l_type == OCFS2_LOCK_TYPE_RW;
148 }
149 
150 static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres)
151 {
152 	return lockres->l_type == OCFS2_LOCK_TYPE_SUPER;
153 }
154 
155 static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres)
156 {
157 	return lockres->l_type == OCFS2_LOCK_TYPE_RENAME;
158 }
159 
160 static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres)
161 {
162 	BUG_ON(!ocfs2_is_super_lock(lockres)
163 	       && !ocfs2_is_rename_lock(lockres));
164 
165 	return (struct ocfs2_super *) lockres->l_priv;
166 }
167 
168 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
169 {
170 	BUG_ON(!ocfs2_is_inode_lock(lockres));
171 
172 	return (struct inode *) lockres->l_priv;
173 }
174 
175 static int ocfs2_lock_create(struct ocfs2_super *osb,
176 			     struct ocfs2_lock_res *lockres,
177 			     int level,
178 			     int dlm_flags);
179 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
180 						     int wanted);
181 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
182 				 struct ocfs2_lock_res *lockres,
183 				 int level);
184 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres);
185 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres);
186 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres);
187 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level);
188 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
189 					struct ocfs2_lock_res *lockres);
190 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
191 						int convert);
192 #define ocfs2_log_dlm_error(_func, _stat, _lockres) do {	\
193 	mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on "	\
194 		"resource %s: %s\n", dlm_errname(_stat), _func,	\
195 		_lockres->l_name, dlm_errmsg(_stat));		\
196 } while (0)
197 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
198 				 struct ocfs2_lock_res *lockres);
199 static int ocfs2_meta_lock_update(struct inode *inode,
200 				  struct buffer_head **bh);
201 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
202 static inline int ocfs2_highest_compat_lock_level(int level);
203 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
204 						  struct ocfs2_lock_res *lockres,
205 						  int new_level);
206 
207 static char *ocfs2_lock_type_strings[] = {
208 	[OCFS2_LOCK_TYPE_META] = "Meta",
209 	[OCFS2_LOCK_TYPE_DATA] = "Data",
210 	[OCFS2_LOCK_TYPE_SUPER] = "Super",
211 	[OCFS2_LOCK_TYPE_RENAME] = "Rename",
212 	/* Need to differntiate from [R]ename.. serializing writes is the
213 	 * important job it does, anyway. */
214 	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
215 };
216 
217 static char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
218 {
219 	mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type);
220 	return ocfs2_lock_type_strings[type];
221 }
222 
223 static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
224 				  u64 blkno,
225 				  u32 generation,
226 				  char *name)
227 {
228 	int len;
229 
230 	mlog_entry_void();
231 
232 	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
233 
234 	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
235 		       ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD,
236 		       (long long)blkno, generation);
237 
238 	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
239 
240 	mlog(0, "built lock resource with name: %s\n", name);
241 
242 	mlog_exit_void();
243 }
244 
245 static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
246 
247 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res,
248 				       struct ocfs2_dlm_debug *dlm_debug)
249 {
250 	mlog(0, "Add tracking for lockres %s\n", res->l_name);
251 
252 	spin_lock(&ocfs2_dlm_tracking_lock);
253 	list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking);
254 	spin_unlock(&ocfs2_dlm_tracking_lock);
255 }
256 
257 static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res)
258 {
259 	spin_lock(&ocfs2_dlm_tracking_lock);
260 	if (!list_empty(&res->l_debug_list))
261 		list_del_init(&res->l_debug_list);
262 	spin_unlock(&ocfs2_dlm_tracking_lock);
263 }
264 
265 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
266 				       struct ocfs2_lock_res *res,
267 				       enum ocfs2_lock_type type,
268 				       u64 blkno,
269 				       u32 generation,
270 				       struct ocfs2_lock_res_ops *ops,
271 				       void *priv)
272 {
273 	ocfs2_build_lock_name(type, blkno, generation, res->l_name);
274 
275 	res->l_type          = type;
276 	res->l_ops           = ops;
277 	res->l_priv          = priv;
278 
279 	res->l_level         = LKM_IVMODE;
280 	res->l_requested     = LKM_IVMODE;
281 	res->l_blocking      = LKM_IVMODE;
282 	res->l_action        = OCFS2_AST_INVALID;
283 	res->l_unlock_action = OCFS2_UNLOCK_INVALID;
284 
285 	res->l_flags         = OCFS2_LOCK_INITIALIZED;
286 
287 	ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug);
288 }
289 
290 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
291 {
292 	/* This also clears out the lock status block */
293 	memset(res, 0, sizeof(struct ocfs2_lock_res));
294 	spin_lock_init(&res->l_lock);
295 	init_waitqueue_head(&res->l_event);
296 	INIT_LIST_HEAD(&res->l_blocked_list);
297 	INIT_LIST_HEAD(&res->l_mask_waiters);
298 }
299 
300 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
301 			       enum ocfs2_lock_type type,
302 			       struct inode *inode)
303 {
304 	struct ocfs2_lock_res_ops *ops;
305 
306 	switch(type) {
307 		case OCFS2_LOCK_TYPE_RW:
308 			ops = &ocfs2_inode_rw_lops;
309 			break;
310 		case OCFS2_LOCK_TYPE_META:
311 			ops = &ocfs2_inode_meta_lops;
312 			break;
313 		case OCFS2_LOCK_TYPE_DATA:
314 			ops = &ocfs2_inode_data_lops;
315 			break;
316 		default:
317 			mlog_bug_on_msg(1, "type: %d\n", type);
318 			ops = NULL; /* thanks, gcc */
319 			break;
320 	};
321 
322 	ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type,
323 				   OCFS2_I(inode)->ip_blkno,
324 				   inode->i_generation, ops, inode);
325 }
326 
327 static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res,
328 				      struct ocfs2_super *osb)
329 {
330 	/* Superblock lockres doesn't come from a slab so we call init
331 	 * once on it manually.  */
332 	ocfs2_lock_res_init_once(res);
333 	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER,
334 				   OCFS2_SUPER_BLOCK_BLKNO, 0,
335 				   &ocfs2_super_lops, osb);
336 }
337 
338 static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res,
339 				       struct ocfs2_super *osb)
340 {
341 	/* Rename lockres doesn't come from a slab so we call init
342 	 * once on it manually.  */
343 	ocfs2_lock_res_init_once(res);
344 	ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0,
345 				   &ocfs2_rename_lops, osb);
346 }
347 
348 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
349 {
350 	mlog_entry_void();
351 
352 	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
353 		return;
354 
355 	ocfs2_remove_lockres_tracking(res);
356 
357 	mlog_bug_on_msg(!list_empty(&res->l_blocked_list),
358 			"Lockres %s is on the blocked list\n",
359 			res->l_name);
360 	mlog_bug_on_msg(!list_empty(&res->l_mask_waiters),
361 			"Lockres %s has mask waiters pending\n",
362 			res->l_name);
363 	mlog_bug_on_msg(spin_is_locked(&res->l_lock),
364 			"Lockres %s is locked\n",
365 			res->l_name);
366 	mlog_bug_on_msg(res->l_ro_holders,
367 			"Lockres %s has %u ro holders\n",
368 			res->l_name, res->l_ro_holders);
369 	mlog_bug_on_msg(res->l_ex_holders,
370 			"Lockres %s has %u ex holders\n",
371 			res->l_name, res->l_ex_holders);
372 
373 	/* Need to clear out the lock status block for the dlm */
374 	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
375 
376 	res->l_flags = 0UL;
377 	mlog_exit_void();
378 }
379 
380 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
381 				     int level)
382 {
383 	mlog_entry_void();
384 
385 	BUG_ON(!lockres);
386 
387 	switch(level) {
388 	case LKM_EXMODE:
389 		lockres->l_ex_holders++;
390 		break;
391 	case LKM_PRMODE:
392 		lockres->l_ro_holders++;
393 		break;
394 	default:
395 		BUG();
396 	}
397 
398 	mlog_exit_void();
399 }
400 
401 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
402 				     int level)
403 {
404 	mlog_entry_void();
405 
406 	BUG_ON(!lockres);
407 
408 	switch(level) {
409 	case LKM_EXMODE:
410 		BUG_ON(!lockres->l_ex_holders);
411 		lockres->l_ex_holders--;
412 		break;
413 	case LKM_PRMODE:
414 		BUG_ON(!lockres->l_ro_holders);
415 		lockres->l_ro_holders--;
416 		break;
417 	default:
418 		BUG();
419 	}
420 	mlog_exit_void();
421 }
422 
423 /* WARNING: This function lives in a world where the only three lock
424  * levels are EX, PR, and NL. It *will* have to be adjusted when more
425  * lock types are added. */
426 static inline int ocfs2_highest_compat_lock_level(int level)
427 {
428 	int new_level = LKM_EXMODE;
429 
430 	if (level == LKM_EXMODE)
431 		new_level = LKM_NLMODE;
432 	else if (level == LKM_PRMODE)
433 		new_level = LKM_PRMODE;
434 	return new_level;
435 }
436 
437 static void lockres_set_flags(struct ocfs2_lock_res *lockres,
438 			      unsigned long newflags)
439 {
440 	struct list_head *pos, *tmp;
441 	struct ocfs2_mask_waiter *mw;
442 
443  	assert_spin_locked(&lockres->l_lock);
444 
445 	lockres->l_flags = newflags;
446 
447 	list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) {
448 		mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item);
449 		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
450 			continue;
451 
452 		list_del_init(&mw->mw_item);
453 		mw->mw_status = 0;
454 		complete(&mw->mw_complete);
455 	}
456 }
457 static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or)
458 {
459 	lockres_set_flags(lockres, lockres->l_flags | or);
460 }
461 static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
462 				unsigned long clear)
463 {
464 	lockres_set_flags(lockres, lockres->l_flags & ~clear);
465 }
466 
467 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
468 {
469 	mlog_entry_void();
470 
471 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
472 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
473 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
474 	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
475 
476 	lockres->l_level = lockres->l_requested;
477 	if (lockres->l_level <=
478 	    ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
479 		lockres->l_blocking = LKM_NLMODE;
480 		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
481 	}
482 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
483 
484 	mlog_exit_void();
485 }
486 
487 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
488 {
489 	mlog_entry_void();
490 
491 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
492 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
493 
494 	/* Convert from RO to EX doesn't really need anything as our
495 	 * information is already up to data. Convert from NL to
496 	 * *anything* however should mark ourselves as needing an
497 	 * update */
498 	if (lockres->l_level == LKM_NLMODE)
499 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
500 
501 	lockres->l_level = lockres->l_requested;
502 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
503 
504 	mlog_exit_void();
505 }
506 
507 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
508 {
509 	mlog_entry_void();
510 
511 	BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY));
512 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
513 
514 	if (lockres->l_requested > LKM_NLMODE &&
515 	    !(lockres->l_flags & OCFS2_LOCK_LOCAL))
516 		lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
517 
518 	lockres->l_level = lockres->l_requested;
519 	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
520 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
521 
522 	mlog_exit_void();
523 }
524 
525 static void ocfs2_inode_ast_func(void *opaque)
526 {
527 	struct ocfs2_lock_res *lockres = opaque;
528 	struct inode *inode;
529 	struct dlm_lockstatus *lksb;
530 	unsigned long flags;
531 
532 	mlog_entry_void();
533 
534 	inode = ocfs2_lock_res_inode(lockres);
535 
536 	mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n",
537 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action,
538 	     ocfs2_lock_type_string(lockres->l_type));
539 
540 	BUG_ON(!ocfs2_is_inode_lock(lockres));
541 
542 	spin_lock_irqsave(&lockres->l_lock, flags);
543 
544 	lksb = &(lockres->l_lksb);
545 	if (lksb->status != DLM_NORMAL) {
546 		mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u "
547 		     "on inode %llu\n", lksb->status,
548 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
549 		spin_unlock_irqrestore(&lockres->l_lock, flags);
550 		mlog_exit_void();
551 		return;
552 	}
553 
554 	switch(lockres->l_action) {
555 	case OCFS2_AST_ATTACH:
556 		ocfs2_generic_handle_attach_action(lockres);
557 		lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL);
558 		break;
559 	case OCFS2_AST_CONVERT:
560 		ocfs2_generic_handle_convert_action(lockres);
561 		break;
562 	case OCFS2_AST_DOWNCONVERT:
563 		ocfs2_generic_handle_downconvert_action(lockres);
564 		break;
565 	default:
566 		mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u "
567 		     "lockres flags = 0x%lx, unlock action: %u\n",
568 		     lockres->l_name, lockres->l_action, lockres->l_flags,
569 		     lockres->l_unlock_action);
570 
571 		BUG();
572 	}
573 
574 	/* data and rw locking ignores refresh flag for now. */
575 	if (lockres->l_type != OCFS2_LOCK_TYPE_META)
576 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
577 
578 	/* set it to something invalid so if we get called again we
579 	 * can catch it. */
580 	lockres->l_action = OCFS2_AST_INVALID;
581 	spin_unlock_irqrestore(&lockres->l_lock, flags);
582 	wake_up(&lockres->l_event);
583 
584 	mlog_exit_void();
585 }
586 
587 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
588 				     int level)
589 {
590 	int needs_downconvert = 0;
591 	mlog_entry_void();
592 
593 	assert_spin_locked(&lockres->l_lock);
594 
595 	lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
596 
597 	if (level > lockres->l_blocking) {
598 		/* only schedule a downconvert if we haven't already scheduled
599 		 * one that goes low enough to satisfy the level we're
600 		 * blocking.  this also catches the case where we get
601 		 * duplicate BASTs */
602 		if (ocfs2_highest_compat_lock_level(level) <
603 		    ocfs2_highest_compat_lock_level(lockres->l_blocking))
604 			needs_downconvert = 1;
605 
606 		lockres->l_blocking = level;
607 	}
608 
609 	mlog_exit(needs_downconvert);
610 	return needs_downconvert;
611 }
612 
613 static void ocfs2_generic_bast_func(struct ocfs2_super *osb,
614 				    struct ocfs2_lock_res *lockres,
615 				    int level)
616 {
617 	int needs_downconvert;
618 	unsigned long flags;
619 
620 	mlog_entry_void();
621 
622 	BUG_ON(level <= LKM_NLMODE);
623 
624 	spin_lock_irqsave(&lockres->l_lock, flags);
625 	needs_downconvert = ocfs2_generic_handle_bast(lockres, level);
626 	if (needs_downconvert)
627 		ocfs2_schedule_blocked_lock(osb, lockres);
628 	spin_unlock_irqrestore(&lockres->l_lock, flags);
629 
630 	ocfs2_kick_vote_thread(osb);
631 
632 	wake_up(&lockres->l_event);
633 	mlog_exit_void();
634 }
635 
636 static void ocfs2_inode_bast_func(void *opaque, int level)
637 {
638 	struct ocfs2_lock_res *lockres = opaque;
639 	struct inode *inode;
640 	struct ocfs2_super *osb;
641 
642 	mlog_entry_void();
643 
644 	BUG_ON(!ocfs2_is_inode_lock(lockres));
645 
646 	inode = ocfs2_lock_res_inode(lockres);
647 	osb = OCFS2_SB(inode->i_sb);
648 
649 	mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n",
650 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, level,
651 	     lockres->l_level, ocfs2_lock_type_string(lockres->l_type));
652 
653 	ocfs2_generic_bast_func(osb, lockres, level);
654 
655 	mlog_exit_void();
656 }
657 
658 static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres,
659 				   int ignore_refresh)
660 {
661 	struct dlm_lockstatus *lksb = &lockres->l_lksb;
662 	unsigned long flags;
663 
664 	spin_lock_irqsave(&lockres->l_lock, flags);
665 
666 	if (lksb->status != DLM_NORMAL) {
667 		mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n",
668 		     lockres->l_name, lksb->status);
669 		spin_unlock_irqrestore(&lockres->l_lock, flags);
670 		return;
671 	}
672 
673 	switch(lockres->l_action) {
674 	case OCFS2_AST_ATTACH:
675 		ocfs2_generic_handle_attach_action(lockres);
676 		break;
677 	case OCFS2_AST_CONVERT:
678 		ocfs2_generic_handle_convert_action(lockres);
679 		break;
680 	case OCFS2_AST_DOWNCONVERT:
681 		ocfs2_generic_handle_downconvert_action(lockres);
682 		break;
683 	default:
684 		BUG();
685 	}
686 
687 	if (ignore_refresh)
688 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
689 
690 	/* set it to something invalid so if we get called again we
691 	 * can catch it. */
692 	lockres->l_action = OCFS2_AST_INVALID;
693 	spin_unlock_irqrestore(&lockres->l_lock, flags);
694 
695 	wake_up(&lockres->l_event);
696 }
697 
698 static void ocfs2_super_ast_func(void *opaque)
699 {
700 	struct ocfs2_lock_res *lockres = opaque;
701 
702 	mlog_entry_void();
703 	mlog(0, "Superblock AST fired\n");
704 
705 	BUG_ON(!ocfs2_is_super_lock(lockres));
706 	ocfs2_generic_ast_func(lockres, 0);
707 
708 	mlog_exit_void();
709 }
710 
711 static void ocfs2_super_bast_func(void *opaque,
712 				  int level)
713 {
714 	struct ocfs2_lock_res *lockres = opaque;
715 	struct ocfs2_super *osb;
716 
717 	mlog_entry_void();
718 	mlog(0, "Superblock BAST fired\n");
719 
720 	BUG_ON(!ocfs2_is_super_lock(lockres));
721        	osb = ocfs2_lock_res_super(lockres);
722 	ocfs2_generic_bast_func(osb, lockres, level);
723 
724 	mlog_exit_void();
725 }
726 
727 static void ocfs2_rename_ast_func(void *opaque)
728 {
729 	struct ocfs2_lock_res *lockres = opaque;
730 
731 	mlog_entry_void();
732 
733 	mlog(0, "Rename AST fired\n");
734 
735 	BUG_ON(!ocfs2_is_rename_lock(lockres));
736 
737 	ocfs2_generic_ast_func(lockres, 1);
738 
739 	mlog_exit_void();
740 }
741 
742 static void ocfs2_rename_bast_func(void *opaque,
743 				   int level)
744 {
745 	struct ocfs2_lock_res *lockres = opaque;
746 	struct ocfs2_super *osb;
747 
748 	mlog_entry_void();
749 
750 	mlog(0, "Rename BAST fired\n");
751 
752 	BUG_ON(!ocfs2_is_rename_lock(lockres));
753 
754 	osb = ocfs2_lock_res_super(lockres);
755 	ocfs2_generic_bast_func(osb, lockres, level);
756 
757 	mlog_exit_void();
758 }
759 
760 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
761 						int convert)
762 {
763 	unsigned long flags;
764 
765 	mlog_entry_void();
766 	spin_lock_irqsave(&lockres->l_lock, flags);
767 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
768 	if (convert)
769 		lockres->l_action = OCFS2_AST_INVALID;
770 	else
771 		lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
772 	spin_unlock_irqrestore(&lockres->l_lock, flags);
773 
774 	wake_up(&lockres->l_event);
775 	mlog_exit_void();
776 }
777 
778 /* Note: If we detect another process working on the lock (i.e.,
779  * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller
780  * to do the right thing in that case.
781  */
782 static int ocfs2_lock_create(struct ocfs2_super *osb,
783 			     struct ocfs2_lock_res *lockres,
784 			     int level,
785 			     int dlm_flags)
786 {
787 	int ret = 0;
788 	enum dlm_status status;
789 	unsigned long flags;
790 
791 	mlog_entry_void();
792 
793 	mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level,
794 	     dlm_flags);
795 
796 	spin_lock_irqsave(&lockres->l_lock, flags);
797 	if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) ||
798 	    (lockres->l_flags & OCFS2_LOCK_BUSY)) {
799 		spin_unlock_irqrestore(&lockres->l_lock, flags);
800 		goto bail;
801 	}
802 
803 	lockres->l_action = OCFS2_AST_ATTACH;
804 	lockres->l_requested = level;
805 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
806 	spin_unlock_irqrestore(&lockres->l_lock, flags);
807 
808 	status = dlmlock(osb->dlm,
809 			 level,
810 			 &lockres->l_lksb,
811 			 dlm_flags,
812 			 lockres->l_name,
813 			 lockres->l_ops->ast,
814 			 lockres,
815 			 lockres->l_ops->bast);
816 	if (status != DLM_NORMAL) {
817 		ocfs2_log_dlm_error("dlmlock", status, lockres);
818 		ret = -EINVAL;
819 		ocfs2_recover_from_dlm_error(lockres, 1);
820 	}
821 
822 	mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name);
823 
824 bail:
825 	mlog_exit(ret);
826 	return ret;
827 }
828 
829 static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres,
830 					int flag)
831 {
832 	unsigned long flags;
833 	int ret;
834 
835 	spin_lock_irqsave(&lockres->l_lock, flags);
836 	ret = lockres->l_flags & flag;
837 	spin_unlock_irqrestore(&lockres->l_lock, flags);
838 
839 	return ret;
840 }
841 
842 static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres)
843 
844 {
845 	wait_event(lockres->l_event,
846 		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY));
847 }
848 
849 static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres)
850 
851 {
852 	wait_event(lockres->l_event,
853 		   !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING));
854 }
855 
856 /* predict what lock level we'll be dropping down to on behalf
857  * of another node, and return true if the currently wanted
858  * level will be compatible with it. */
859 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
860 						     int wanted)
861 {
862 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
863 
864 	return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking);
865 }
866 
867 static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw)
868 {
869 	INIT_LIST_HEAD(&mw->mw_item);
870 	init_completion(&mw->mw_complete);
871 }
872 
873 static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw)
874 {
875 	wait_for_completion(&mw->mw_complete);
876 	/* Re-arm the completion in case we want to wait on it again */
877 	INIT_COMPLETION(mw->mw_complete);
878 	return mw->mw_status;
879 }
880 
881 static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres,
882 				    struct ocfs2_mask_waiter *mw,
883 				    unsigned long mask,
884 				    unsigned long goal)
885 {
886 	BUG_ON(!list_empty(&mw->mw_item));
887 
888 	assert_spin_locked(&lockres->l_lock);
889 
890 	list_add_tail(&mw->mw_item, &lockres->l_mask_waiters);
891 	mw->mw_mask = mask;
892 	mw->mw_goal = goal;
893 }
894 
895 /* returns 0 if the mw that was removed was already satisfied, -EBUSY
896  * if the mask still hadn't reached its goal */
897 static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres,
898 				      struct ocfs2_mask_waiter *mw)
899 {
900 	unsigned long flags;
901 	int ret = 0;
902 
903 	spin_lock_irqsave(&lockres->l_lock, flags);
904 	if (!list_empty(&mw->mw_item)) {
905 		if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal)
906 			ret = -EBUSY;
907 
908 		list_del_init(&mw->mw_item);
909 		init_completion(&mw->mw_complete);
910 	}
911 	spin_unlock_irqrestore(&lockres->l_lock, flags);
912 
913 	return ret;
914 
915 }
916 
917 static int ocfs2_cluster_lock(struct ocfs2_super *osb,
918 			      struct ocfs2_lock_res *lockres,
919 			      int level,
920 			      int lkm_flags,
921 			      int arg_flags)
922 {
923 	struct ocfs2_mask_waiter mw;
924 	enum dlm_status status;
925 	int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
926 	int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
927 	unsigned long flags;
928 
929 	mlog_entry_void();
930 
931 	ocfs2_init_mask_waiter(&mw);
932 
933 again:
934 	wait = 0;
935 
936 	if (catch_signals && signal_pending(current)) {
937 		ret = -ERESTARTSYS;
938 		goto out;
939 	}
940 
941 	spin_lock_irqsave(&lockres->l_lock, flags);
942 
943 	mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING,
944 			"Cluster lock called on freeing lockres %s! flags "
945 			"0x%lx\n", lockres->l_name, lockres->l_flags);
946 
947 	/* We only compare against the currently granted level
948 	 * here. If the lock is blocked waiting on a downconvert,
949 	 * we'll get caught below. */
950 	if (lockres->l_flags & OCFS2_LOCK_BUSY &&
951 	    level > lockres->l_level) {
952 		/* is someone sitting in dlm_lock? If so, wait on
953 		 * them. */
954 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
955 		wait = 1;
956 		goto unlock;
957 	}
958 
959 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
960 		/* lock has not been created yet. */
961 		spin_unlock_irqrestore(&lockres->l_lock, flags);
962 
963 		ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0);
964 		if (ret < 0) {
965 			mlog_errno(ret);
966 			goto out;
967 		}
968 		goto again;
969 	}
970 
971 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED &&
972 	    !ocfs2_may_continue_on_blocked_lock(lockres, level)) {
973 		/* is the lock is currently blocked on behalf of
974 		 * another node */
975 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0);
976 		wait = 1;
977 		goto unlock;
978 	}
979 
980 	if (level > lockres->l_level) {
981 		if (lockres->l_action != OCFS2_AST_INVALID)
982 			mlog(ML_ERROR, "lockres %s has action %u pending\n",
983 			     lockres->l_name, lockres->l_action);
984 
985 		lockres->l_action = OCFS2_AST_CONVERT;
986 		lockres->l_requested = level;
987 		lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
988 		spin_unlock_irqrestore(&lockres->l_lock, flags);
989 
990 		BUG_ON(level == LKM_IVMODE);
991 		BUG_ON(level == LKM_NLMODE);
992 
993 		mlog(0, "lock %s, convert from %d to level = %d\n",
994 		     lockres->l_name, lockres->l_level, level);
995 
996 		/* call dlm_lock to upgrade lock now */
997 		status = dlmlock(osb->dlm,
998 				 level,
999 				 &lockres->l_lksb,
1000 				 lkm_flags|LKM_CONVERT|LKM_VALBLK,
1001 				 lockres->l_name,
1002 				 lockres->l_ops->ast,
1003 				 lockres,
1004 				 lockres->l_ops->bast);
1005 		if (status != DLM_NORMAL) {
1006 			if ((lkm_flags & LKM_NOQUEUE) &&
1007 			    (status == DLM_NOTQUEUED))
1008 				ret = -EAGAIN;
1009 			else {
1010 				ocfs2_log_dlm_error("dlmlock", status,
1011 						    lockres);
1012 				ret = -EINVAL;
1013 			}
1014 			ocfs2_recover_from_dlm_error(lockres, 1);
1015 			goto out;
1016 		}
1017 
1018 		mlog(0, "lock %s, successfull return from dlmlock\n",
1019 		     lockres->l_name);
1020 
1021 		/* At this point we've gone inside the dlm and need to
1022 		 * complete our work regardless. */
1023 		catch_signals = 0;
1024 
1025 		/* wait for busy to clear and carry on */
1026 		goto again;
1027 	}
1028 
1029 	/* Ok, if we get here then we're good to go. */
1030 	ocfs2_inc_holders(lockres, level);
1031 
1032 	ret = 0;
1033 unlock:
1034 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1035 out:
1036 	/*
1037 	 * This is helping work around a lock inversion between the page lock
1038 	 * and dlm locks.  One path holds the page lock while calling aops
1039 	 * which block acquiring dlm locks.  The voting thread holds dlm
1040 	 * locks while acquiring page locks while down converting data locks.
1041 	 * This block is helping an aop path notice the inversion and back
1042 	 * off to unlock its page lock before trying the dlm lock again.
1043 	 */
1044 	if (wait && arg_flags & OCFS2_LOCK_NONBLOCK &&
1045 	    mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) {
1046 		wait = 0;
1047 		if (lockres_remove_mask_waiter(lockres, &mw))
1048 			ret = -EAGAIN;
1049 		else
1050 			goto again;
1051 	}
1052 	if (wait) {
1053 		ret = ocfs2_wait_for_mask(&mw);
1054 		if (ret == 0)
1055 			goto again;
1056 		mlog_errno(ret);
1057 	}
1058 
1059 	mlog_exit(ret);
1060 	return ret;
1061 }
1062 
1063 static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
1064 				 struct ocfs2_lock_res *lockres,
1065 				 int level)
1066 {
1067 	unsigned long flags;
1068 
1069 	mlog_entry_void();
1070 	spin_lock_irqsave(&lockres->l_lock, flags);
1071 	ocfs2_dec_holders(lockres, level);
1072 	ocfs2_vote_on_unlock(osb, lockres);
1073 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1074 	mlog_exit_void();
1075 }
1076 
1077 static int ocfs2_create_new_inode_lock(struct inode *inode,
1078 				       struct ocfs2_lock_res *lockres)
1079 {
1080 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1081 	unsigned long flags;
1082 
1083 	spin_lock_irqsave(&lockres->l_lock, flags);
1084 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
1085 	lockres_or_flags(lockres, OCFS2_LOCK_LOCAL);
1086 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1087 
1088 	return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL);
1089 }
1090 
1091 /* Grants us an EX lock on the data and metadata resources, skipping
1092  * the normal cluster directory lookup. Use this ONLY on newly created
1093  * inodes which other nodes can't possibly see, and which haven't been
1094  * hashed in the inode hash yet. This can give us a good performance
1095  * increase as it'll skip the network broadcast normally associated
1096  * with creating a new lock resource. */
1097 int ocfs2_create_new_inode_locks(struct inode *inode)
1098 {
1099 	int ret;
1100 
1101 	BUG_ON(!inode);
1102 	BUG_ON(!ocfs2_inode_is_new(inode));
1103 
1104 	mlog_entry_void();
1105 
1106 	mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
1107 
1108 	/* NOTE: That we don't increment any of the holder counts, nor
1109 	 * do we add anything to a journal handle. Since this is
1110 	 * supposed to be a new inode which the cluster doesn't know
1111 	 * about yet, there is no need to.  As far as the LVB handling
1112 	 * is concerned, this is basically like acquiring an EX lock
1113 	 * on a resource which has an invalid one -- we'll set it
1114 	 * valid when we release the EX. */
1115 
1116 	ret = ocfs2_create_new_inode_lock(inode,
1117 					  &OCFS2_I(inode)->ip_rw_lockres);
1118 	if (ret) {
1119 		mlog_errno(ret);
1120 		goto bail;
1121 	}
1122 
1123 	ret = ocfs2_create_new_inode_lock(inode,
1124 					  &OCFS2_I(inode)->ip_meta_lockres);
1125 	if (ret) {
1126 		mlog_errno(ret);
1127 		goto bail;
1128 	}
1129 
1130 	ret = ocfs2_create_new_inode_lock(inode,
1131 					  &OCFS2_I(inode)->ip_data_lockres);
1132 	if (ret) {
1133 		mlog_errno(ret);
1134 		goto bail;
1135 	}
1136 
1137 bail:
1138 	mlog_exit(ret);
1139 	return ret;
1140 }
1141 
1142 int ocfs2_rw_lock(struct inode *inode, int write)
1143 {
1144 	int status, level;
1145 	struct ocfs2_lock_res *lockres;
1146 
1147 	BUG_ON(!inode);
1148 
1149 	mlog_entry_void();
1150 
1151 	mlog(0, "inode %llu take %s RW lock\n",
1152 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1153 	     write ? "EXMODE" : "PRMODE");
1154 
1155 	lockres = &OCFS2_I(inode)->ip_rw_lockres;
1156 
1157 	level = write ? LKM_EXMODE : LKM_PRMODE;
1158 
1159 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1160 				    0);
1161 	if (status < 0)
1162 		mlog_errno(status);
1163 
1164 	mlog_exit(status);
1165 	return status;
1166 }
1167 
1168 void ocfs2_rw_unlock(struct inode *inode, int write)
1169 {
1170 	int level = write ? LKM_EXMODE : LKM_PRMODE;
1171 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1172 
1173 	mlog_entry_void();
1174 
1175 	mlog(0, "inode %llu drop %s RW lock\n",
1176 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1177 	     write ? "EXMODE" : "PRMODE");
1178 
1179 	ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1180 
1181 	mlog_exit_void();
1182 }
1183 
1184 int ocfs2_data_lock_full(struct inode *inode,
1185 			 int write,
1186 			 int arg_flags)
1187 {
1188 	int status = 0, level;
1189 	struct ocfs2_lock_res *lockres;
1190 
1191 	BUG_ON(!inode);
1192 
1193 	mlog_entry_void();
1194 
1195 	mlog(0, "inode %llu take %s DATA lock\n",
1196 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1197 	     write ? "EXMODE" : "PRMODE");
1198 
1199 	/* We'll allow faking a readonly data lock for
1200 	 * rodevices. */
1201 	if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) {
1202 		if (write) {
1203 			status = -EROFS;
1204 			mlog_errno(status);
1205 		}
1206 		goto out;
1207 	}
1208 
1209 	lockres = &OCFS2_I(inode)->ip_data_lockres;
1210 
1211 	level = write ? LKM_EXMODE : LKM_PRMODE;
1212 
1213 	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level,
1214 				    0, arg_flags);
1215 	if (status < 0 && status != -EAGAIN)
1216 		mlog_errno(status);
1217 
1218 out:
1219 	mlog_exit(status);
1220 	return status;
1221 }
1222 
1223 /* see ocfs2_meta_lock_with_page() */
1224 int ocfs2_data_lock_with_page(struct inode *inode,
1225 			      int write,
1226 			      struct page *page)
1227 {
1228 	int ret;
1229 
1230 	ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK);
1231 	if (ret == -EAGAIN) {
1232 		unlock_page(page);
1233 		if (ocfs2_data_lock(inode, write) == 0)
1234 			ocfs2_data_unlock(inode, write);
1235 		ret = AOP_TRUNCATED_PAGE;
1236 	}
1237 
1238 	return ret;
1239 }
1240 
1241 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb,
1242 				 struct ocfs2_lock_res *lockres)
1243 {
1244 	int kick = 0;
1245 
1246 	mlog_entry_void();
1247 
1248 	/* If we know that another node is waiting on our lock, kick
1249 	 * the vote thread * pre-emptively when we reach a release
1250 	 * condition. */
1251 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1252 		switch(lockres->l_blocking) {
1253 		case LKM_EXMODE:
1254 			if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1255 				kick = 1;
1256 			break;
1257 		case LKM_PRMODE:
1258 			if (!lockres->l_ex_holders)
1259 				kick = 1;
1260 			break;
1261 		default:
1262 			BUG();
1263 		}
1264 	}
1265 
1266 	if (kick)
1267 		ocfs2_kick_vote_thread(osb);
1268 
1269 	mlog_exit_void();
1270 }
1271 
1272 void ocfs2_data_unlock(struct inode *inode,
1273 		       int write)
1274 {
1275 	int level = write ? LKM_EXMODE : LKM_PRMODE;
1276 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres;
1277 
1278 	mlog_entry_void();
1279 
1280 	mlog(0, "inode %llu drop %s DATA lock\n",
1281 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1282 	     write ? "EXMODE" : "PRMODE");
1283 
1284 	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1285 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1286 
1287 	mlog_exit_void();
1288 }
1289 
1290 #define OCFS2_SEC_BITS   34
1291 #define OCFS2_SEC_SHIFT  (64 - 34)
1292 #define OCFS2_NSEC_MASK  ((1ULL << OCFS2_SEC_SHIFT) - 1)
1293 
1294 /* LVB only has room for 64 bits of time here so we pack it for
1295  * now. */
1296 static u64 ocfs2_pack_timespec(struct timespec *spec)
1297 {
1298 	u64 res;
1299 	u64 sec = spec->tv_sec;
1300 	u32 nsec = spec->tv_nsec;
1301 
1302 	res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK);
1303 
1304 	return res;
1305 }
1306 
1307 /* Call this with the lockres locked. I am reasonably sure we don't
1308  * need ip_lock in this function as anyone who would be changing those
1309  * values is supposed to be blocked in ocfs2_meta_lock right now. */
1310 static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1311 {
1312 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1313 	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1314 	struct ocfs2_meta_lvb *lvb;
1315 
1316 	mlog_entry_void();
1317 
1318 	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1319 
1320 	lvb->lvb_version   = cpu_to_be32(OCFS2_LVB_VERSION);
1321 	lvb->lvb_isize	   = cpu_to_be64(i_size_read(inode));
1322 	lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters);
1323 	lvb->lvb_iuid      = cpu_to_be32(inode->i_uid);
1324 	lvb->lvb_igid      = cpu_to_be32(inode->i_gid);
1325 	lvb->lvb_imode     = cpu_to_be16(inode->i_mode);
1326 	lvb->lvb_inlink    = cpu_to_be16(inode->i_nlink);
1327 	lvb->lvb_iatime_packed  =
1328 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime));
1329 	lvb->lvb_ictime_packed =
1330 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime));
1331 	lvb->lvb_imtime_packed =
1332 		cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime));
1333 	lvb->lvb_iattr    = cpu_to_be32(oi->ip_attr);
1334 
1335 	mlog_meta_lvb(0, lockres);
1336 
1337 	mlog_exit_void();
1338 }
1339 
1340 static void ocfs2_unpack_timespec(struct timespec *spec,
1341 				  u64 packed_time)
1342 {
1343 	spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT;
1344 	spec->tv_nsec = packed_time & OCFS2_NSEC_MASK;
1345 }
1346 
1347 static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1348 {
1349 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1350 	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
1351 	struct ocfs2_meta_lvb *lvb;
1352 
1353 	mlog_entry_void();
1354 
1355 	mlog_meta_lvb(0, lockres);
1356 
1357 	lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1358 
1359 	/* We're safe here without the lockres lock... */
1360 	spin_lock(&oi->ip_lock);
1361 	oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters);
1362 	i_size_write(inode, be64_to_cpu(lvb->lvb_isize));
1363 
1364 	oi->ip_attr = be32_to_cpu(lvb->lvb_iattr);
1365 	ocfs2_set_inode_flags(inode);
1366 
1367 	/* fast-symlinks are a special case */
1368 	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
1369 		inode->i_blocks = 0;
1370 	else
1371 		inode->i_blocks =
1372 			ocfs2_align_bytes_to_sectors(i_size_read(inode));
1373 
1374 	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
1375 	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
1376 	inode->i_mode    = be16_to_cpu(lvb->lvb_imode);
1377 	inode->i_nlink   = be16_to_cpu(lvb->lvb_inlink);
1378 	ocfs2_unpack_timespec(&inode->i_atime,
1379 			      be64_to_cpu(lvb->lvb_iatime_packed));
1380 	ocfs2_unpack_timespec(&inode->i_mtime,
1381 			      be64_to_cpu(lvb->lvb_imtime_packed));
1382 	ocfs2_unpack_timespec(&inode->i_ctime,
1383 			      be64_to_cpu(lvb->lvb_ictime_packed));
1384 	spin_unlock(&oi->ip_lock);
1385 
1386 	mlog_exit_void();
1387 }
1388 
1389 static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres)
1390 {
1391 	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
1392 
1393 	if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION)
1394 		return 1;
1395 	return 0;
1396 }
1397 
1398 /* Determine whether a lock resource needs to be refreshed, and
1399  * arbitrate who gets to refresh it.
1400  *
1401  *   0 means no refresh needed.
1402  *
1403  *   > 0 means you need to refresh this and you MUST call
1404  *   ocfs2_complete_lock_res_refresh afterwards. */
1405 static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
1406 {
1407 	unsigned long flags;
1408 	int status = 0;
1409 
1410 	mlog_entry_void();
1411 
1412 refresh_check:
1413 	spin_lock_irqsave(&lockres->l_lock, flags);
1414 	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
1415 		spin_unlock_irqrestore(&lockres->l_lock, flags);
1416 		goto bail;
1417 	}
1418 
1419 	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
1420 		spin_unlock_irqrestore(&lockres->l_lock, flags);
1421 
1422 		ocfs2_wait_on_refreshing_lock(lockres);
1423 		goto refresh_check;
1424 	}
1425 
1426 	/* Ok, I'll be the one to refresh this lock. */
1427 	lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING);
1428 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1429 
1430 	status = 1;
1431 bail:
1432 	mlog_exit(status);
1433 	return status;
1434 }
1435 
1436 /* If status is non zero, I'll mark it as not being in refresh
1437  * anymroe, but i won't clear the needs refresh flag. */
1438 static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres,
1439 						   int status)
1440 {
1441 	unsigned long flags;
1442 	mlog_entry_void();
1443 
1444 	spin_lock_irqsave(&lockres->l_lock, flags);
1445 	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
1446 	if (!status)
1447 		lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
1448 	spin_unlock_irqrestore(&lockres->l_lock, flags);
1449 
1450 	wake_up(&lockres->l_event);
1451 
1452 	mlog_exit_void();
1453 }
1454 
1455 /* may or may not return a bh if it went to disk. */
1456 static int ocfs2_meta_lock_update(struct inode *inode,
1457 				  struct buffer_head **bh)
1458 {
1459 	int status = 0;
1460 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1461 	struct ocfs2_lock_res *lockres;
1462 	struct ocfs2_dinode *fe;
1463 
1464 	mlog_entry_void();
1465 
1466 	spin_lock(&oi->ip_lock);
1467 	if (oi->ip_flags & OCFS2_INODE_DELETED) {
1468 		mlog(0, "Orphaned inode %llu was deleted while we "
1469 		     "were waiting on a lock. ip_flags = 0x%x\n",
1470 		     (unsigned long long)oi->ip_blkno, oi->ip_flags);
1471 		spin_unlock(&oi->ip_lock);
1472 		status = -ENOENT;
1473 		goto bail;
1474 	}
1475 	spin_unlock(&oi->ip_lock);
1476 
1477 	lockres = &oi->ip_meta_lockres;
1478 
1479 	if (!ocfs2_should_refresh_lock_res(lockres))
1480 		goto bail;
1481 
1482 	/* This will discard any caching information we might have had
1483 	 * for the inode metadata. */
1484 	ocfs2_metadata_cache_purge(inode);
1485 
1486 	/* will do nothing for inode types that don't use the extent
1487 	 * map (directories, bitmap files, etc) */
1488 	ocfs2_extent_map_trunc(inode, 0);
1489 
1490 	if (ocfs2_meta_lvb_is_trustable(lockres)) {
1491 		mlog(0, "Trusting LVB on inode %llu\n",
1492 		     (unsigned long long)oi->ip_blkno);
1493 		ocfs2_refresh_inode_from_lvb(inode);
1494 	} else {
1495 		/* Boo, we have to go to disk. */
1496 		/* read bh, cast, ocfs2_refresh_inode */
1497 		status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno,
1498 					  bh, OCFS2_BH_CACHED, inode);
1499 		if (status < 0) {
1500 			mlog_errno(status);
1501 			goto bail_refresh;
1502 		}
1503 		fe = (struct ocfs2_dinode *) (*bh)->b_data;
1504 
1505 		/* This is a good chance to make sure we're not
1506 		 * locking an invalid object.
1507 		 *
1508 		 * We bug on a stale inode here because we checked
1509 		 * above whether it was wiped from disk. The wiping
1510 		 * node provides a guarantee that we receive that
1511 		 * message and can mark the inode before dropping any
1512 		 * locks associated with it. */
1513 		if (!OCFS2_IS_VALID_DINODE(fe)) {
1514 			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
1515 			status = -EIO;
1516 			goto bail_refresh;
1517 		}
1518 		mlog_bug_on_msg(inode->i_generation !=
1519 				le32_to_cpu(fe->i_generation),
1520 				"Invalid dinode %llu disk generation: %u "
1521 				"inode->i_generation: %u\n",
1522 				(unsigned long long)oi->ip_blkno,
1523 				le32_to_cpu(fe->i_generation),
1524 				inode->i_generation);
1525 		mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) ||
1526 				!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)),
1527 				"Stale dinode %llu dtime: %llu flags: 0x%x\n",
1528 				(unsigned long long)oi->ip_blkno,
1529 				(unsigned long long)le64_to_cpu(fe->i_dtime),
1530 				le32_to_cpu(fe->i_flags));
1531 
1532 		ocfs2_refresh_inode(inode, fe);
1533 	}
1534 
1535 	status = 0;
1536 bail_refresh:
1537 	ocfs2_complete_lock_res_refresh(lockres, status);
1538 bail:
1539 	mlog_exit(status);
1540 	return status;
1541 }
1542 
1543 static int ocfs2_assign_bh(struct inode *inode,
1544 			   struct buffer_head **ret_bh,
1545 			   struct buffer_head *passed_bh)
1546 {
1547 	int status;
1548 
1549 	if (passed_bh) {
1550 		/* Ok, the update went to disk for us, use the
1551 		 * returned bh. */
1552 		*ret_bh = passed_bh;
1553 		get_bh(*ret_bh);
1554 
1555 		return 0;
1556 	}
1557 
1558 	status = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1559 				  OCFS2_I(inode)->ip_blkno,
1560 				  ret_bh,
1561 				  OCFS2_BH_CACHED,
1562 				  inode);
1563 	if (status < 0)
1564 		mlog_errno(status);
1565 
1566 	return status;
1567 }
1568 
1569 /*
1570  * returns < 0 error if the callback will never be called, otherwise
1571  * the result of the lock will be communicated via the callback.
1572  */
1573 int ocfs2_meta_lock_full(struct inode *inode,
1574 			 struct ocfs2_journal_handle *handle,
1575 			 struct buffer_head **ret_bh,
1576 			 int ex,
1577 			 int arg_flags)
1578 {
1579 	int status, level, dlm_flags, acquired;
1580 	struct ocfs2_lock_res *lockres;
1581 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1582 	struct buffer_head *local_bh = NULL;
1583 
1584 	BUG_ON(!inode);
1585 
1586 	mlog_entry_void();
1587 
1588 	mlog(0, "inode %llu, take %s META lock\n",
1589 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1590 	     ex ? "EXMODE" : "PRMODE");
1591 
1592 	status = 0;
1593 	acquired = 0;
1594 	/* We'll allow faking a readonly metadata lock for
1595 	 * rodevices. */
1596 	if (ocfs2_is_hard_readonly(osb)) {
1597 		if (ex)
1598 			status = -EROFS;
1599 		goto bail;
1600 	}
1601 
1602 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1603 		wait_event(osb->recovery_event,
1604 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1605 
1606 	acquired = 0;
1607 	lockres = &OCFS2_I(inode)->ip_meta_lockres;
1608 	level = ex ? LKM_EXMODE : LKM_PRMODE;
1609 	dlm_flags = 0;
1610 	if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1611 		dlm_flags |= LKM_NOQUEUE;
1612 
1613 	status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1614 	if (status < 0) {
1615 		if (status != -EAGAIN && status != -EIOCBRETRY)
1616 			mlog_errno(status);
1617 		goto bail;
1618 	}
1619 
1620 	/* Notify the error cleanup path to drop the cluster lock. */
1621 	acquired = 1;
1622 
1623 	/* We wait twice because a node may have died while we were in
1624 	 * the lower dlm layers. The second time though, we've
1625 	 * committed to owning this lock so we don't allow signals to
1626 	 * abort the operation. */
1627 	if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1628 		wait_event(osb->recovery_event,
1629 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1630 
1631 	/* This is fun. The caller may want a bh back, or it may
1632 	 * not. ocfs2_meta_lock_update definitely wants one in, but
1633 	 * may or may not read one, depending on what's in the
1634 	 * LVB. The result of all of this is that we've *only* gone to
1635 	 * disk if we have to, so the complexity is worthwhile. */
1636 	status = ocfs2_meta_lock_update(inode, &local_bh);
1637 	if (status < 0) {
1638 		if (status != -ENOENT)
1639 			mlog_errno(status);
1640 		goto bail;
1641 	}
1642 
1643 	if (ret_bh) {
1644 		status = ocfs2_assign_bh(inode, ret_bh, local_bh);
1645 		if (status < 0) {
1646 			mlog_errno(status);
1647 			goto bail;
1648 		}
1649 	}
1650 
1651 	if (handle) {
1652 		status = ocfs2_handle_add_lock(handle, inode);
1653 		if (status < 0)
1654 			mlog_errno(status);
1655 	}
1656 
1657 bail:
1658 	if (status < 0) {
1659 		if (ret_bh && (*ret_bh)) {
1660 			brelse(*ret_bh);
1661 			*ret_bh = NULL;
1662 		}
1663 		if (acquired)
1664 			ocfs2_meta_unlock(inode, ex);
1665 	}
1666 
1667 	if (local_bh)
1668 		brelse(local_bh);
1669 
1670 	mlog_exit(status);
1671 	return status;
1672 }
1673 
1674 /*
1675  * This is working around a lock inversion between tasks acquiring DLM locks
1676  * while holding a page lock and the vote thread which blocks dlm lock acquiry
1677  * while acquiring page locks.
1678  *
1679  * ** These _with_page variantes are only intended to be called from aop
1680  * methods that hold page locks and return a very specific *positive* error
1681  * code that aop methods pass up to the VFS -- test for errors with != 0. **
1682  *
1683  * The DLM is called such that it returns -EAGAIN if it would have blocked
1684  * waiting for the vote thread.  In that case we unlock our page so the vote
1685  * thread can make progress.  Once we've done this we have to return
1686  * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up
1687  * into the VFS who will then immediately retry the aop call.
1688  *
1689  * We do a blocking lock and immediate unlock before returning, though, so that
1690  * the lock has a great chance of being cached on this node by the time the VFS
1691  * calls back to retry the aop.    This has a potential to livelock as nodes
1692  * ping locks back and forth, but that's a risk we're willing to take to avoid
1693  * the lock inversion simply.
1694  */
1695 int ocfs2_meta_lock_with_page(struct inode *inode,
1696 			      struct ocfs2_journal_handle *handle,
1697 			      struct buffer_head **ret_bh,
1698 			      int ex,
1699 			      struct page *page)
1700 {
1701 	int ret;
1702 
1703 	ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex,
1704 				   OCFS2_LOCK_NONBLOCK);
1705 	if (ret == -EAGAIN) {
1706 		unlock_page(page);
1707 		if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0)
1708 			ocfs2_meta_unlock(inode, ex);
1709 		ret = AOP_TRUNCATED_PAGE;
1710 	}
1711 
1712 	return ret;
1713 }
1714 
1715 void ocfs2_meta_unlock(struct inode *inode,
1716 		       int ex)
1717 {
1718 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1719 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
1720 
1721 	mlog_entry_void();
1722 
1723 	mlog(0, "inode %llu drop %s META lock\n",
1724 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
1725 	     ex ? "EXMODE" : "PRMODE");
1726 
1727 	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)))
1728 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
1729 
1730 	mlog_exit_void();
1731 }
1732 
1733 int ocfs2_super_lock(struct ocfs2_super *osb,
1734 		     int ex)
1735 {
1736 	int status;
1737 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1738 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1739 	struct buffer_head *bh;
1740 	struct ocfs2_slot_info *si = osb->slot_info;
1741 
1742 	mlog_entry_void();
1743 
1744 	if (ocfs2_is_hard_readonly(osb))
1745 		return -EROFS;
1746 
1747 	status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
1748 	if (status < 0) {
1749 		mlog_errno(status);
1750 		goto bail;
1751 	}
1752 
1753 	/* The super block lock path is really in the best position to
1754 	 * know when resources covered by the lock need to be
1755 	 * refreshed, so we do it here. Of course, making sense of
1756 	 * everything is up to the caller :) */
1757 	status = ocfs2_should_refresh_lock_res(lockres);
1758 	if (status < 0) {
1759 		mlog_errno(status);
1760 		goto bail;
1761 	}
1762 	if (status) {
1763 		bh = si->si_bh;
1764 		status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
1765 					  si->si_inode);
1766 		if (status == 0)
1767 			ocfs2_update_slot_info(si);
1768 
1769 		ocfs2_complete_lock_res_refresh(lockres, status);
1770 
1771 		if (status < 0)
1772 			mlog_errno(status);
1773 	}
1774 bail:
1775 	mlog_exit(status);
1776 	return status;
1777 }
1778 
1779 void ocfs2_super_unlock(struct ocfs2_super *osb,
1780 			int ex)
1781 {
1782 	int level = ex ? LKM_EXMODE : LKM_PRMODE;
1783 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
1784 
1785 	ocfs2_cluster_unlock(osb, lockres, level);
1786 }
1787 
1788 int ocfs2_rename_lock(struct ocfs2_super *osb)
1789 {
1790 	int status;
1791 	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1792 
1793 	if (ocfs2_is_hard_readonly(osb))
1794 		return -EROFS;
1795 
1796 	status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0);
1797 	if (status < 0)
1798 		mlog_errno(status);
1799 
1800 	return status;
1801 }
1802 
1803 void ocfs2_rename_unlock(struct ocfs2_super *osb)
1804 {
1805 	struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
1806 
1807 	ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE);
1808 }
1809 
1810 /* Reference counting of the dlm debug structure. We want this because
1811  * open references on the debug inodes can live on after a mount, so
1812  * we can't rely on the ocfs2_super to always exist. */
1813 static void ocfs2_dlm_debug_free(struct kref *kref)
1814 {
1815 	struct ocfs2_dlm_debug *dlm_debug;
1816 
1817 	dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt);
1818 
1819 	kfree(dlm_debug);
1820 }
1821 
1822 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug)
1823 {
1824 	if (dlm_debug)
1825 		kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free);
1826 }
1827 
1828 static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug)
1829 {
1830 	kref_get(&debug->d_refcnt);
1831 }
1832 
1833 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void)
1834 {
1835 	struct ocfs2_dlm_debug *dlm_debug;
1836 
1837 	dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL);
1838 	if (!dlm_debug) {
1839 		mlog_errno(-ENOMEM);
1840 		goto out;
1841 	}
1842 
1843 	kref_init(&dlm_debug->d_refcnt);
1844 	INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking);
1845 	dlm_debug->d_locking_state = NULL;
1846 out:
1847 	return dlm_debug;
1848 }
1849 
1850 /* Access to this is arbitrated for us via seq_file->sem. */
1851 struct ocfs2_dlm_seq_priv {
1852 	struct ocfs2_dlm_debug *p_dlm_debug;
1853 	struct ocfs2_lock_res p_iter_res;
1854 	struct ocfs2_lock_res p_tmp_res;
1855 };
1856 
1857 static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start,
1858 						 struct ocfs2_dlm_seq_priv *priv)
1859 {
1860 	struct ocfs2_lock_res *iter, *ret = NULL;
1861 	struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug;
1862 
1863 	assert_spin_locked(&ocfs2_dlm_tracking_lock);
1864 
1865 	list_for_each_entry(iter, &start->l_debug_list, l_debug_list) {
1866 		/* discover the head of the list */
1867 		if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) {
1868 			mlog(0, "End of list found, %p\n", ret);
1869 			break;
1870 		}
1871 
1872 		/* We track our "dummy" iteration lockres' by a NULL
1873 		 * l_ops field. */
1874 		if (iter->l_ops != NULL) {
1875 			ret = iter;
1876 			break;
1877 		}
1878 	}
1879 
1880 	return ret;
1881 }
1882 
1883 static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos)
1884 {
1885 	struct ocfs2_dlm_seq_priv *priv = m->private;
1886 	struct ocfs2_lock_res *iter;
1887 
1888 	spin_lock(&ocfs2_dlm_tracking_lock);
1889 	iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv);
1890 	if (iter) {
1891 		/* Since lockres' have the lifetime of their container
1892 		 * (which can be inodes, ocfs2_supers, etc) we want to
1893 		 * copy this out to a temporary lockres while still
1894 		 * under the spinlock. Obviously after this we can't
1895 		 * trust any pointers on the copy returned, but that's
1896 		 * ok as the information we want isn't typically held
1897 		 * in them. */
1898 		priv->p_tmp_res = *iter;
1899 		iter = &priv->p_tmp_res;
1900 	}
1901 	spin_unlock(&ocfs2_dlm_tracking_lock);
1902 
1903 	return iter;
1904 }
1905 
1906 static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v)
1907 {
1908 }
1909 
1910 static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos)
1911 {
1912 	struct ocfs2_dlm_seq_priv *priv = m->private;
1913 	struct ocfs2_lock_res *iter = v;
1914 	struct ocfs2_lock_res *dummy = &priv->p_iter_res;
1915 
1916 	spin_lock(&ocfs2_dlm_tracking_lock);
1917 	iter = ocfs2_dlm_next_res(iter, priv);
1918 	list_del_init(&dummy->l_debug_list);
1919 	if (iter) {
1920 		list_add(&dummy->l_debug_list, &iter->l_debug_list);
1921 		priv->p_tmp_res = *iter;
1922 		iter = &priv->p_tmp_res;
1923 	}
1924 	spin_unlock(&ocfs2_dlm_tracking_lock);
1925 
1926 	return iter;
1927 }
1928 
1929 /* So that debugfs.ocfs2 can determine which format is being used */
1930 #define OCFS2_DLM_DEBUG_STR_VERSION 1
1931 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
1932 {
1933 	int i;
1934 	char *lvb;
1935 	struct ocfs2_lock_res *lockres = v;
1936 
1937 	if (!lockres)
1938 		return -EINVAL;
1939 
1940 	seq_printf(m, "0x%x\t"
1941 		   "%.*s\t"
1942 		   "%d\t"
1943 		   "0x%lx\t"
1944 		   "0x%x\t"
1945 		   "0x%x\t"
1946 		   "%u\t"
1947 		   "%u\t"
1948 		   "%d\t"
1949 		   "%d\t",
1950 		   OCFS2_DLM_DEBUG_STR_VERSION,
1951 		   OCFS2_LOCK_ID_MAX_LEN, lockres->l_name,
1952 		   lockres->l_level,
1953 		   lockres->l_flags,
1954 		   lockres->l_action,
1955 		   lockres->l_unlock_action,
1956 		   lockres->l_ro_holders,
1957 		   lockres->l_ex_holders,
1958 		   lockres->l_requested,
1959 		   lockres->l_blocking);
1960 
1961 	/* Dump the raw LVB */
1962 	lvb = lockres->l_lksb.lvb;
1963 	for(i = 0; i < DLM_LVB_LEN; i++)
1964 		seq_printf(m, "0x%x\t", lvb[i]);
1965 
1966 	/* End the line */
1967 	seq_printf(m, "\n");
1968 	return 0;
1969 }
1970 
1971 static struct seq_operations ocfs2_dlm_seq_ops = {
1972 	.start =	ocfs2_dlm_seq_start,
1973 	.stop =		ocfs2_dlm_seq_stop,
1974 	.next =		ocfs2_dlm_seq_next,
1975 	.show =		ocfs2_dlm_seq_show,
1976 };
1977 
1978 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file)
1979 {
1980 	struct seq_file *seq = (struct seq_file *) file->private_data;
1981 	struct ocfs2_dlm_seq_priv *priv = seq->private;
1982 	struct ocfs2_lock_res *res = &priv->p_iter_res;
1983 
1984 	ocfs2_remove_lockres_tracking(res);
1985 	ocfs2_put_dlm_debug(priv->p_dlm_debug);
1986 	return seq_release_private(inode, file);
1987 }
1988 
1989 static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file)
1990 {
1991 	int ret;
1992 	struct ocfs2_dlm_seq_priv *priv;
1993 	struct seq_file *seq;
1994 	struct ocfs2_super *osb;
1995 
1996 	priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL);
1997 	if (!priv) {
1998 		ret = -ENOMEM;
1999 		mlog_errno(ret);
2000 		goto out;
2001 	}
2002 	osb = (struct ocfs2_super *) inode->u.generic_ip;
2003 	ocfs2_get_dlm_debug(osb->osb_dlm_debug);
2004 	priv->p_dlm_debug = osb->osb_dlm_debug;
2005 	INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list);
2006 
2007 	ret = seq_open(file, &ocfs2_dlm_seq_ops);
2008 	if (ret) {
2009 		kfree(priv);
2010 		mlog_errno(ret);
2011 		goto out;
2012 	}
2013 
2014 	seq = (struct seq_file *) file->private_data;
2015 	seq->private = priv;
2016 
2017 	ocfs2_add_lockres_tracking(&priv->p_iter_res,
2018 				   priv->p_dlm_debug);
2019 
2020 out:
2021 	return ret;
2022 }
2023 
2024 static const struct file_operations ocfs2_dlm_debug_fops = {
2025 	.open =		ocfs2_dlm_debug_open,
2026 	.release =	ocfs2_dlm_debug_release,
2027 	.read =		seq_read,
2028 	.llseek =	seq_lseek,
2029 };
2030 
2031 static int ocfs2_dlm_init_debug(struct ocfs2_super *osb)
2032 {
2033 	int ret = 0;
2034 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2035 
2036 	dlm_debug->d_locking_state = debugfs_create_file("locking_state",
2037 							 S_IFREG|S_IRUSR,
2038 							 osb->osb_debug_root,
2039 							 osb,
2040 							 &ocfs2_dlm_debug_fops);
2041 	if (!dlm_debug->d_locking_state) {
2042 		ret = -EINVAL;
2043 		mlog(ML_ERROR,
2044 		     "Unable to create locking state debugfs file.\n");
2045 		goto out;
2046 	}
2047 
2048 	ocfs2_get_dlm_debug(dlm_debug);
2049 out:
2050 	return ret;
2051 }
2052 
2053 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2054 {
2055 	struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug;
2056 
2057 	if (dlm_debug) {
2058 		debugfs_remove(dlm_debug->d_locking_state);
2059 		ocfs2_put_dlm_debug(dlm_debug);
2060 	}
2061 }
2062 
2063 int ocfs2_dlm_init(struct ocfs2_super *osb)
2064 {
2065 	int status;
2066 	u32 dlm_key;
2067 	struct dlm_ctxt *dlm;
2068 
2069 	mlog_entry_void();
2070 
2071 	status = ocfs2_dlm_init_debug(osb);
2072 	if (status < 0) {
2073 		mlog_errno(status);
2074 		goto bail;
2075 	}
2076 
2077 	/* launch vote thread */
2078 	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
2079 	if (IS_ERR(osb->vote_task)) {
2080 		status = PTR_ERR(osb->vote_task);
2081 		osb->vote_task = NULL;
2082 		mlog_errno(status);
2083 		goto bail;
2084 	}
2085 
2086 	/* used by the dlm code to make message headers unique, each
2087 	 * node in this domain must agree on this. */
2088 	dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2089 
2090 	/* for now, uuid == domain */
2091 	dlm = dlm_register_domain(osb->uuid_str, dlm_key);
2092 	if (IS_ERR(dlm)) {
2093 		status = PTR_ERR(dlm);
2094 		mlog_errno(status);
2095 		goto bail;
2096 	}
2097 
2098 	ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2099 	ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2100 
2101 	dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb);
2102 
2103 	osb->dlm = dlm;
2104 
2105 	status = 0;
2106 bail:
2107 	if (status < 0) {
2108 		ocfs2_dlm_shutdown_debug(osb);
2109 		if (osb->vote_task)
2110 			kthread_stop(osb->vote_task);
2111 	}
2112 
2113 	mlog_exit(status);
2114 	return status;
2115 }
2116 
2117 void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2118 {
2119 	mlog_entry_void();
2120 
2121 	dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2122 
2123 	ocfs2_drop_osb_locks(osb);
2124 
2125 	if (osb->vote_task) {
2126 		kthread_stop(osb->vote_task);
2127 		osb->vote_task = NULL;
2128 	}
2129 
2130 	ocfs2_lock_res_free(&osb->osb_super_lockres);
2131 	ocfs2_lock_res_free(&osb->osb_rename_lockres);
2132 
2133 	dlm_unregister_domain(osb->dlm);
2134 	osb->dlm = NULL;
2135 
2136 	ocfs2_dlm_shutdown_debug(osb);
2137 
2138 	mlog_exit_void();
2139 }
2140 
2141 static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status)
2142 {
2143 	struct ocfs2_lock_res *lockres = opaque;
2144 	unsigned long flags;
2145 
2146 	mlog_entry_void();
2147 
2148 	mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name,
2149 	     lockres->l_unlock_action);
2150 
2151 	spin_lock_irqsave(&lockres->l_lock, flags);
2152 	/* We tried to cancel a convert request, but it was already
2153 	 * granted. All we want to do here is clear our unlock
2154 	 * state. The wake_up call done at the bottom is redundant
2155 	 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2156 	 * hurt anything anyway */
2157 	if (status == DLM_CANCELGRANT &&
2158 	    lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2159 		mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2160 
2161 		/* We don't clear the busy flag in this case as it
2162 		 * should have been cleared by the ast which the dlm
2163 		 * has called. */
2164 		goto complete_unlock;
2165 	}
2166 
2167 	if (status != DLM_NORMAL) {
2168 		mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2169 		     "unlock_action %d\n", status, lockres->l_name,
2170 		     lockres->l_unlock_action);
2171 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2172 		return;
2173 	}
2174 
2175 	switch(lockres->l_unlock_action) {
2176 	case OCFS2_UNLOCK_CANCEL_CONVERT:
2177 		mlog(0, "Cancel convert success for %s\n", lockres->l_name);
2178 		lockres->l_action = OCFS2_AST_INVALID;
2179 		break;
2180 	case OCFS2_UNLOCK_DROP_LOCK:
2181 		lockres->l_level = LKM_IVMODE;
2182 		break;
2183 	default:
2184 		BUG();
2185 	}
2186 
2187 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2188 complete_unlock:
2189 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2190 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2191 
2192 	wake_up(&lockres->l_event);
2193 
2194 	mlog_exit_void();
2195 }
2196 
2197 typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *);
2198 
2199 struct drop_lock_cb {
2200 	ocfs2_pre_drop_cb_t	*drop_func;
2201 	void			*drop_data;
2202 };
2203 
2204 static int ocfs2_drop_lock(struct ocfs2_super *osb,
2205 			   struct ocfs2_lock_res *lockres,
2206 			   struct drop_lock_cb *dcb)
2207 {
2208 	enum dlm_status status;
2209 	unsigned long flags;
2210 
2211 	/* We didn't get anywhere near actually using this lockres. */
2212 	if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2213 		goto out;
2214 
2215 	spin_lock_irqsave(&lockres->l_lock, flags);
2216 
2217 	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING),
2218 			"lockres %s, flags 0x%lx\n",
2219 			lockres->l_name, lockres->l_flags);
2220 
2221 	while (lockres->l_flags & OCFS2_LOCK_BUSY) {
2222 		mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = "
2223 		     "%u, unlock_action = %u\n",
2224 		     lockres->l_name, lockres->l_flags, lockres->l_action,
2225 		     lockres->l_unlock_action);
2226 
2227 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2228 
2229 		/* XXX: Today we just wait on any busy
2230 		 * locks... Perhaps we need to cancel converts in the
2231 		 * future? */
2232 		ocfs2_wait_on_busy_lock(lockres);
2233 
2234 		spin_lock_irqsave(&lockres->l_lock, flags);
2235 	}
2236 
2237 	if (dcb)
2238 		dcb->drop_func(lockres, dcb->drop_data);
2239 
2240 	if (lockres->l_flags & OCFS2_LOCK_BUSY)
2241 		mlog(ML_ERROR, "destroying busy lock: \"%s\"\n",
2242 		     lockres->l_name);
2243 	if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
2244 		mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name);
2245 
2246 	if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
2247 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2248 		goto out;
2249 	}
2250 
2251 	lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED);
2252 
2253 	/* make sure we never get here while waiting for an ast to
2254 	 * fire. */
2255 	BUG_ON(lockres->l_action != OCFS2_AST_INVALID);
2256 
2257 	/* is this necessary? */
2258 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2259 	lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK;
2260 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2261 
2262 	mlog(0, "lock %s\n", lockres->l_name);
2263 
2264 	status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK,
2265 			   lockres->l_ops->unlock_ast, lockres);
2266 	if (status != DLM_NORMAL) {
2267 		ocfs2_log_dlm_error("dlmunlock", status, lockres);
2268 		mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2269 		dlm_print_one_lock(lockres->l_lksb.lockid);
2270 		BUG();
2271 	}
2272 	mlog(0, "lock %s, successfull return from dlmunlock\n",
2273 	     lockres->l_name);
2274 
2275 	ocfs2_wait_on_busy_lock(lockres);
2276 out:
2277 	mlog_exit(0);
2278 	return 0;
2279 }
2280 
2281 /* Mark the lockres as being dropped. It will no longer be
2282  * queued if blocking, but we still may have to wait on it
2283  * being dequeued from the vote thread before we can consider
2284  * it safe to drop.
2285  *
2286  * You can *not* attempt to call cluster_lock on this lockres anymore. */
2287 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
2288 {
2289 	int status;
2290 	struct ocfs2_mask_waiter mw;
2291 	unsigned long flags;
2292 
2293 	ocfs2_init_mask_waiter(&mw);
2294 
2295 	spin_lock_irqsave(&lockres->l_lock, flags);
2296 	lockres->l_flags |= OCFS2_LOCK_FREEING;
2297 	while (lockres->l_flags & OCFS2_LOCK_QUEUED) {
2298 		lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0);
2299 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2300 
2301 		mlog(0, "Waiting on lockres %s\n", lockres->l_name);
2302 
2303 		status = ocfs2_wait_for_mask(&mw);
2304 		if (status)
2305 			mlog_errno(status);
2306 
2307 		spin_lock_irqsave(&lockres->l_lock, flags);
2308 	}
2309 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2310 }
2311 
2312 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb)
2313 {
2314 	int status;
2315 
2316 	mlog_entry_void();
2317 
2318 	ocfs2_mark_lockres_freeing(&osb->osb_super_lockres);
2319 
2320 	status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL);
2321 	if (status < 0)
2322 		mlog_errno(status);
2323 
2324 	ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres);
2325 
2326 	status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL);
2327 	if (status < 0)
2328 		mlog_errno(status);
2329 
2330 	mlog_exit(status);
2331 }
2332 
2333 static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data)
2334 {
2335 	struct inode *inode = data;
2336 
2337 	/* the metadata lock requires a bit more work as we have an
2338 	 * LVB to worry about. */
2339 	if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2340 	    lockres->l_level == LKM_EXMODE &&
2341 	    !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2342 		__ocfs2_stuff_meta_lvb(inode);
2343 }
2344 
2345 int ocfs2_drop_inode_locks(struct inode *inode)
2346 {
2347 	int status, err;
2348 	struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, };
2349 
2350 	mlog_entry_void();
2351 
2352 	/* No need to call ocfs2_mark_lockres_freeing here -
2353 	 * ocfs2_clear_inode has done it for us. */
2354 
2355 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2356 			      &OCFS2_I(inode)->ip_data_lockres,
2357 			      NULL);
2358 	if (err < 0)
2359 		mlog_errno(err);
2360 
2361 	status = err;
2362 
2363 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2364 			      &OCFS2_I(inode)->ip_meta_lockres,
2365 			      &meta_dcb);
2366 	if (err < 0)
2367 		mlog_errno(err);
2368 	if (err < 0 && !status)
2369 		status = err;
2370 
2371 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
2372 			      &OCFS2_I(inode)->ip_rw_lockres,
2373 			      NULL);
2374 	if (err < 0)
2375 		mlog_errno(err);
2376 	if (err < 0 && !status)
2377 		status = err;
2378 
2379 	mlog_exit(status);
2380 	return status;
2381 }
2382 
2383 static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2384 				      int new_level)
2385 {
2386 	assert_spin_locked(&lockres->l_lock);
2387 
2388 	BUG_ON(lockres->l_blocking <= LKM_NLMODE);
2389 
2390 	if (lockres->l_level <= new_level) {
2391 		mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n",
2392 		     lockres->l_level, new_level);
2393 		BUG();
2394 	}
2395 
2396 	mlog(0, "lock %s, new_level = %d, l_blocking = %d\n",
2397 	     lockres->l_name, new_level, lockres->l_blocking);
2398 
2399 	lockres->l_action = OCFS2_AST_DOWNCONVERT;
2400 	lockres->l_requested = new_level;
2401 	lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2402 }
2403 
2404 static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2405 				  struct ocfs2_lock_res *lockres,
2406 				  int new_level,
2407 				  int lvb)
2408 {
2409 	int ret, dlm_flags = LKM_CONVERT;
2410 	enum dlm_status status;
2411 
2412 	mlog_entry_void();
2413 
2414 	if (lvb)
2415 		dlm_flags |= LKM_VALBLK;
2416 
2417 	status = dlmlock(osb->dlm,
2418 			 new_level,
2419 			 &lockres->l_lksb,
2420 			 dlm_flags,
2421 			 lockres->l_name,
2422 			 lockres->l_ops->ast,
2423 			 lockres,
2424 			 lockres->l_ops->bast);
2425 	if (status != DLM_NORMAL) {
2426 		ocfs2_log_dlm_error("dlmlock", status, lockres);
2427 		ret = -EINVAL;
2428 		ocfs2_recover_from_dlm_error(lockres, 1);
2429 		goto bail;
2430 	}
2431 
2432 	ret = 0;
2433 bail:
2434 	mlog_exit(ret);
2435 	return ret;
2436 }
2437 
2438 /* returns 1 when the caller should unlock and call dlmunlock */
2439 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2440 				        struct ocfs2_lock_res *lockres)
2441 {
2442 	assert_spin_locked(&lockres->l_lock);
2443 
2444 	mlog_entry_void();
2445 	mlog(0, "lock %s\n", lockres->l_name);
2446 
2447 	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2448 		/* If we're already trying to cancel a lock conversion
2449 		 * then just drop the spinlock and allow the caller to
2450 		 * requeue this lock. */
2451 
2452 		mlog(0, "Lockres %s, skip convert\n", lockres->l_name);
2453 		return 0;
2454 	}
2455 
2456 	/* were we in a convert when we got the bast fire? */
2457 	BUG_ON(lockres->l_action != OCFS2_AST_CONVERT &&
2458 	       lockres->l_action != OCFS2_AST_DOWNCONVERT);
2459 	/* set things up for the unlockast to know to just
2460 	 * clear out the ast_action and unset busy, etc. */
2461 	lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT;
2462 
2463 	mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY),
2464 			"lock %s, invalid flags: 0x%lx\n",
2465 			lockres->l_name, lockres->l_flags);
2466 
2467 	return 1;
2468 }
2469 
2470 static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2471 				struct ocfs2_lock_res *lockres)
2472 {
2473 	int ret;
2474 	enum dlm_status status;
2475 
2476 	mlog_entry_void();
2477 	mlog(0, "lock %s\n", lockres->l_name);
2478 
2479 	ret = 0;
2480 	status = dlmunlock(osb->dlm,
2481 			   &lockres->l_lksb,
2482 			   LKM_CANCEL,
2483 			   lockres->l_ops->unlock_ast,
2484 			   lockres);
2485 	if (status != DLM_NORMAL) {
2486 		ocfs2_log_dlm_error("dlmunlock", status, lockres);
2487 		ret = -EINVAL;
2488 		ocfs2_recover_from_dlm_error(lockres, 0);
2489 	}
2490 
2491 	mlog(0, "lock %s return from dlmunlock\n", lockres->l_name);
2492 
2493 	mlog_exit(ret);
2494 	return ret;
2495 }
2496 
2497 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode,
2498 						  struct ocfs2_lock_res *lockres,
2499 						  int new_level)
2500 {
2501 	int ret;
2502 
2503 	mlog_entry_void();
2504 
2505 	BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE);
2506 
2507 	if (lockres->l_flags & OCFS2_LOCK_REFRESHING) {
2508 		ret = 0;
2509 		mlog(0, "lockres %s currently being refreshed -- backing "
2510 		     "off!\n", lockres->l_name);
2511 	} else if (new_level == LKM_PRMODE)
2512 		ret = !lockres->l_ex_holders &&
2513 			ocfs2_inode_fully_checkpointed(inode);
2514 	else /* Must be NLMODE we're converting to. */
2515 		ret = !lockres->l_ro_holders && !lockres->l_ex_holders &&
2516 			ocfs2_inode_fully_checkpointed(inode);
2517 
2518 	mlog_exit(ret);
2519 	return ret;
2520 }
2521 
2522 static int ocfs2_do_unblock_meta(struct inode *inode,
2523 				 int *requeue)
2524 {
2525 	int new_level;
2526 	int set_lvb = 0;
2527 	int ret = 0;
2528 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres;
2529 	unsigned long flags;
2530 
2531 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2532 
2533 	mlog_entry_void();
2534 
2535 	spin_lock_irqsave(&lockres->l_lock, flags);
2536 
2537 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2538 
2539 	mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level,
2540 	     lockres->l_blocking);
2541 
2542 	BUG_ON(lockres->l_level != LKM_EXMODE &&
2543 	       lockres->l_level != LKM_PRMODE);
2544 
2545 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2546 		*requeue = 1;
2547 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
2548 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2549 		if (ret) {
2550 			ret = ocfs2_cancel_convert(osb, lockres);
2551 			if (ret < 0)
2552 				mlog_errno(ret);
2553 		}
2554 		goto leave;
2555 	}
2556 
2557 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2558 
2559 	mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n",
2560 	     lockres->l_level, lockres->l_blocking, new_level);
2561 
2562 	if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) {
2563 		if (lockres->l_level == LKM_EXMODE)
2564 			set_lvb = 1;
2565 
2566 		/* If the lock hasn't been refreshed yet (rare), then
2567 		 * our memory inode values are old and we skip
2568 		 * stuffing the lvb. There's no need to actually clear
2569 		 * out the lvb here as it's value is still valid. */
2570 		if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
2571 			if (set_lvb)
2572 				__ocfs2_stuff_meta_lvb(inode);
2573 		} else
2574 			mlog(0, "lockres %s: downconverting stale lock!\n",
2575 			     lockres->l_name);
2576 
2577 		mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, "
2578 		     "l_blocking=%d, new_level=%d\n",
2579 		     lockres->l_level, lockres->l_blocking, new_level);
2580 
2581 		ocfs2_prepare_downconvert(lockres, new_level);
2582 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2583 		ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb);
2584 		goto leave;
2585 	}
2586 	if (!ocfs2_inode_fully_checkpointed(inode))
2587 		ocfs2_start_checkpoint(osb);
2588 
2589 	*requeue = 1;
2590 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2591 	ret = 0;
2592 leave:
2593 	mlog_exit(ret);
2594 	return ret;
2595 }
2596 
2597 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb,
2598 				      struct ocfs2_lock_res *lockres,
2599 				      int *requeue,
2600 				      ocfs2_convert_worker_t *worker)
2601 {
2602 	unsigned long flags;
2603 	int blocking;
2604 	int new_level;
2605 	int ret = 0;
2606 
2607 	mlog_entry_void();
2608 
2609 	spin_lock_irqsave(&lockres->l_lock, flags);
2610 
2611 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
2612 
2613 recheck:
2614 	if (lockres->l_flags & OCFS2_LOCK_BUSY) {
2615 		*requeue = 1;
2616 		ret = ocfs2_prepare_cancel_convert(osb, lockres);
2617 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2618 		if (ret) {
2619 			ret = ocfs2_cancel_convert(osb, lockres);
2620 			if (ret < 0)
2621 				mlog_errno(ret);
2622 		}
2623 		goto leave;
2624 	}
2625 
2626 	/* if we're blocking an exclusive and we have *any* holders,
2627 	 * then requeue. */
2628 	if ((lockres->l_blocking == LKM_EXMODE)
2629 	    && (lockres->l_ex_holders || lockres->l_ro_holders)) {
2630 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2631 		*requeue = 1;
2632 		ret = 0;
2633 		goto leave;
2634 	}
2635 
2636 	/* If it's a PR we're blocking, then only
2637 	 * requeue if we've got any EX holders */
2638 	if (lockres->l_blocking == LKM_PRMODE &&
2639 	    lockres->l_ex_holders) {
2640 		spin_unlock_irqrestore(&lockres->l_lock, flags);
2641 		*requeue = 1;
2642 		ret = 0;
2643 		goto leave;
2644 	}
2645 
2646 	/* If we get here, then we know that there are no more
2647 	 * incompatible holders (and anyone asking for an incompatible
2648 	 * lock is blocked). We can now downconvert the lock */
2649 	if (!worker)
2650 		goto downconvert;
2651 
2652 	/* Some lockres types want to do a bit of work before
2653 	 * downconverting a lock. Allow that here. The worker function
2654 	 * may sleep, so we save off a copy of what we're blocking as
2655 	 * it may change while we're not holding the spin lock. */
2656 	blocking = lockres->l_blocking;
2657 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2658 
2659 	worker(lockres, blocking);
2660 
2661 	spin_lock_irqsave(&lockres->l_lock, flags);
2662 	if (blocking != lockres->l_blocking) {
2663 		/* If this changed underneath us, then we can't drop
2664 		 * it just yet. */
2665 		goto recheck;
2666 	}
2667 
2668 downconvert:
2669 	*requeue = 0;
2670 	new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking);
2671 
2672 	ocfs2_prepare_downconvert(lockres, new_level);
2673 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2674 	ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0);
2675 leave:
2676 	mlog_exit(ret);
2677 	return ret;
2678 }
2679 
2680 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
2681 				      int blocking)
2682 {
2683 	struct inode *inode;
2684 	struct address_space *mapping;
2685 
2686 	mlog_entry_void();
2687 
2688        	inode = ocfs2_lock_res_inode(lockres);
2689 	mapping = inode->i_mapping;
2690 
2691 	if (filemap_fdatawrite(mapping)) {
2692 		mlog(ML_ERROR, "Could not sync inode %llu for downconvert!",
2693 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2694 	}
2695 	sync_mapping_buffers(mapping);
2696 	if (blocking == LKM_EXMODE) {
2697 		truncate_inode_pages(mapping, 0);
2698 		unmap_mapping_range(mapping, 0, 0, 0);
2699 	} else {
2700 		/* We only need to wait on the I/O if we're not also
2701 		 * truncating pages because truncate_inode_pages waits
2702 		 * for us above. We don't truncate pages if we're
2703 		 * blocking anything < EXMODE because we want to keep
2704 		 * them around in that case. */
2705 		filemap_fdatawait(mapping);
2706 	}
2707 
2708 	mlog_exit_void();
2709 }
2710 
2711 int ocfs2_unblock_data(struct ocfs2_lock_res *lockres,
2712 		       int *requeue)
2713 {
2714 	int status;
2715 	struct inode *inode;
2716 	struct ocfs2_super *osb;
2717 
2718 	mlog_entry_void();
2719 
2720 	inode = ocfs2_lock_res_inode(lockres);
2721 	osb = OCFS2_SB(inode->i_sb);
2722 
2723 	mlog(0, "unblock inode %llu\n",
2724 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2725 
2726 	status = ocfs2_generic_unblock_lock(osb,
2727 					    lockres,
2728 					    requeue,
2729 					    ocfs2_data_convert_worker);
2730 	if (status < 0)
2731 		mlog_errno(status);
2732 
2733 	mlog(0, "inode %llu, requeue = %d\n",
2734 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue);
2735 
2736 	mlog_exit(status);
2737 	return status;
2738 }
2739 
2740 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres,
2741 				    int *requeue)
2742 {
2743 	int status;
2744 	struct inode *inode;
2745 
2746 	mlog_entry_void();
2747 
2748 	mlog(0, "Unblock lockres %s\n", lockres->l_name);
2749 
2750 	inode  = ocfs2_lock_res_inode(lockres);
2751 
2752 	status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb),
2753 					    lockres,
2754 					    requeue,
2755 					    NULL);
2756 	if (status < 0)
2757 		mlog_errno(status);
2758 
2759 	mlog_exit(status);
2760 	return status;
2761 }
2762 
2763 
2764 int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres,
2765 		       int *requeue)
2766 {
2767 	int status;
2768 	struct inode *inode;
2769 
2770 	mlog_entry_void();
2771 
2772        	inode = ocfs2_lock_res_inode(lockres);
2773 
2774 	mlog(0, "unblock inode %llu\n",
2775 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
2776 
2777 	status = ocfs2_do_unblock_meta(inode, requeue);
2778 	if (status < 0)
2779 		mlog_errno(status);
2780 
2781 	mlog(0, "inode %llu, requeue = %d\n",
2782 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue);
2783 
2784 	mlog_exit(status);
2785 	return status;
2786 }
2787 
2788 /* Generic unblock function for any lockres whose private data is an
2789  * ocfs2_super pointer. */
2790 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres,
2791 				  int *requeue)
2792 {
2793 	int status;
2794 	struct ocfs2_super *osb;
2795 
2796 	mlog_entry_void();
2797 
2798 	mlog(0, "Unblock lockres %s\n", lockres->l_name);
2799 
2800 	osb = ocfs2_lock_res_super(lockres);
2801 
2802 	status = ocfs2_generic_unblock_lock(osb,
2803 					    lockres,
2804 					    requeue,
2805 					    NULL);
2806 	if (status < 0)
2807 		mlog_errno(status);
2808 
2809 	mlog_exit(status);
2810 	return status;
2811 }
2812 
2813 void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
2814 				struct ocfs2_lock_res *lockres)
2815 {
2816 	int status;
2817 	int requeue = 0;
2818 	unsigned long flags;
2819 
2820 	/* Our reference to the lockres in this function can be
2821 	 * considered valid until we remove the OCFS2_LOCK_QUEUED
2822 	 * flag. */
2823 
2824 	mlog_entry_void();
2825 
2826 	BUG_ON(!lockres);
2827 	BUG_ON(!lockres->l_ops);
2828 	BUG_ON(!lockres->l_ops->unblock);
2829 
2830 	mlog(0, "lockres %s blocked.\n", lockres->l_name);
2831 
2832 	/* Detect whether a lock has been marked as going away while
2833 	 * the vote thread was processing other things. A lock can
2834 	 * still be marked with OCFS2_LOCK_FREEING after this check,
2835 	 * but short circuiting here will still save us some
2836 	 * performance. */
2837 	spin_lock_irqsave(&lockres->l_lock, flags);
2838 	if (lockres->l_flags & OCFS2_LOCK_FREEING)
2839 		goto unqueue;
2840 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2841 
2842 	status = lockres->l_ops->unblock(lockres, &requeue);
2843 	if (status < 0)
2844 		mlog_errno(status);
2845 
2846 	spin_lock_irqsave(&lockres->l_lock, flags);
2847 unqueue:
2848 	if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) {
2849 		lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
2850 	} else
2851 		ocfs2_schedule_blocked_lock(osb, lockres);
2852 
2853 	mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name,
2854 	     requeue ? "yes" : "no");
2855 	spin_unlock_irqrestore(&lockres->l_lock, flags);
2856 
2857 	mlog_exit_void();
2858 }
2859 
2860 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
2861 					struct ocfs2_lock_res *lockres)
2862 {
2863 	mlog_entry_void();
2864 
2865 	assert_spin_locked(&lockres->l_lock);
2866 
2867 	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
2868 		/* Do not schedule a lock for downconvert when it's on
2869 		 * the way to destruction - any nodes wanting access
2870 		 * to the resource will get it soon. */
2871 		mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n",
2872 		     lockres->l_name, lockres->l_flags);
2873 		return;
2874 	}
2875 
2876 	lockres_or_flags(lockres, OCFS2_LOCK_QUEUED);
2877 
2878 	spin_lock(&osb->vote_task_lock);
2879 	if (list_empty(&lockres->l_blocked_list)) {
2880 		list_add_tail(&lockres->l_blocked_list,
2881 			      &osb->blocked_lock_list);
2882 		osb->blocked_lock_count++;
2883 	}
2884 	spin_unlock(&osb->vote_task_lock);
2885 
2886 	mlog_exit_void();
2887 }
2888 
2889 /* This aids in debugging situations where a bad LVB might be involved. */
2890 void ocfs2_dump_meta_lvb_info(u64 level,
2891 			      const char *function,
2892 			      unsigned int line,
2893 			      struct ocfs2_lock_res *lockres)
2894 {
2895 	struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb;
2896 
2897 	mlog(level, "LVB information for %s (called from %s:%u):\n",
2898 	     lockres->l_name, function, line);
2899 	mlog(level, "version: %u, clusters: %u\n",
2900 	     be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters));
2901 	mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n",
2902 	     (unsigned long long)be64_to_cpu(lvb->lvb_isize),
2903 	     be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid),
2904 	     be16_to_cpu(lvb->lvb_imode));
2905 	mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, "
2906 	     "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink),
2907 	     (long long)be64_to_cpu(lvb->lvb_iatime_packed),
2908 	     (long long)be64_to_cpu(lvb->lvb_ictime_packed),
2909 	     (long long)be64_to_cpu(lvb->lvb_imtime_packed),
2910 	     be32_to_cpu(lvb->lvb_iattr));
2911 }
2912