1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * dlmglue.c 5 * 6 * Code which implements an OCFS2 specific interface to our DLM. 7 * 8 * Copyright (C) 2003, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/types.h> 27 #include <linux/slab.h> 28 #include <linux/highmem.h> 29 #include <linux/mm.h> 30 #include <linux/smp_lock.h> 31 #include <linux/crc32.h> 32 #include <linux/kthread.h> 33 #include <linux/pagemap.h> 34 #include <linux/debugfs.h> 35 #include <linux/seq_file.h> 36 37 #include <cluster/heartbeat.h> 38 #include <cluster/nodemanager.h> 39 #include <cluster/tcp.h> 40 41 #include <dlm/dlmapi.h> 42 43 #define MLOG_MASK_PREFIX ML_DLM_GLUE 44 #include <cluster/masklog.h> 45 46 #include "ocfs2.h" 47 48 #include "alloc.h" 49 #include "dlmglue.h" 50 #include "extent_map.h" 51 #include "heartbeat.h" 52 #include "inode.h" 53 #include "journal.h" 54 #include "slot_map.h" 55 #include "super.h" 56 #include "uptodate.h" 57 #include "vote.h" 58 59 #include "buffer_head_io.h" 60 61 struct ocfs2_mask_waiter { 62 struct list_head mw_item; 63 int mw_status; 64 struct completion mw_complete; 65 unsigned long mw_mask; 66 unsigned long mw_goal; 67 }; 68 69 static void ocfs2_inode_ast_func(void *opaque); 70 static void ocfs2_inode_bast_func(void *opaque, 71 int level); 72 static void ocfs2_super_ast_func(void *opaque); 73 static void ocfs2_super_bast_func(void *opaque, 74 int level); 75 static void ocfs2_rename_ast_func(void *opaque); 76 static void ocfs2_rename_bast_func(void *opaque, 77 int level); 78 79 /* so far, all locks have gotten along with the same unlock ast */ 80 static void ocfs2_unlock_ast_func(void *opaque, 81 enum dlm_status status); 82 static int ocfs2_do_unblock_meta(struct inode *inode, 83 int *requeue); 84 static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, 85 int *requeue); 86 static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, 87 int *requeue); 88 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, 89 int *requeue); 90 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, 91 int *requeue); 92 typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int); 93 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, 94 struct ocfs2_lock_res *lockres, 95 int *requeue, 96 ocfs2_convert_worker_t *worker); 97 98 struct ocfs2_lock_res_ops { 99 void (*ast)(void *); 100 void (*bast)(void *, int); 101 void (*unlock_ast)(void *, enum dlm_status); 102 int (*unblock)(struct ocfs2_lock_res *, int *); 103 }; 104 105 static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { 106 .ast = ocfs2_inode_ast_func, 107 .bast = ocfs2_inode_bast_func, 108 .unlock_ast = ocfs2_unlock_ast_func, 109 .unblock = ocfs2_unblock_inode_lock, 110 }; 111 112 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 113 .ast = ocfs2_inode_ast_func, 114 .bast = ocfs2_inode_bast_func, 115 .unlock_ast = ocfs2_unlock_ast_func, 116 .unblock = ocfs2_unblock_meta, 117 }; 118 119 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, 120 int blocking); 121 122 static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { 123 .ast = ocfs2_inode_ast_func, 124 .bast = ocfs2_inode_bast_func, 125 .unlock_ast = ocfs2_unlock_ast_func, 126 .unblock = ocfs2_unblock_data, 127 }; 128 129 static struct ocfs2_lock_res_ops ocfs2_super_lops = { 130 .ast = ocfs2_super_ast_func, 131 .bast = ocfs2_super_bast_func, 132 .unlock_ast = ocfs2_unlock_ast_func, 133 .unblock = ocfs2_unblock_osb_lock, 134 }; 135 136 static struct ocfs2_lock_res_ops ocfs2_rename_lops = { 137 .ast = ocfs2_rename_ast_func, 138 .bast = ocfs2_rename_bast_func, 139 .unlock_ast = ocfs2_unlock_ast_func, 140 .unblock = ocfs2_unblock_osb_lock, 141 }; 142 143 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 144 { 145 return lockres->l_type == OCFS2_LOCK_TYPE_META || 146 lockres->l_type == OCFS2_LOCK_TYPE_DATA || 147 lockres->l_type == OCFS2_LOCK_TYPE_RW; 148 } 149 150 static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres) 151 { 152 return lockres->l_type == OCFS2_LOCK_TYPE_SUPER; 153 } 154 155 static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres) 156 { 157 return lockres->l_type == OCFS2_LOCK_TYPE_RENAME; 158 } 159 160 static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres) 161 { 162 BUG_ON(!ocfs2_is_super_lock(lockres) 163 && !ocfs2_is_rename_lock(lockres)); 164 165 return (struct ocfs2_super *) lockres->l_priv; 166 } 167 168 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 169 { 170 BUG_ON(!ocfs2_is_inode_lock(lockres)); 171 172 return (struct inode *) lockres->l_priv; 173 } 174 175 static int ocfs2_lock_create(struct ocfs2_super *osb, 176 struct ocfs2_lock_res *lockres, 177 int level, 178 int dlm_flags); 179 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 180 int wanted); 181 static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 182 struct ocfs2_lock_res *lockres, 183 int level); 184 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); 185 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); 186 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); 187 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); 188 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, 189 struct ocfs2_lock_res *lockres); 190 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 191 int convert); 192 #define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ 193 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 194 "resource %s: %s\n", dlm_errname(_stat), _func, \ 195 _lockres->l_name, dlm_errmsg(_stat)); \ 196 } while (0) 197 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 198 struct ocfs2_lock_res *lockres); 199 static int ocfs2_meta_lock_update(struct inode *inode, 200 struct buffer_head **bh); 201 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 202 static inline int ocfs2_highest_compat_lock_level(int level); 203 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, 204 struct ocfs2_lock_res *lockres, 205 int new_level); 206 207 static char *ocfs2_lock_type_strings[] = { 208 [OCFS2_LOCK_TYPE_META] = "Meta", 209 [OCFS2_LOCK_TYPE_DATA] = "Data", 210 [OCFS2_LOCK_TYPE_SUPER] = "Super", 211 [OCFS2_LOCK_TYPE_RENAME] = "Rename", 212 /* Need to differntiate from [R]ename.. serializing writes is the 213 * important job it does, anyway. */ 214 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 215 }; 216 217 static char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 218 { 219 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); 220 return ocfs2_lock_type_strings[type]; 221 } 222 223 static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 224 u64 blkno, 225 u32 generation, 226 char *name) 227 { 228 int len; 229 230 mlog_entry_void(); 231 232 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); 233 234 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x", 235 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, 236 (long long)blkno, generation); 237 238 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); 239 240 mlog(0, "built lock resource with name: %s\n", name); 241 242 mlog_exit_void(); 243 } 244 245 static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); 246 247 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, 248 struct ocfs2_dlm_debug *dlm_debug) 249 { 250 mlog(0, "Add tracking for lockres %s\n", res->l_name); 251 252 spin_lock(&ocfs2_dlm_tracking_lock); 253 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); 254 spin_unlock(&ocfs2_dlm_tracking_lock); 255 } 256 257 static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) 258 { 259 spin_lock(&ocfs2_dlm_tracking_lock); 260 if (!list_empty(&res->l_debug_list)) 261 list_del_init(&res->l_debug_list); 262 spin_unlock(&ocfs2_dlm_tracking_lock); 263 } 264 265 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, 266 struct ocfs2_lock_res *res, 267 enum ocfs2_lock_type type, 268 u64 blkno, 269 u32 generation, 270 struct ocfs2_lock_res_ops *ops, 271 void *priv) 272 { 273 ocfs2_build_lock_name(type, blkno, generation, res->l_name); 274 275 res->l_type = type; 276 res->l_ops = ops; 277 res->l_priv = priv; 278 279 res->l_level = LKM_IVMODE; 280 res->l_requested = LKM_IVMODE; 281 res->l_blocking = LKM_IVMODE; 282 res->l_action = OCFS2_AST_INVALID; 283 res->l_unlock_action = OCFS2_UNLOCK_INVALID; 284 285 res->l_flags = OCFS2_LOCK_INITIALIZED; 286 287 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); 288 } 289 290 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) 291 { 292 /* This also clears out the lock status block */ 293 memset(res, 0, sizeof(struct ocfs2_lock_res)); 294 spin_lock_init(&res->l_lock); 295 init_waitqueue_head(&res->l_event); 296 INIT_LIST_HEAD(&res->l_blocked_list); 297 INIT_LIST_HEAD(&res->l_mask_waiters); 298 } 299 300 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, 301 enum ocfs2_lock_type type, 302 struct inode *inode) 303 { 304 struct ocfs2_lock_res_ops *ops; 305 306 switch(type) { 307 case OCFS2_LOCK_TYPE_RW: 308 ops = &ocfs2_inode_rw_lops; 309 break; 310 case OCFS2_LOCK_TYPE_META: 311 ops = &ocfs2_inode_meta_lops; 312 break; 313 case OCFS2_LOCK_TYPE_DATA: 314 ops = &ocfs2_inode_data_lops; 315 break; 316 default: 317 mlog_bug_on_msg(1, "type: %d\n", type); 318 ops = NULL; /* thanks, gcc */ 319 break; 320 }; 321 322 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, 323 OCFS2_I(inode)->ip_blkno, 324 inode->i_generation, ops, inode); 325 } 326 327 static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, 328 struct ocfs2_super *osb) 329 { 330 /* Superblock lockres doesn't come from a slab so we call init 331 * once on it manually. */ 332 ocfs2_lock_res_init_once(res); 333 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, 334 OCFS2_SUPER_BLOCK_BLKNO, 0, 335 &ocfs2_super_lops, osb); 336 } 337 338 static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, 339 struct ocfs2_super *osb) 340 { 341 /* Rename lockres doesn't come from a slab so we call init 342 * once on it manually. */ 343 ocfs2_lock_res_init_once(res); 344 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0, 345 &ocfs2_rename_lops, osb); 346 } 347 348 void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 349 { 350 mlog_entry_void(); 351 352 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) 353 return; 354 355 ocfs2_remove_lockres_tracking(res); 356 357 mlog_bug_on_msg(!list_empty(&res->l_blocked_list), 358 "Lockres %s is on the blocked list\n", 359 res->l_name); 360 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), 361 "Lockres %s has mask waiters pending\n", 362 res->l_name); 363 mlog_bug_on_msg(spin_is_locked(&res->l_lock), 364 "Lockres %s is locked\n", 365 res->l_name); 366 mlog_bug_on_msg(res->l_ro_holders, 367 "Lockres %s has %u ro holders\n", 368 res->l_name, res->l_ro_holders); 369 mlog_bug_on_msg(res->l_ex_holders, 370 "Lockres %s has %u ex holders\n", 371 res->l_name, res->l_ex_holders); 372 373 /* Need to clear out the lock status block for the dlm */ 374 memset(&res->l_lksb, 0, sizeof(res->l_lksb)); 375 376 res->l_flags = 0UL; 377 mlog_exit_void(); 378 } 379 380 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, 381 int level) 382 { 383 mlog_entry_void(); 384 385 BUG_ON(!lockres); 386 387 switch(level) { 388 case LKM_EXMODE: 389 lockres->l_ex_holders++; 390 break; 391 case LKM_PRMODE: 392 lockres->l_ro_holders++; 393 break; 394 default: 395 BUG(); 396 } 397 398 mlog_exit_void(); 399 } 400 401 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, 402 int level) 403 { 404 mlog_entry_void(); 405 406 BUG_ON(!lockres); 407 408 switch(level) { 409 case LKM_EXMODE: 410 BUG_ON(!lockres->l_ex_holders); 411 lockres->l_ex_holders--; 412 break; 413 case LKM_PRMODE: 414 BUG_ON(!lockres->l_ro_holders); 415 lockres->l_ro_holders--; 416 break; 417 default: 418 BUG(); 419 } 420 mlog_exit_void(); 421 } 422 423 /* WARNING: This function lives in a world where the only three lock 424 * levels are EX, PR, and NL. It *will* have to be adjusted when more 425 * lock types are added. */ 426 static inline int ocfs2_highest_compat_lock_level(int level) 427 { 428 int new_level = LKM_EXMODE; 429 430 if (level == LKM_EXMODE) 431 new_level = LKM_NLMODE; 432 else if (level == LKM_PRMODE) 433 new_level = LKM_PRMODE; 434 return new_level; 435 } 436 437 static void lockres_set_flags(struct ocfs2_lock_res *lockres, 438 unsigned long newflags) 439 { 440 struct list_head *pos, *tmp; 441 struct ocfs2_mask_waiter *mw; 442 443 assert_spin_locked(&lockres->l_lock); 444 445 lockres->l_flags = newflags; 446 447 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { 448 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); 449 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 450 continue; 451 452 list_del_init(&mw->mw_item); 453 mw->mw_status = 0; 454 complete(&mw->mw_complete); 455 } 456 } 457 static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) 458 { 459 lockres_set_flags(lockres, lockres->l_flags | or); 460 } 461 static void lockres_clear_flags(struct ocfs2_lock_res *lockres, 462 unsigned long clear) 463 { 464 lockres_set_flags(lockres, lockres->l_flags & ~clear); 465 } 466 467 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) 468 { 469 mlog_entry_void(); 470 471 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 472 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 473 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 474 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 475 476 lockres->l_level = lockres->l_requested; 477 if (lockres->l_level <= 478 ocfs2_highest_compat_lock_level(lockres->l_blocking)) { 479 lockres->l_blocking = LKM_NLMODE; 480 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 481 } 482 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 483 484 mlog_exit_void(); 485 } 486 487 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) 488 { 489 mlog_entry_void(); 490 491 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 492 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 493 494 /* Convert from RO to EX doesn't really need anything as our 495 * information is already up to data. Convert from NL to 496 * *anything* however should mark ourselves as needing an 497 * update */ 498 if (lockres->l_level == LKM_NLMODE) 499 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 500 501 lockres->l_level = lockres->l_requested; 502 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 503 504 mlog_exit_void(); 505 } 506 507 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) 508 { 509 mlog_entry_void(); 510 511 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); 512 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 513 514 if (lockres->l_requested > LKM_NLMODE && 515 !(lockres->l_flags & OCFS2_LOCK_LOCAL)) 516 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 517 518 lockres->l_level = lockres->l_requested; 519 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); 520 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 521 522 mlog_exit_void(); 523 } 524 525 static void ocfs2_inode_ast_func(void *opaque) 526 { 527 struct ocfs2_lock_res *lockres = opaque; 528 struct inode *inode; 529 struct dlm_lockstatus *lksb; 530 unsigned long flags; 531 532 mlog_entry_void(); 533 534 inode = ocfs2_lock_res_inode(lockres); 535 536 mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n", 537 (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action, 538 ocfs2_lock_type_string(lockres->l_type)); 539 540 BUG_ON(!ocfs2_is_inode_lock(lockres)); 541 542 spin_lock_irqsave(&lockres->l_lock, flags); 543 544 lksb = &(lockres->l_lksb); 545 if (lksb->status != DLM_NORMAL) { 546 mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u " 547 "on inode %llu\n", lksb->status, 548 (unsigned long long)OCFS2_I(inode)->ip_blkno); 549 spin_unlock_irqrestore(&lockres->l_lock, flags); 550 mlog_exit_void(); 551 return; 552 } 553 554 switch(lockres->l_action) { 555 case OCFS2_AST_ATTACH: 556 ocfs2_generic_handle_attach_action(lockres); 557 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); 558 break; 559 case OCFS2_AST_CONVERT: 560 ocfs2_generic_handle_convert_action(lockres); 561 break; 562 case OCFS2_AST_DOWNCONVERT: 563 ocfs2_generic_handle_downconvert_action(lockres); 564 break; 565 default: 566 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 567 "lockres flags = 0x%lx, unlock action: %u\n", 568 lockres->l_name, lockres->l_action, lockres->l_flags, 569 lockres->l_unlock_action); 570 571 BUG(); 572 } 573 574 /* data and rw locking ignores refresh flag for now. */ 575 if (lockres->l_type != OCFS2_LOCK_TYPE_META) 576 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 577 578 /* set it to something invalid so if we get called again we 579 * can catch it. */ 580 lockres->l_action = OCFS2_AST_INVALID; 581 spin_unlock_irqrestore(&lockres->l_lock, flags); 582 wake_up(&lockres->l_event); 583 584 mlog_exit_void(); 585 } 586 587 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, 588 int level) 589 { 590 int needs_downconvert = 0; 591 mlog_entry_void(); 592 593 assert_spin_locked(&lockres->l_lock); 594 595 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 596 597 if (level > lockres->l_blocking) { 598 /* only schedule a downconvert if we haven't already scheduled 599 * one that goes low enough to satisfy the level we're 600 * blocking. this also catches the case where we get 601 * duplicate BASTs */ 602 if (ocfs2_highest_compat_lock_level(level) < 603 ocfs2_highest_compat_lock_level(lockres->l_blocking)) 604 needs_downconvert = 1; 605 606 lockres->l_blocking = level; 607 } 608 609 mlog_exit(needs_downconvert); 610 return needs_downconvert; 611 } 612 613 static void ocfs2_generic_bast_func(struct ocfs2_super *osb, 614 struct ocfs2_lock_res *lockres, 615 int level) 616 { 617 int needs_downconvert; 618 unsigned long flags; 619 620 mlog_entry_void(); 621 622 BUG_ON(level <= LKM_NLMODE); 623 624 spin_lock_irqsave(&lockres->l_lock, flags); 625 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 626 if (needs_downconvert) 627 ocfs2_schedule_blocked_lock(osb, lockres); 628 spin_unlock_irqrestore(&lockres->l_lock, flags); 629 630 ocfs2_kick_vote_thread(osb); 631 632 wake_up(&lockres->l_event); 633 mlog_exit_void(); 634 } 635 636 static void ocfs2_inode_bast_func(void *opaque, int level) 637 { 638 struct ocfs2_lock_res *lockres = opaque; 639 struct inode *inode; 640 struct ocfs2_super *osb; 641 642 mlog_entry_void(); 643 644 BUG_ON(!ocfs2_is_inode_lock(lockres)); 645 646 inode = ocfs2_lock_res_inode(lockres); 647 osb = OCFS2_SB(inode->i_sb); 648 649 mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n", 650 (unsigned long long)OCFS2_I(inode)->ip_blkno, level, 651 lockres->l_level, ocfs2_lock_type_string(lockres->l_type)); 652 653 ocfs2_generic_bast_func(osb, lockres, level); 654 655 mlog_exit_void(); 656 } 657 658 static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, 659 int ignore_refresh) 660 { 661 struct dlm_lockstatus *lksb = &lockres->l_lksb; 662 unsigned long flags; 663 664 spin_lock_irqsave(&lockres->l_lock, flags); 665 666 if (lksb->status != DLM_NORMAL) { 667 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", 668 lockres->l_name, lksb->status); 669 spin_unlock_irqrestore(&lockres->l_lock, flags); 670 return; 671 } 672 673 switch(lockres->l_action) { 674 case OCFS2_AST_ATTACH: 675 ocfs2_generic_handle_attach_action(lockres); 676 break; 677 case OCFS2_AST_CONVERT: 678 ocfs2_generic_handle_convert_action(lockres); 679 break; 680 case OCFS2_AST_DOWNCONVERT: 681 ocfs2_generic_handle_downconvert_action(lockres); 682 break; 683 default: 684 BUG(); 685 } 686 687 if (ignore_refresh) 688 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 689 690 /* set it to something invalid so if we get called again we 691 * can catch it. */ 692 lockres->l_action = OCFS2_AST_INVALID; 693 spin_unlock_irqrestore(&lockres->l_lock, flags); 694 695 wake_up(&lockres->l_event); 696 } 697 698 static void ocfs2_super_ast_func(void *opaque) 699 { 700 struct ocfs2_lock_res *lockres = opaque; 701 702 mlog_entry_void(); 703 mlog(0, "Superblock AST fired\n"); 704 705 BUG_ON(!ocfs2_is_super_lock(lockres)); 706 ocfs2_generic_ast_func(lockres, 0); 707 708 mlog_exit_void(); 709 } 710 711 static void ocfs2_super_bast_func(void *opaque, 712 int level) 713 { 714 struct ocfs2_lock_res *lockres = opaque; 715 struct ocfs2_super *osb; 716 717 mlog_entry_void(); 718 mlog(0, "Superblock BAST fired\n"); 719 720 BUG_ON(!ocfs2_is_super_lock(lockres)); 721 osb = ocfs2_lock_res_super(lockres); 722 ocfs2_generic_bast_func(osb, lockres, level); 723 724 mlog_exit_void(); 725 } 726 727 static void ocfs2_rename_ast_func(void *opaque) 728 { 729 struct ocfs2_lock_res *lockres = opaque; 730 731 mlog_entry_void(); 732 733 mlog(0, "Rename AST fired\n"); 734 735 BUG_ON(!ocfs2_is_rename_lock(lockres)); 736 737 ocfs2_generic_ast_func(lockres, 1); 738 739 mlog_exit_void(); 740 } 741 742 static void ocfs2_rename_bast_func(void *opaque, 743 int level) 744 { 745 struct ocfs2_lock_res *lockres = opaque; 746 struct ocfs2_super *osb; 747 748 mlog_entry_void(); 749 750 mlog(0, "Rename BAST fired\n"); 751 752 BUG_ON(!ocfs2_is_rename_lock(lockres)); 753 754 osb = ocfs2_lock_res_super(lockres); 755 ocfs2_generic_bast_func(osb, lockres, level); 756 757 mlog_exit_void(); 758 } 759 760 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 761 int convert) 762 { 763 unsigned long flags; 764 765 mlog_entry_void(); 766 spin_lock_irqsave(&lockres->l_lock, flags); 767 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 768 if (convert) 769 lockres->l_action = OCFS2_AST_INVALID; 770 else 771 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 772 spin_unlock_irqrestore(&lockres->l_lock, flags); 773 774 wake_up(&lockres->l_event); 775 mlog_exit_void(); 776 } 777 778 /* Note: If we detect another process working on the lock (i.e., 779 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller 780 * to do the right thing in that case. 781 */ 782 static int ocfs2_lock_create(struct ocfs2_super *osb, 783 struct ocfs2_lock_res *lockres, 784 int level, 785 int dlm_flags) 786 { 787 int ret = 0; 788 enum dlm_status status; 789 unsigned long flags; 790 791 mlog_entry_void(); 792 793 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, 794 dlm_flags); 795 796 spin_lock_irqsave(&lockres->l_lock, flags); 797 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || 798 (lockres->l_flags & OCFS2_LOCK_BUSY)) { 799 spin_unlock_irqrestore(&lockres->l_lock, flags); 800 goto bail; 801 } 802 803 lockres->l_action = OCFS2_AST_ATTACH; 804 lockres->l_requested = level; 805 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 806 spin_unlock_irqrestore(&lockres->l_lock, flags); 807 808 status = dlmlock(osb->dlm, 809 level, 810 &lockres->l_lksb, 811 dlm_flags, 812 lockres->l_name, 813 lockres->l_ops->ast, 814 lockres, 815 lockres->l_ops->bast); 816 if (status != DLM_NORMAL) { 817 ocfs2_log_dlm_error("dlmlock", status, lockres); 818 ret = -EINVAL; 819 ocfs2_recover_from_dlm_error(lockres, 1); 820 } 821 822 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); 823 824 bail: 825 mlog_exit(ret); 826 return ret; 827 } 828 829 static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, 830 int flag) 831 { 832 unsigned long flags; 833 int ret; 834 835 spin_lock_irqsave(&lockres->l_lock, flags); 836 ret = lockres->l_flags & flag; 837 spin_unlock_irqrestore(&lockres->l_lock, flags); 838 839 return ret; 840 } 841 842 static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) 843 844 { 845 wait_event(lockres->l_event, 846 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); 847 } 848 849 static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) 850 851 { 852 wait_event(lockres->l_event, 853 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); 854 } 855 856 /* predict what lock level we'll be dropping down to on behalf 857 * of another node, and return true if the currently wanted 858 * level will be compatible with it. */ 859 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 860 int wanted) 861 { 862 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 863 864 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); 865 } 866 867 static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) 868 { 869 INIT_LIST_HEAD(&mw->mw_item); 870 init_completion(&mw->mw_complete); 871 } 872 873 static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) 874 { 875 wait_for_completion(&mw->mw_complete); 876 /* Re-arm the completion in case we want to wait on it again */ 877 INIT_COMPLETION(mw->mw_complete); 878 return mw->mw_status; 879 } 880 881 static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, 882 struct ocfs2_mask_waiter *mw, 883 unsigned long mask, 884 unsigned long goal) 885 { 886 BUG_ON(!list_empty(&mw->mw_item)); 887 888 assert_spin_locked(&lockres->l_lock); 889 890 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); 891 mw->mw_mask = mask; 892 mw->mw_goal = goal; 893 } 894 895 /* returns 0 if the mw that was removed was already satisfied, -EBUSY 896 * if the mask still hadn't reached its goal */ 897 static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, 898 struct ocfs2_mask_waiter *mw) 899 { 900 unsigned long flags; 901 int ret = 0; 902 903 spin_lock_irqsave(&lockres->l_lock, flags); 904 if (!list_empty(&mw->mw_item)) { 905 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 906 ret = -EBUSY; 907 908 list_del_init(&mw->mw_item); 909 init_completion(&mw->mw_complete); 910 } 911 spin_unlock_irqrestore(&lockres->l_lock, flags); 912 913 return ret; 914 915 } 916 917 static int ocfs2_cluster_lock(struct ocfs2_super *osb, 918 struct ocfs2_lock_res *lockres, 919 int level, 920 int lkm_flags, 921 int arg_flags) 922 { 923 struct ocfs2_mask_waiter mw; 924 enum dlm_status status; 925 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); 926 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ 927 unsigned long flags; 928 929 mlog_entry_void(); 930 931 ocfs2_init_mask_waiter(&mw); 932 933 again: 934 wait = 0; 935 936 if (catch_signals && signal_pending(current)) { 937 ret = -ERESTARTSYS; 938 goto out; 939 } 940 941 spin_lock_irqsave(&lockres->l_lock, flags); 942 943 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, 944 "Cluster lock called on freeing lockres %s! flags " 945 "0x%lx\n", lockres->l_name, lockres->l_flags); 946 947 /* We only compare against the currently granted level 948 * here. If the lock is blocked waiting on a downconvert, 949 * we'll get caught below. */ 950 if (lockres->l_flags & OCFS2_LOCK_BUSY && 951 level > lockres->l_level) { 952 /* is someone sitting in dlm_lock? If so, wait on 953 * them. */ 954 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 955 wait = 1; 956 goto unlock; 957 } 958 959 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 960 /* lock has not been created yet. */ 961 spin_unlock_irqrestore(&lockres->l_lock, flags); 962 963 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); 964 if (ret < 0) { 965 mlog_errno(ret); 966 goto out; 967 } 968 goto again; 969 } 970 971 if (lockres->l_flags & OCFS2_LOCK_BLOCKED && 972 !ocfs2_may_continue_on_blocked_lock(lockres, level)) { 973 /* is the lock is currently blocked on behalf of 974 * another node */ 975 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); 976 wait = 1; 977 goto unlock; 978 } 979 980 if (level > lockres->l_level) { 981 if (lockres->l_action != OCFS2_AST_INVALID) 982 mlog(ML_ERROR, "lockres %s has action %u pending\n", 983 lockres->l_name, lockres->l_action); 984 985 lockres->l_action = OCFS2_AST_CONVERT; 986 lockres->l_requested = level; 987 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 988 spin_unlock_irqrestore(&lockres->l_lock, flags); 989 990 BUG_ON(level == LKM_IVMODE); 991 BUG_ON(level == LKM_NLMODE); 992 993 mlog(0, "lock %s, convert from %d to level = %d\n", 994 lockres->l_name, lockres->l_level, level); 995 996 /* call dlm_lock to upgrade lock now */ 997 status = dlmlock(osb->dlm, 998 level, 999 &lockres->l_lksb, 1000 lkm_flags|LKM_CONVERT|LKM_VALBLK, 1001 lockres->l_name, 1002 lockres->l_ops->ast, 1003 lockres, 1004 lockres->l_ops->bast); 1005 if (status != DLM_NORMAL) { 1006 if ((lkm_flags & LKM_NOQUEUE) && 1007 (status == DLM_NOTQUEUED)) 1008 ret = -EAGAIN; 1009 else { 1010 ocfs2_log_dlm_error("dlmlock", status, 1011 lockres); 1012 ret = -EINVAL; 1013 } 1014 ocfs2_recover_from_dlm_error(lockres, 1); 1015 goto out; 1016 } 1017 1018 mlog(0, "lock %s, successfull return from dlmlock\n", 1019 lockres->l_name); 1020 1021 /* At this point we've gone inside the dlm and need to 1022 * complete our work regardless. */ 1023 catch_signals = 0; 1024 1025 /* wait for busy to clear and carry on */ 1026 goto again; 1027 } 1028 1029 /* Ok, if we get here then we're good to go. */ 1030 ocfs2_inc_holders(lockres, level); 1031 1032 ret = 0; 1033 unlock: 1034 spin_unlock_irqrestore(&lockres->l_lock, flags); 1035 out: 1036 /* 1037 * This is helping work around a lock inversion between the page lock 1038 * and dlm locks. One path holds the page lock while calling aops 1039 * which block acquiring dlm locks. The voting thread holds dlm 1040 * locks while acquiring page locks while down converting data locks. 1041 * This block is helping an aop path notice the inversion and back 1042 * off to unlock its page lock before trying the dlm lock again. 1043 */ 1044 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && 1045 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { 1046 wait = 0; 1047 if (lockres_remove_mask_waiter(lockres, &mw)) 1048 ret = -EAGAIN; 1049 else 1050 goto again; 1051 } 1052 if (wait) { 1053 ret = ocfs2_wait_for_mask(&mw); 1054 if (ret == 0) 1055 goto again; 1056 mlog_errno(ret); 1057 } 1058 1059 mlog_exit(ret); 1060 return ret; 1061 } 1062 1063 static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 1064 struct ocfs2_lock_res *lockres, 1065 int level) 1066 { 1067 unsigned long flags; 1068 1069 mlog_entry_void(); 1070 spin_lock_irqsave(&lockres->l_lock, flags); 1071 ocfs2_dec_holders(lockres, level); 1072 ocfs2_vote_on_unlock(osb, lockres); 1073 spin_unlock_irqrestore(&lockres->l_lock, flags); 1074 mlog_exit_void(); 1075 } 1076 1077 static int ocfs2_create_new_inode_lock(struct inode *inode, 1078 struct ocfs2_lock_res *lockres) 1079 { 1080 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1081 unsigned long flags; 1082 1083 spin_lock_irqsave(&lockres->l_lock, flags); 1084 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 1085 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); 1086 spin_unlock_irqrestore(&lockres->l_lock, flags); 1087 1088 return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL); 1089 } 1090 1091 /* Grants us an EX lock on the data and metadata resources, skipping 1092 * the normal cluster directory lookup. Use this ONLY on newly created 1093 * inodes which other nodes can't possibly see, and which haven't been 1094 * hashed in the inode hash yet. This can give us a good performance 1095 * increase as it'll skip the network broadcast normally associated 1096 * with creating a new lock resource. */ 1097 int ocfs2_create_new_inode_locks(struct inode *inode) 1098 { 1099 int ret; 1100 1101 BUG_ON(!inode); 1102 BUG_ON(!ocfs2_inode_is_new(inode)); 1103 1104 mlog_entry_void(); 1105 1106 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 1107 1108 /* NOTE: That we don't increment any of the holder counts, nor 1109 * do we add anything to a journal handle. Since this is 1110 * supposed to be a new inode which the cluster doesn't know 1111 * about yet, there is no need to. As far as the LVB handling 1112 * is concerned, this is basically like acquiring an EX lock 1113 * on a resource which has an invalid one -- we'll set it 1114 * valid when we release the EX. */ 1115 1116 ret = ocfs2_create_new_inode_lock(inode, 1117 &OCFS2_I(inode)->ip_rw_lockres); 1118 if (ret) { 1119 mlog_errno(ret); 1120 goto bail; 1121 } 1122 1123 ret = ocfs2_create_new_inode_lock(inode, 1124 &OCFS2_I(inode)->ip_meta_lockres); 1125 if (ret) { 1126 mlog_errno(ret); 1127 goto bail; 1128 } 1129 1130 ret = ocfs2_create_new_inode_lock(inode, 1131 &OCFS2_I(inode)->ip_data_lockres); 1132 if (ret) { 1133 mlog_errno(ret); 1134 goto bail; 1135 } 1136 1137 bail: 1138 mlog_exit(ret); 1139 return ret; 1140 } 1141 1142 int ocfs2_rw_lock(struct inode *inode, int write) 1143 { 1144 int status, level; 1145 struct ocfs2_lock_res *lockres; 1146 1147 BUG_ON(!inode); 1148 1149 mlog_entry_void(); 1150 1151 mlog(0, "inode %llu take %s RW lock\n", 1152 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1153 write ? "EXMODE" : "PRMODE"); 1154 1155 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1156 1157 level = write ? LKM_EXMODE : LKM_PRMODE; 1158 1159 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 1160 0); 1161 if (status < 0) 1162 mlog_errno(status); 1163 1164 mlog_exit(status); 1165 return status; 1166 } 1167 1168 void ocfs2_rw_unlock(struct inode *inode, int write) 1169 { 1170 int level = write ? LKM_EXMODE : LKM_PRMODE; 1171 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; 1172 1173 mlog_entry_void(); 1174 1175 mlog(0, "inode %llu drop %s RW lock\n", 1176 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1177 write ? "EXMODE" : "PRMODE"); 1178 1179 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1180 1181 mlog_exit_void(); 1182 } 1183 1184 int ocfs2_data_lock_full(struct inode *inode, 1185 int write, 1186 int arg_flags) 1187 { 1188 int status = 0, level; 1189 struct ocfs2_lock_res *lockres; 1190 1191 BUG_ON(!inode); 1192 1193 mlog_entry_void(); 1194 1195 mlog(0, "inode %llu take %s DATA lock\n", 1196 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1197 write ? "EXMODE" : "PRMODE"); 1198 1199 /* We'll allow faking a readonly data lock for 1200 * rodevices. */ 1201 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1202 if (write) { 1203 status = -EROFS; 1204 mlog_errno(status); 1205 } 1206 goto out; 1207 } 1208 1209 lockres = &OCFS2_I(inode)->ip_data_lockres; 1210 1211 level = write ? LKM_EXMODE : LKM_PRMODE; 1212 1213 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1214 0, arg_flags); 1215 if (status < 0 && status != -EAGAIN) 1216 mlog_errno(status); 1217 1218 out: 1219 mlog_exit(status); 1220 return status; 1221 } 1222 1223 /* see ocfs2_meta_lock_with_page() */ 1224 int ocfs2_data_lock_with_page(struct inode *inode, 1225 int write, 1226 struct page *page) 1227 { 1228 int ret; 1229 1230 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1231 if (ret == -EAGAIN) { 1232 unlock_page(page); 1233 if (ocfs2_data_lock(inode, write) == 0) 1234 ocfs2_data_unlock(inode, write); 1235 ret = AOP_TRUNCATED_PAGE; 1236 } 1237 1238 return ret; 1239 } 1240 1241 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1242 struct ocfs2_lock_res *lockres) 1243 { 1244 int kick = 0; 1245 1246 mlog_entry_void(); 1247 1248 /* If we know that another node is waiting on our lock, kick 1249 * the vote thread * pre-emptively when we reach a release 1250 * condition. */ 1251 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1252 switch(lockres->l_blocking) { 1253 case LKM_EXMODE: 1254 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 1255 kick = 1; 1256 break; 1257 case LKM_PRMODE: 1258 if (!lockres->l_ex_holders) 1259 kick = 1; 1260 break; 1261 default: 1262 BUG(); 1263 } 1264 } 1265 1266 if (kick) 1267 ocfs2_kick_vote_thread(osb); 1268 1269 mlog_exit_void(); 1270 } 1271 1272 void ocfs2_data_unlock(struct inode *inode, 1273 int write) 1274 { 1275 int level = write ? LKM_EXMODE : LKM_PRMODE; 1276 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; 1277 1278 mlog_entry_void(); 1279 1280 mlog(0, "inode %llu drop %s DATA lock\n", 1281 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1282 write ? "EXMODE" : "PRMODE"); 1283 1284 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) 1285 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1286 1287 mlog_exit_void(); 1288 } 1289 1290 #define OCFS2_SEC_BITS 34 1291 #define OCFS2_SEC_SHIFT (64 - 34) 1292 #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) 1293 1294 /* LVB only has room for 64 bits of time here so we pack it for 1295 * now. */ 1296 static u64 ocfs2_pack_timespec(struct timespec *spec) 1297 { 1298 u64 res; 1299 u64 sec = spec->tv_sec; 1300 u32 nsec = spec->tv_nsec; 1301 1302 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); 1303 1304 return res; 1305 } 1306 1307 /* Call this with the lockres locked. I am reasonably sure we don't 1308 * need ip_lock in this function as anyone who would be changing those 1309 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1310 static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1311 { 1312 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1313 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1314 struct ocfs2_meta_lvb *lvb; 1315 1316 mlog_entry_void(); 1317 1318 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1319 1320 lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION); 1321 lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); 1322 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); 1323 lvb->lvb_iuid = cpu_to_be32(inode->i_uid); 1324 lvb->lvb_igid = cpu_to_be32(inode->i_gid); 1325 lvb->lvb_imode = cpu_to_be16(inode->i_mode); 1326 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); 1327 lvb->lvb_iatime_packed = 1328 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); 1329 lvb->lvb_ictime_packed = 1330 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); 1331 lvb->lvb_imtime_packed = 1332 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); 1333 1334 mlog_meta_lvb(0, lockres); 1335 1336 mlog_exit_void(); 1337 } 1338 1339 static void ocfs2_unpack_timespec(struct timespec *spec, 1340 u64 packed_time) 1341 { 1342 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; 1343 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; 1344 } 1345 1346 static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1347 { 1348 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1349 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1350 struct ocfs2_meta_lvb *lvb; 1351 1352 mlog_entry_void(); 1353 1354 mlog_meta_lvb(0, lockres); 1355 1356 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1357 1358 /* We're safe here without the lockres lock... */ 1359 spin_lock(&oi->ip_lock); 1360 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); 1361 i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); 1362 1363 /* fast-symlinks are a special case */ 1364 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) 1365 inode->i_blocks = 0; 1366 else 1367 inode->i_blocks = 1368 ocfs2_align_bytes_to_sectors(i_size_read(inode)); 1369 1370 inode->i_uid = be32_to_cpu(lvb->lvb_iuid); 1371 inode->i_gid = be32_to_cpu(lvb->lvb_igid); 1372 inode->i_mode = be16_to_cpu(lvb->lvb_imode); 1373 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); 1374 ocfs2_unpack_timespec(&inode->i_atime, 1375 be64_to_cpu(lvb->lvb_iatime_packed)); 1376 ocfs2_unpack_timespec(&inode->i_mtime, 1377 be64_to_cpu(lvb->lvb_imtime_packed)); 1378 ocfs2_unpack_timespec(&inode->i_ctime, 1379 be64_to_cpu(lvb->lvb_ictime_packed)); 1380 spin_unlock(&oi->ip_lock); 1381 1382 mlog_exit_void(); 1383 } 1384 1385 static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres) 1386 { 1387 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1388 1389 if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION) 1390 return 1; 1391 return 0; 1392 } 1393 1394 /* Determine whether a lock resource needs to be refreshed, and 1395 * arbitrate who gets to refresh it. 1396 * 1397 * 0 means no refresh needed. 1398 * 1399 * > 0 means you need to refresh this and you MUST call 1400 * ocfs2_complete_lock_res_refresh afterwards. */ 1401 static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) 1402 { 1403 unsigned long flags; 1404 int status = 0; 1405 1406 mlog_entry_void(); 1407 1408 refresh_check: 1409 spin_lock_irqsave(&lockres->l_lock, flags); 1410 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { 1411 spin_unlock_irqrestore(&lockres->l_lock, flags); 1412 goto bail; 1413 } 1414 1415 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { 1416 spin_unlock_irqrestore(&lockres->l_lock, flags); 1417 1418 ocfs2_wait_on_refreshing_lock(lockres); 1419 goto refresh_check; 1420 } 1421 1422 /* Ok, I'll be the one to refresh this lock. */ 1423 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); 1424 spin_unlock_irqrestore(&lockres->l_lock, flags); 1425 1426 status = 1; 1427 bail: 1428 mlog_exit(status); 1429 return status; 1430 } 1431 1432 /* If status is non zero, I'll mark it as not being in refresh 1433 * anymroe, but i won't clear the needs refresh flag. */ 1434 static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, 1435 int status) 1436 { 1437 unsigned long flags; 1438 mlog_entry_void(); 1439 1440 spin_lock_irqsave(&lockres->l_lock, flags); 1441 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); 1442 if (!status) 1443 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 1444 spin_unlock_irqrestore(&lockres->l_lock, flags); 1445 1446 wake_up(&lockres->l_event); 1447 1448 mlog_exit_void(); 1449 } 1450 1451 /* may or may not return a bh if it went to disk. */ 1452 static int ocfs2_meta_lock_update(struct inode *inode, 1453 struct buffer_head **bh) 1454 { 1455 int status = 0; 1456 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1457 struct ocfs2_lock_res *lockres; 1458 struct ocfs2_dinode *fe; 1459 1460 mlog_entry_void(); 1461 1462 spin_lock(&oi->ip_lock); 1463 if (oi->ip_flags & OCFS2_INODE_DELETED) { 1464 mlog(0, "Orphaned inode %llu was deleted while we " 1465 "were waiting on a lock. ip_flags = 0x%x\n", 1466 (unsigned long long)oi->ip_blkno, oi->ip_flags); 1467 spin_unlock(&oi->ip_lock); 1468 status = -ENOENT; 1469 goto bail; 1470 } 1471 spin_unlock(&oi->ip_lock); 1472 1473 lockres = &oi->ip_meta_lockres; 1474 1475 if (!ocfs2_should_refresh_lock_res(lockres)) 1476 goto bail; 1477 1478 /* This will discard any caching information we might have had 1479 * for the inode metadata. */ 1480 ocfs2_metadata_cache_purge(inode); 1481 1482 /* will do nothing for inode types that don't use the extent 1483 * map (directories, bitmap files, etc) */ 1484 ocfs2_extent_map_trunc(inode, 0); 1485 1486 if (ocfs2_meta_lvb_is_trustable(lockres)) { 1487 mlog(0, "Trusting LVB on inode %llu\n", 1488 (unsigned long long)oi->ip_blkno); 1489 ocfs2_refresh_inode_from_lvb(inode); 1490 } else { 1491 /* Boo, we have to go to disk. */ 1492 /* read bh, cast, ocfs2_refresh_inode */ 1493 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, 1494 bh, OCFS2_BH_CACHED, inode); 1495 if (status < 0) { 1496 mlog_errno(status); 1497 goto bail_refresh; 1498 } 1499 fe = (struct ocfs2_dinode *) (*bh)->b_data; 1500 1501 /* This is a good chance to make sure we're not 1502 * locking an invalid object. 1503 * 1504 * We bug on a stale inode here because we checked 1505 * above whether it was wiped from disk. The wiping 1506 * node provides a guarantee that we receive that 1507 * message and can mark the inode before dropping any 1508 * locks associated with it. */ 1509 if (!OCFS2_IS_VALID_DINODE(fe)) { 1510 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 1511 status = -EIO; 1512 goto bail_refresh; 1513 } 1514 mlog_bug_on_msg(inode->i_generation != 1515 le32_to_cpu(fe->i_generation), 1516 "Invalid dinode %llu disk generation: %u " 1517 "inode->i_generation: %u\n", 1518 (unsigned long long)oi->ip_blkno, 1519 le32_to_cpu(fe->i_generation), 1520 inode->i_generation); 1521 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || 1522 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), 1523 "Stale dinode %llu dtime: %llu flags: 0x%x\n", 1524 (unsigned long long)oi->ip_blkno, 1525 (unsigned long long)le64_to_cpu(fe->i_dtime), 1526 le32_to_cpu(fe->i_flags)); 1527 1528 ocfs2_refresh_inode(inode, fe); 1529 } 1530 1531 status = 0; 1532 bail_refresh: 1533 ocfs2_complete_lock_res_refresh(lockres, status); 1534 bail: 1535 mlog_exit(status); 1536 return status; 1537 } 1538 1539 static int ocfs2_assign_bh(struct inode *inode, 1540 struct buffer_head **ret_bh, 1541 struct buffer_head *passed_bh) 1542 { 1543 int status; 1544 1545 if (passed_bh) { 1546 /* Ok, the update went to disk for us, use the 1547 * returned bh. */ 1548 *ret_bh = passed_bh; 1549 get_bh(*ret_bh); 1550 1551 return 0; 1552 } 1553 1554 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 1555 OCFS2_I(inode)->ip_blkno, 1556 ret_bh, 1557 OCFS2_BH_CACHED, 1558 inode); 1559 if (status < 0) 1560 mlog_errno(status); 1561 1562 return status; 1563 } 1564 1565 /* 1566 * returns < 0 error if the callback will never be called, otherwise 1567 * the result of the lock will be communicated via the callback. 1568 */ 1569 int ocfs2_meta_lock_full(struct inode *inode, 1570 struct ocfs2_journal_handle *handle, 1571 struct buffer_head **ret_bh, 1572 int ex, 1573 int arg_flags) 1574 { 1575 int status, level, dlm_flags, acquired; 1576 struct ocfs2_lock_res *lockres; 1577 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1578 struct buffer_head *local_bh = NULL; 1579 1580 BUG_ON(!inode); 1581 1582 mlog_entry_void(); 1583 1584 mlog(0, "inode %llu, take %s META lock\n", 1585 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1586 ex ? "EXMODE" : "PRMODE"); 1587 1588 status = 0; 1589 acquired = 0; 1590 /* We'll allow faking a readonly metadata lock for 1591 * rodevices. */ 1592 if (ocfs2_is_hard_readonly(osb)) { 1593 if (ex) 1594 status = -EROFS; 1595 goto bail; 1596 } 1597 1598 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 1599 wait_event(osb->recovery_event, 1600 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1601 1602 acquired = 0; 1603 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1604 level = ex ? LKM_EXMODE : LKM_PRMODE; 1605 dlm_flags = 0; 1606 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1607 dlm_flags |= LKM_NOQUEUE; 1608 1609 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); 1610 if (status < 0) { 1611 if (status != -EAGAIN && status != -EIOCBRETRY) 1612 mlog_errno(status); 1613 goto bail; 1614 } 1615 1616 /* Notify the error cleanup path to drop the cluster lock. */ 1617 acquired = 1; 1618 1619 /* We wait twice because a node may have died while we were in 1620 * the lower dlm layers. The second time though, we've 1621 * committed to owning this lock so we don't allow signals to 1622 * abort the operation. */ 1623 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 1624 wait_event(osb->recovery_event, 1625 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1626 1627 /* This is fun. The caller may want a bh back, or it may 1628 * not. ocfs2_meta_lock_update definitely wants one in, but 1629 * may or may not read one, depending on what's in the 1630 * LVB. The result of all of this is that we've *only* gone to 1631 * disk if we have to, so the complexity is worthwhile. */ 1632 status = ocfs2_meta_lock_update(inode, &local_bh); 1633 if (status < 0) { 1634 if (status != -ENOENT) 1635 mlog_errno(status); 1636 goto bail; 1637 } 1638 1639 if (ret_bh) { 1640 status = ocfs2_assign_bh(inode, ret_bh, local_bh); 1641 if (status < 0) { 1642 mlog_errno(status); 1643 goto bail; 1644 } 1645 } 1646 1647 if (handle) { 1648 status = ocfs2_handle_add_lock(handle, inode); 1649 if (status < 0) 1650 mlog_errno(status); 1651 } 1652 1653 bail: 1654 if (status < 0) { 1655 if (ret_bh && (*ret_bh)) { 1656 brelse(*ret_bh); 1657 *ret_bh = NULL; 1658 } 1659 if (acquired) 1660 ocfs2_meta_unlock(inode, ex); 1661 } 1662 1663 if (local_bh) 1664 brelse(local_bh); 1665 1666 mlog_exit(status); 1667 return status; 1668 } 1669 1670 /* 1671 * This is working around a lock inversion between tasks acquiring DLM locks 1672 * while holding a page lock and the vote thread which blocks dlm lock acquiry 1673 * while acquiring page locks. 1674 * 1675 * ** These _with_page variantes are only intended to be called from aop 1676 * methods that hold page locks and return a very specific *positive* error 1677 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 1678 * 1679 * The DLM is called such that it returns -EAGAIN if it would have blocked 1680 * waiting for the vote thread. In that case we unlock our page so the vote 1681 * thread can make progress. Once we've done this we have to return 1682 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 1683 * into the VFS who will then immediately retry the aop call. 1684 * 1685 * We do a blocking lock and immediate unlock before returning, though, so that 1686 * the lock has a great chance of being cached on this node by the time the VFS 1687 * calls back to retry the aop. This has a potential to livelock as nodes 1688 * ping locks back and forth, but that's a risk we're willing to take to avoid 1689 * the lock inversion simply. 1690 */ 1691 int ocfs2_meta_lock_with_page(struct inode *inode, 1692 struct ocfs2_journal_handle *handle, 1693 struct buffer_head **ret_bh, 1694 int ex, 1695 struct page *page) 1696 { 1697 int ret; 1698 1699 ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex, 1700 OCFS2_LOCK_NONBLOCK); 1701 if (ret == -EAGAIN) { 1702 unlock_page(page); 1703 if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0) 1704 ocfs2_meta_unlock(inode, ex); 1705 ret = AOP_TRUNCATED_PAGE; 1706 } 1707 1708 return ret; 1709 } 1710 1711 void ocfs2_meta_unlock(struct inode *inode, 1712 int ex) 1713 { 1714 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1715 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 1716 1717 mlog_entry_void(); 1718 1719 mlog(0, "inode %llu drop %s META lock\n", 1720 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1721 ex ? "EXMODE" : "PRMODE"); 1722 1723 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) 1724 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1725 1726 mlog_exit_void(); 1727 } 1728 1729 int ocfs2_super_lock(struct ocfs2_super *osb, 1730 int ex) 1731 { 1732 int status; 1733 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1734 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 1735 struct buffer_head *bh; 1736 struct ocfs2_slot_info *si = osb->slot_info; 1737 1738 mlog_entry_void(); 1739 1740 if (ocfs2_is_hard_readonly(osb)) 1741 return -EROFS; 1742 1743 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); 1744 if (status < 0) { 1745 mlog_errno(status); 1746 goto bail; 1747 } 1748 1749 /* The super block lock path is really in the best position to 1750 * know when resources covered by the lock need to be 1751 * refreshed, so we do it here. Of course, making sense of 1752 * everything is up to the caller :) */ 1753 status = ocfs2_should_refresh_lock_res(lockres); 1754 if (status < 0) { 1755 mlog_errno(status); 1756 goto bail; 1757 } 1758 if (status) { 1759 bh = si->si_bh; 1760 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, 1761 si->si_inode); 1762 if (status == 0) 1763 ocfs2_update_slot_info(si); 1764 1765 ocfs2_complete_lock_res_refresh(lockres, status); 1766 1767 if (status < 0) 1768 mlog_errno(status); 1769 } 1770 bail: 1771 mlog_exit(status); 1772 return status; 1773 } 1774 1775 void ocfs2_super_unlock(struct ocfs2_super *osb, 1776 int ex) 1777 { 1778 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1779 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 1780 1781 ocfs2_cluster_unlock(osb, lockres, level); 1782 } 1783 1784 int ocfs2_rename_lock(struct ocfs2_super *osb) 1785 { 1786 int status; 1787 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 1788 1789 if (ocfs2_is_hard_readonly(osb)) 1790 return -EROFS; 1791 1792 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); 1793 if (status < 0) 1794 mlog_errno(status); 1795 1796 return status; 1797 } 1798 1799 void ocfs2_rename_unlock(struct ocfs2_super *osb) 1800 { 1801 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 1802 1803 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); 1804 } 1805 1806 /* Reference counting of the dlm debug structure. We want this because 1807 * open references on the debug inodes can live on after a mount, so 1808 * we can't rely on the ocfs2_super to always exist. */ 1809 static void ocfs2_dlm_debug_free(struct kref *kref) 1810 { 1811 struct ocfs2_dlm_debug *dlm_debug; 1812 1813 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); 1814 1815 kfree(dlm_debug); 1816 } 1817 1818 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) 1819 { 1820 if (dlm_debug) 1821 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); 1822 } 1823 1824 static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) 1825 { 1826 kref_get(&debug->d_refcnt); 1827 } 1828 1829 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) 1830 { 1831 struct ocfs2_dlm_debug *dlm_debug; 1832 1833 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); 1834 if (!dlm_debug) { 1835 mlog_errno(-ENOMEM); 1836 goto out; 1837 } 1838 1839 kref_init(&dlm_debug->d_refcnt); 1840 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); 1841 dlm_debug->d_locking_state = NULL; 1842 out: 1843 return dlm_debug; 1844 } 1845 1846 /* Access to this is arbitrated for us via seq_file->sem. */ 1847 struct ocfs2_dlm_seq_priv { 1848 struct ocfs2_dlm_debug *p_dlm_debug; 1849 struct ocfs2_lock_res p_iter_res; 1850 struct ocfs2_lock_res p_tmp_res; 1851 }; 1852 1853 static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, 1854 struct ocfs2_dlm_seq_priv *priv) 1855 { 1856 struct ocfs2_lock_res *iter, *ret = NULL; 1857 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; 1858 1859 assert_spin_locked(&ocfs2_dlm_tracking_lock); 1860 1861 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { 1862 /* discover the head of the list */ 1863 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { 1864 mlog(0, "End of list found, %p\n", ret); 1865 break; 1866 } 1867 1868 /* We track our "dummy" iteration lockres' by a NULL 1869 * l_ops field. */ 1870 if (iter->l_ops != NULL) { 1871 ret = iter; 1872 break; 1873 } 1874 } 1875 1876 return ret; 1877 } 1878 1879 static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) 1880 { 1881 struct ocfs2_dlm_seq_priv *priv = m->private; 1882 struct ocfs2_lock_res *iter; 1883 1884 spin_lock(&ocfs2_dlm_tracking_lock); 1885 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); 1886 if (iter) { 1887 /* Since lockres' have the lifetime of their container 1888 * (which can be inodes, ocfs2_supers, etc) we want to 1889 * copy this out to a temporary lockres while still 1890 * under the spinlock. Obviously after this we can't 1891 * trust any pointers on the copy returned, but that's 1892 * ok as the information we want isn't typically held 1893 * in them. */ 1894 priv->p_tmp_res = *iter; 1895 iter = &priv->p_tmp_res; 1896 } 1897 spin_unlock(&ocfs2_dlm_tracking_lock); 1898 1899 return iter; 1900 } 1901 1902 static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) 1903 { 1904 } 1905 1906 static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) 1907 { 1908 struct ocfs2_dlm_seq_priv *priv = m->private; 1909 struct ocfs2_lock_res *iter = v; 1910 struct ocfs2_lock_res *dummy = &priv->p_iter_res; 1911 1912 spin_lock(&ocfs2_dlm_tracking_lock); 1913 iter = ocfs2_dlm_next_res(iter, priv); 1914 list_del_init(&dummy->l_debug_list); 1915 if (iter) { 1916 list_add(&dummy->l_debug_list, &iter->l_debug_list); 1917 priv->p_tmp_res = *iter; 1918 iter = &priv->p_tmp_res; 1919 } 1920 spin_unlock(&ocfs2_dlm_tracking_lock); 1921 1922 return iter; 1923 } 1924 1925 /* So that debugfs.ocfs2 can determine which format is being used */ 1926 #define OCFS2_DLM_DEBUG_STR_VERSION 1 1927 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) 1928 { 1929 int i; 1930 char *lvb; 1931 struct ocfs2_lock_res *lockres = v; 1932 1933 if (!lockres) 1934 return -EINVAL; 1935 1936 seq_printf(m, "0x%x\t" 1937 "%.*s\t" 1938 "%d\t" 1939 "0x%lx\t" 1940 "0x%x\t" 1941 "0x%x\t" 1942 "%u\t" 1943 "%u\t" 1944 "%d\t" 1945 "%d\t", 1946 OCFS2_DLM_DEBUG_STR_VERSION, 1947 OCFS2_LOCK_ID_MAX_LEN, lockres->l_name, 1948 lockres->l_level, 1949 lockres->l_flags, 1950 lockres->l_action, 1951 lockres->l_unlock_action, 1952 lockres->l_ro_holders, 1953 lockres->l_ex_holders, 1954 lockres->l_requested, 1955 lockres->l_blocking); 1956 1957 /* Dump the raw LVB */ 1958 lvb = lockres->l_lksb.lvb; 1959 for(i = 0; i < DLM_LVB_LEN; i++) 1960 seq_printf(m, "0x%x\t", lvb[i]); 1961 1962 /* End the line */ 1963 seq_printf(m, "\n"); 1964 return 0; 1965 } 1966 1967 static struct seq_operations ocfs2_dlm_seq_ops = { 1968 .start = ocfs2_dlm_seq_start, 1969 .stop = ocfs2_dlm_seq_stop, 1970 .next = ocfs2_dlm_seq_next, 1971 .show = ocfs2_dlm_seq_show, 1972 }; 1973 1974 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) 1975 { 1976 struct seq_file *seq = (struct seq_file *) file->private_data; 1977 struct ocfs2_dlm_seq_priv *priv = seq->private; 1978 struct ocfs2_lock_res *res = &priv->p_iter_res; 1979 1980 ocfs2_remove_lockres_tracking(res); 1981 ocfs2_put_dlm_debug(priv->p_dlm_debug); 1982 return seq_release_private(inode, file); 1983 } 1984 1985 static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) 1986 { 1987 int ret; 1988 struct ocfs2_dlm_seq_priv *priv; 1989 struct seq_file *seq; 1990 struct ocfs2_super *osb; 1991 1992 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); 1993 if (!priv) { 1994 ret = -ENOMEM; 1995 mlog_errno(ret); 1996 goto out; 1997 } 1998 osb = (struct ocfs2_super *) inode->u.generic_ip; 1999 ocfs2_get_dlm_debug(osb->osb_dlm_debug); 2000 priv->p_dlm_debug = osb->osb_dlm_debug; 2001 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); 2002 2003 ret = seq_open(file, &ocfs2_dlm_seq_ops); 2004 if (ret) { 2005 kfree(priv); 2006 mlog_errno(ret); 2007 goto out; 2008 } 2009 2010 seq = (struct seq_file *) file->private_data; 2011 seq->private = priv; 2012 2013 ocfs2_add_lockres_tracking(&priv->p_iter_res, 2014 priv->p_dlm_debug); 2015 2016 out: 2017 return ret; 2018 } 2019 2020 static const struct file_operations ocfs2_dlm_debug_fops = { 2021 .open = ocfs2_dlm_debug_open, 2022 .release = ocfs2_dlm_debug_release, 2023 .read = seq_read, 2024 .llseek = seq_lseek, 2025 }; 2026 2027 static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) 2028 { 2029 int ret = 0; 2030 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 2031 2032 dlm_debug->d_locking_state = debugfs_create_file("locking_state", 2033 S_IFREG|S_IRUSR, 2034 osb->osb_debug_root, 2035 osb, 2036 &ocfs2_dlm_debug_fops); 2037 if (!dlm_debug->d_locking_state) { 2038 ret = -EINVAL; 2039 mlog(ML_ERROR, 2040 "Unable to create locking state debugfs file.\n"); 2041 goto out; 2042 } 2043 2044 ocfs2_get_dlm_debug(dlm_debug); 2045 out: 2046 return ret; 2047 } 2048 2049 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) 2050 { 2051 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 2052 2053 if (dlm_debug) { 2054 debugfs_remove(dlm_debug->d_locking_state); 2055 ocfs2_put_dlm_debug(dlm_debug); 2056 } 2057 } 2058 2059 int ocfs2_dlm_init(struct ocfs2_super *osb) 2060 { 2061 int status; 2062 u32 dlm_key; 2063 struct dlm_ctxt *dlm; 2064 2065 mlog_entry_void(); 2066 2067 status = ocfs2_dlm_init_debug(osb); 2068 if (status < 0) { 2069 mlog_errno(status); 2070 goto bail; 2071 } 2072 2073 /* launch vote thread */ 2074 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); 2075 if (IS_ERR(osb->vote_task)) { 2076 status = PTR_ERR(osb->vote_task); 2077 osb->vote_task = NULL; 2078 mlog_errno(status); 2079 goto bail; 2080 } 2081 2082 /* used by the dlm code to make message headers unique, each 2083 * node in this domain must agree on this. */ 2084 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); 2085 2086 /* for now, uuid == domain */ 2087 dlm = dlm_register_domain(osb->uuid_str, dlm_key); 2088 if (IS_ERR(dlm)) { 2089 status = PTR_ERR(dlm); 2090 mlog_errno(status); 2091 goto bail; 2092 } 2093 2094 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2095 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2096 2097 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); 2098 2099 osb->dlm = dlm; 2100 2101 status = 0; 2102 bail: 2103 if (status < 0) { 2104 ocfs2_dlm_shutdown_debug(osb); 2105 if (osb->vote_task) 2106 kthread_stop(osb->vote_task); 2107 } 2108 2109 mlog_exit(status); 2110 return status; 2111 } 2112 2113 void ocfs2_dlm_shutdown(struct ocfs2_super *osb) 2114 { 2115 mlog_entry_void(); 2116 2117 dlm_unregister_eviction_cb(&osb->osb_eviction_cb); 2118 2119 ocfs2_drop_osb_locks(osb); 2120 2121 if (osb->vote_task) { 2122 kthread_stop(osb->vote_task); 2123 osb->vote_task = NULL; 2124 } 2125 2126 ocfs2_lock_res_free(&osb->osb_super_lockres); 2127 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2128 2129 dlm_unregister_domain(osb->dlm); 2130 osb->dlm = NULL; 2131 2132 ocfs2_dlm_shutdown_debug(osb); 2133 2134 mlog_exit_void(); 2135 } 2136 2137 static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status) 2138 { 2139 struct ocfs2_lock_res *lockres = opaque; 2140 unsigned long flags; 2141 2142 mlog_entry_void(); 2143 2144 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, 2145 lockres->l_unlock_action); 2146 2147 spin_lock_irqsave(&lockres->l_lock, flags); 2148 /* We tried to cancel a convert request, but it was already 2149 * granted. All we want to do here is clear our unlock 2150 * state. The wake_up call done at the bottom is redundant 2151 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't 2152 * hurt anything anyway */ 2153 if (status == DLM_CANCELGRANT && 2154 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 2155 mlog(0, "Got cancelgrant for %s\n", lockres->l_name); 2156 2157 /* We don't clear the busy flag in this case as it 2158 * should have been cleared by the ast which the dlm 2159 * has called. */ 2160 goto complete_unlock; 2161 } 2162 2163 if (status != DLM_NORMAL) { 2164 mlog(ML_ERROR, "Dlm passes status %d for lock %s, " 2165 "unlock_action %d\n", status, lockres->l_name, 2166 lockres->l_unlock_action); 2167 spin_unlock_irqrestore(&lockres->l_lock, flags); 2168 return; 2169 } 2170 2171 switch(lockres->l_unlock_action) { 2172 case OCFS2_UNLOCK_CANCEL_CONVERT: 2173 mlog(0, "Cancel convert success for %s\n", lockres->l_name); 2174 lockres->l_action = OCFS2_AST_INVALID; 2175 break; 2176 case OCFS2_UNLOCK_DROP_LOCK: 2177 lockres->l_level = LKM_IVMODE; 2178 break; 2179 default: 2180 BUG(); 2181 } 2182 2183 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 2184 complete_unlock: 2185 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 2186 spin_unlock_irqrestore(&lockres->l_lock, flags); 2187 2188 wake_up(&lockres->l_event); 2189 2190 mlog_exit_void(); 2191 } 2192 2193 typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *); 2194 2195 struct drop_lock_cb { 2196 ocfs2_pre_drop_cb_t *drop_func; 2197 void *drop_data; 2198 }; 2199 2200 static int ocfs2_drop_lock(struct ocfs2_super *osb, 2201 struct ocfs2_lock_res *lockres, 2202 struct drop_lock_cb *dcb) 2203 { 2204 enum dlm_status status; 2205 unsigned long flags; 2206 2207 /* We didn't get anywhere near actually using this lockres. */ 2208 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) 2209 goto out; 2210 2211 spin_lock_irqsave(&lockres->l_lock, flags); 2212 2213 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), 2214 "lockres %s, flags 0x%lx\n", 2215 lockres->l_name, lockres->l_flags); 2216 2217 while (lockres->l_flags & OCFS2_LOCK_BUSY) { 2218 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " 2219 "%u, unlock_action = %u\n", 2220 lockres->l_name, lockres->l_flags, lockres->l_action, 2221 lockres->l_unlock_action); 2222 2223 spin_unlock_irqrestore(&lockres->l_lock, flags); 2224 2225 /* XXX: Today we just wait on any busy 2226 * locks... Perhaps we need to cancel converts in the 2227 * future? */ 2228 ocfs2_wait_on_busy_lock(lockres); 2229 2230 spin_lock_irqsave(&lockres->l_lock, flags); 2231 } 2232 2233 if (dcb) 2234 dcb->drop_func(lockres, dcb->drop_data); 2235 2236 if (lockres->l_flags & OCFS2_LOCK_BUSY) 2237 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", 2238 lockres->l_name); 2239 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) 2240 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); 2241 2242 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 2243 spin_unlock_irqrestore(&lockres->l_lock, flags); 2244 goto out; 2245 } 2246 2247 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); 2248 2249 /* make sure we never get here while waiting for an ast to 2250 * fire. */ 2251 BUG_ON(lockres->l_action != OCFS2_AST_INVALID); 2252 2253 /* is this necessary? */ 2254 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2255 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; 2256 spin_unlock_irqrestore(&lockres->l_lock, flags); 2257 2258 mlog(0, "lock %s\n", lockres->l_name); 2259 2260 status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK, 2261 lockres->l_ops->unlock_ast, lockres); 2262 if (status != DLM_NORMAL) { 2263 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2264 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 2265 dlm_print_one_lock(lockres->l_lksb.lockid); 2266 BUG(); 2267 } 2268 mlog(0, "lock %s, successfull return from dlmunlock\n", 2269 lockres->l_name); 2270 2271 ocfs2_wait_on_busy_lock(lockres); 2272 out: 2273 mlog_exit(0); 2274 return 0; 2275 } 2276 2277 /* Mark the lockres as being dropped. It will no longer be 2278 * queued if blocking, but we still may have to wait on it 2279 * being dequeued from the vote thread before we can consider 2280 * it safe to drop. 2281 * 2282 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2283 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 2284 { 2285 int status; 2286 struct ocfs2_mask_waiter mw; 2287 unsigned long flags; 2288 2289 ocfs2_init_mask_waiter(&mw); 2290 2291 spin_lock_irqsave(&lockres->l_lock, flags); 2292 lockres->l_flags |= OCFS2_LOCK_FREEING; 2293 while (lockres->l_flags & OCFS2_LOCK_QUEUED) { 2294 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); 2295 spin_unlock_irqrestore(&lockres->l_lock, flags); 2296 2297 mlog(0, "Waiting on lockres %s\n", lockres->l_name); 2298 2299 status = ocfs2_wait_for_mask(&mw); 2300 if (status) 2301 mlog_errno(status); 2302 2303 spin_lock_irqsave(&lockres->l_lock, flags); 2304 } 2305 spin_unlock_irqrestore(&lockres->l_lock, flags); 2306 } 2307 2308 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) 2309 { 2310 int status; 2311 2312 mlog_entry_void(); 2313 2314 ocfs2_mark_lockres_freeing(&osb->osb_super_lockres); 2315 2316 status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL); 2317 if (status < 0) 2318 mlog_errno(status); 2319 2320 ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres); 2321 2322 status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL); 2323 if (status < 0) 2324 mlog_errno(status); 2325 2326 mlog_exit(status); 2327 } 2328 2329 static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data) 2330 { 2331 struct inode *inode = data; 2332 2333 /* the metadata lock requires a bit more work as we have an 2334 * LVB to worry about. */ 2335 if (lockres->l_flags & OCFS2_LOCK_ATTACHED && 2336 lockres->l_level == LKM_EXMODE && 2337 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) 2338 __ocfs2_stuff_meta_lvb(inode); 2339 } 2340 2341 int ocfs2_drop_inode_locks(struct inode *inode) 2342 { 2343 int status, err; 2344 struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, }; 2345 2346 mlog_entry_void(); 2347 2348 /* No need to call ocfs2_mark_lockres_freeing here - 2349 * ocfs2_clear_inode has done it for us. */ 2350 2351 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2352 &OCFS2_I(inode)->ip_data_lockres, 2353 NULL); 2354 if (err < 0) 2355 mlog_errno(err); 2356 2357 status = err; 2358 2359 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2360 &OCFS2_I(inode)->ip_meta_lockres, 2361 &meta_dcb); 2362 if (err < 0) 2363 mlog_errno(err); 2364 if (err < 0 && !status) 2365 status = err; 2366 2367 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2368 &OCFS2_I(inode)->ip_rw_lockres, 2369 NULL); 2370 if (err < 0) 2371 mlog_errno(err); 2372 if (err < 0 && !status) 2373 status = err; 2374 2375 mlog_exit(status); 2376 return status; 2377 } 2378 2379 static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 2380 int new_level) 2381 { 2382 assert_spin_locked(&lockres->l_lock); 2383 2384 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 2385 2386 if (lockres->l_level <= new_level) { 2387 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", 2388 lockres->l_level, new_level); 2389 BUG(); 2390 } 2391 2392 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 2393 lockres->l_name, new_level, lockres->l_blocking); 2394 2395 lockres->l_action = OCFS2_AST_DOWNCONVERT; 2396 lockres->l_requested = new_level; 2397 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2398 } 2399 2400 static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 2401 struct ocfs2_lock_res *lockres, 2402 int new_level, 2403 int lvb) 2404 { 2405 int ret, dlm_flags = LKM_CONVERT; 2406 enum dlm_status status; 2407 2408 mlog_entry_void(); 2409 2410 if (lvb) 2411 dlm_flags |= LKM_VALBLK; 2412 2413 status = dlmlock(osb->dlm, 2414 new_level, 2415 &lockres->l_lksb, 2416 dlm_flags, 2417 lockres->l_name, 2418 lockres->l_ops->ast, 2419 lockres, 2420 lockres->l_ops->bast); 2421 if (status != DLM_NORMAL) { 2422 ocfs2_log_dlm_error("dlmlock", status, lockres); 2423 ret = -EINVAL; 2424 ocfs2_recover_from_dlm_error(lockres, 1); 2425 goto bail; 2426 } 2427 2428 ret = 0; 2429 bail: 2430 mlog_exit(ret); 2431 return ret; 2432 } 2433 2434 /* returns 1 when the caller should unlock and call dlmunlock */ 2435 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 2436 struct ocfs2_lock_res *lockres) 2437 { 2438 assert_spin_locked(&lockres->l_lock); 2439 2440 mlog_entry_void(); 2441 mlog(0, "lock %s\n", lockres->l_name); 2442 2443 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 2444 /* If we're already trying to cancel a lock conversion 2445 * then just drop the spinlock and allow the caller to 2446 * requeue this lock. */ 2447 2448 mlog(0, "Lockres %s, skip convert\n", lockres->l_name); 2449 return 0; 2450 } 2451 2452 /* were we in a convert when we got the bast fire? */ 2453 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && 2454 lockres->l_action != OCFS2_AST_DOWNCONVERT); 2455 /* set things up for the unlockast to know to just 2456 * clear out the ast_action and unset busy, etc. */ 2457 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; 2458 2459 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), 2460 "lock %s, invalid flags: 0x%lx\n", 2461 lockres->l_name, lockres->l_flags); 2462 2463 return 1; 2464 } 2465 2466 static int ocfs2_cancel_convert(struct ocfs2_super *osb, 2467 struct ocfs2_lock_res *lockres) 2468 { 2469 int ret; 2470 enum dlm_status status; 2471 2472 mlog_entry_void(); 2473 mlog(0, "lock %s\n", lockres->l_name); 2474 2475 ret = 0; 2476 status = dlmunlock(osb->dlm, 2477 &lockres->l_lksb, 2478 LKM_CANCEL, 2479 lockres->l_ops->unlock_ast, 2480 lockres); 2481 if (status != DLM_NORMAL) { 2482 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2483 ret = -EINVAL; 2484 ocfs2_recover_from_dlm_error(lockres, 0); 2485 } 2486 2487 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); 2488 2489 mlog_exit(ret); 2490 return ret; 2491 } 2492 2493 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, 2494 struct ocfs2_lock_res *lockres, 2495 int new_level) 2496 { 2497 int ret; 2498 2499 mlog_entry_void(); 2500 2501 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); 2502 2503 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { 2504 ret = 0; 2505 mlog(0, "lockres %s currently being refreshed -- backing " 2506 "off!\n", lockres->l_name); 2507 } else if (new_level == LKM_PRMODE) 2508 ret = !lockres->l_ex_holders && 2509 ocfs2_inode_fully_checkpointed(inode); 2510 else /* Must be NLMODE we're converting to. */ 2511 ret = !lockres->l_ro_holders && !lockres->l_ex_holders && 2512 ocfs2_inode_fully_checkpointed(inode); 2513 2514 mlog_exit(ret); 2515 return ret; 2516 } 2517 2518 static int ocfs2_do_unblock_meta(struct inode *inode, 2519 int *requeue) 2520 { 2521 int new_level; 2522 int set_lvb = 0; 2523 int ret = 0; 2524 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2525 unsigned long flags; 2526 2527 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2528 2529 mlog_entry_void(); 2530 2531 spin_lock_irqsave(&lockres->l_lock, flags); 2532 2533 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 2534 2535 mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level, 2536 lockres->l_blocking); 2537 2538 BUG_ON(lockres->l_level != LKM_EXMODE && 2539 lockres->l_level != LKM_PRMODE); 2540 2541 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 2542 *requeue = 1; 2543 ret = ocfs2_prepare_cancel_convert(osb, lockres); 2544 spin_unlock_irqrestore(&lockres->l_lock, flags); 2545 if (ret) { 2546 ret = ocfs2_cancel_convert(osb, lockres); 2547 if (ret < 0) 2548 mlog_errno(ret); 2549 } 2550 goto leave; 2551 } 2552 2553 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 2554 2555 mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n", 2556 lockres->l_level, lockres->l_blocking, new_level); 2557 2558 if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) { 2559 if (lockres->l_level == LKM_EXMODE) 2560 set_lvb = 1; 2561 2562 /* If the lock hasn't been refreshed yet (rare), then 2563 * our memory inode values are old and we skip 2564 * stuffing the lvb. There's no need to actually clear 2565 * out the lvb here as it's value is still valid. */ 2566 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { 2567 if (set_lvb) 2568 __ocfs2_stuff_meta_lvb(inode); 2569 } else 2570 mlog(0, "lockres %s: downconverting stale lock!\n", 2571 lockres->l_name); 2572 2573 mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, " 2574 "l_blocking=%d, new_level=%d\n", 2575 lockres->l_level, lockres->l_blocking, new_level); 2576 2577 ocfs2_prepare_downconvert(lockres, new_level); 2578 spin_unlock_irqrestore(&lockres->l_lock, flags); 2579 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); 2580 goto leave; 2581 } 2582 if (!ocfs2_inode_fully_checkpointed(inode)) 2583 ocfs2_start_checkpoint(osb); 2584 2585 *requeue = 1; 2586 spin_unlock_irqrestore(&lockres->l_lock, flags); 2587 ret = 0; 2588 leave: 2589 mlog_exit(ret); 2590 return ret; 2591 } 2592 2593 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, 2594 struct ocfs2_lock_res *lockres, 2595 int *requeue, 2596 ocfs2_convert_worker_t *worker) 2597 { 2598 unsigned long flags; 2599 int blocking; 2600 int new_level; 2601 int ret = 0; 2602 2603 mlog_entry_void(); 2604 2605 spin_lock_irqsave(&lockres->l_lock, flags); 2606 2607 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 2608 2609 recheck: 2610 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 2611 *requeue = 1; 2612 ret = ocfs2_prepare_cancel_convert(osb, lockres); 2613 spin_unlock_irqrestore(&lockres->l_lock, flags); 2614 if (ret) { 2615 ret = ocfs2_cancel_convert(osb, lockres); 2616 if (ret < 0) 2617 mlog_errno(ret); 2618 } 2619 goto leave; 2620 } 2621 2622 /* if we're blocking an exclusive and we have *any* holders, 2623 * then requeue. */ 2624 if ((lockres->l_blocking == LKM_EXMODE) 2625 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 2626 spin_unlock_irqrestore(&lockres->l_lock, flags); 2627 *requeue = 1; 2628 ret = 0; 2629 goto leave; 2630 } 2631 2632 /* If it's a PR we're blocking, then only 2633 * requeue if we've got any EX holders */ 2634 if (lockres->l_blocking == LKM_PRMODE && 2635 lockres->l_ex_holders) { 2636 spin_unlock_irqrestore(&lockres->l_lock, flags); 2637 *requeue = 1; 2638 ret = 0; 2639 goto leave; 2640 } 2641 2642 /* If we get here, then we know that there are no more 2643 * incompatible holders (and anyone asking for an incompatible 2644 * lock is blocked). We can now downconvert the lock */ 2645 if (!worker) 2646 goto downconvert; 2647 2648 /* Some lockres types want to do a bit of work before 2649 * downconverting a lock. Allow that here. The worker function 2650 * may sleep, so we save off a copy of what we're blocking as 2651 * it may change while we're not holding the spin lock. */ 2652 blocking = lockres->l_blocking; 2653 spin_unlock_irqrestore(&lockres->l_lock, flags); 2654 2655 worker(lockres, blocking); 2656 2657 spin_lock_irqsave(&lockres->l_lock, flags); 2658 if (blocking != lockres->l_blocking) { 2659 /* If this changed underneath us, then we can't drop 2660 * it just yet. */ 2661 goto recheck; 2662 } 2663 2664 downconvert: 2665 *requeue = 0; 2666 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 2667 2668 ocfs2_prepare_downconvert(lockres, new_level); 2669 spin_unlock_irqrestore(&lockres->l_lock, flags); 2670 ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0); 2671 leave: 2672 mlog_exit(ret); 2673 return ret; 2674 } 2675 2676 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, 2677 int blocking) 2678 { 2679 struct inode *inode; 2680 struct address_space *mapping; 2681 2682 mlog_entry_void(); 2683 2684 inode = ocfs2_lock_res_inode(lockres); 2685 mapping = inode->i_mapping; 2686 2687 if (filemap_fdatawrite(mapping)) { 2688 mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", 2689 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2690 } 2691 sync_mapping_buffers(mapping); 2692 if (blocking == LKM_EXMODE) { 2693 truncate_inode_pages(mapping, 0); 2694 unmap_mapping_range(mapping, 0, 0, 0); 2695 } else { 2696 /* We only need to wait on the I/O if we're not also 2697 * truncating pages because truncate_inode_pages waits 2698 * for us above. We don't truncate pages if we're 2699 * blocking anything < EXMODE because we want to keep 2700 * them around in that case. */ 2701 filemap_fdatawait(mapping); 2702 } 2703 2704 mlog_exit_void(); 2705 } 2706 2707 int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, 2708 int *requeue) 2709 { 2710 int status; 2711 struct inode *inode; 2712 struct ocfs2_super *osb; 2713 2714 mlog_entry_void(); 2715 2716 inode = ocfs2_lock_res_inode(lockres); 2717 osb = OCFS2_SB(inode->i_sb); 2718 2719 mlog(0, "unblock inode %llu\n", 2720 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2721 2722 status = ocfs2_generic_unblock_lock(osb, 2723 lockres, 2724 requeue, 2725 ocfs2_data_convert_worker); 2726 if (status < 0) 2727 mlog_errno(status); 2728 2729 mlog(0, "inode %llu, requeue = %d\n", 2730 (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue); 2731 2732 mlog_exit(status); 2733 return status; 2734 } 2735 2736 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, 2737 int *requeue) 2738 { 2739 int status; 2740 struct inode *inode; 2741 2742 mlog_entry_void(); 2743 2744 mlog(0, "Unblock lockres %s\n", lockres->l_name); 2745 2746 inode = ocfs2_lock_res_inode(lockres); 2747 2748 status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb), 2749 lockres, 2750 requeue, 2751 NULL); 2752 if (status < 0) 2753 mlog_errno(status); 2754 2755 mlog_exit(status); 2756 return status; 2757 } 2758 2759 2760 int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, 2761 int *requeue) 2762 { 2763 int status; 2764 struct inode *inode; 2765 2766 mlog_entry_void(); 2767 2768 inode = ocfs2_lock_res_inode(lockres); 2769 2770 mlog(0, "unblock inode %llu\n", 2771 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2772 2773 status = ocfs2_do_unblock_meta(inode, requeue); 2774 if (status < 0) 2775 mlog_errno(status); 2776 2777 mlog(0, "inode %llu, requeue = %d\n", 2778 (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue); 2779 2780 mlog_exit(status); 2781 return status; 2782 } 2783 2784 /* Generic unblock function for any lockres whose private data is an 2785 * ocfs2_super pointer. */ 2786 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, 2787 int *requeue) 2788 { 2789 int status; 2790 struct ocfs2_super *osb; 2791 2792 mlog_entry_void(); 2793 2794 mlog(0, "Unblock lockres %s\n", lockres->l_name); 2795 2796 osb = ocfs2_lock_res_super(lockres); 2797 2798 status = ocfs2_generic_unblock_lock(osb, 2799 lockres, 2800 requeue, 2801 NULL); 2802 if (status < 0) 2803 mlog_errno(status); 2804 2805 mlog_exit(status); 2806 return status; 2807 } 2808 2809 void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 2810 struct ocfs2_lock_res *lockres) 2811 { 2812 int status; 2813 int requeue = 0; 2814 unsigned long flags; 2815 2816 /* Our reference to the lockres in this function can be 2817 * considered valid until we remove the OCFS2_LOCK_QUEUED 2818 * flag. */ 2819 2820 mlog_entry_void(); 2821 2822 BUG_ON(!lockres); 2823 BUG_ON(!lockres->l_ops); 2824 BUG_ON(!lockres->l_ops->unblock); 2825 2826 mlog(0, "lockres %s blocked.\n", lockres->l_name); 2827 2828 /* Detect whether a lock has been marked as going away while 2829 * the vote thread was processing other things. A lock can 2830 * still be marked with OCFS2_LOCK_FREEING after this check, 2831 * but short circuiting here will still save us some 2832 * performance. */ 2833 spin_lock_irqsave(&lockres->l_lock, flags); 2834 if (lockres->l_flags & OCFS2_LOCK_FREEING) 2835 goto unqueue; 2836 spin_unlock_irqrestore(&lockres->l_lock, flags); 2837 2838 status = lockres->l_ops->unblock(lockres, &requeue); 2839 if (status < 0) 2840 mlog_errno(status); 2841 2842 spin_lock_irqsave(&lockres->l_lock, flags); 2843 unqueue: 2844 if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) { 2845 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); 2846 } else 2847 ocfs2_schedule_blocked_lock(osb, lockres); 2848 2849 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 2850 requeue ? "yes" : "no"); 2851 spin_unlock_irqrestore(&lockres->l_lock, flags); 2852 2853 mlog_exit_void(); 2854 } 2855 2856 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, 2857 struct ocfs2_lock_res *lockres) 2858 { 2859 mlog_entry_void(); 2860 2861 assert_spin_locked(&lockres->l_lock); 2862 2863 if (lockres->l_flags & OCFS2_LOCK_FREEING) { 2864 /* Do not schedule a lock for downconvert when it's on 2865 * the way to destruction - any nodes wanting access 2866 * to the resource will get it soon. */ 2867 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 2868 lockres->l_name, lockres->l_flags); 2869 return; 2870 } 2871 2872 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 2873 2874 spin_lock(&osb->vote_task_lock); 2875 if (list_empty(&lockres->l_blocked_list)) { 2876 list_add_tail(&lockres->l_blocked_list, 2877 &osb->blocked_lock_list); 2878 osb->blocked_lock_count++; 2879 } 2880 spin_unlock(&osb->vote_task_lock); 2881 2882 mlog_exit_void(); 2883 } 2884 2885 /* This aids in debugging situations where a bad LVB might be involved. */ 2886 void ocfs2_dump_meta_lvb_info(u64 level, 2887 const char *function, 2888 unsigned int line, 2889 struct ocfs2_lock_res *lockres) 2890 { 2891 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 2892 2893 mlog(level, "LVB information for %s (called from %s:%u):\n", 2894 lockres->l_name, function, line); 2895 mlog(level, "version: %u, clusters: %u\n", 2896 be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters)); 2897 mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n", 2898 (unsigned long long)be64_to_cpu(lvb->lvb_isize), 2899 be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid), 2900 be16_to_cpu(lvb->lvb_imode)); 2901 mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, " 2902 "mtime_packed 0x%llx\n", be16_to_cpu(lvb->lvb_inlink), 2903 (long long)be64_to_cpu(lvb->lvb_iatime_packed), 2904 (long long)be64_to_cpu(lvb->lvb_ictime_packed), 2905 (long long)be64_to_cpu(lvb->lvb_imtime_packed)); 2906 } 2907