1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * dlmglue.c 5 * 6 * Code which implements an OCFS2 specific interface to our DLM. 7 * 8 * Copyright (C) 2003, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/types.h> 27 #include <linux/slab.h> 28 #include <linux/highmem.h> 29 #include <linux/mm.h> 30 #include <linux/smp_lock.h> 31 #include <linux/crc32.h> 32 #include <linux/kthread.h> 33 #include <linux/pagemap.h> 34 #include <linux/debugfs.h> 35 #include <linux/seq_file.h> 36 37 #include <cluster/heartbeat.h> 38 #include <cluster/nodemanager.h> 39 #include <cluster/tcp.h> 40 41 #include <dlm/dlmapi.h> 42 43 #define MLOG_MASK_PREFIX ML_DLM_GLUE 44 #include <cluster/masklog.h> 45 46 #include "ocfs2.h" 47 48 #include "alloc.h" 49 #include "dlmglue.h" 50 #include "extent_map.h" 51 #include "heartbeat.h" 52 #include "inode.h" 53 #include "journal.h" 54 #include "slot_map.h" 55 #include "super.h" 56 #include "uptodate.h" 57 #include "vote.h" 58 59 #include "buffer_head_io.h" 60 61 struct ocfs2_mask_waiter { 62 struct list_head mw_item; 63 int mw_status; 64 struct completion mw_complete; 65 unsigned long mw_mask; 66 unsigned long mw_goal; 67 }; 68 69 static void ocfs2_inode_ast_func(void *opaque); 70 static void ocfs2_inode_bast_func(void *opaque, 71 int level); 72 static void ocfs2_super_ast_func(void *opaque); 73 static void ocfs2_super_bast_func(void *opaque, 74 int level); 75 static void ocfs2_rename_ast_func(void *opaque); 76 static void ocfs2_rename_bast_func(void *opaque, 77 int level); 78 79 /* so far, all locks have gotten along with the same unlock ast */ 80 static void ocfs2_unlock_ast_func(void *opaque, 81 enum dlm_status status); 82 static int ocfs2_do_unblock_meta(struct inode *inode, 83 int *requeue); 84 static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, 85 int *requeue); 86 static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, 87 int *requeue); 88 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, 89 int *requeue); 90 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, 91 int *requeue); 92 typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int); 93 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, 94 struct ocfs2_lock_res *lockres, 95 int *requeue, 96 ocfs2_convert_worker_t *worker); 97 98 struct ocfs2_lock_res_ops { 99 void (*ast)(void *); 100 void (*bast)(void *, int); 101 void (*unlock_ast)(void *, enum dlm_status); 102 int (*unblock)(struct ocfs2_lock_res *, int *); 103 }; 104 105 static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { 106 .ast = ocfs2_inode_ast_func, 107 .bast = ocfs2_inode_bast_func, 108 .unlock_ast = ocfs2_unlock_ast_func, 109 .unblock = ocfs2_unblock_inode_lock, 110 }; 111 112 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 113 .ast = ocfs2_inode_ast_func, 114 .bast = ocfs2_inode_bast_func, 115 .unlock_ast = ocfs2_unlock_ast_func, 116 .unblock = ocfs2_unblock_meta, 117 }; 118 119 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, 120 int blocking); 121 122 static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { 123 .ast = ocfs2_inode_ast_func, 124 .bast = ocfs2_inode_bast_func, 125 .unlock_ast = ocfs2_unlock_ast_func, 126 .unblock = ocfs2_unblock_data, 127 }; 128 129 static struct ocfs2_lock_res_ops ocfs2_super_lops = { 130 .ast = ocfs2_super_ast_func, 131 .bast = ocfs2_super_bast_func, 132 .unlock_ast = ocfs2_unlock_ast_func, 133 .unblock = ocfs2_unblock_osb_lock, 134 }; 135 136 static struct ocfs2_lock_res_ops ocfs2_rename_lops = { 137 .ast = ocfs2_rename_ast_func, 138 .bast = ocfs2_rename_bast_func, 139 .unlock_ast = ocfs2_unlock_ast_func, 140 .unblock = ocfs2_unblock_osb_lock, 141 }; 142 143 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 144 { 145 return lockres->l_type == OCFS2_LOCK_TYPE_META || 146 lockres->l_type == OCFS2_LOCK_TYPE_DATA || 147 lockres->l_type == OCFS2_LOCK_TYPE_RW; 148 } 149 150 static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres) 151 { 152 return lockres->l_type == OCFS2_LOCK_TYPE_SUPER; 153 } 154 155 static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres) 156 { 157 return lockres->l_type == OCFS2_LOCK_TYPE_RENAME; 158 } 159 160 static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres) 161 { 162 BUG_ON(!ocfs2_is_super_lock(lockres) 163 && !ocfs2_is_rename_lock(lockres)); 164 165 return (struct ocfs2_super *) lockres->l_priv; 166 } 167 168 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 169 { 170 BUG_ON(!ocfs2_is_inode_lock(lockres)); 171 172 return (struct inode *) lockres->l_priv; 173 } 174 175 static int ocfs2_lock_create(struct ocfs2_super *osb, 176 struct ocfs2_lock_res *lockres, 177 int level, 178 int dlm_flags); 179 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 180 int wanted); 181 static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 182 struct ocfs2_lock_res *lockres, 183 int level); 184 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); 185 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); 186 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); 187 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); 188 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, 189 struct ocfs2_lock_res *lockres); 190 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 191 int convert); 192 #define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ 193 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 194 "resource %s: %s\n", dlm_errname(_stat), _func, \ 195 _lockres->l_name, dlm_errmsg(_stat)); \ 196 } while (0) 197 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 198 struct ocfs2_lock_res *lockres); 199 static int ocfs2_meta_lock_update(struct inode *inode, 200 struct buffer_head **bh); 201 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 202 static inline int ocfs2_highest_compat_lock_level(int level); 203 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, 204 struct ocfs2_lock_res *lockres, 205 int new_level); 206 207 static char *ocfs2_lock_type_strings[] = { 208 [OCFS2_LOCK_TYPE_META] = "Meta", 209 [OCFS2_LOCK_TYPE_DATA] = "Data", 210 [OCFS2_LOCK_TYPE_SUPER] = "Super", 211 [OCFS2_LOCK_TYPE_RENAME] = "Rename", 212 /* Need to differntiate from [R]ename.. serializing writes is the 213 * important job it does, anyway. */ 214 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 215 }; 216 217 static char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 218 { 219 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); 220 return ocfs2_lock_type_strings[type]; 221 } 222 223 static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 224 u64 blkno, 225 u32 generation, 226 char *name) 227 { 228 int len; 229 230 mlog_entry_void(); 231 232 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); 233 234 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x", 235 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, 236 (long long)blkno, generation); 237 238 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); 239 240 mlog(0, "built lock resource with name: %s\n", name); 241 242 mlog_exit_void(); 243 } 244 245 static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; 246 247 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, 248 struct ocfs2_dlm_debug *dlm_debug) 249 { 250 mlog(0, "Add tracking for lockres %s\n", res->l_name); 251 252 spin_lock(&ocfs2_dlm_tracking_lock); 253 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); 254 spin_unlock(&ocfs2_dlm_tracking_lock); 255 } 256 257 static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) 258 { 259 spin_lock(&ocfs2_dlm_tracking_lock); 260 if (!list_empty(&res->l_debug_list)) 261 list_del_init(&res->l_debug_list); 262 spin_unlock(&ocfs2_dlm_tracking_lock); 263 } 264 265 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, 266 struct ocfs2_lock_res *res, 267 enum ocfs2_lock_type type, 268 u64 blkno, 269 u32 generation, 270 struct ocfs2_lock_res_ops *ops, 271 void *priv) 272 { 273 ocfs2_build_lock_name(type, blkno, generation, res->l_name); 274 275 res->l_type = type; 276 res->l_ops = ops; 277 res->l_priv = priv; 278 279 res->l_level = LKM_IVMODE; 280 res->l_requested = LKM_IVMODE; 281 res->l_blocking = LKM_IVMODE; 282 res->l_action = OCFS2_AST_INVALID; 283 res->l_unlock_action = OCFS2_UNLOCK_INVALID; 284 285 res->l_flags = OCFS2_LOCK_INITIALIZED; 286 287 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); 288 } 289 290 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) 291 { 292 /* This also clears out the lock status block */ 293 memset(res, 0, sizeof(struct ocfs2_lock_res)); 294 spin_lock_init(&res->l_lock); 295 init_waitqueue_head(&res->l_event); 296 INIT_LIST_HEAD(&res->l_blocked_list); 297 INIT_LIST_HEAD(&res->l_mask_waiters); 298 } 299 300 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, 301 enum ocfs2_lock_type type, 302 struct inode *inode) 303 { 304 struct ocfs2_lock_res_ops *ops; 305 306 switch(type) { 307 case OCFS2_LOCK_TYPE_RW: 308 ops = &ocfs2_inode_rw_lops; 309 break; 310 case OCFS2_LOCK_TYPE_META: 311 ops = &ocfs2_inode_meta_lops; 312 break; 313 case OCFS2_LOCK_TYPE_DATA: 314 ops = &ocfs2_inode_data_lops; 315 break; 316 default: 317 mlog_bug_on_msg(1, "type: %d\n", type); 318 ops = NULL; /* thanks, gcc */ 319 break; 320 }; 321 322 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, 323 OCFS2_I(inode)->ip_blkno, 324 inode->i_generation, ops, inode); 325 } 326 327 static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, 328 struct ocfs2_super *osb) 329 { 330 /* Superblock lockres doesn't come from a slab so we call init 331 * once on it manually. */ 332 ocfs2_lock_res_init_once(res); 333 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, 334 OCFS2_SUPER_BLOCK_BLKNO, 0, 335 &ocfs2_super_lops, osb); 336 } 337 338 static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, 339 struct ocfs2_super *osb) 340 { 341 /* Rename lockres doesn't come from a slab so we call init 342 * once on it manually. */ 343 ocfs2_lock_res_init_once(res); 344 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0, 345 &ocfs2_rename_lops, osb); 346 } 347 348 void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 349 { 350 mlog_entry_void(); 351 352 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) 353 return; 354 355 ocfs2_remove_lockres_tracking(res); 356 357 mlog_bug_on_msg(!list_empty(&res->l_blocked_list), 358 "Lockres %s is on the blocked list\n", 359 res->l_name); 360 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), 361 "Lockres %s has mask waiters pending\n", 362 res->l_name); 363 mlog_bug_on_msg(spin_is_locked(&res->l_lock), 364 "Lockres %s is locked\n", 365 res->l_name); 366 mlog_bug_on_msg(res->l_ro_holders, 367 "Lockres %s has %u ro holders\n", 368 res->l_name, res->l_ro_holders); 369 mlog_bug_on_msg(res->l_ex_holders, 370 "Lockres %s has %u ex holders\n", 371 res->l_name, res->l_ex_holders); 372 373 /* Need to clear out the lock status block for the dlm */ 374 memset(&res->l_lksb, 0, sizeof(res->l_lksb)); 375 376 res->l_flags = 0UL; 377 mlog_exit_void(); 378 } 379 380 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, 381 int level) 382 { 383 mlog_entry_void(); 384 385 BUG_ON(!lockres); 386 387 switch(level) { 388 case LKM_EXMODE: 389 lockres->l_ex_holders++; 390 break; 391 case LKM_PRMODE: 392 lockres->l_ro_holders++; 393 break; 394 default: 395 BUG(); 396 } 397 398 mlog_exit_void(); 399 } 400 401 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, 402 int level) 403 { 404 mlog_entry_void(); 405 406 BUG_ON(!lockres); 407 408 switch(level) { 409 case LKM_EXMODE: 410 BUG_ON(!lockres->l_ex_holders); 411 lockres->l_ex_holders--; 412 break; 413 case LKM_PRMODE: 414 BUG_ON(!lockres->l_ro_holders); 415 lockres->l_ro_holders--; 416 break; 417 default: 418 BUG(); 419 } 420 mlog_exit_void(); 421 } 422 423 /* WARNING: This function lives in a world where the only three lock 424 * levels are EX, PR, and NL. It *will* have to be adjusted when more 425 * lock types are added. */ 426 static inline int ocfs2_highest_compat_lock_level(int level) 427 { 428 int new_level = LKM_EXMODE; 429 430 if (level == LKM_EXMODE) 431 new_level = LKM_NLMODE; 432 else if (level == LKM_PRMODE) 433 new_level = LKM_PRMODE; 434 return new_level; 435 } 436 437 static void lockres_set_flags(struct ocfs2_lock_res *lockres, 438 unsigned long newflags) 439 { 440 struct list_head *pos, *tmp; 441 struct ocfs2_mask_waiter *mw; 442 443 assert_spin_locked(&lockres->l_lock); 444 445 lockres->l_flags = newflags; 446 447 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { 448 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); 449 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 450 continue; 451 452 list_del_init(&mw->mw_item); 453 mw->mw_status = 0; 454 complete(&mw->mw_complete); 455 } 456 } 457 static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) 458 { 459 lockres_set_flags(lockres, lockres->l_flags | or); 460 } 461 static void lockres_clear_flags(struct ocfs2_lock_res *lockres, 462 unsigned long clear) 463 { 464 lockres_set_flags(lockres, lockres->l_flags & ~clear); 465 } 466 467 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) 468 { 469 mlog_entry_void(); 470 471 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 472 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 473 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 474 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 475 476 lockres->l_level = lockres->l_requested; 477 if (lockres->l_level <= 478 ocfs2_highest_compat_lock_level(lockres->l_blocking)) { 479 lockres->l_blocking = LKM_NLMODE; 480 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 481 } 482 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 483 484 mlog_exit_void(); 485 } 486 487 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) 488 { 489 mlog_entry_void(); 490 491 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 492 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 493 494 /* Convert from RO to EX doesn't really need anything as our 495 * information is already up to data. Convert from NL to 496 * *anything* however should mark ourselves as needing an 497 * update */ 498 if (lockres->l_level == LKM_NLMODE) 499 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 500 501 lockres->l_level = lockres->l_requested; 502 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 503 504 mlog_exit_void(); 505 } 506 507 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) 508 { 509 mlog_entry_void(); 510 511 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); 512 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 513 514 if (lockres->l_requested > LKM_NLMODE && 515 !(lockres->l_flags & OCFS2_LOCK_LOCAL)) 516 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 517 518 lockres->l_level = lockres->l_requested; 519 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); 520 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 521 522 mlog_exit_void(); 523 } 524 525 static void ocfs2_inode_ast_func(void *opaque) 526 { 527 struct ocfs2_lock_res *lockres = opaque; 528 struct inode *inode; 529 struct dlm_lockstatus *lksb; 530 unsigned long flags; 531 532 mlog_entry_void(); 533 534 inode = ocfs2_lock_res_inode(lockres); 535 536 mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n", 537 (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action, 538 ocfs2_lock_type_string(lockres->l_type)); 539 540 BUG_ON(!ocfs2_is_inode_lock(lockres)); 541 542 spin_lock_irqsave(&lockres->l_lock, flags); 543 544 lksb = &(lockres->l_lksb); 545 if (lksb->status != DLM_NORMAL) { 546 mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u " 547 "on inode %llu\n", lksb->status, 548 (unsigned long long)OCFS2_I(inode)->ip_blkno); 549 spin_unlock_irqrestore(&lockres->l_lock, flags); 550 mlog_exit_void(); 551 return; 552 } 553 554 switch(lockres->l_action) { 555 case OCFS2_AST_ATTACH: 556 ocfs2_generic_handle_attach_action(lockres); 557 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); 558 break; 559 case OCFS2_AST_CONVERT: 560 ocfs2_generic_handle_convert_action(lockres); 561 break; 562 case OCFS2_AST_DOWNCONVERT: 563 ocfs2_generic_handle_downconvert_action(lockres); 564 break; 565 default: 566 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 567 "lockres flags = 0x%lx, unlock action: %u\n", 568 lockres->l_name, lockres->l_action, lockres->l_flags, 569 lockres->l_unlock_action); 570 571 BUG(); 572 } 573 574 /* data and rw locking ignores refresh flag for now. */ 575 if (lockres->l_type != OCFS2_LOCK_TYPE_META) 576 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 577 578 /* set it to something invalid so if we get called again we 579 * can catch it. */ 580 lockres->l_action = OCFS2_AST_INVALID; 581 spin_unlock_irqrestore(&lockres->l_lock, flags); 582 wake_up(&lockres->l_event); 583 584 mlog_exit_void(); 585 } 586 587 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, 588 int level) 589 { 590 int needs_downconvert = 0; 591 mlog_entry_void(); 592 593 assert_spin_locked(&lockres->l_lock); 594 595 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 596 597 if (level > lockres->l_blocking) { 598 /* only schedule a downconvert if we haven't already scheduled 599 * one that goes low enough to satisfy the level we're 600 * blocking. this also catches the case where we get 601 * duplicate BASTs */ 602 if (ocfs2_highest_compat_lock_level(level) < 603 ocfs2_highest_compat_lock_level(lockres->l_blocking)) 604 needs_downconvert = 1; 605 606 lockres->l_blocking = level; 607 } 608 609 mlog_exit(needs_downconvert); 610 return needs_downconvert; 611 } 612 613 static void ocfs2_generic_bast_func(struct ocfs2_super *osb, 614 struct ocfs2_lock_res *lockres, 615 int level) 616 { 617 int needs_downconvert; 618 unsigned long flags; 619 620 mlog_entry_void(); 621 622 BUG_ON(level <= LKM_NLMODE); 623 624 spin_lock_irqsave(&lockres->l_lock, flags); 625 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 626 if (needs_downconvert) 627 ocfs2_schedule_blocked_lock(osb, lockres); 628 spin_unlock_irqrestore(&lockres->l_lock, flags); 629 630 ocfs2_kick_vote_thread(osb); 631 632 wake_up(&lockres->l_event); 633 mlog_exit_void(); 634 } 635 636 static void ocfs2_inode_bast_func(void *opaque, int level) 637 { 638 struct ocfs2_lock_res *lockres = opaque; 639 struct inode *inode; 640 struct ocfs2_super *osb; 641 642 mlog_entry_void(); 643 644 BUG_ON(!ocfs2_is_inode_lock(lockres)); 645 646 inode = ocfs2_lock_res_inode(lockres); 647 osb = OCFS2_SB(inode->i_sb); 648 649 mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n", 650 (unsigned long long)OCFS2_I(inode)->ip_blkno, level, 651 lockres->l_level, ocfs2_lock_type_string(lockres->l_type)); 652 653 ocfs2_generic_bast_func(osb, lockres, level); 654 655 mlog_exit_void(); 656 } 657 658 static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, 659 int ignore_refresh) 660 { 661 struct dlm_lockstatus *lksb = &lockres->l_lksb; 662 unsigned long flags; 663 664 spin_lock_irqsave(&lockres->l_lock, flags); 665 666 if (lksb->status != DLM_NORMAL) { 667 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", 668 lockres->l_name, lksb->status); 669 spin_unlock_irqrestore(&lockres->l_lock, flags); 670 return; 671 } 672 673 switch(lockres->l_action) { 674 case OCFS2_AST_ATTACH: 675 ocfs2_generic_handle_attach_action(lockres); 676 break; 677 case OCFS2_AST_CONVERT: 678 ocfs2_generic_handle_convert_action(lockres); 679 break; 680 case OCFS2_AST_DOWNCONVERT: 681 ocfs2_generic_handle_downconvert_action(lockres); 682 break; 683 default: 684 BUG(); 685 } 686 687 if (ignore_refresh) 688 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 689 690 /* set it to something invalid so if we get called again we 691 * can catch it. */ 692 lockres->l_action = OCFS2_AST_INVALID; 693 spin_unlock_irqrestore(&lockres->l_lock, flags); 694 695 wake_up(&lockres->l_event); 696 } 697 698 static void ocfs2_super_ast_func(void *opaque) 699 { 700 struct ocfs2_lock_res *lockres = opaque; 701 702 mlog_entry_void(); 703 mlog(0, "Superblock AST fired\n"); 704 705 BUG_ON(!ocfs2_is_super_lock(lockres)); 706 ocfs2_generic_ast_func(lockres, 0); 707 708 mlog_exit_void(); 709 } 710 711 static void ocfs2_super_bast_func(void *opaque, 712 int level) 713 { 714 struct ocfs2_lock_res *lockres = opaque; 715 struct ocfs2_super *osb; 716 717 mlog_entry_void(); 718 mlog(0, "Superblock BAST fired\n"); 719 720 BUG_ON(!ocfs2_is_super_lock(lockres)); 721 osb = ocfs2_lock_res_super(lockres); 722 ocfs2_generic_bast_func(osb, lockres, level); 723 724 mlog_exit_void(); 725 } 726 727 static void ocfs2_rename_ast_func(void *opaque) 728 { 729 struct ocfs2_lock_res *lockres = opaque; 730 731 mlog_entry_void(); 732 733 mlog(0, "Rename AST fired\n"); 734 735 BUG_ON(!ocfs2_is_rename_lock(lockres)); 736 737 ocfs2_generic_ast_func(lockres, 1); 738 739 mlog_exit_void(); 740 } 741 742 static void ocfs2_rename_bast_func(void *opaque, 743 int level) 744 { 745 struct ocfs2_lock_res *lockres = opaque; 746 struct ocfs2_super *osb; 747 748 mlog_entry_void(); 749 750 mlog(0, "Rename BAST fired\n"); 751 752 BUG_ON(!ocfs2_is_rename_lock(lockres)); 753 754 osb = ocfs2_lock_res_super(lockres); 755 ocfs2_generic_bast_func(osb, lockres, level); 756 757 mlog_exit_void(); 758 } 759 760 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 761 int convert) 762 { 763 unsigned long flags; 764 765 mlog_entry_void(); 766 spin_lock_irqsave(&lockres->l_lock, flags); 767 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 768 if (convert) 769 lockres->l_action = OCFS2_AST_INVALID; 770 else 771 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 772 spin_unlock_irqrestore(&lockres->l_lock, flags); 773 774 wake_up(&lockres->l_event); 775 mlog_exit_void(); 776 } 777 778 /* Note: If we detect another process working on the lock (i.e., 779 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller 780 * to do the right thing in that case. 781 */ 782 static int ocfs2_lock_create(struct ocfs2_super *osb, 783 struct ocfs2_lock_res *lockres, 784 int level, 785 int dlm_flags) 786 { 787 int ret = 0; 788 enum dlm_status status; 789 unsigned long flags; 790 791 mlog_entry_void(); 792 793 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, 794 dlm_flags); 795 796 spin_lock_irqsave(&lockres->l_lock, flags); 797 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || 798 (lockres->l_flags & OCFS2_LOCK_BUSY)) { 799 spin_unlock_irqrestore(&lockres->l_lock, flags); 800 goto bail; 801 } 802 803 lockres->l_action = OCFS2_AST_ATTACH; 804 lockres->l_requested = level; 805 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 806 spin_unlock_irqrestore(&lockres->l_lock, flags); 807 808 status = dlmlock(osb->dlm, 809 level, 810 &lockres->l_lksb, 811 dlm_flags, 812 lockres->l_name, 813 lockres->l_ops->ast, 814 lockres, 815 lockres->l_ops->bast); 816 if (status != DLM_NORMAL) { 817 ocfs2_log_dlm_error("dlmlock", status, lockres); 818 ret = -EINVAL; 819 ocfs2_recover_from_dlm_error(lockres, 1); 820 } 821 822 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); 823 824 bail: 825 mlog_exit(ret); 826 return ret; 827 } 828 829 static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, 830 int flag) 831 { 832 unsigned long flags; 833 int ret; 834 835 spin_lock_irqsave(&lockres->l_lock, flags); 836 ret = lockres->l_flags & flag; 837 spin_unlock_irqrestore(&lockres->l_lock, flags); 838 839 return ret; 840 } 841 842 static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) 843 844 { 845 wait_event(lockres->l_event, 846 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); 847 } 848 849 static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) 850 851 { 852 wait_event(lockres->l_event, 853 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); 854 } 855 856 /* predict what lock level we'll be dropping down to on behalf 857 * of another node, and return true if the currently wanted 858 * level will be compatible with it. */ 859 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 860 int wanted) 861 { 862 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 863 864 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); 865 } 866 867 static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) 868 { 869 INIT_LIST_HEAD(&mw->mw_item); 870 init_completion(&mw->mw_complete); 871 } 872 873 static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) 874 { 875 wait_for_completion(&mw->mw_complete); 876 /* Re-arm the completion in case we want to wait on it again */ 877 INIT_COMPLETION(mw->mw_complete); 878 return mw->mw_status; 879 } 880 881 static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, 882 struct ocfs2_mask_waiter *mw, 883 unsigned long mask, 884 unsigned long goal) 885 { 886 BUG_ON(!list_empty(&mw->mw_item)); 887 888 assert_spin_locked(&lockres->l_lock); 889 890 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); 891 mw->mw_mask = mask; 892 mw->mw_goal = goal; 893 } 894 895 /* returns 0 if the mw that was removed was already satisfied, -EBUSY 896 * if the mask still hadn't reached its goal */ 897 static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, 898 struct ocfs2_mask_waiter *mw) 899 { 900 unsigned long flags; 901 int ret = 0; 902 903 spin_lock_irqsave(&lockres->l_lock, flags); 904 if (!list_empty(&mw->mw_item)) { 905 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 906 ret = -EBUSY; 907 908 list_del_init(&mw->mw_item); 909 init_completion(&mw->mw_complete); 910 } 911 spin_unlock_irqrestore(&lockres->l_lock, flags); 912 913 return ret; 914 915 } 916 917 static int ocfs2_cluster_lock(struct ocfs2_super *osb, 918 struct ocfs2_lock_res *lockres, 919 int level, 920 int lkm_flags, 921 int arg_flags) 922 { 923 struct ocfs2_mask_waiter mw; 924 enum dlm_status status; 925 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); 926 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ 927 unsigned long flags; 928 929 mlog_entry_void(); 930 931 ocfs2_init_mask_waiter(&mw); 932 933 again: 934 wait = 0; 935 936 if (catch_signals && signal_pending(current)) { 937 ret = -ERESTARTSYS; 938 goto out; 939 } 940 941 spin_lock_irqsave(&lockres->l_lock, flags); 942 943 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, 944 "Cluster lock called on freeing lockres %s! flags " 945 "0x%lx\n", lockres->l_name, lockres->l_flags); 946 947 /* We only compare against the currently granted level 948 * here. If the lock is blocked waiting on a downconvert, 949 * we'll get caught below. */ 950 if (lockres->l_flags & OCFS2_LOCK_BUSY && 951 level > lockres->l_level) { 952 /* is someone sitting in dlm_lock? If so, wait on 953 * them. */ 954 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 955 wait = 1; 956 goto unlock; 957 } 958 959 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 960 /* lock has not been created yet. */ 961 spin_unlock_irqrestore(&lockres->l_lock, flags); 962 963 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); 964 if (ret < 0) { 965 mlog_errno(ret); 966 goto out; 967 } 968 goto again; 969 } 970 971 if (lockres->l_flags & OCFS2_LOCK_BLOCKED && 972 !ocfs2_may_continue_on_blocked_lock(lockres, level)) { 973 /* is the lock is currently blocked on behalf of 974 * another node */ 975 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); 976 wait = 1; 977 goto unlock; 978 } 979 980 if (level > lockres->l_level) { 981 if (lockres->l_action != OCFS2_AST_INVALID) 982 mlog(ML_ERROR, "lockres %s has action %u pending\n", 983 lockres->l_name, lockres->l_action); 984 985 lockres->l_action = OCFS2_AST_CONVERT; 986 lockres->l_requested = level; 987 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 988 spin_unlock_irqrestore(&lockres->l_lock, flags); 989 990 BUG_ON(level == LKM_IVMODE); 991 BUG_ON(level == LKM_NLMODE); 992 993 mlog(0, "lock %s, convert from %d to level = %d\n", 994 lockres->l_name, lockres->l_level, level); 995 996 /* call dlm_lock to upgrade lock now */ 997 status = dlmlock(osb->dlm, 998 level, 999 &lockres->l_lksb, 1000 lkm_flags|LKM_CONVERT|LKM_VALBLK, 1001 lockres->l_name, 1002 lockres->l_ops->ast, 1003 lockres, 1004 lockres->l_ops->bast); 1005 if (status != DLM_NORMAL) { 1006 if ((lkm_flags & LKM_NOQUEUE) && 1007 (status == DLM_NOTQUEUED)) 1008 ret = -EAGAIN; 1009 else { 1010 ocfs2_log_dlm_error("dlmlock", status, 1011 lockres); 1012 ret = -EINVAL; 1013 } 1014 ocfs2_recover_from_dlm_error(lockres, 1); 1015 goto out; 1016 } 1017 1018 mlog(0, "lock %s, successfull return from dlmlock\n", 1019 lockres->l_name); 1020 1021 /* At this point we've gone inside the dlm and need to 1022 * complete our work regardless. */ 1023 catch_signals = 0; 1024 1025 /* wait for busy to clear and carry on */ 1026 goto again; 1027 } 1028 1029 /* Ok, if we get here then we're good to go. */ 1030 ocfs2_inc_holders(lockres, level); 1031 1032 ret = 0; 1033 unlock: 1034 spin_unlock_irqrestore(&lockres->l_lock, flags); 1035 out: 1036 /* 1037 * This is helping work around a lock inversion between the page lock 1038 * and dlm locks. One path holds the page lock while calling aops 1039 * which block acquiring dlm locks. The voting thread holds dlm 1040 * locks while acquiring page locks while down converting data locks. 1041 * This block is helping an aop path notice the inversion and back 1042 * off to unlock its page lock before trying the dlm lock again. 1043 */ 1044 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && 1045 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { 1046 wait = 0; 1047 if (lockres_remove_mask_waiter(lockres, &mw)) 1048 ret = -EAGAIN; 1049 else 1050 goto again; 1051 } 1052 if (wait) { 1053 ret = ocfs2_wait_for_mask(&mw); 1054 if (ret == 0) 1055 goto again; 1056 mlog_errno(ret); 1057 } 1058 1059 mlog_exit(ret); 1060 return ret; 1061 } 1062 1063 static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 1064 struct ocfs2_lock_res *lockres, 1065 int level) 1066 { 1067 unsigned long flags; 1068 1069 mlog_entry_void(); 1070 spin_lock_irqsave(&lockres->l_lock, flags); 1071 ocfs2_dec_holders(lockres, level); 1072 ocfs2_vote_on_unlock(osb, lockres); 1073 spin_unlock_irqrestore(&lockres->l_lock, flags); 1074 mlog_exit_void(); 1075 } 1076 1077 static int ocfs2_create_new_inode_lock(struct inode *inode, 1078 struct ocfs2_lock_res *lockres) 1079 { 1080 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1081 unsigned long flags; 1082 1083 spin_lock_irqsave(&lockres->l_lock, flags); 1084 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 1085 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); 1086 spin_unlock_irqrestore(&lockres->l_lock, flags); 1087 1088 return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL); 1089 } 1090 1091 /* Grants us an EX lock on the data and metadata resources, skipping 1092 * the normal cluster directory lookup. Use this ONLY on newly created 1093 * inodes which other nodes can't possibly see, and which haven't been 1094 * hashed in the inode hash yet. This can give us a good performance 1095 * increase as it'll skip the network broadcast normally associated 1096 * with creating a new lock resource. */ 1097 int ocfs2_create_new_inode_locks(struct inode *inode) 1098 { 1099 int ret; 1100 1101 BUG_ON(!inode); 1102 BUG_ON(!ocfs2_inode_is_new(inode)); 1103 1104 mlog_entry_void(); 1105 1106 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 1107 1108 /* NOTE: That we don't increment any of the holder counts, nor 1109 * do we add anything to a journal handle. Since this is 1110 * supposed to be a new inode which the cluster doesn't know 1111 * about yet, there is no need to. As far as the LVB handling 1112 * is concerned, this is basically like acquiring an EX lock 1113 * on a resource which has an invalid one -- we'll set it 1114 * valid when we release the EX. */ 1115 1116 ret = ocfs2_create_new_inode_lock(inode, 1117 &OCFS2_I(inode)->ip_rw_lockres); 1118 if (ret) { 1119 mlog_errno(ret); 1120 goto bail; 1121 } 1122 1123 ret = ocfs2_create_new_inode_lock(inode, 1124 &OCFS2_I(inode)->ip_meta_lockres); 1125 if (ret) { 1126 mlog_errno(ret); 1127 goto bail; 1128 } 1129 1130 ret = ocfs2_create_new_inode_lock(inode, 1131 &OCFS2_I(inode)->ip_data_lockres); 1132 if (ret) { 1133 mlog_errno(ret); 1134 goto bail; 1135 } 1136 1137 bail: 1138 mlog_exit(ret); 1139 return ret; 1140 } 1141 1142 int ocfs2_rw_lock(struct inode *inode, int write) 1143 { 1144 int status, level; 1145 struct ocfs2_lock_res *lockres; 1146 1147 BUG_ON(!inode); 1148 1149 mlog_entry_void(); 1150 1151 mlog(0, "inode %llu take %s RW lock\n", 1152 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1153 write ? "EXMODE" : "PRMODE"); 1154 1155 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1156 1157 level = write ? LKM_EXMODE : LKM_PRMODE; 1158 1159 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 1160 0); 1161 if (status < 0) 1162 mlog_errno(status); 1163 1164 mlog_exit(status); 1165 return status; 1166 } 1167 1168 void ocfs2_rw_unlock(struct inode *inode, int write) 1169 { 1170 int level = write ? LKM_EXMODE : LKM_PRMODE; 1171 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; 1172 1173 mlog_entry_void(); 1174 1175 mlog(0, "inode %llu drop %s RW lock\n", 1176 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1177 write ? "EXMODE" : "PRMODE"); 1178 1179 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1180 1181 mlog_exit_void(); 1182 } 1183 1184 int ocfs2_data_lock_full(struct inode *inode, 1185 int write, 1186 int arg_flags) 1187 { 1188 int status = 0, level; 1189 struct ocfs2_lock_res *lockres; 1190 1191 BUG_ON(!inode); 1192 1193 mlog_entry_void(); 1194 1195 mlog(0, "inode %llu take %s DATA lock\n", 1196 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1197 write ? "EXMODE" : "PRMODE"); 1198 1199 /* We'll allow faking a readonly data lock for 1200 * rodevices. */ 1201 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1202 if (write) { 1203 status = -EROFS; 1204 mlog_errno(status); 1205 } 1206 goto out; 1207 } 1208 1209 lockres = &OCFS2_I(inode)->ip_data_lockres; 1210 1211 level = write ? LKM_EXMODE : LKM_PRMODE; 1212 1213 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1214 0, arg_flags); 1215 if (status < 0 && status != -EAGAIN) 1216 mlog_errno(status); 1217 1218 out: 1219 mlog_exit(status); 1220 return status; 1221 } 1222 1223 /* see ocfs2_meta_lock_with_page() */ 1224 int ocfs2_data_lock_with_page(struct inode *inode, 1225 int write, 1226 struct page *page) 1227 { 1228 int ret; 1229 1230 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1231 if (ret == -EAGAIN) { 1232 unlock_page(page); 1233 if (ocfs2_data_lock(inode, write) == 0) 1234 ocfs2_data_unlock(inode, write); 1235 ret = AOP_TRUNCATED_PAGE; 1236 } 1237 1238 return ret; 1239 } 1240 1241 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1242 struct ocfs2_lock_res *lockres) 1243 { 1244 int kick = 0; 1245 1246 mlog_entry_void(); 1247 1248 /* If we know that another node is waiting on our lock, kick 1249 * the vote thread * pre-emptively when we reach a release 1250 * condition. */ 1251 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1252 switch(lockres->l_blocking) { 1253 case LKM_EXMODE: 1254 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 1255 kick = 1; 1256 break; 1257 case LKM_PRMODE: 1258 if (!lockres->l_ex_holders) 1259 kick = 1; 1260 break; 1261 default: 1262 BUG(); 1263 } 1264 } 1265 1266 if (kick) 1267 ocfs2_kick_vote_thread(osb); 1268 1269 mlog_exit_void(); 1270 } 1271 1272 void ocfs2_data_unlock(struct inode *inode, 1273 int write) 1274 { 1275 int level = write ? LKM_EXMODE : LKM_PRMODE; 1276 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; 1277 1278 mlog_entry_void(); 1279 1280 mlog(0, "inode %llu drop %s DATA lock\n", 1281 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1282 write ? "EXMODE" : "PRMODE"); 1283 1284 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) 1285 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1286 1287 mlog_exit_void(); 1288 } 1289 1290 #define OCFS2_SEC_BITS 34 1291 #define OCFS2_SEC_SHIFT (64 - 34) 1292 #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) 1293 1294 /* LVB only has room for 64 bits of time here so we pack it for 1295 * now. */ 1296 static u64 ocfs2_pack_timespec(struct timespec *spec) 1297 { 1298 u64 res; 1299 u64 sec = spec->tv_sec; 1300 u32 nsec = spec->tv_nsec; 1301 1302 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); 1303 1304 return res; 1305 } 1306 1307 /* Call this with the lockres locked. I am reasonably sure we don't 1308 * need ip_lock in this function as anyone who would be changing those 1309 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1310 static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1311 { 1312 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1313 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1314 struct ocfs2_meta_lvb *lvb; 1315 1316 mlog_entry_void(); 1317 1318 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1319 1320 lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION); 1321 lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); 1322 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); 1323 lvb->lvb_iuid = cpu_to_be32(inode->i_uid); 1324 lvb->lvb_igid = cpu_to_be32(inode->i_gid); 1325 lvb->lvb_imode = cpu_to_be16(inode->i_mode); 1326 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); 1327 lvb->lvb_iatime_packed = 1328 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); 1329 lvb->lvb_ictime_packed = 1330 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); 1331 lvb->lvb_imtime_packed = 1332 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); 1333 1334 mlog_meta_lvb(0, lockres); 1335 1336 mlog_exit_void(); 1337 } 1338 1339 static void ocfs2_unpack_timespec(struct timespec *spec, 1340 u64 packed_time) 1341 { 1342 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; 1343 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; 1344 } 1345 1346 static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1347 { 1348 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1349 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1350 struct ocfs2_meta_lvb *lvb; 1351 1352 mlog_entry_void(); 1353 1354 mlog_meta_lvb(0, lockres); 1355 1356 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1357 1358 /* We're safe here without the lockres lock... */ 1359 spin_lock(&oi->ip_lock); 1360 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); 1361 i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); 1362 1363 /* fast-symlinks are a special case */ 1364 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) 1365 inode->i_blocks = 0; 1366 else 1367 inode->i_blocks = 1368 ocfs2_align_bytes_to_sectors(i_size_read(inode)); 1369 1370 inode->i_uid = be32_to_cpu(lvb->lvb_iuid); 1371 inode->i_gid = be32_to_cpu(lvb->lvb_igid); 1372 inode->i_mode = be16_to_cpu(lvb->lvb_imode); 1373 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); 1374 ocfs2_unpack_timespec(&inode->i_atime, 1375 be64_to_cpu(lvb->lvb_iatime_packed)); 1376 ocfs2_unpack_timespec(&inode->i_mtime, 1377 be64_to_cpu(lvb->lvb_imtime_packed)); 1378 ocfs2_unpack_timespec(&inode->i_ctime, 1379 be64_to_cpu(lvb->lvb_ictime_packed)); 1380 spin_unlock(&oi->ip_lock); 1381 1382 mlog_exit_void(); 1383 } 1384 1385 static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres) 1386 { 1387 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1388 1389 if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION) 1390 return 1; 1391 return 0; 1392 } 1393 1394 /* Determine whether a lock resource needs to be refreshed, and 1395 * arbitrate who gets to refresh it. 1396 * 1397 * 0 means no refresh needed. 1398 * 1399 * > 0 means you need to refresh this and you MUST call 1400 * ocfs2_complete_lock_res_refresh afterwards. */ 1401 static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) 1402 { 1403 unsigned long flags; 1404 int status = 0; 1405 1406 mlog_entry_void(); 1407 1408 refresh_check: 1409 spin_lock_irqsave(&lockres->l_lock, flags); 1410 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { 1411 spin_unlock_irqrestore(&lockres->l_lock, flags); 1412 goto bail; 1413 } 1414 1415 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { 1416 spin_unlock_irqrestore(&lockres->l_lock, flags); 1417 1418 ocfs2_wait_on_refreshing_lock(lockres); 1419 goto refresh_check; 1420 } 1421 1422 /* Ok, I'll be the one to refresh this lock. */ 1423 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); 1424 spin_unlock_irqrestore(&lockres->l_lock, flags); 1425 1426 status = 1; 1427 bail: 1428 mlog_exit(status); 1429 return status; 1430 } 1431 1432 /* If status is non zero, I'll mark it as not being in refresh 1433 * anymroe, but i won't clear the needs refresh flag. */ 1434 static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, 1435 int status) 1436 { 1437 unsigned long flags; 1438 mlog_entry_void(); 1439 1440 spin_lock_irqsave(&lockres->l_lock, flags); 1441 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); 1442 if (!status) 1443 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 1444 spin_unlock_irqrestore(&lockres->l_lock, flags); 1445 1446 wake_up(&lockres->l_event); 1447 1448 mlog_exit_void(); 1449 } 1450 1451 /* may or may not return a bh if it went to disk. */ 1452 static int ocfs2_meta_lock_update(struct inode *inode, 1453 struct buffer_head **bh) 1454 { 1455 int status = 0; 1456 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1457 struct ocfs2_lock_res *lockres; 1458 struct ocfs2_dinode *fe; 1459 1460 mlog_entry_void(); 1461 1462 spin_lock(&oi->ip_lock); 1463 if (oi->ip_flags & OCFS2_INODE_DELETED) { 1464 mlog(0, "Orphaned inode %llu was deleted while we " 1465 "were waiting on a lock. ip_flags = 0x%x\n", 1466 (unsigned long long)oi->ip_blkno, oi->ip_flags); 1467 spin_unlock(&oi->ip_lock); 1468 status = -ENOENT; 1469 goto bail; 1470 } 1471 spin_unlock(&oi->ip_lock); 1472 1473 lockres = &oi->ip_meta_lockres; 1474 1475 if (!ocfs2_should_refresh_lock_res(lockres)) 1476 goto bail; 1477 1478 /* This will discard any caching information we might have had 1479 * for the inode metadata. */ 1480 ocfs2_metadata_cache_purge(inode); 1481 1482 /* will do nothing for inode types that don't use the extent 1483 * map (directories, bitmap files, etc) */ 1484 ocfs2_extent_map_trunc(inode, 0); 1485 1486 if (ocfs2_meta_lvb_is_trustable(lockres)) { 1487 mlog(0, "Trusting LVB on inode %llu\n", 1488 (unsigned long long)oi->ip_blkno); 1489 ocfs2_refresh_inode_from_lvb(inode); 1490 } else { 1491 /* Boo, we have to go to disk. */ 1492 /* read bh, cast, ocfs2_refresh_inode */ 1493 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, 1494 bh, OCFS2_BH_CACHED, inode); 1495 if (status < 0) { 1496 mlog_errno(status); 1497 goto bail_refresh; 1498 } 1499 fe = (struct ocfs2_dinode *) (*bh)->b_data; 1500 1501 /* This is a good chance to make sure we're not 1502 * locking an invalid object. 1503 * 1504 * We bug on a stale inode here because we checked 1505 * above whether it was wiped from disk. The wiping 1506 * node provides a guarantee that we receive that 1507 * message and can mark the inode before dropping any 1508 * locks associated with it. */ 1509 if (!OCFS2_IS_VALID_DINODE(fe)) { 1510 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 1511 status = -EIO; 1512 goto bail_refresh; 1513 } 1514 mlog_bug_on_msg(inode->i_generation != 1515 le32_to_cpu(fe->i_generation), 1516 "Invalid dinode %llu disk generation: %u " 1517 "inode->i_generation: %u\n", 1518 (unsigned long long)oi->ip_blkno, 1519 le32_to_cpu(fe->i_generation), 1520 inode->i_generation); 1521 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || 1522 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), 1523 "Stale dinode %llu dtime: %llu flags: 0x%x\n", 1524 (unsigned long long)oi->ip_blkno, 1525 (unsigned long long)le64_to_cpu(fe->i_dtime), 1526 le32_to_cpu(fe->i_flags)); 1527 1528 ocfs2_refresh_inode(inode, fe); 1529 } 1530 1531 status = 0; 1532 bail_refresh: 1533 ocfs2_complete_lock_res_refresh(lockres, status); 1534 bail: 1535 mlog_exit(status); 1536 return status; 1537 } 1538 1539 static int ocfs2_assign_bh(struct inode *inode, 1540 struct buffer_head **ret_bh, 1541 struct buffer_head *passed_bh) 1542 { 1543 int status; 1544 1545 if (passed_bh) { 1546 /* Ok, the update went to disk for us, use the 1547 * returned bh. */ 1548 *ret_bh = passed_bh; 1549 get_bh(*ret_bh); 1550 1551 return 0; 1552 } 1553 1554 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 1555 OCFS2_I(inode)->ip_blkno, 1556 ret_bh, 1557 OCFS2_BH_CACHED, 1558 inode); 1559 if (status < 0) 1560 mlog_errno(status); 1561 1562 return status; 1563 } 1564 1565 /* 1566 * returns < 0 error if the callback will never be called, otherwise 1567 * the result of the lock will be communicated via the callback. 1568 */ 1569 int ocfs2_meta_lock_full(struct inode *inode, 1570 struct ocfs2_journal_handle *handle, 1571 struct buffer_head **ret_bh, 1572 int ex, 1573 int arg_flags) 1574 { 1575 int status, level, dlm_flags, acquired; 1576 struct ocfs2_lock_res *lockres; 1577 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1578 struct buffer_head *local_bh = NULL; 1579 1580 BUG_ON(!inode); 1581 1582 mlog_entry_void(); 1583 1584 mlog(0, "inode %llu, take %s META lock\n", 1585 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1586 ex ? "EXMODE" : "PRMODE"); 1587 1588 status = 0; 1589 acquired = 0; 1590 /* We'll allow faking a readonly metadata lock for 1591 * rodevices. */ 1592 if (ocfs2_is_hard_readonly(osb)) { 1593 if (ex) 1594 status = -EROFS; 1595 goto bail; 1596 } 1597 1598 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 1599 wait_event(osb->recovery_event, 1600 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1601 1602 acquired = 0; 1603 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1604 level = ex ? LKM_EXMODE : LKM_PRMODE; 1605 dlm_flags = 0; 1606 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1607 dlm_flags |= LKM_NOQUEUE; 1608 1609 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); 1610 if (status < 0) { 1611 if (status != -EAGAIN && status != -EIOCBRETRY) 1612 mlog_errno(status); 1613 goto bail; 1614 } 1615 1616 /* Notify the error cleanup path to drop the cluster lock. */ 1617 acquired = 1; 1618 1619 /* We wait twice because a node may have died while we were in 1620 * the lower dlm layers. The second time though, we've 1621 * committed to owning this lock so we don't allow signals to 1622 * abort the operation. */ 1623 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 1624 wait_event(osb->recovery_event, 1625 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1626 1627 /* This is fun. The caller may want a bh back, or it may 1628 * not. ocfs2_meta_lock_update definitely wants one in, but 1629 * may or may not read one, depending on what's in the 1630 * LVB. The result of all of this is that we've *only* gone to 1631 * disk if we have to, so the complexity is worthwhile. */ 1632 status = ocfs2_meta_lock_update(inode, &local_bh); 1633 if (status < 0) { 1634 if (status != -ENOENT) 1635 mlog_errno(status); 1636 goto bail; 1637 } 1638 1639 if (ret_bh) { 1640 status = ocfs2_assign_bh(inode, ret_bh, local_bh); 1641 if (status < 0) { 1642 mlog_errno(status); 1643 goto bail; 1644 } 1645 } 1646 1647 if (handle) { 1648 status = ocfs2_handle_add_lock(handle, inode); 1649 if (status < 0) 1650 mlog_errno(status); 1651 } 1652 1653 bail: 1654 if (status < 0) { 1655 if (ret_bh && (*ret_bh)) { 1656 brelse(*ret_bh); 1657 *ret_bh = NULL; 1658 } 1659 if (acquired) 1660 ocfs2_meta_unlock(inode, ex); 1661 } 1662 1663 if (local_bh) 1664 brelse(local_bh); 1665 1666 mlog_exit(status); 1667 return status; 1668 } 1669 1670 /* 1671 * This is working around a lock inversion between tasks acquiring DLM locks 1672 * while holding a page lock and the vote thread which blocks dlm lock acquiry 1673 * while acquiring page locks. 1674 * 1675 * ** These _with_page variantes are only intended to be called from aop 1676 * methods that hold page locks and return a very specific *positive* error 1677 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 1678 * 1679 * The DLM is called such that it returns -EAGAIN if it would have blocked 1680 * waiting for the vote thread. In that case we unlock our page so the vote 1681 * thread can make progress. Once we've done this we have to return 1682 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 1683 * into the VFS who will then immediately retry the aop call. 1684 * 1685 * We do a blocking lock and immediate unlock before returning, though, so that 1686 * the lock has a great chance of being cached on this node by the time the VFS 1687 * calls back to retry the aop. This has a potential to livelock as nodes 1688 * ping locks back and forth, but that's a risk we're willing to take to avoid 1689 * the lock inversion simply. 1690 */ 1691 int ocfs2_meta_lock_with_page(struct inode *inode, 1692 struct ocfs2_journal_handle *handle, 1693 struct buffer_head **ret_bh, 1694 int ex, 1695 struct page *page) 1696 { 1697 int ret; 1698 1699 ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex, 1700 OCFS2_LOCK_NONBLOCK); 1701 if (ret == -EAGAIN) { 1702 unlock_page(page); 1703 if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0) 1704 ocfs2_meta_unlock(inode, ex); 1705 ret = AOP_TRUNCATED_PAGE; 1706 } 1707 1708 return ret; 1709 } 1710 1711 void ocfs2_meta_unlock(struct inode *inode, 1712 int ex) 1713 { 1714 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1715 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 1716 1717 mlog_entry_void(); 1718 1719 mlog(0, "inode %llu drop %s META lock\n", 1720 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1721 ex ? "EXMODE" : "PRMODE"); 1722 1723 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) 1724 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1725 1726 mlog_exit_void(); 1727 } 1728 1729 int ocfs2_super_lock(struct ocfs2_super *osb, 1730 int ex) 1731 { 1732 int status; 1733 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1734 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 1735 struct buffer_head *bh; 1736 struct ocfs2_slot_info *si = osb->slot_info; 1737 1738 mlog_entry_void(); 1739 1740 if (ocfs2_is_hard_readonly(osb)) 1741 return -EROFS; 1742 1743 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); 1744 if (status < 0) { 1745 mlog_errno(status); 1746 goto bail; 1747 } 1748 1749 /* The super block lock path is really in the best position to 1750 * know when resources covered by the lock need to be 1751 * refreshed, so we do it here. Of course, making sense of 1752 * everything is up to the caller :) */ 1753 status = ocfs2_should_refresh_lock_res(lockres); 1754 if (status < 0) { 1755 mlog_errno(status); 1756 goto bail; 1757 } 1758 if (status) { 1759 bh = si->si_bh; 1760 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, 1761 si->si_inode); 1762 if (status == 0) 1763 ocfs2_update_slot_info(si); 1764 1765 ocfs2_complete_lock_res_refresh(lockres, status); 1766 1767 if (status < 0) 1768 mlog_errno(status); 1769 } 1770 bail: 1771 mlog_exit(status); 1772 return status; 1773 } 1774 1775 void ocfs2_super_unlock(struct ocfs2_super *osb, 1776 int ex) 1777 { 1778 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1779 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 1780 1781 ocfs2_cluster_unlock(osb, lockres, level); 1782 } 1783 1784 int ocfs2_rename_lock(struct ocfs2_super *osb) 1785 { 1786 int status; 1787 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 1788 1789 if (ocfs2_is_hard_readonly(osb)) 1790 return -EROFS; 1791 1792 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); 1793 if (status < 0) 1794 mlog_errno(status); 1795 1796 return status; 1797 } 1798 1799 void ocfs2_rename_unlock(struct ocfs2_super *osb) 1800 { 1801 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 1802 1803 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); 1804 } 1805 1806 /* Reference counting of the dlm debug structure. We want this because 1807 * open references on the debug inodes can live on after a mount, so 1808 * we can't rely on the ocfs2_super to always exist. */ 1809 static void ocfs2_dlm_debug_free(struct kref *kref) 1810 { 1811 struct ocfs2_dlm_debug *dlm_debug; 1812 1813 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); 1814 1815 kfree(dlm_debug); 1816 } 1817 1818 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) 1819 { 1820 if (dlm_debug) 1821 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); 1822 } 1823 1824 static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) 1825 { 1826 kref_get(&debug->d_refcnt); 1827 } 1828 1829 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) 1830 { 1831 struct ocfs2_dlm_debug *dlm_debug; 1832 1833 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); 1834 if (!dlm_debug) { 1835 mlog_errno(-ENOMEM); 1836 goto out; 1837 } 1838 1839 kref_init(&dlm_debug->d_refcnt); 1840 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); 1841 dlm_debug->d_locking_state = NULL; 1842 out: 1843 return dlm_debug; 1844 } 1845 1846 /* Access to this is arbitrated for us via seq_file->sem. */ 1847 struct ocfs2_dlm_seq_priv { 1848 struct ocfs2_dlm_debug *p_dlm_debug; 1849 struct ocfs2_lock_res p_iter_res; 1850 struct ocfs2_lock_res p_tmp_res; 1851 }; 1852 1853 static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, 1854 struct ocfs2_dlm_seq_priv *priv) 1855 { 1856 struct ocfs2_lock_res *iter, *ret = NULL; 1857 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; 1858 1859 assert_spin_locked(&ocfs2_dlm_tracking_lock); 1860 1861 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { 1862 /* discover the head of the list */ 1863 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { 1864 mlog(0, "End of list found, %p\n", ret); 1865 break; 1866 } 1867 1868 /* We track our "dummy" iteration lockres' by a NULL 1869 * l_ops field. */ 1870 if (iter->l_ops != NULL) { 1871 ret = iter; 1872 break; 1873 } 1874 } 1875 1876 return ret; 1877 } 1878 1879 static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) 1880 { 1881 struct ocfs2_dlm_seq_priv *priv = m->private; 1882 struct ocfs2_lock_res *iter; 1883 1884 spin_lock(&ocfs2_dlm_tracking_lock); 1885 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); 1886 if (iter) { 1887 /* Since lockres' have the lifetime of their container 1888 * (which can be inodes, ocfs2_supers, etc) we want to 1889 * copy this out to a temporary lockres while still 1890 * under the spinlock. Obviously after this we can't 1891 * trust any pointers on the copy returned, but that's 1892 * ok as the information we want isn't typically held 1893 * in them. */ 1894 priv->p_tmp_res = *iter; 1895 iter = &priv->p_tmp_res; 1896 } 1897 spin_unlock(&ocfs2_dlm_tracking_lock); 1898 1899 return iter; 1900 } 1901 1902 static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) 1903 { 1904 } 1905 1906 static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) 1907 { 1908 struct ocfs2_dlm_seq_priv *priv = m->private; 1909 struct ocfs2_lock_res *iter = v; 1910 struct ocfs2_lock_res *dummy = &priv->p_iter_res; 1911 1912 spin_lock(&ocfs2_dlm_tracking_lock); 1913 iter = ocfs2_dlm_next_res(iter, priv); 1914 list_del_init(&dummy->l_debug_list); 1915 if (iter) { 1916 list_add(&dummy->l_debug_list, &iter->l_debug_list); 1917 priv->p_tmp_res = *iter; 1918 iter = &priv->p_tmp_res; 1919 } 1920 spin_unlock(&ocfs2_dlm_tracking_lock); 1921 1922 return iter; 1923 } 1924 1925 /* So that debugfs.ocfs2 can determine which format is being used */ 1926 #define OCFS2_DLM_DEBUG_STR_VERSION 1 1927 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) 1928 { 1929 int i; 1930 char *lvb; 1931 struct ocfs2_lock_res *lockres = v; 1932 1933 if (!lockres) 1934 return -EINVAL; 1935 1936 seq_printf(m, "0x%x\t" 1937 "%.*s\t" 1938 "%d\t" 1939 "0x%lx\t" 1940 "0x%x\t" 1941 "0x%x\t" 1942 "%u\t" 1943 "%u\t" 1944 "%d\t" 1945 "%d\t", 1946 OCFS2_DLM_DEBUG_STR_VERSION, 1947 OCFS2_LOCK_ID_MAX_LEN, lockres->l_name, 1948 lockres->l_level, 1949 lockres->l_flags, 1950 lockres->l_action, 1951 lockres->l_unlock_action, 1952 lockres->l_ro_holders, 1953 lockres->l_ex_holders, 1954 lockres->l_requested, 1955 lockres->l_blocking); 1956 1957 /* Dump the raw LVB */ 1958 lvb = lockres->l_lksb.lvb; 1959 for(i = 0; i < DLM_LVB_LEN; i++) 1960 seq_printf(m, "0x%x\t", lvb[i]); 1961 1962 /* End the line */ 1963 seq_printf(m, "\n"); 1964 return 0; 1965 } 1966 1967 static struct seq_operations ocfs2_dlm_seq_ops = { 1968 .start = ocfs2_dlm_seq_start, 1969 .stop = ocfs2_dlm_seq_stop, 1970 .next = ocfs2_dlm_seq_next, 1971 .show = ocfs2_dlm_seq_show, 1972 }; 1973 1974 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) 1975 { 1976 struct seq_file *seq = (struct seq_file *) file->private_data; 1977 struct ocfs2_dlm_seq_priv *priv = seq->private; 1978 struct ocfs2_lock_res *res = &priv->p_iter_res; 1979 1980 ocfs2_remove_lockres_tracking(res); 1981 ocfs2_put_dlm_debug(priv->p_dlm_debug); 1982 return seq_release_private(inode, file); 1983 } 1984 1985 static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) 1986 { 1987 int ret; 1988 struct ocfs2_dlm_seq_priv *priv; 1989 struct seq_file *seq; 1990 struct ocfs2_super *osb; 1991 1992 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); 1993 if (!priv) { 1994 ret = -ENOMEM; 1995 mlog_errno(ret); 1996 goto out; 1997 } 1998 osb = (struct ocfs2_super *) inode->u.generic_ip; 1999 ocfs2_get_dlm_debug(osb->osb_dlm_debug); 2000 priv->p_dlm_debug = osb->osb_dlm_debug; 2001 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); 2002 2003 ret = seq_open(file, &ocfs2_dlm_seq_ops); 2004 if (ret) { 2005 kfree(priv); 2006 mlog_errno(ret); 2007 goto out; 2008 } 2009 2010 seq = (struct seq_file *) file->private_data; 2011 seq->private = priv; 2012 2013 ocfs2_add_lockres_tracking(&priv->p_iter_res, 2014 priv->p_dlm_debug); 2015 2016 out: 2017 return ret; 2018 } 2019 2020 static const struct file_operations ocfs2_dlm_debug_fops = { 2021 .open = ocfs2_dlm_debug_open, 2022 .release = ocfs2_dlm_debug_release, 2023 .read = seq_read, 2024 .llseek = seq_lseek, 2025 }; 2026 2027 static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) 2028 { 2029 int ret = 0; 2030 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 2031 2032 dlm_debug->d_locking_state = debugfs_create_file("locking_state", 2033 S_IFREG|S_IRUSR, 2034 osb->osb_debug_root, 2035 osb, 2036 &ocfs2_dlm_debug_fops); 2037 if (!dlm_debug->d_locking_state) { 2038 ret = -EINVAL; 2039 mlog(ML_ERROR, 2040 "Unable to create locking state debugfs file.\n"); 2041 goto out; 2042 } 2043 2044 ocfs2_get_dlm_debug(dlm_debug); 2045 out: 2046 return ret; 2047 } 2048 2049 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) 2050 { 2051 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 2052 2053 if (dlm_debug) { 2054 debugfs_remove(dlm_debug->d_locking_state); 2055 ocfs2_put_dlm_debug(dlm_debug); 2056 } 2057 } 2058 2059 int ocfs2_dlm_init(struct ocfs2_super *osb) 2060 { 2061 int status; 2062 u32 dlm_key; 2063 struct dlm_ctxt *dlm; 2064 2065 mlog_entry_void(); 2066 2067 status = ocfs2_dlm_init_debug(osb); 2068 if (status < 0) { 2069 mlog_errno(status); 2070 goto bail; 2071 } 2072 2073 /* launch vote thread */ 2074 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d", 2075 osb->osb_id); 2076 if (IS_ERR(osb->vote_task)) { 2077 status = PTR_ERR(osb->vote_task); 2078 osb->vote_task = NULL; 2079 mlog_errno(status); 2080 goto bail; 2081 } 2082 2083 /* used by the dlm code to make message headers unique, each 2084 * node in this domain must agree on this. */ 2085 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); 2086 2087 /* for now, uuid == domain */ 2088 dlm = dlm_register_domain(osb->uuid_str, dlm_key); 2089 if (IS_ERR(dlm)) { 2090 status = PTR_ERR(dlm); 2091 mlog_errno(status); 2092 goto bail; 2093 } 2094 2095 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2096 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2097 2098 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); 2099 2100 osb->dlm = dlm; 2101 2102 status = 0; 2103 bail: 2104 if (status < 0) { 2105 ocfs2_dlm_shutdown_debug(osb); 2106 if (osb->vote_task) 2107 kthread_stop(osb->vote_task); 2108 } 2109 2110 mlog_exit(status); 2111 return status; 2112 } 2113 2114 void ocfs2_dlm_shutdown(struct ocfs2_super *osb) 2115 { 2116 mlog_entry_void(); 2117 2118 dlm_unregister_eviction_cb(&osb->osb_eviction_cb); 2119 2120 ocfs2_drop_osb_locks(osb); 2121 2122 if (osb->vote_task) { 2123 kthread_stop(osb->vote_task); 2124 osb->vote_task = NULL; 2125 } 2126 2127 ocfs2_lock_res_free(&osb->osb_super_lockres); 2128 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2129 2130 dlm_unregister_domain(osb->dlm); 2131 osb->dlm = NULL; 2132 2133 ocfs2_dlm_shutdown_debug(osb); 2134 2135 mlog_exit_void(); 2136 } 2137 2138 static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status) 2139 { 2140 struct ocfs2_lock_res *lockres = opaque; 2141 unsigned long flags; 2142 2143 mlog_entry_void(); 2144 2145 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, 2146 lockres->l_unlock_action); 2147 2148 spin_lock_irqsave(&lockres->l_lock, flags); 2149 /* We tried to cancel a convert request, but it was already 2150 * granted. All we want to do here is clear our unlock 2151 * state. The wake_up call done at the bottom is redundant 2152 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't 2153 * hurt anything anyway */ 2154 if (status == DLM_CANCELGRANT && 2155 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 2156 mlog(0, "Got cancelgrant for %s\n", lockres->l_name); 2157 2158 /* We don't clear the busy flag in this case as it 2159 * should have been cleared by the ast which the dlm 2160 * has called. */ 2161 goto complete_unlock; 2162 } 2163 2164 if (status != DLM_NORMAL) { 2165 mlog(ML_ERROR, "Dlm passes status %d for lock %s, " 2166 "unlock_action %d\n", status, lockres->l_name, 2167 lockres->l_unlock_action); 2168 spin_unlock_irqrestore(&lockres->l_lock, flags); 2169 return; 2170 } 2171 2172 switch(lockres->l_unlock_action) { 2173 case OCFS2_UNLOCK_CANCEL_CONVERT: 2174 mlog(0, "Cancel convert success for %s\n", lockres->l_name); 2175 lockres->l_action = OCFS2_AST_INVALID; 2176 break; 2177 case OCFS2_UNLOCK_DROP_LOCK: 2178 lockres->l_level = LKM_IVMODE; 2179 break; 2180 default: 2181 BUG(); 2182 } 2183 2184 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 2185 complete_unlock: 2186 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 2187 spin_unlock_irqrestore(&lockres->l_lock, flags); 2188 2189 wake_up(&lockres->l_event); 2190 2191 mlog_exit_void(); 2192 } 2193 2194 typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *); 2195 2196 struct drop_lock_cb { 2197 ocfs2_pre_drop_cb_t *drop_func; 2198 void *drop_data; 2199 }; 2200 2201 static int ocfs2_drop_lock(struct ocfs2_super *osb, 2202 struct ocfs2_lock_res *lockres, 2203 struct drop_lock_cb *dcb) 2204 { 2205 enum dlm_status status; 2206 unsigned long flags; 2207 2208 /* We didn't get anywhere near actually using this lockres. */ 2209 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) 2210 goto out; 2211 2212 spin_lock_irqsave(&lockres->l_lock, flags); 2213 2214 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), 2215 "lockres %s, flags 0x%lx\n", 2216 lockres->l_name, lockres->l_flags); 2217 2218 while (lockres->l_flags & OCFS2_LOCK_BUSY) { 2219 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " 2220 "%u, unlock_action = %u\n", 2221 lockres->l_name, lockres->l_flags, lockres->l_action, 2222 lockres->l_unlock_action); 2223 2224 spin_unlock_irqrestore(&lockres->l_lock, flags); 2225 2226 /* XXX: Today we just wait on any busy 2227 * locks... Perhaps we need to cancel converts in the 2228 * future? */ 2229 ocfs2_wait_on_busy_lock(lockres); 2230 2231 spin_lock_irqsave(&lockres->l_lock, flags); 2232 } 2233 2234 if (dcb) 2235 dcb->drop_func(lockres, dcb->drop_data); 2236 2237 if (lockres->l_flags & OCFS2_LOCK_BUSY) 2238 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", 2239 lockres->l_name); 2240 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) 2241 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); 2242 2243 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 2244 spin_unlock_irqrestore(&lockres->l_lock, flags); 2245 goto out; 2246 } 2247 2248 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); 2249 2250 /* make sure we never get here while waiting for an ast to 2251 * fire. */ 2252 BUG_ON(lockres->l_action != OCFS2_AST_INVALID); 2253 2254 /* is this necessary? */ 2255 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2256 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; 2257 spin_unlock_irqrestore(&lockres->l_lock, flags); 2258 2259 mlog(0, "lock %s\n", lockres->l_name); 2260 2261 status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK, 2262 lockres->l_ops->unlock_ast, lockres); 2263 if (status != DLM_NORMAL) { 2264 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2265 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 2266 dlm_print_one_lock(lockres->l_lksb.lockid); 2267 BUG(); 2268 } 2269 mlog(0, "lock %s, successfull return from dlmunlock\n", 2270 lockres->l_name); 2271 2272 ocfs2_wait_on_busy_lock(lockres); 2273 out: 2274 mlog_exit(0); 2275 return 0; 2276 } 2277 2278 /* Mark the lockres as being dropped. It will no longer be 2279 * queued if blocking, but we still may have to wait on it 2280 * being dequeued from the vote thread before we can consider 2281 * it safe to drop. 2282 * 2283 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2284 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 2285 { 2286 int status; 2287 struct ocfs2_mask_waiter mw; 2288 unsigned long flags; 2289 2290 ocfs2_init_mask_waiter(&mw); 2291 2292 spin_lock_irqsave(&lockres->l_lock, flags); 2293 lockres->l_flags |= OCFS2_LOCK_FREEING; 2294 while (lockres->l_flags & OCFS2_LOCK_QUEUED) { 2295 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); 2296 spin_unlock_irqrestore(&lockres->l_lock, flags); 2297 2298 mlog(0, "Waiting on lockres %s\n", lockres->l_name); 2299 2300 status = ocfs2_wait_for_mask(&mw); 2301 if (status) 2302 mlog_errno(status); 2303 2304 spin_lock_irqsave(&lockres->l_lock, flags); 2305 } 2306 spin_unlock_irqrestore(&lockres->l_lock, flags); 2307 } 2308 2309 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) 2310 { 2311 int status; 2312 2313 mlog_entry_void(); 2314 2315 ocfs2_mark_lockres_freeing(&osb->osb_super_lockres); 2316 2317 status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL); 2318 if (status < 0) 2319 mlog_errno(status); 2320 2321 ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres); 2322 2323 status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL); 2324 if (status < 0) 2325 mlog_errno(status); 2326 2327 mlog_exit(status); 2328 } 2329 2330 static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data) 2331 { 2332 struct inode *inode = data; 2333 2334 /* the metadata lock requires a bit more work as we have an 2335 * LVB to worry about. */ 2336 if (lockres->l_flags & OCFS2_LOCK_ATTACHED && 2337 lockres->l_level == LKM_EXMODE && 2338 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) 2339 __ocfs2_stuff_meta_lvb(inode); 2340 } 2341 2342 int ocfs2_drop_inode_locks(struct inode *inode) 2343 { 2344 int status, err; 2345 struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, }; 2346 2347 mlog_entry_void(); 2348 2349 /* No need to call ocfs2_mark_lockres_freeing here - 2350 * ocfs2_clear_inode has done it for us. */ 2351 2352 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2353 &OCFS2_I(inode)->ip_data_lockres, 2354 NULL); 2355 if (err < 0) 2356 mlog_errno(err); 2357 2358 status = err; 2359 2360 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2361 &OCFS2_I(inode)->ip_meta_lockres, 2362 &meta_dcb); 2363 if (err < 0) 2364 mlog_errno(err); 2365 if (err < 0 && !status) 2366 status = err; 2367 2368 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2369 &OCFS2_I(inode)->ip_rw_lockres, 2370 NULL); 2371 if (err < 0) 2372 mlog_errno(err); 2373 if (err < 0 && !status) 2374 status = err; 2375 2376 mlog_exit(status); 2377 return status; 2378 } 2379 2380 static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 2381 int new_level) 2382 { 2383 assert_spin_locked(&lockres->l_lock); 2384 2385 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 2386 2387 if (lockres->l_level <= new_level) { 2388 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", 2389 lockres->l_level, new_level); 2390 BUG(); 2391 } 2392 2393 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 2394 lockres->l_name, new_level, lockres->l_blocking); 2395 2396 lockres->l_action = OCFS2_AST_DOWNCONVERT; 2397 lockres->l_requested = new_level; 2398 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2399 } 2400 2401 static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 2402 struct ocfs2_lock_res *lockres, 2403 int new_level, 2404 int lvb) 2405 { 2406 int ret, dlm_flags = LKM_CONVERT; 2407 enum dlm_status status; 2408 2409 mlog_entry_void(); 2410 2411 if (lvb) 2412 dlm_flags |= LKM_VALBLK; 2413 2414 status = dlmlock(osb->dlm, 2415 new_level, 2416 &lockres->l_lksb, 2417 dlm_flags, 2418 lockres->l_name, 2419 lockres->l_ops->ast, 2420 lockres, 2421 lockres->l_ops->bast); 2422 if (status != DLM_NORMAL) { 2423 ocfs2_log_dlm_error("dlmlock", status, lockres); 2424 ret = -EINVAL; 2425 ocfs2_recover_from_dlm_error(lockres, 1); 2426 goto bail; 2427 } 2428 2429 ret = 0; 2430 bail: 2431 mlog_exit(ret); 2432 return ret; 2433 } 2434 2435 /* returns 1 when the caller should unlock and call dlmunlock */ 2436 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 2437 struct ocfs2_lock_res *lockres) 2438 { 2439 assert_spin_locked(&lockres->l_lock); 2440 2441 mlog_entry_void(); 2442 mlog(0, "lock %s\n", lockres->l_name); 2443 2444 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 2445 /* If we're already trying to cancel a lock conversion 2446 * then just drop the spinlock and allow the caller to 2447 * requeue this lock. */ 2448 2449 mlog(0, "Lockres %s, skip convert\n", lockres->l_name); 2450 return 0; 2451 } 2452 2453 /* were we in a convert when we got the bast fire? */ 2454 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && 2455 lockres->l_action != OCFS2_AST_DOWNCONVERT); 2456 /* set things up for the unlockast to know to just 2457 * clear out the ast_action and unset busy, etc. */ 2458 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; 2459 2460 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), 2461 "lock %s, invalid flags: 0x%lx\n", 2462 lockres->l_name, lockres->l_flags); 2463 2464 return 1; 2465 } 2466 2467 static int ocfs2_cancel_convert(struct ocfs2_super *osb, 2468 struct ocfs2_lock_res *lockres) 2469 { 2470 int ret; 2471 enum dlm_status status; 2472 2473 mlog_entry_void(); 2474 mlog(0, "lock %s\n", lockres->l_name); 2475 2476 ret = 0; 2477 status = dlmunlock(osb->dlm, 2478 &lockres->l_lksb, 2479 LKM_CANCEL, 2480 lockres->l_ops->unlock_ast, 2481 lockres); 2482 if (status != DLM_NORMAL) { 2483 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2484 ret = -EINVAL; 2485 ocfs2_recover_from_dlm_error(lockres, 0); 2486 } 2487 2488 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); 2489 2490 mlog_exit(ret); 2491 return ret; 2492 } 2493 2494 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, 2495 struct ocfs2_lock_res *lockres, 2496 int new_level) 2497 { 2498 int ret; 2499 2500 mlog_entry_void(); 2501 2502 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); 2503 2504 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { 2505 ret = 0; 2506 mlog(0, "lockres %s currently being refreshed -- backing " 2507 "off!\n", lockres->l_name); 2508 } else if (new_level == LKM_PRMODE) 2509 ret = !lockres->l_ex_holders && 2510 ocfs2_inode_fully_checkpointed(inode); 2511 else /* Must be NLMODE we're converting to. */ 2512 ret = !lockres->l_ro_holders && !lockres->l_ex_holders && 2513 ocfs2_inode_fully_checkpointed(inode); 2514 2515 mlog_exit(ret); 2516 return ret; 2517 } 2518 2519 static int ocfs2_do_unblock_meta(struct inode *inode, 2520 int *requeue) 2521 { 2522 int new_level; 2523 int set_lvb = 0; 2524 int ret = 0; 2525 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2526 unsigned long flags; 2527 2528 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2529 2530 mlog_entry_void(); 2531 2532 spin_lock_irqsave(&lockres->l_lock, flags); 2533 2534 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 2535 2536 mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level, 2537 lockres->l_blocking); 2538 2539 BUG_ON(lockres->l_level != LKM_EXMODE && 2540 lockres->l_level != LKM_PRMODE); 2541 2542 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 2543 *requeue = 1; 2544 ret = ocfs2_prepare_cancel_convert(osb, lockres); 2545 spin_unlock_irqrestore(&lockres->l_lock, flags); 2546 if (ret) { 2547 ret = ocfs2_cancel_convert(osb, lockres); 2548 if (ret < 0) 2549 mlog_errno(ret); 2550 } 2551 goto leave; 2552 } 2553 2554 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 2555 2556 mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n", 2557 lockres->l_level, lockres->l_blocking, new_level); 2558 2559 if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) { 2560 if (lockres->l_level == LKM_EXMODE) 2561 set_lvb = 1; 2562 2563 /* If the lock hasn't been refreshed yet (rare), then 2564 * our memory inode values are old and we skip 2565 * stuffing the lvb. There's no need to actually clear 2566 * out the lvb here as it's value is still valid. */ 2567 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { 2568 if (set_lvb) 2569 __ocfs2_stuff_meta_lvb(inode); 2570 } else 2571 mlog(0, "lockres %s: downconverting stale lock!\n", 2572 lockres->l_name); 2573 2574 mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, " 2575 "l_blocking=%d, new_level=%d\n", 2576 lockres->l_level, lockres->l_blocking, new_level); 2577 2578 ocfs2_prepare_downconvert(lockres, new_level); 2579 spin_unlock_irqrestore(&lockres->l_lock, flags); 2580 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); 2581 goto leave; 2582 } 2583 if (!ocfs2_inode_fully_checkpointed(inode)) 2584 ocfs2_start_checkpoint(osb); 2585 2586 *requeue = 1; 2587 spin_unlock_irqrestore(&lockres->l_lock, flags); 2588 ret = 0; 2589 leave: 2590 mlog_exit(ret); 2591 return ret; 2592 } 2593 2594 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, 2595 struct ocfs2_lock_res *lockres, 2596 int *requeue, 2597 ocfs2_convert_worker_t *worker) 2598 { 2599 unsigned long flags; 2600 int blocking; 2601 int new_level; 2602 int ret = 0; 2603 2604 mlog_entry_void(); 2605 2606 spin_lock_irqsave(&lockres->l_lock, flags); 2607 2608 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 2609 2610 recheck: 2611 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 2612 *requeue = 1; 2613 ret = ocfs2_prepare_cancel_convert(osb, lockres); 2614 spin_unlock_irqrestore(&lockres->l_lock, flags); 2615 if (ret) { 2616 ret = ocfs2_cancel_convert(osb, lockres); 2617 if (ret < 0) 2618 mlog_errno(ret); 2619 } 2620 goto leave; 2621 } 2622 2623 /* if we're blocking an exclusive and we have *any* holders, 2624 * then requeue. */ 2625 if ((lockres->l_blocking == LKM_EXMODE) 2626 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 2627 spin_unlock_irqrestore(&lockres->l_lock, flags); 2628 *requeue = 1; 2629 ret = 0; 2630 goto leave; 2631 } 2632 2633 /* If it's a PR we're blocking, then only 2634 * requeue if we've got any EX holders */ 2635 if (lockres->l_blocking == LKM_PRMODE && 2636 lockres->l_ex_holders) { 2637 spin_unlock_irqrestore(&lockres->l_lock, flags); 2638 *requeue = 1; 2639 ret = 0; 2640 goto leave; 2641 } 2642 2643 /* If we get here, then we know that there are no more 2644 * incompatible holders (and anyone asking for an incompatible 2645 * lock is blocked). We can now downconvert the lock */ 2646 if (!worker) 2647 goto downconvert; 2648 2649 /* Some lockres types want to do a bit of work before 2650 * downconverting a lock. Allow that here. The worker function 2651 * may sleep, so we save off a copy of what we're blocking as 2652 * it may change while we're not holding the spin lock. */ 2653 blocking = lockres->l_blocking; 2654 spin_unlock_irqrestore(&lockres->l_lock, flags); 2655 2656 worker(lockres, blocking); 2657 2658 spin_lock_irqsave(&lockres->l_lock, flags); 2659 if (blocking != lockres->l_blocking) { 2660 /* If this changed underneath us, then we can't drop 2661 * it just yet. */ 2662 goto recheck; 2663 } 2664 2665 downconvert: 2666 *requeue = 0; 2667 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 2668 2669 ocfs2_prepare_downconvert(lockres, new_level); 2670 spin_unlock_irqrestore(&lockres->l_lock, flags); 2671 ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0); 2672 leave: 2673 mlog_exit(ret); 2674 return ret; 2675 } 2676 2677 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, 2678 int blocking) 2679 { 2680 struct inode *inode; 2681 struct address_space *mapping; 2682 2683 mlog_entry_void(); 2684 2685 inode = ocfs2_lock_res_inode(lockres); 2686 mapping = inode->i_mapping; 2687 2688 if (filemap_fdatawrite(mapping)) { 2689 mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", 2690 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2691 } 2692 sync_mapping_buffers(mapping); 2693 if (blocking == LKM_EXMODE) { 2694 truncate_inode_pages(mapping, 0); 2695 unmap_mapping_range(mapping, 0, 0, 0); 2696 } else { 2697 /* We only need to wait on the I/O if we're not also 2698 * truncating pages because truncate_inode_pages waits 2699 * for us above. We don't truncate pages if we're 2700 * blocking anything < EXMODE because we want to keep 2701 * them around in that case. */ 2702 filemap_fdatawait(mapping); 2703 } 2704 2705 mlog_exit_void(); 2706 } 2707 2708 int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, 2709 int *requeue) 2710 { 2711 int status; 2712 struct inode *inode; 2713 struct ocfs2_super *osb; 2714 2715 mlog_entry_void(); 2716 2717 inode = ocfs2_lock_res_inode(lockres); 2718 osb = OCFS2_SB(inode->i_sb); 2719 2720 mlog(0, "unblock inode %llu\n", 2721 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2722 2723 status = ocfs2_generic_unblock_lock(osb, 2724 lockres, 2725 requeue, 2726 ocfs2_data_convert_worker); 2727 if (status < 0) 2728 mlog_errno(status); 2729 2730 mlog(0, "inode %llu, requeue = %d\n", 2731 (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue); 2732 2733 mlog_exit(status); 2734 return status; 2735 } 2736 2737 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, 2738 int *requeue) 2739 { 2740 int status; 2741 struct inode *inode; 2742 2743 mlog_entry_void(); 2744 2745 mlog(0, "Unblock lockres %s\n", lockres->l_name); 2746 2747 inode = ocfs2_lock_res_inode(lockres); 2748 2749 status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb), 2750 lockres, 2751 requeue, 2752 NULL); 2753 if (status < 0) 2754 mlog_errno(status); 2755 2756 mlog_exit(status); 2757 return status; 2758 } 2759 2760 2761 int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, 2762 int *requeue) 2763 { 2764 int status; 2765 struct inode *inode; 2766 2767 mlog_entry_void(); 2768 2769 inode = ocfs2_lock_res_inode(lockres); 2770 2771 mlog(0, "unblock inode %llu\n", 2772 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2773 2774 status = ocfs2_do_unblock_meta(inode, requeue); 2775 if (status < 0) 2776 mlog_errno(status); 2777 2778 mlog(0, "inode %llu, requeue = %d\n", 2779 (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue); 2780 2781 mlog_exit(status); 2782 return status; 2783 } 2784 2785 /* Generic unblock function for any lockres whose private data is an 2786 * ocfs2_super pointer. */ 2787 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, 2788 int *requeue) 2789 { 2790 int status; 2791 struct ocfs2_super *osb; 2792 2793 mlog_entry_void(); 2794 2795 mlog(0, "Unblock lockres %s\n", lockres->l_name); 2796 2797 osb = ocfs2_lock_res_super(lockres); 2798 2799 status = ocfs2_generic_unblock_lock(osb, 2800 lockres, 2801 requeue, 2802 NULL); 2803 if (status < 0) 2804 mlog_errno(status); 2805 2806 mlog_exit(status); 2807 return status; 2808 } 2809 2810 void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 2811 struct ocfs2_lock_res *lockres) 2812 { 2813 int status; 2814 int requeue = 0; 2815 unsigned long flags; 2816 2817 /* Our reference to the lockres in this function can be 2818 * considered valid until we remove the OCFS2_LOCK_QUEUED 2819 * flag. */ 2820 2821 mlog_entry_void(); 2822 2823 BUG_ON(!lockres); 2824 BUG_ON(!lockres->l_ops); 2825 BUG_ON(!lockres->l_ops->unblock); 2826 2827 mlog(0, "lockres %s blocked.\n", lockres->l_name); 2828 2829 /* Detect whether a lock has been marked as going away while 2830 * the vote thread was processing other things. A lock can 2831 * still be marked with OCFS2_LOCK_FREEING after this check, 2832 * but short circuiting here will still save us some 2833 * performance. */ 2834 spin_lock_irqsave(&lockres->l_lock, flags); 2835 if (lockres->l_flags & OCFS2_LOCK_FREEING) 2836 goto unqueue; 2837 spin_unlock_irqrestore(&lockres->l_lock, flags); 2838 2839 status = lockres->l_ops->unblock(lockres, &requeue); 2840 if (status < 0) 2841 mlog_errno(status); 2842 2843 spin_lock_irqsave(&lockres->l_lock, flags); 2844 unqueue: 2845 if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) { 2846 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); 2847 } else 2848 ocfs2_schedule_blocked_lock(osb, lockres); 2849 2850 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 2851 requeue ? "yes" : "no"); 2852 spin_unlock_irqrestore(&lockres->l_lock, flags); 2853 2854 mlog_exit_void(); 2855 } 2856 2857 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, 2858 struct ocfs2_lock_res *lockres) 2859 { 2860 mlog_entry_void(); 2861 2862 assert_spin_locked(&lockres->l_lock); 2863 2864 if (lockres->l_flags & OCFS2_LOCK_FREEING) { 2865 /* Do not schedule a lock for downconvert when it's on 2866 * the way to destruction - any nodes wanting access 2867 * to the resource will get it soon. */ 2868 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 2869 lockres->l_name, lockres->l_flags); 2870 return; 2871 } 2872 2873 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 2874 2875 spin_lock(&osb->vote_task_lock); 2876 if (list_empty(&lockres->l_blocked_list)) { 2877 list_add_tail(&lockres->l_blocked_list, 2878 &osb->blocked_lock_list); 2879 osb->blocked_lock_count++; 2880 } 2881 spin_unlock(&osb->vote_task_lock); 2882 2883 mlog_exit_void(); 2884 } 2885 2886 /* This aids in debugging situations where a bad LVB might be involved. */ 2887 void ocfs2_dump_meta_lvb_info(u64 level, 2888 const char *function, 2889 unsigned int line, 2890 struct ocfs2_lock_res *lockres) 2891 { 2892 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 2893 2894 mlog(level, "LVB information for %s (called from %s:%u):\n", 2895 lockres->l_name, function, line); 2896 mlog(level, "version: %u, clusters: %u\n", 2897 be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters)); 2898 mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n", 2899 (unsigned long long)be64_to_cpu(lvb->lvb_isize), 2900 be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid), 2901 be16_to_cpu(lvb->lvb_imode)); 2902 mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, " 2903 "mtime_packed 0x%llx\n", be16_to_cpu(lvb->lvb_inlink), 2904 (long long)be64_to_cpu(lvb->lvb_iatime_packed), 2905 (long long)be64_to_cpu(lvb->lvb_ictime_packed), 2906 (long long)be64_to_cpu(lvb->lvb_imtime_packed)); 2907 } 2908