1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * dlmglue.c 5 * 6 * Code which implements an OCFS2 specific interface to our DLM. 7 * 8 * Copyright (C) 2003, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/types.h> 27 #include <linux/slab.h> 28 #include <linux/highmem.h> 29 #include <linux/mm.h> 30 #include <linux/smp_lock.h> 31 #include <linux/crc32.h> 32 #include <linux/kthread.h> 33 #include <linux/pagemap.h> 34 #include <linux/debugfs.h> 35 #include <linux/seq_file.h> 36 37 #include <cluster/heartbeat.h> 38 #include <cluster/nodemanager.h> 39 #include <cluster/tcp.h> 40 41 #include <dlm/dlmapi.h> 42 43 #define MLOG_MASK_PREFIX ML_DLM_GLUE 44 #include <cluster/masklog.h> 45 46 #include "ocfs2.h" 47 48 #include "alloc.h" 49 #include "dlmglue.h" 50 #include "extent_map.h" 51 #include "heartbeat.h" 52 #include "inode.h" 53 #include "journal.h" 54 #include "slot_map.h" 55 #include "super.h" 56 #include "uptodate.h" 57 #include "vote.h" 58 59 #include "buffer_head_io.h" 60 61 struct ocfs2_mask_waiter { 62 struct list_head mw_item; 63 int mw_status; 64 struct completion mw_complete; 65 unsigned long mw_mask; 66 unsigned long mw_goal; 67 }; 68 69 static void ocfs2_inode_ast_func(void *opaque); 70 static void ocfs2_inode_bast_func(void *opaque, 71 int level); 72 static void ocfs2_super_ast_func(void *opaque); 73 static void ocfs2_super_bast_func(void *opaque, 74 int level); 75 static void ocfs2_rename_ast_func(void *opaque); 76 static void ocfs2_rename_bast_func(void *opaque, 77 int level); 78 79 /* so far, all locks have gotten along with the same unlock ast */ 80 static void ocfs2_unlock_ast_func(void *opaque, 81 enum dlm_status status); 82 static int ocfs2_do_unblock_meta(struct inode *inode, 83 int *requeue); 84 static int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, 85 int *requeue); 86 static int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, 87 int *requeue); 88 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, 89 int *requeue); 90 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, 91 int *requeue); 92 typedef void (ocfs2_convert_worker_t)(struct ocfs2_lock_res *, int); 93 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, 94 struct ocfs2_lock_res *lockres, 95 int *requeue, 96 ocfs2_convert_worker_t *worker); 97 98 struct ocfs2_lock_res_ops { 99 void (*ast)(void *); 100 void (*bast)(void *, int); 101 void (*unlock_ast)(void *, enum dlm_status); 102 int (*unblock)(struct ocfs2_lock_res *, int *); 103 }; 104 105 static struct ocfs2_lock_res_ops ocfs2_inode_rw_lops = { 106 .ast = ocfs2_inode_ast_func, 107 .bast = ocfs2_inode_bast_func, 108 .unlock_ast = ocfs2_unlock_ast_func, 109 .unblock = ocfs2_unblock_inode_lock, 110 }; 111 112 static struct ocfs2_lock_res_ops ocfs2_inode_meta_lops = { 113 .ast = ocfs2_inode_ast_func, 114 .bast = ocfs2_inode_bast_func, 115 .unlock_ast = ocfs2_unlock_ast_func, 116 .unblock = ocfs2_unblock_meta, 117 }; 118 119 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, 120 int blocking); 121 122 static struct ocfs2_lock_res_ops ocfs2_inode_data_lops = { 123 .ast = ocfs2_inode_ast_func, 124 .bast = ocfs2_inode_bast_func, 125 .unlock_ast = ocfs2_unlock_ast_func, 126 .unblock = ocfs2_unblock_data, 127 }; 128 129 static struct ocfs2_lock_res_ops ocfs2_super_lops = { 130 .ast = ocfs2_super_ast_func, 131 .bast = ocfs2_super_bast_func, 132 .unlock_ast = ocfs2_unlock_ast_func, 133 .unblock = ocfs2_unblock_osb_lock, 134 }; 135 136 static struct ocfs2_lock_res_ops ocfs2_rename_lops = { 137 .ast = ocfs2_rename_ast_func, 138 .bast = ocfs2_rename_bast_func, 139 .unlock_ast = ocfs2_unlock_ast_func, 140 .unblock = ocfs2_unblock_osb_lock, 141 }; 142 143 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 144 { 145 return lockres->l_type == OCFS2_LOCK_TYPE_META || 146 lockres->l_type == OCFS2_LOCK_TYPE_DATA || 147 lockres->l_type == OCFS2_LOCK_TYPE_RW; 148 } 149 150 static inline int ocfs2_is_super_lock(struct ocfs2_lock_res *lockres) 151 { 152 return lockres->l_type == OCFS2_LOCK_TYPE_SUPER; 153 } 154 155 static inline int ocfs2_is_rename_lock(struct ocfs2_lock_res *lockres) 156 { 157 return lockres->l_type == OCFS2_LOCK_TYPE_RENAME; 158 } 159 160 static inline struct ocfs2_super *ocfs2_lock_res_super(struct ocfs2_lock_res *lockres) 161 { 162 BUG_ON(!ocfs2_is_super_lock(lockres) 163 && !ocfs2_is_rename_lock(lockres)); 164 165 return (struct ocfs2_super *) lockres->l_priv; 166 } 167 168 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres) 169 { 170 BUG_ON(!ocfs2_is_inode_lock(lockres)); 171 172 return (struct inode *) lockres->l_priv; 173 } 174 175 static int ocfs2_lock_create(struct ocfs2_super *osb, 176 struct ocfs2_lock_res *lockres, 177 int level, 178 int dlm_flags); 179 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 180 int wanted); 181 static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 182 struct ocfs2_lock_res *lockres, 183 int level); 184 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres); 185 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres); 186 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres); 187 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, int level); 188 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, 189 struct ocfs2_lock_res *lockres); 190 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 191 int convert); 192 #define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ 193 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 194 "resource %s: %s\n", dlm_errname(_stat), _func, \ 195 _lockres->l_name, dlm_errmsg(_stat)); \ 196 } while (0) 197 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 198 struct ocfs2_lock_res *lockres); 199 static int ocfs2_meta_lock_update(struct inode *inode, 200 struct buffer_head **bh); 201 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 202 static inline int ocfs2_highest_compat_lock_level(int level); 203 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, 204 struct ocfs2_lock_res *lockres, 205 int new_level); 206 207 static char *ocfs2_lock_type_strings[] = { 208 [OCFS2_LOCK_TYPE_META] = "Meta", 209 [OCFS2_LOCK_TYPE_DATA] = "Data", 210 [OCFS2_LOCK_TYPE_SUPER] = "Super", 211 [OCFS2_LOCK_TYPE_RENAME] = "Rename", 212 /* Need to differntiate from [R]ename.. serializing writes is the 213 * important job it does, anyway. */ 214 [OCFS2_LOCK_TYPE_RW] = "Write/Read", 215 }; 216 217 static char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 218 { 219 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); 220 return ocfs2_lock_type_strings[type]; 221 } 222 223 static void ocfs2_build_lock_name(enum ocfs2_lock_type type, 224 u64 blkno, 225 u32 generation, 226 char *name) 227 { 228 int len; 229 230 mlog_entry_void(); 231 232 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); 233 234 len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x", 235 ocfs2_lock_type_char(type), OCFS2_LOCK_ID_PAD, 236 (long long)blkno, generation); 237 238 BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1)); 239 240 mlog(0, "built lock resource with name: %s\n", name); 241 242 mlog_exit_void(); 243 } 244 245 static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); 246 247 static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, 248 struct ocfs2_dlm_debug *dlm_debug) 249 { 250 mlog(0, "Add tracking for lockres %s\n", res->l_name); 251 252 spin_lock(&ocfs2_dlm_tracking_lock); 253 list_add(&res->l_debug_list, &dlm_debug->d_lockres_tracking); 254 spin_unlock(&ocfs2_dlm_tracking_lock); 255 } 256 257 static void ocfs2_remove_lockres_tracking(struct ocfs2_lock_res *res) 258 { 259 spin_lock(&ocfs2_dlm_tracking_lock); 260 if (!list_empty(&res->l_debug_list)) 261 list_del_init(&res->l_debug_list); 262 spin_unlock(&ocfs2_dlm_tracking_lock); 263 } 264 265 static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, 266 struct ocfs2_lock_res *res, 267 enum ocfs2_lock_type type, 268 u64 blkno, 269 u32 generation, 270 struct ocfs2_lock_res_ops *ops, 271 void *priv) 272 { 273 ocfs2_build_lock_name(type, blkno, generation, res->l_name); 274 275 res->l_type = type; 276 res->l_ops = ops; 277 res->l_priv = priv; 278 279 res->l_level = LKM_IVMODE; 280 res->l_requested = LKM_IVMODE; 281 res->l_blocking = LKM_IVMODE; 282 res->l_action = OCFS2_AST_INVALID; 283 res->l_unlock_action = OCFS2_UNLOCK_INVALID; 284 285 res->l_flags = OCFS2_LOCK_INITIALIZED; 286 287 ocfs2_add_lockres_tracking(res, osb->osb_dlm_debug); 288 } 289 290 void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) 291 { 292 /* This also clears out the lock status block */ 293 memset(res, 0, sizeof(struct ocfs2_lock_res)); 294 spin_lock_init(&res->l_lock); 295 init_waitqueue_head(&res->l_event); 296 INIT_LIST_HEAD(&res->l_blocked_list); 297 INIT_LIST_HEAD(&res->l_mask_waiters); 298 } 299 300 void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, 301 enum ocfs2_lock_type type, 302 struct inode *inode) 303 { 304 struct ocfs2_lock_res_ops *ops; 305 306 switch(type) { 307 case OCFS2_LOCK_TYPE_RW: 308 ops = &ocfs2_inode_rw_lops; 309 break; 310 case OCFS2_LOCK_TYPE_META: 311 ops = &ocfs2_inode_meta_lops; 312 break; 313 case OCFS2_LOCK_TYPE_DATA: 314 ops = &ocfs2_inode_data_lops; 315 break; 316 default: 317 mlog_bug_on_msg(1, "type: %d\n", type); 318 ops = NULL; /* thanks, gcc */ 319 break; 320 }; 321 322 ocfs2_lock_res_init_common(OCFS2_SB(inode->i_sb), res, type, 323 OCFS2_I(inode)->ip_blkno, 324 inode->i_generation, ops, inode); 325 } 326 327 static void ocfs2_super_lock_res_init(struct ocfs2_lock_res *res, 328 struct ocfs2_super *osb) 329 { 330 /* Superblock lockres doesn't come from a slab so we call init 331 * once on it manually. */ 332 ocfs2_lock_res_init_once(res); 333 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_SUPER, 334 OCFS2_SUPER_BLOCK_BLKNO, 0, 335 &ocfs2_super_lops, osb); 336 } 337 338 static void ocfs2_rename_lock_res_init(struct ocfs2_lock_res *res, 339 struct ocfs2_super *osb) 340 { 341 /* Rename lockres doesn't come from a slab so we call init 342 * once on it manually. */ 343 ocfs2_lock_res_init_once(res); 344 ocfs2_lock_res_init_common(osb, res, OCFS2_LOCK_TYPE_RENAME, 0, 0, 345 &ocfs2_rename_lops, osb); 346 } 347 348 void ocfs2_lock_res_free(struct ocfs2_lock_res *res) 349 { 350 mlog_entry_void(); 351 352 if (!(res->l_flags & OCFS2_LOCK_INITIALIZED)) 353 return; 354 355 ocfs2_remove_lockres_tracking(res); 356 357 mlog_bug_on_msg(!list_empty(&res->l_blocked_list), 358 "Lockres %s is on the blocked list\n", 359 res->l_name); 360 mlog_bug_on_msg(!list_empty(&res->l_mask_waiters), 361 "Lockres %s has mask waiters pending\n", 362 res->l_name); 363 mlog_bug_on_msg(spin_is_locked(&res->l_lock), 364 "Lockres %s is locked\n", 365 res->l_name); 366 mlog_bug_on_msg(res->l_ro_holders, 367 "Lockres %s has %u ro holders\n", 368 res->l_name, res->l_ro_holders); 369 mlog_bug_on_msg(res->l_ex_holders, 370 "Lockres %s has %u ex holders\n", 371 res->l_name, res->l_ex_holders); 372 373 /* Need to clear out the lock status block for the dlm */ 374 memset(&res->l_lksb, 0, sizeof(res->l_lksb)); 375 376 res->l_flags = 0UL; 377 mlog_exit_void(); 378 } 379 380 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, 381 int level) 382 { 383 mlog_entry_void(); 384 385 BUG_ON(!lockres); 386 387 switch(level) { 388 case LKM_EXMODE: 389 lockres->l_ex_holders++; 390 break; 391 case LKM_PRMODE: 392 lockres->l_ro_holders++; 393 break; 394 default: 395 BUG(); 396 } 397 398 mlog_exit_void(); 399 } 400 401 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, 402 int level) 403 { 404 mlog_entry_void(); 405 406 BUG_ON(!lockres); 407 408 switch(level) { 409 case LKM_EXMODE: 410 BUG_ON(!lockres->l_ex_holders); 411 lockres->l_ex_holders--; 412 break; 413 case LKM_PRMODE: 414 BUG_ON(!lockres->l_ro_holders); 415 lockres->l_ro_holders--; 416 break; 417 default: 418 BUG(); 419 } 420 mlog_exit_void(); 421 } 422 423 /* WARNING: This function lives in a world where the only three lock 424 * levels are EX, PR, and NL. It *will* have to be adjusted when more 425 * lock types are added. */ 426 static inline int ocfs2_highest_compat_lock_level(int level) 427 { 428 int new_level = LKM_EXMODE; 429 430 if (level == LKM_EXMODE) 431 new_level = LKM_NLMODE; 432 else if (level == LKM_PRMODE) 433 new_level = LKM_PRMODE; 434 return new_level; 435 } 436 437 static void lockres_set_flags(struct ocfs2_lock_res *lockres, 438 unsigned long newflags) 439 { 440 struct list_head *pos, *tmp; 441 struct ocfs2_mask_waiter *mw; 442 443 assert_spin_locked(&lockres->l_lock); 444 445 lockres->l_flags = newflags; 446 447 list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { 448 mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); 449 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 450 continue; 451 452 list_del_init(&mw->mw_item); 453 mw->mw_status = 0; 454 complete(&mw->mw_complete); 455 } 456 } 457 static void lockres_or_flags(struct ocfs2_lock_res *lockres, unsigned long or) 458 { 459 lockres_set_flags(lockres, lockres->l_flags | or); 460 } 461 static void lockres_clear_flags(struct ocfs2_lock_res *lockres, 462 unsigned long clear) 463 { 464 lockres_set_flags(lockres, lockres->l_flags & ~clear); 465 } 466 467 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres) 468 { 469 mlog_entry_void(); 470 471 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 472 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 473 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 474 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 475 476 lockres->l_level = lockres->l_requested; 477 if (lockres->l_level <= 478 ocfs2_highest_compat_lock_level(lockres->l_blocking)) { 479 lockres->l_blocking = LKM_NLMODE; 480 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 481 } 482 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 483 484 mlog_exit_void(); 485 } 486 487 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres) 488 { 489 mlog_entry_void(); 490 491 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 492 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 493 494 /* Convert from RO to EX doesn't really need anything as our 495 * information is already up to data. Convert from NL to 496 * *anything* however should mark ourselves as needing an 497 * update */ 498 if (lockres->l_level == LKM_NLMODE) 499 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 500 501 lockres->l_level = lockres->l_requested; 502 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 503 504 mlog_exit_void(); 505 } 506 507 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres) 508 { 509 mlog_entry_void(); 510 511 BUG_ON((!lockres->l_flags & OCFS2_LOCK_BUSY)); 512 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 513 514 if (lockres->l_requested > LKM_NLMODE && 515 !(lockres->l_flags & OCFS2_LOCK_LOCAL)) 516 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 517 518 lockres->l_level = lockres->l_requested; 519 lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED); 520 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 521 522 mlog_exit_void(); 523 } 524 525 static void ocfs2_inode_ast_func(void *opaque) 526 { 527 struct ocfs2_lock_res *lockres = opaque; 528 struct inode *inode; 529 struct dlm_lockstatus *lksb; 530 unsigned long flags; 531 532 mlog_entry_void(); 533 534 inode = ocfs2_lock_res_inode(lockres); 535 536 mlog(0, "AST fired for inode %llu, l_action = %u, type = %s\n", 537 (unsigned long long)OCFS2_I(inode)->ip_blkno, lockres->l_action, 538 ocfs2_lock_type_string(lockres->l_type)); 539 540 BUG_ON(!ocfs2_is_inode_lock(lockres)); 541 542 spin_lock_irqsave(&lockres->l_lock, flags); 543 544 lksb = &(lockres->l_lksb); 545 if (lksb->status != DLM_NORMAL) { 546 mlog(ML_ERROR, "ocfs2_inode_ast_func: lksb status value of %u " 547 "on inode %llu\n", lksb->status, 548 (unsigned long long)OCFS2_I(inode)->ip_blkno); 549 spin_unlock_irqrestore(&lockres->l_lock, flags); 550 mlog_exit_void(); 551 return; 552 } 553 554 switch(lockres->l_action) { 555 case OCFS2_AST_ATTACH: 556 ocfs2_generic_handle_attach_action(lockres); 557 lockres_clear_flags(lockres, OCFS2_LOCK_LOCAL); 558 break; 559 case OCFS2_AST_CONVERT: 560 ocfs2_generic_handle_convert_action(lockres); 561 break; 562 case OCFS2_AST_DOWNCONVERT: 563 ocfs2_generic_handle_downconvert_action(lockres); 564 break; 565 default: 566 mlog(ML_ERROR, "lockres %s: ast fired with invalid action: %u " 567 "lockres flags = 0x%lx, unlock action: %u\n", 568 lockres->l_name, lockres->l_action, lockres->l_flags, 569 lockres->l_unlock_action); 570 571 BUG(); 572 } 573 574 /* data and rw locking ignores refresh flag for now. */ 575 if (lockres->l_type != OCFS2_LOCK_TYPE_META) 576 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 577 578 /* set it to something invalid so if we get called again we 579 * can catch it. */ 580 lockres->l_action = OCFS2_AST_INVALID; 581 spin_unlock_irqrestore(&lockres->l_lock, flags); 582 wake_up(&lockres->l_event); 583 584 mlog_exit_void(); 585 } 586 587 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, 588 int level) 589 { 590 int needs_downconvert = 0; 591 mlog_entry_void(); 592 593 assert_spin_locked(&lockres->l_lock); 594 595 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 596 597 if (level > lockres->l_blocking) { 598 /* only schedule a downconvert if we haven't already scheduled 599 * one that goes low enough to satisfy the level we're 600 * blocking. this also catches the case where we get 601 * duplicate BASTs */ 602 if (ocfs2_highest_compat_lock_level(level) < 603 ocfs2_highest_compat_lock_level(lockres->l_blocking)) 604 needs_downconvert = 1; 605 606 lockres->l_blocking = level; 607 } 608 609 mlog_exit(needs_downconvert); 610 return needs_downconvert; 611 } 612 613 static void ocfs2_generic_bast_func(struct ocfs2_super *osb, 614 struct ocfs2_lock_res *lockres, 615 int level) 616 { 617 int needs_downconvert; 618 unsigned long flags; 619 620 mlog_entry_void(); 621 622 BUG_ON(level <= LKM_NLMODE); 623 624 spin_lock_irqsave(&lockres->l_lock, flags); 625 needs_downconvert = ocfs2_generic_handle_bast(lockres, level); 626 if (needs_downconvert) 627 ocfs2_schedule_blocked_lock(osb, lockres); 628 spin_unlock_irqrestore(&lockres->l_lock, flags); 629 630 ocfs2_kick_vote_thread(osb); 631 632 wake_up(&lockres->l_event); 633 mlog_exit_void(); 634 } 635 636 static void ocfs2_inode_bast_func(void *opaque, int level) 637 { 638 struct ocfs2_lock_res *lockres = opaque; 639 struct inode *inode; 640 struct ocfs2_super *osb; 641 642 mlog_entry_void(); 643 644 BUG_ON(!ocfs2_is_inode_lock(lockres)); 645 646 inode = ocfs2_lock_res_inode(lockres); 647 osb = OCFS2_SB(inode->i_sb); 648 649 mlog(0, "BAST fired for inode %llu, blocking %d, level %d type %s\n", 650 (unsigned long long)OCFS2_I(inode)->ip_blkno, level, 651 lockres->l_level, ocfs2_lock_type_string(lockres->l_type)); 652 653 ocfs2_generic_bast_func(osb, lockres, level); 654 655 mlog_exit_void(); 656 } 657 658 static void ocfs2_generic_ast_func(struct ocfs2_lock_res *lockres, 659 int ignore_refresh) 660 { 661 struct dlm_lockstatus *lksb = &lockres->l_lksb; 662 unsigned long flags; 663 664 spin_lock_irqsave(&lockres->l_lock, flags); 665 666 if (lksb->status != DLM_NORMAL) { 667 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", 668 lockres->l_name, lksb->status); 669 spin_unlock_irqrestore(&lockres->l_lock, flags); 670 return; 671 } 672 673 switch(lockres->l_action) { 674 case OCFS2_AST_ATTACH: 675 ocfs2_generic_handle_attach_action(lockres); 676 break; 677 case OCFS2_AST_CONVERT: 678 ocfs2_generic_handle_convert_action(lockres); 679 break; 680 case OCFS2_AST_DOWNCONVERT: 681 ocfs2_generic_handle_downconvert_action(lockres); 682 break; 683 default: 684 BUG(); 685 } 686 687 if (ignore_refresh) 688 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 689 690 /* set it to something invalid so if we get called again we 691 * can catch it. */ 692 lockres->l_action = OCFS2_AST_INVALID; 693 spin_unlock_irqrestore(&lockres->l_lock, flags); 694 695 wake_up(&lockres->l_event); 696 } 697 698 static void ocfs2_super_ast_func(void *opaque) 699 { 700 struct ocfs2_lock_res *lockres = opaque; 701 702 mlog_entry_void(); 703 mlog(0, "Superblock AST fired\n"); 704 705 BUG_ON(!ocfs2_is_super_lock(lockres)); 706 ocfs2_generic_ast_func(lockres, 0); 707 708 mlog_exit_void(); 709 } 710 711 static void ocfs2_super_bast_func(void *opaque, 712 int level) 713 { 714 struct ocfs2_lock_res *lockres = opaque; 715 struct ocfs2_super *osb; 716 717 mlog_entry_void(); 718 mlog(0, "Superblock BAST fired\n"); 719 720 BUG_ON(!ocfs2_is_super_lock(lockres)); 721 osb = ocfs2_lock_res_super(lockres); 722 ocfs2_generic_bast_func(osb, lockres, level); 723 724 mlog_exit_void(); 725 } 726 727 static void ocfs2_rename_ast_func(void *opaque) 728 { 729 struct ocfs2_lock_res *lockres = opaque; 730 731 mlog_entry_void(); 732 733 mlog(0, "Rename AST fired\n"); 734 735 BUG_ON(!ocfs2_is_rename_lock(lockres)); 736 737 ocfs2_generic_ast_func(lockres, 1); 738 739 mlog_exit_void(); 740 } 741 742 static void ocfs2_rename_bast_func(void *opaque, 743 int level) 744 { 745 struct ocfs2_lock_res *lockres = opaque; 746 struct ocfs2_super *osb; 747 748 mlog_entry_void(); 749 750 mlog(0, "Rename BAST fired\n"); 751 752 BUG_ON(!ocfs2_is_rename_lock(lockres)); 753 754 osb = ocfs2_lock_res_super(lockres); 755 ocfs2_generic_bast_func(osb, lockres, level); 756 757 mlog_exit_void(); 758 } 759 760 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 761 int convert) 762 { 763 unsigned long flags; 764 765 mlog_entry_void(); 766 spin_lock_irqsave(&lockres->l_lock, flags); 767 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 768 if (convert) 769 lockres->l_action = OCFS2_AST_INVALID; 770 else 771 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 772 spin_unlock_irqrestore(&lockres->l_lock, flags); 773 774 wake_up(&lockres->l_event); 775 mlog_exit_void(); 776 } 777 778 /* Note: If we detect another process working on the lock (i.e., 779 * OCFS2_LOCK_BUSY), we'll bail out returning 0. It's up to the caller 780 * to do the right thing in that case. 781 */ 782 static int ocfs2_lock_create(struct ocfs2_super *osb, 783 struct ocfs2_lock_res *lockres, 784 int level, 785 int dlm_flags) 786 { 787 int ret = 0; 788 enum dlm_status status; 789 unsigned long flags; 790 791 mlog_entry_void(); 792 793 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, 794 dlm_flags); 795 796 spin_lock_irqsave(&lockres->l_lock, flags); 797 if ((lockres->l_flags & OCFS2_LOCK_ATTACHED) || 798 (lockres->l_flags & OCFS2_LOCK_BUSY)) { 799 spin_unlock_irqrestore(&lockres->l_lock, flags); 800 goto bail; 801 } 802 803 lockres->l_action = OCFS2_AST_ATTACH; 804 lockres->l_requested = level; 805 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 806 spin_unlock_irqrestore(&lockres->l_lock, flags); 807 808 status = dlmlock(osb->dlm, 809 level, 810 &lockres->l_lksb, 811 dlm_flags, 812 lockres->l_name, 813 lockres->l_ops->ast, 814 lockres, 815 lockres->l_ops->bast); 816 if (status != DLM_NORMAL) { 817 ocfs2_log_dlm_error("dlmlock", status, lockres); 818 ret = -EINVAL; 819 ocfs2_recover_from_dlm_error(lockres, 1); 820 } 821 822 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); 823 824 bail: 825 mlog_exit(ret); 826 return ret; 827 } 828 829 static inline int ocfs2_check_wait_flag(struct ocfs2_lock_res *lockres, 830 int flag) 831 { 832 unsigned long flags; 833 int ret; 834 835 spin_lock_irqsave(&lockres->l_lock, flags); 836 ret = lockres->l_flags & flag; 837 spin_unlock_irqrestore(&lockres->l_lock, flags); 838 839 return ret; 840 } 841 842 static inline void ocfs2_wait_on_busy_lock(struct ocfs2_lock_res *lockres) 843 844 { 845 wait_event(lockres->l_event, 846 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_BUSY)); 847 } 848 849 static inline void ocfs2_wait_on_refreshing_lock(struct ocfs2_lock_res *lockres) 850 851 { 852 wait_event(lockres->l_event, 853 !ocfs2_check_wait_flag(lockres, OCFS2_LOCK_REFRESHING)); 854 } 855 856 /* predict what lock level we'll be dropping down to on behalf 857 * of another node, and return true if the currently wanted 858 * level will be compatible with it. */ 859 static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 860 int wanted) 861 { 862 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 863 864 return wanted <= ocfs2_highest_compat_lock_level(lockres->l_blocking); 865 } 866 867 static void ocfs2_init_mask_waiter(struct ocfs2_mask_waiter *mw) 868 { 869 INIT_LIST_HEAD(&mw->mw_item); 870 init_completion(&mw->mw_complete); 871 } 872 873 static int ocfs2_wait_for_mask(struct ocfs2_mask_waiter *mw) 874 { 875 wait_for_completion(&mw->mw_complete); 876 /* Re-arm the completion in case we want to wait on it again */ 877 INIT_COMPLETION(mw->mw_complete); 878 return mw->mw_status; 879 } 880 881 static void lockres_add_mask_waiter(struct ocfs2_lock_res *lockres, 882 struct ocfs2_mask_waiter *mw, 883 unsigned long mask, 884 unsigned long goal) 885 { 886 BUG_ON(!list_empty(&mw->mw_item)); 887 888 assert_spin_locked(&lockres->l_lock); 889 890 list_add_tail(&mw->mw_item, &lockres->l_mask_waiters); 891 mw->mw_mask = mask; 892 mw->mw_goal = goal; 893 } 894 895 /* returns 0 if the mw that was removed was already satisfied, -EBUSY 896 * if the mask still hadn't reached its goal */ 897 static int lockres_remove_mask_waiter(struct ocfs2_lock_res *lockres, 898 struct ocfs2_mask_waiter *mw) 899 { 900 unsigned long flags; 901 int ret = 0; 902 903 spin_lock_irqsave(&lockres->l_lock, flags); 904 if (!list_empty(&mw->mw_item)) { 905 if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) 906 ret = -EBUSY; 907 908 list_del_init(&mw->mw_item); 909 init_completion(&mw->mw_complete); 910 } 911 spin_unlock_irqrestore(&lockres->l_lock, flags); 912 913 return ret; 914 915 } 916 917 static int ocfs2_cluster_lock(struct ocfs2_super *osb, 918 struct ocfs2_lock_res *lockres, 919 int level, 920 int lkm_flags, 921 int arg_flags) 922 { 923 struct ocfs2_mask_waiter mw; 924 enum dlm_status status; 925 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); 926 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ 927 unsigned long flags; 928 929 mlog_entry_void(); 930 931 ocfs2_init_mask_waiter(&mw); 932 933 again: 934 wait = 0; 935 936 if (catch_signals && signal_pending(current)) { 937 ret = -ERESTARTSYS; 938 goto out; 939 } 940 941 spin_lock_irqsave(&lockres->l_lock, flags); 942 943 mlog_bug_on_msg(lockres->l_flags & OCFS2_LOCK_FREEING, 944 "Cluster lock called on freeing lockres %s! flags " 945 "0x%lx\n", lockres->l_name, lockres->l_flags); 946 947 /* We only compare against the currently granted level 948 * here. If the lock is blocked waiting on a downconvert, 949 * we'll get caught below. */ 950 if (lockres->l_flags & OCFS2_LOCK_BUSY && 951 level > lockres->l_level) { 952 /* is someone sitting in dlm_lock? If so, wait on 953 * them. */ 954 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 955 wait = 1; 956 goto unlock; 957 } 958 959 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 960 /* lock has not been created yet. */ 961 spin_unlock_irqrestore(&lockres->l_lock, flags); 962 963 ret = ocfs2_lock_create(osb, lockres, LKM_NLMODE, 0); 964 if (ret < 0) { 965 mlog_errno(ret); 966 goto out; 967 } 968 goto again; 969 } 970 971 if (lockres->l_flags & OCFS2_LOCK_BLOCKED && 972 !ocfs2_may_continue_on_blocked_lock(lockres, level)) { 973 /* is the lock is currently blocked on behalf of 974 * another node */ 975 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BLOCKED, 0); 976 wait = 1; 977 goto unlock; 978 } 979 980 if (level > lockres->l_level) { 981 if (lockres->l_action != OCFS2_AST_INVALID) 982 mlog(ML_ERROR, "lockres %s has action %u pending\n", 983 lockres->l_name, lockres->l_action); 984 985 lockres->l_action = OCFS2_AST_CONVERT; 986 lockres->l_requested = level; 987 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 988 spin_unlock_irqrestore(&lockres->l_lock, flags); 989 990 BUG_ON(level == LKM_IVMODE); 991 BUG_ON(level == LKM_NLMODE); 992 993 mlog(0, "lock %s, convert from %d to level = %d\n", 994 lockres->l_name, lockres->l_level, level); 995 996 /* call dlm_lock to upgrade lock now */ 997 status = dlmlock(osb->dlm, 998 level, 999 &lockres->l_lksb, 1000 lkm_flags|LKM_CONVERT|LKM_VALBLK, 1001 lockres->l_name, 1002 lockres->l_ops->ast, 1003 lockres, 1004 lockres->l_ops->bast); 1005 if (status != DLM_NORMAL) { 1006 if ((lkm_flags & LKM_NOQUEUE) && 1007 (status == DLM_NOTQUEUED)) 1008 ret = -EAGAIN; 1009 else { 1010 ocfs2_log_dlm_error("dlmlock", status, 1011 lockres); 1012 ret = -EINVAL; 1013 } 1014 ocfs2_recover_from_dlm_error(lockres, 1); 1015 goto out; 1016 } 1017 1018 mlog(0, "lock %s, successfull return from dlmlock\n", 1019 lockres->l_name); 1020 1021 /* At this point we've gone inside the dlm and need to 1022 * complete our work regardless. */ 1023 catch_signals = 0; 1024 1025 /* wait for busy to clear and carry on */ 1026 goto again; 1027 } 1028 1029 /* Ok, if we get here then we're good to go. */ 1030 ocfs2_inc_holders(lockres, level); 1031 1032 ret = 0; 1033 unlock: 1034 spin_unlock_irqrestore(&lockres->l_lock, flags); 1035 out: 1036 /* 1037 * This is helping work around a lock inversion between the page lock 1038 * and dlm locks. One path holds the page lock while calling aops 1039 * which block acquiring dlm locks. The voting thread holds dlm 1040 * locks while acquiring page locks while down converting data locks. 1041 * This block is helping an aop path notice the inversion and back 1042 * off to unlock its page lock before trying the dlm lock again. 1043 */ 1044 if (wait && arg_flags & OCFS2_LOCK_NONBLOCK && 1045 mw.mw_mask & (OCFS2_LOCK_BUSY|OCFS2_LOCK_BLOCKED)) { 1046 wait = 0; 1047 if (lockres_remove_mask_waiter(lockres, &mw)) 1048 ret = -EAGAIN; 1049 else 1050 goto again; 1051 } 1052 if (wait) { 1053 ret = ocfs2_wait_for_mask(&mw); 1054 if (ret == 0) 1055 goto again; 1056 mlog_errno(ret); 1057 } 1058 1059 mlog_exit(ret); 1060 return ret; 1061 } 1062 1063 static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 1064 struct ocfs2_lock_res *lockres, 1065 int level) 1066 { 1067 unsigned long flags; 1068 1069 mlog_entry_void(); 1070 spin_lock_irqsave(&lockres->l_lock, flags); 1071 ocfs2_dec_holders(lockres, level); 1072 ocfs2_vote_on_unlock(osb, lockres); 1073 spin_unlock_irqrestore(&lockres->l_lock, flags); 1074 mlog_exit_void(); 1075 } 1076 1077 static int ocfs2_create_new_inode_lock(struct inode *inode, 1078 struct ocfs2_lock_res *lockres) 1079 { 1080 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1081 unsigned long flags; 1082 1083 spin_lock_irqsave(&lockres->l_lock, flags); 1084 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 1085 lockres_or_flags(lockres, OCFS2_LOCK_LOCAL); 1086 spin_unlock_irqrestore(&lockres->l_lock, flags); 1087 1088 return ocfs2_lock_create(osb, lockres, LKM_EXMODE, LKM_LOCAL); 1089 } 1090 1091 /* Grants us an EX lock on the data and metadata resources, skipping 1092 * the normal cluster directory lookup. Use this ONLY on newly created 1093 * inodes which other nodes can't possibly see, and which haven't been 1094 * hashed in the inode hash yet. This can give us a good performance 1095 * increase as it'll skip the network broadcast normally associated 1096 * with creating a new lock resource. */ 1097 int ocfs2_create_new_inode_locks(struct inode *inode) 1098 { 1099 int ret; 1100 1101 BUG_ON(!inode); 1102 BUG_ON(!ocfs2_inode_is_new(inode)); 1103 1104 mlog_entry_void(); 1105 1106 mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); 1107 1108 /* NOTE: That we don't increment any of the holder counts, nor 1109 * do we add anything to a journal handle. Since this is 1110 * supposed to be a new inode which the cluster doesn't know 1111 * about yet, there is no need to. As far as the LVB handling 1112 * is concerned, this is basically like acquiring an EX lock 1113 * on a resource which has an invalid one -- we'll set it 1114 * valid when we release the EX. */ 1115 1116 ret = ocfs2_create_new_inode_lock(inode, 1117 &OCFS2_I(inode)->ip_rw_lockres); 1118 if (ret) { 1119 mlog_errno(ret); 1120 goto bail; 1121 } 1122 1123 ret = ocfs2_create_new_inode_lock(inode, 1124 &OCFS2_I(inode)->ip_meta_lockres); 1125 if (ret) { 1126 mlog_errno(ret); 1127 goto bail; 1128 } 1129 1130 ret = ocfs2_create_new_inode_lock(inode, 1131 &OCFS2_I(inode)->ip_data_lockres); 1132 if (ret) { 1133 mlog_errno(ret); 1134 goto bail; 1135 } 1136 1137 bail: 1138 mlog_exit(ret); 1139 return ret; 1140 } 1141 1142 int ocfs2_rw_lock(struct inode *inode, int write) 1143 { 1144 int status, level; 1145 struct ocfs2_lock_res *lockres; 1146 1147 BUG_ON(!inode); 1148 1149 mlog_entry_void(); 1150 1151 mlog(0, "inode %llu take %s RW lock\n", 1152 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1153 write ? "EXMODE" : "PRMODE"); 1154 1155 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1156 1157 level = write ? LKM_EXMODE : LKM_PRMODE; 1158 1159 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 1160 0); 1161 if (status < 0) 1162 mlog_errno(status); 1163 1164 mlog_exit(status); 1165 return status; 1166 } 1167 1168 void ocfs2_rw_unlock(struct inode *inode, int write) 1169 { 1170 int level = write ? LKM_EXMODE : LKM_PRMODE; 1171 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; 1172 1173 mlog_entry_void(); 1174 1175 mlog(0, "inode %llu drop %s RW lock\n", 1176 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1177 write ? "EXMODE" : "PRMODE"); 1178 1179 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1180 1181 mlog_exit_void(); 1182 } 1183 1184 int ocfs2_data_lock_full(struct inode *inode, 1185 int write, 1186 int arg_flags) 1187 { 1188 int status = 0, level; 1189 struct ocfs2_lock_res *lockres; 1190 1191 BUG_ON(!inode); 1192 1193 mlog_entry_void(); 1194 1195 mlog(0, "inode %llu take %s DATA lock\n", 1196 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1197 write ? "EXMODE" : "PRMODE"); 1198 1199 /* We'll allow faking a readonly data lock for 1200 * rodevices. */ 1201 if (ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) { 1202 if (write) { 1203 status = -EROFS; 1204 mlog_errno(status); 1205 } 1206 goto out; 1207 } 1208 1209 lockres = &OCFS2_I(inode)->ip_data_lockres; 1210 1211 level = write ? LKM_EXMODE : LKM_PRMODE; 1212 1213 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 1214 0, arg_flags); 1215 if (status < 0 && status != -EAGAIN) 1216 mlog_errno(status); 1217 1218 out: 1219 mlog_exit(status); 1220 return status; 1221 } 1222 1223 /* see ocfs2_meta_lock_with_page() */ 1224 int ocfs2_data_lock_with_page(struct inode *inode, 1225 int write, 1226 struct page *page) 1227 { 1228 int ret; 1229 1230 ret = ocfs2_data_lock_full(inode, write, OCFS2_LOCK_NONBLOCK); 1231 if (ret == -EAGAIN) { 1232 unlock_page(page); 1233 if (ocfs2_data_lock(inode, write) == 0) 1234 ocfs2_data_unlock(inode, write); 1235 ret = AOP_TRUNCATED_PAGE; 1236 } 1237 1238 return ret; 1239 } 1240 1241 static void ocfs2_vote_on_unlock(struct ocfs2_super *osb, 1242 struct ocfs2_lock_res *lockres) 1243 { 1244 int kick = 0; 1245 1246 mlog_entry_void(); 1247 1248 /* If we know that another node is waiting on our lock, kick 1249 * the vote thread * pre-emptively when we reach a release 1250 * condition. */ 1251 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1252 switch(lockres->l_blocking) { 1253 case LKM_EXMODE: 1254 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 1255 kick = 1; 1256 break; 1257 case LKM_PRMODE: 1258 if (!lockres->l_ex_holders) 1259 kick = 1; 1260 break; 1261 default: 1262 BUG(); 1263 } 1264 } 1265 1266 if (kick) 1267 ocfs2_kick_vote_thread(osb); 1268 1269 mlog_exit_void(); 1270 } 1271 1272 void ocfs2_data_unlock(struct inode *inode, 1273 int write) 1274 { 1275 int level = write ? LKM_EXMODE : LKM_PRMODE; 1276 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_data_lockres; 1277 1278 mlog_entry_void(); 1279 1280 mlog(0, "inode %llu drop %s DATA lock\n", 1281 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1282 write ? "EXMODE" : "PRMODE"); 1283 1284 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) 1285 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1286 1287 mlog_exit_void(); 1288 } 1289 1290 #define OCFS2_SEC_BITS 34 1291 #define OCFS2_SEC_SHIFT (64 - 34) 1292 #define OCFS2_NSEC_MASK ((1ULL << OCFS2_SEC_SHIFT) - 1) 1293 1294 /* LVB only has room for 64 bits of time here so we pack it for 1295 * now. */ 1296 static u64 ocfs2_pack_timespec(struct timespec *spec) 1297 { 1298 u64 res; 1299 u64 sec = spec->tv_sec; 1300 u32 nsec = spec->tv_nsec; 1301 1302 res = (sec << OCFS2_SEC_SHIFT) | (nsec & OCFS2_NSEC_MASK); 1303 1304 return res; 1305 } 1306 1307 /* Call this with the lockres locked. I am reasonably sure we don't 1308 * need ip_lock in this function as anyone who would be changing those 1309 * values is supposed to be blocked in ocfs2_meta_lock right now. */ 1310 static void __ocfs2_stuff_meta_lvb(struct inode *inode) 1311 { 1312 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1313 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1314 struct ocfs2_meta_lvb *lvb; 1315 1316 mlog_entry_void(); 1317 1318 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1319 1320 lvb->lvb_version = cpu_to_be32(OCFS2_LVB_VERSION); 1321 lvb->lvb_isize = cpu_to_be64(i_size_read(inode)); 1322 lvb->lvb_iclusters = cpu_to_be32(oi->ip_clusters); 1323 lvb->lvb_iuid = cpu_to_be32(inode->i_uid); 1324 lvb->lvb_igid = cpu_to_be32(inode->i_gid); 1325 lvb->lvb_imode = cpu_to_be16(inode->i_mode); 1326 lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); 1327 lvb->lvb_iatime_packed = 1328 cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); 1329 lvb->lvb_ictime_packed = 1330 cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); 1331 lvb->lvb_imtime_packed = 1332 cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); 1333 lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); 1334 1335 mlog_meta_lvb(0, lockres); 1336 1337 mlog_exit_void(); 1338 } 1339 1340 static void ocfs2_unpack_timespec(struct timespec *spec, 1341 u64 packed_time) 1342 { 1343 spec->tv_sec = packed_time >> OCFS2_SEC_SHIFT; 1344 spec->tv_nsec = packed_time & OCFS2_NSEC_MASK; 1345 } 1346 1347 static void ocfs2_refresh_inode_from_lvb(struct inode *inode) 1348 { 1349 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1350 struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres; 1351 struct ocfs2_meta_lvb *lvb; 1352 1353 mlog_entry_void(); 1354 1355 mlog_meta_lvb(0, lockres); 1356 1357 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1358 1359 /* We're safe here without the lockres lock... */ 1360 spin_lock(&oi->ip_lock); 1361 oi->ip_clusters = be32_to_cpu(lvb->lvb_iclusters); 1362 i_size_write(inode, be64_to_cpu(lvb->lvb_isize)); 1363 1364 oi->ip_attr = be32_to_cpu(lvb->lvb_iattr); 1365 ocfs2_set_inode_flags(inode); 1366 1367 /* fast-symlinks are a special case */ 1368 if (S_ISLNK(inode->i_mode) && !oi->ip_clusters) 1369 inode->i_blocks = 0; 1370 else 1371 inode->i_blocks = 1372 ocfs2_align_bytes_to_sectors(i_size_read(inode)); 1373 1374 inode->i_uid = be32_to_cpu(lvb->lvb_iuid); 1375 inode->i_gid = be32_to_cpu(lvb->lvb_igid); 1376 inode->i_mode = be16_to_cpu(lvb->lvb_imode); 1377 inode->i_nlink = be16_to_cpu(lvb->lvb_inlink); 1378 ocfs2_unpack_timespec(&inode->i_atime, 1379 be64_to_cpu(lvb->lvb_iatime_packed)); 1380 ocfs2_unpack_timespec(&inode->i_mtime, 1381 be64_to_cpu(lvb->lvb_imtime_packed)); 1382 ocfs2_unpack_timespec(&inode->i_ctime, 1383 be64_to_cpu(lvb->lvb_ictime_packed)); 1384 spin_unlock(&oi->ip_lock); 1385 1386 mlog_exit_void(); 1387 } 1388 1389 static inline int ocfs2_meta_lvb_is_trustable(struct ocfs2_lock_res *lockres) 1390 { 1391 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1392 1393 if (be32_to_cpu(lvb->lvb_version) == OCFS2_LVB_VERSION) 1394 return 1; 1395 return 0; 1396 } 1397 1398 /* Determine whether a lock resource needs to be refreshed, and 1399 * arbitrate who gets to refresh it. 1400 * 1401 * 0 means no refresh needed. 1402 * 1403 * > 0 means you need to refresh this and you MUST call 1404 * ocfs2_complete_lock_res_refresh afterwards. */ 1405 static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres) 1406 { 1407 unsigned long flags; 1408 int status = 0; 1409 1410 mlog_entry_void(); 1411 1412 refresh_check: 1413 spin_lock_irqsave(&lockres->l_lock, flags); 1414 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { 1415 spin_unlock_irqrestore(&lockres->l_lock, flags); 1416 goto bail; 1417 } 1418 1419 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { 1420 spin_unlock_irqrestore(&lockres->l_lock, flags); 1421 1422 ocfs2_wait_on_refreshing_lock(lockres); 1423 goto refresh_check; 1424 } 1425 1426 /* Ok, I'll be the one to refresh this lock. */ 1427 lockres_or_flags(lockres, OCFS2_LOCK_REFRESHING); 1428 spin_unlock_irqrestore(&lockres->l_lock, flags); 1429 1430 status = 1; 1431 bail: 1432 mlog_exit(status); 1433 return status; 1434 } 1435 1436 /* If status is non zero, I'll mark it as not being in refresh 1437 * anymroe, but i won't clear the needs refresh flag. */ 1438 static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockres, 1439 int status) 1440 { 1441 unsigned long flags; 1442 mlog_entry_void(); 1443 1444 spin_lock_irqsave(&lockres->l_lock, flags); 1445 lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING); 1446 if (!status) 1447 lockres_clear_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 1448 spin_unlock_irqrestore(&lockres->l_lock, flags); 1449 1450 wake_up(&lockres->l_event); 1451 1452 mlog_exit_void(); 1453 } 1454 1455 /* may or may not return a bh if it went to disk. */ 1456 static int ocfs2_meta_lock_update(struct inode *inode, 1457 struct buffer_head **bh) 1458 { 1459 int status = 0; 1460 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1461 struct ocfs2_lock_res *lockres; 1462 struct ocfs2_dinode *fe; 1463 1464 mlog_entry_void(); 1465 1466 spin_lock(&oi->ip_lock); 1467 if (oi->ip_flags & OCFS2_INODE_DELETED) { 1468 mlog(0, "Orphaned inode %llu was deleted while we " 1469 "were waiting on a lock. ip_flags = 0x%x\n", 1470 (unsigned long long)oi->ip_blkno, oi->ip_flags); 1471 spin_unlock(&oi->ip_lock); 1472 status = -ENOENT; 1473 goto bail; 1474 } 1475 spin_unlock(&oi->ip_lock); 1476 1477 lockres = &oi->ip_meta_lockres; 1478 1479 if (!ocfs2_should_refresh_lock_res(lockres)) 1480 goto bail; 1481 1482 /* This will discard any caching information we might have had 1483 * for the inode metadata. */ 1484 ocfs2_metadata_cache_purge(inode); 1485 1486 /* will do nothing for inode types that don't use the extent 1487 * map (directories, bitmap files, etc) */ 1488 ocfs2_extent_map_trunc(inode, 0); 1489 1490 if (ocfs2_meta_lvb_is_trustable(lockres)) { 1491 mlog(0, "Trusting LVB on inode %llu\n", 1492 (unsigned long long)oi->ip_blkno); 1493 ocfs2_refresh_inode_from_lvb(inode); 1494 } else { 1495 /* Boo, we have to go to disk. */ 1496 /* read bh, cast, ocfs2_refresh_inode */ 1497 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, 1498 bh, OCFS2_BH_CACHED, inode); 1499 if (status < 0) { 1500 mlog_errno(status); 1501 goto bail_refresh; 1502 } 1503 fe = (struct ocfs2_dinode *) (*bh)->b_data; 1504 1505 /* This is a good chance to make sure we're not 1506 * locking an invalid object. 1507 * 1508 * We bug on a stale inode here because we checked 1509 * above whether it was wiped from disk. The wiping 1510 * node provides a guarantee that we receive that 1511 * message and can mark the inode before dropping any 1512 * locks associated with it. */ 1513 if (!OCFS2_IS_VALID_DINODE(fe)) { 1514 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 1515 status = -EIO; 1516 goto bail_refresh; 1517 } 1518 mlog_bug_on_msg(inode->i_generation != 1519 le32_to_cpu(fe->i_generation), 1520 "Invalid dinode %llu disk generation: %u " 1521 "inode->i_generation: %u\n", 1522 (unsigned long long)oi->ip_blkno, 1523 le32_to_cpu(fe->i_generation), 1524 inode->i_generation); 1525 mlog_bug_on_msg(le64_to_cpu(fe->i_dtime) || 1526 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)), 1527 "Stale dinode %llu dtime: %llu flags: 0x%x\n", 1528 (unsigned long long)oi->ip_blkno, 1529 (unsigned long long)le64_to_cpu(fe->i_dtime), 1530 le32_to_cpu(fe->i_flags)); 1531 1532 ocfs2_refresh_inode(inode, fe); 1533 } 1534 1535 status = 0; 1536 bail_refresh: 1537 ocfs2_complete_lock_res_refresh(lockres, status); 1538 bail: 1539 mlog_exit(status); 1540 return status; 1541 } 1542 1543 static int ocfs2_assign_bh(struct inode *inode, 1544 struct buffer_head **ret_bh, 1545 struct buffer_head *passed_bh) 1546 { 1547 int status; 1548 1549 if (passed_bh) { 1550 /* Ok, the update went to disk for us, use the 1551 * returned bh. */ 1552 *ret_bh = passed_bh; 1553 get_bh(*ret_bh); 1554 1555 return 0; 1556 } 1557 1558 status = ocfs2_read_block(OCFS2_SB(inode->i_sb), 1559 OCFS2_I(inode)->ip_blkno, 1560 ret_bh, 1561 OCFS2_BH_CACHED, 1562 inode); 1563 if (status < 0) 1564 mlog_errno(status); 1565 1566 return status; 1567 } 1568 1569 /* 1570 * returns < 0 error if the callback will never be called, otherwise 1571 * the result of the lock will be communicated via the callback. 1572 */ 1573 int ocfs2_meta_lock_full(struct inode *inode, 1574 struct ocfs2_journal_handle *handle, 1575 struct buffer_head **ret_bh, 1576 int ex, 1577 int arg_flags) 1578 { 1579 int status, level, dlm_flags, acquired; 1580 struct ocfs2_lock_res *lockres; 1581 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1582 struct buffer_head *local_bh = NULL; 1583 1584 BUG_ON(!inode); 1585 1586 mlog_entry_void(); 1587 1588 mlog(0, "inode %llu, take %s META lock\n", 1589 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1590 ex ? "EXMODE" : "PRMODE"); 1591 1592 status = 0; 1593 acquired = 0; 1594 /* We'll allow faking a readonly metadata lock for 1595 * rodevices. */ 1596 if (ocfs2_is_hard_readonly(osb)) { 1597 if (ex) 1598 status = -EROFS; 1599 goto bail; 1600 } 1601 1602 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 1603 wait_event(osb->recovery_event, 1604 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1605 1606 acquired = 0; 1607 lockres = &OCFS2_I(inode)->ip_meta_lockres; 1608 level = ex ? LKM_EXMODE : LKM_PRMODE; 1609 dlm_flags = 0; 1610 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 1611 dlm_flags |= LKM_NOQUEUE; 1612 1613 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); 1614 if (status < 0) { 1615 if (status != -EAGAIN && status != -EIOCBRETRY) 1616 mlog_errno(status); 1617 goto bail; 1618 } 1619 1620 /* Notify the error cleanup path to drop the cluster lock. */ 1621 acquired = 1; 1622 1623 /* We wait twice because a node may have died while we were in 1624 * the lower dlm layers. The second time though, we've 1625 * committed to owning this lock so we don't allow signals to 1626 * abort the operation. */ 1627 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 1628 wait_event(osb->recovery_event, 1629 ocfs2_node_map_is_empty(osb, &osb->recovery_map)); 1630 1631 /* This is fun. The caller may want a bh back, or it may 1632 * not. ocfs2_meta_lock_update definitely wants one in, but 1633 * may or may not read one, depending on what's in the 1634 * LVB. The result of all of this is that we've *only* gone to 1635 * disk if we have to, so the complexity is worthwhile. */ 1636 status = ocfs2_meta_lock_update(inode, &local_bh); 1637 if (status < 0) { 1638 if (status != -ENOENT) 1639 mlog_errno(status); 1640 goto bail; 1641 } 1642 1643 if (ret_bh) { 1644 status = ocfs2_assign_bh(inode, ret_bh, local_bh); 1645 if (status < 0) { 1646 mlog_errno(status); 1647 goto bail; 1648 } 1649 } 1650 1651 if (handle) { 1652 status = ocfs2_handle_add_lock(handle, inode); 1653 if (status < 0) 1654 mlog_errno(status); 1655 } 1656 1657 bail: 1658 if (status < 0) { 1659 if (ret_bh && (*ret_bh)) { 1660 brelse(*ret_bh); 1661 *ret_bh = NULL; 1662 } 1663 if (acquired) 1664 ocfs2_meta_unlock(inode, ex); 1665 } 1666 1667 if (local_bh) 1668 brelse(local_bh); 1669 1670 mlog_exit(status); 1671 return status; 1672 } 1673 1674 /* 1675 * This is working around a lock inversion between tasks acquiring DLM locks 1676 * while holding a page lock and the vote thread which blocks dlm lock acquiry 1677 * while acquiring page locks. 1678 * 1679 * ** These _with_page variantes are only intended to be called from aop 1680 * methods that hold page locks and return a very specific *positive* error 1681 * code that aop methods pass up to the VFS -- test for errors with != 0. ** 1682 * 1683 * The DLM is called such that it returns -EAGAIN if it would have blocked 1684 * waiting for the vote thread. In that case we unlock our page so the vote 1685 * thread can make progress. Once we've done this we have to return 1686 * AOP_TRUNCATED_PAGE so the aop method that called us can bubble that back up 1687 * into the VFS who will then immediately retry the aop call. 1688 * 1689 * We do a blocking lock and immediate unlock before returning, though, so that 1690 * the lock has a great chance of being cached on this node by the time the VFS 1691 * calls back to retry the aop. This has a potential to livelock as nodes 1692 * ping locks back and forth, but that's a risk we're willing to take to avoid 1693 * the lock inversion simply. 1694 */ 1695 int ocfs2_meta_lock_with_page(struct inode *inode, 1696 struct ocfs2_journal_handle *handle, 1697 struct buffer_head **ret_bh, 1698 int ex, 1699 struct page *page) 1700 { 1701 int ret; 1702 1703 ret = ocfs2_meta_lock_full(inode, handle, ret_bh, ex, 1704 OCFS2_LOCK_NONBLOCK); 1705 if (ret == -EAGAIN) { 1706 unlock_page(page); 1707 if (ocfs2_meta_lock(inode, handle, ret_bh, ex) == 0) 1708 ocfs2_meta_unlock(inode, ex); 1709 ret = AOP_TRUNCATED_PAGE; 1710 } 1711 1712 return ret; 1713 } 1714 1715 void ocfs2_meta_unlock(struct inode *inode, 1716 int ex) 1717 { 1718 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1719 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 1720 1721 mlog_entry_void(); 1722 1723 mlog(0, "inode %llu drop %s META lock\n", 1724 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1725 ex ? "EXMODE" : "PRMODE"); 1726 1727 if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb))) 1728 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level); 1729 1730 mlog_exit_void(); 1731 } 1732 1733 int ocfs2_super_lock(struct ocfs2_super *osb, 1734 int ex) 1735 { 1736 int status; 1737 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1738 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 1739 struct buffer_head *bh; 1740 struct ocfs2_slot_info *si = osb->slot_info; 1741 1742 mlog_entry_void(); 1743 1744 if (ocfs2_is_hard_readonly(osb)) 1745 return -EROFS; 1746 1747 status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); 1748 if (status < 0) { 1749 mlog_errno(status); 1750 goto bail; 1751 } 1752 1753 /* The super block lock path is really in the best position to 1754 * know when resources covered by the lock need to be 1755 * refreshed, so we do it here. Of course, making sense of 1756 * everything is up to the caller :) */ 1757 status = ocfs2_should_refresh_lock_res(lockres); 1758 if (status < 0) { 1759 mlog_errno(status); 1760 goto bail; 1761 } 1762 if (status) { 1763 bh = si->si_bh; 1764 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, 1765 si->si_inode); 1766 if (status == 0) 1767 ocfs2_update_slot_info(si); 1768 1769 ocfs2_complete_lock_res_refresh(lockres, status); 1770 1771 if (status < 0) 1772 mlog_errno(status); 1773 } 1774 bail: 1775 mlog_exit(status); 1776 return status; 1777 } 1778 1779 void ocfs2_super_unlock(struct ocfs2_super *osb, 1780 int ex) 1781 { 1782 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1783 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 1784 1785 ocfs2_cluster_unlock(osb, lockres, level); 1786 } 1787 1788 int ocfs2_rename_lock(struct ocfs2_super *osb) 1789 { 1790 int status; 1791 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 1792 1793 if (ocfs2_is_hard_readonly(osb)) 1794 return -EROFS; 1795 1796 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); 1797 if (status < 0) 1798 mlog_errno(status); 1799 1800 return status; 1801 } 1802 1803 void ocfs2_rename_unlock(struct ocfs2_super *osb) 1804 { 1805 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 1806 1807 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); 1808 } 1809 1810 /* Reference counting of the dlm debug structure. We want this because 1811 * open references on the debug inodes can live on after a mount, so 1812 * we can't rely on the ocfs2_super to always exist. */ 1813 static void ocfs2_dlm_debug_free(struct kref *kref) 1814 { 1815 struct ocfs2_dlm_debug *dlm_debug; 1816 1817 dlm_debug = container_of(kref, struct ocfs2_dlm_debug, d_refcnt); 1818 1819 kfree(dlm_debug); 1820 } 1821 1822 void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug) 1823 { 1824 if (dlm_debug) 1825 kref_put(&dlm_debug->d_refcnt, ocfs2_dlm_debug_free); 1826 } 1827 1828 static void ocfs2_get_dlm_debug(struct ocfs2_dlm_debug *debug) 1829 { 1830 kref_get(&debug->d_refcnt); 1831 } 1832 1833 struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) 1834 { 1835 struct ocfs2_dlm_debug *dlm_debug; 1836 1837 dlm_debug = kmalloc(sizeof(struct ocfs2_dlm_debug), GFP_KERNEL); 1838 if (!dlm_debug) { 1839 mlog_errno(-ENOMEM); 1840 goto out; 1841 } 1842 1843 kref_init(&dlm_debug->d_refcnt); 1844 INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); 1845 dlm_debug->d_locking_state = NULL; 1846 out: 1847 return dlm_debug; 1848 } 1849 1850 /* Access to this is arbitrated for us via seq_file->sem. */ 1851 struct ocfs2_dlm_seq_priv { 1852 struct ocfs2_dlm_debug *p_dlm_debug; 1853 struct ocfs2_lock_res p_iter_res; 1854 struct ocfs2_lock_res p_tmp_res; 1855 }; 1856 1857 static struct ocfs2_lock_res *ocfs2_dlm_next_res(struct ocfs2_lock_res *start, 1858 struct ocfs2_dlm_seq_priv *priv) 1859 { 1860 struct ocfs2_lock_res *iter, *ret = NULL; 1861 struct ocfs2_dlm_debug *dlm_debug = priv->p_dlm_debug; 1862 1863 assert_spin_locked(&ocfs2_dlm_tracking_lock); 1864 1865 list_for_each_entry(iter, &start->l_debug_list, l_debug_list) { 1866 /* discover the head of the list */ 1867 if (&iter->l_debug_list == &dlm_debug->d_lockres_tracking) { 1868 mlog(0, "End of list found, %p\n", ret); 1869 break; 1870 } 1871 1872 /* We track our "dummy" iteration lockres' by a NULL 1873 * l_ops field. */ 1874 if (iter->l_ops != NULL) { 1875 ret = iter; 1876 break; 1877 } 1878 } 1879 1880 return ret; 1881 } 1882 1883 static void *ocfs2_dlm_seq_start(struct seq_file *m, loff_t *pos) 1884 { 1885 struct ocfs2_dlm_seq_priv *priv = m->private; 1886 struct ocfs2_lock_res *iter; 1887 1888 spin_lock(&ocfs2_dlm_tracking_lock); 1889 iter = ocfs2_dlm_next_res(&priv->p_iter_res, priv); 1890 if (iter) { 1891 /* Since lockres' have the lifetime of their container 1892 * (which can be inodes, ocfs2_supers, etc) we want to 1893 * copy this out to a temporary lockres while still 1894 * under the spinlock. Obviously after this we can't 1895 * trust any pointers on the copy returned, but that's 1896 * ok as the information we want isn't typically held 1897 * in them. */ 1898 priv->p_tmp_res = *iter; 1899 iter = &priv->p_tmp_res; 1900 } 1901 spin_unlock(&ocfs2_dlm_tracking_lock); 1902 1903 return iter; 1904 } 1905 1906 static void ocfs2_dlm_seq_stop(struct seq_file *m, void *v) 1907 { 1908 } 1909 1910 static void *ocfs2_dlm_seq_next(struct seq_file *m, void *v, loff_t *pos) 1911 { 1912 struct ocfs2_dlm_seq_priv *priv = m->private; 1913 struct ocfs2_lock_res *iter = v; 1914 struct ocfs2_lock_res *dummy = &priv->p_iter_res; 1915 1916 spin_lock(&ocfs2_dlm_tracking_lock); 1917 iter = ocfs2_dlm_next_res(iter, priv); 1918 list_del_init(&dummy->l_debug_list); 1919 if (iter) { 1920 list_add(&dummy->l_debug_list, &iter->l_debug_list); 1921 priv->p_tmp_res = *iter; 1922 iter = &priv->p_tmp_res; 1923 } 1924 spin_unlock(&ocfs2_dlm_tracking_lock); 1925 1926 return iter; 1927 } 1928 1929 /* So that debugfs.ocfs2 can determine which format is being used */ 1930 #define OCFS2_DLM_DEBUG_STR_VERSION 1 1931 static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) 1932 { 1933 int i; 1934 char *lvb; 1935 struct ocfs2_lock_res *lockres = v; 1936 1937 if (!lockres) 1938 return -EINVAL; 1939 1940 seq_printf(m, "0x%x\t" 1941 "%.*s\t" 1942 "%d\t" 1943 "0x%lx\t" 1944 "0x%x\t" 1945 "0x%x\t" 1946 "%u\t" 1947 "%u\t" 1948 "%d\t" 1949 "%d\t", 1950 OCFS2_DLM_DEBUG_STR_VERSION, 1951 OCFS2_LOCK_ID_MAX_LEN, lockres->l_name, 1952 lockres->l_level, 1953 lockres->l_flags, 1954 lockres->l_action, 1955 lockres->l_unlock_action, 1956 lockres->l_ro_holders, 1957 lockres->l_ex_holders, 1958 lockres->l_requested, 1959 lockres->l_blocking); 1960 1961 /* Dump the raw LVB */ 1962 lvb = lockres->l_lksb.lvb; 1963 for(i = 0; i < DLM_LVB_LEN; i++) 1964 seq_printf(m, "0x%x\t", lvb[i]); 1965 1966 /* End the line */ 1967 seq_printf(m, "\n"); 1968 return 0; 1969 } 1970 1971 static struct seq_operations ocfs2_dlm_seq_ops = { 1972 .start = ocfs2_dlm_seq_start, 1973 .stop = ocfs2_dlm_seq_stop, 1974 .next = ocfs2_dlm_seq_next, 1975 .show = ocfs2_dlm_seq_show, 1976 }; 1977 1978 static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) 1979 { 1980 struct seq_file *seq = (struct seq_file *) file->private_data; 1981 struct ocfs2_dlm_seq_priv *priv = seq->private; 1982 struct ocfs2_lock_res *res = &priv->p_iter_res; 1983 1984 ocfs2_remove_lockres_tracking(res); 1985 ocfs2_put_dlm_debug(priv->p_dlm_debug); 1986 return seq_release_private(inode, file); 1987 } 1988 1989 static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) 1990 { 1991 int ret; 1992 struct ocfs2_dlm_seq_priv *priv; 1993 struct seq_file *seq; 1994 struct ocfs2_super *osb; 1995 1996 priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); 1997 if (!priv) { 1998 ret = -ENOMEM; 1999 mlog_errno(ret); 2000 goto out; 2001 } 2002 osb = (struct ocfs2_super *) inode->u.generic_ip; 2003 ocfs2_get_dlm_debug(osb->osb_dlm_debug); 2004 priv->p_dlm_debug = osb->osb_dlm_debug; 2005 INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); 2006 2007 ret = seq_open(file, &ocfs2_dlm_seq_ops); 2008 if (ret) { 2009 kfree(priv); 2010 mlog_errno(ret); 2011 goto out; 2012 } 2013 2014 seq = (struct seq_file *) file->private_data; 2015 seq->private = priv; 2016 2017 ocfs2_add_lockres_tracking(&priv->p_iter_res, 2018 priv->p_dlm_debug); 2019 2020 out: 2021 return ret; 2022 } 2023 2024 static const struct file_operations ocfs2_dlm_debug_fops = { 2025 .open = ocfs2_dlm_debug_open, 2026 .release = ocfs2_dlm_debug_release, 2027 .read = seq_read, 2028 .llseek = seq_lseek, 2029 }; 2030 2031 static int ocfs2_dlm_init_debug(struct ocfs2_super *osb) 2032 { 2033 int ret = 0; 2034 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 2035 2036 dlm_debug->d_locking_state = debugfs_create_file("locking_state", 2037 S_IFREG|S_IRUSR, 2038 osb->osb_debug_root, 2039 osb, 2040 &ocfs2_dlm_debug_fops); 2041 if (!dlm_debug->d_locking_state) { 2042 ret = -EINVAL; 2043 mlog(ML_ERROR, 2044 "Unable to create locking state debugfs file.\n"); 2045 goto out; 2046 } 2047 2048 ocfs2_get_dlm_debug(dlm_debug); 2049 out: 2050 return ret; 2051 } 2052 2053 static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) 2054 { 2055 struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; 2056 2057 if (dlm_debug) { 2058 debugfs_remove(dlm_debug->d_locking_state); 2059 ocfs2_put_dlm_debug(dlm_debug); 2060 } 2061 } 2062 2063 int ocfs2_dlm_init(struct ocfs2_super *osb) 2064 { 2065 int status; 2066 u32 dlm_key; 2067 struct dlm_ctxt *dlm; 2068 2069 mlog_entry_void(); 2070 2071 status = ocfs2_dlm_init_debug(osb); 2072 if (status < 0) { 2073 mlog_errno(status); 2074 goto bail; 2075 } 2076 2077 /* launch vote thread */ 2078 osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); 2079 if (IS_ERR(osb->vote_task)) { 2080 status = PTR_ERR(osb->vote_task); 2081 osb->vote_task = NULL; 2082 mlog_errno(status); 2083 goto bail; 2084 } 2085 2086 /* used by the dlm code to make message headers unique, each 2087 * node in this domain must agree on this. */ 2088 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); 2089 2090 /* for now, uuid == domain */ 2091 dlm = dlm_register_domain(osb->uuid_str, dlm_key); 2092 if (IS_ERR(dlm)) { 2093 status = PTR_ERR(dlm); 2094 mlog_errno(status); 2095 goto bail; 2096 } 2097 2098 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2099 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2100 2101 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); 2102 2103 osb->dlm = dlm; 2104 2105 status = 0; 2106 bail: 2107 if (status < 0) { 2108 ocfs2_dlm_shutdown_debug(osb); 2109 if (osb->vote_task) 2110 kthread_stop(osb->vote_task); 2111 } 2112 2113 mlog_exit(status); 2114 return status; 2115 } 2116 2117 void ocfs2_dlm_shutdown(struct ocfs2_super *osb) 2118 { 2119 mlog_entry_void(); 2120 2121 dlm_unregister_eviction_cb(&osb->osb_eviction_cb); 2122 2123 ocfs2_drop_osb_locks(osb); 2124 2125 if (osb->vote_task) { 2126 kthread_stop(osb->vote_task); 2127 osb->vote_task = NULL; 2128 } 2129 2130 ocfs2_lock_res_free(&osb->osb_super_lockres); 2131 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2132 2133 dlm_unregister_domain(osb->dlm); 2134 osb->dlm = NULL; 2135 2136 ocfs2_dlm_shutdown_debug(osb); 2137 2138 mlog_exit_void(); 2139 } 2140 2141 static void ocfs2_unlock_ast_func(void *opaque, enum dlm_status status) 2142 { 2143 struct ocfs2_lock_res *lockres = opaque; 2144 unsigned long flags; 2145 2146 mlog_entry_void(); 2147 2148 mlog(0, "UNLOCK AST called on lock %s, action = %d\n", lockres->l_name, 2149 lockres->l_unlock_action); 2150 2151 spin_lock_irqsave(&lockres->l_lock, flags); 2152 /* We tried to cancel a convert request, but it was already 2153 * granted. All we want to do here is clear our unlock 2154 * state. The wake_up call done at the bottom is redundant 2155 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't 2156 * hurt anything anyway */ 2157 if (status == DLM_CANCELGRANT && 2158 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 2159 mlog(0, "Got cancelgrant for %s\n", lockres->l_name); 2160 2161 /* We don't clear the busy flag in this case as it 2162 * should have been cleared by the ast which the dlm 2163 * has called. */ 2164 goto complete_unlock; 2165 } 2166 2167 if (status != DLM_NORMAL) { 2168 mlog(ML_ERROR, "Dlm passes status %d for lock %s, " 2169 "unlock_action %d\n", status, lockres->l_name, 2170 lockres->l_unlock_action); 2171 spin_unlock_irqrestore(&lockres->l_lock, flags); 2172 return; 2173 } 2174 2175 switch(lockres->l_unlock_action) { 2176 case OCFS2_UNLOCK_CANCEL_CONVERT: 2177 mlog(0, "Cancel convert success for %s\n", lockres->l_name); 2178 lockres->l_action = OCFS2_AST_INVALID; 2179 break; 2180 case OCFS2_UNLOCK_DROP_LOCK: 2181 lockres->l_level = LKM_IVMODE; 2182 break; 2183 default: 2184 BUG(); 2185 } 2186 2187 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 2188 complete_unlock: 2189 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 2190 spin_unlock_irqrestore(&lockres->l_lock, flags); 2191 2192 wake_up(&lockres->l_event); 2193 2194 mlog_exit_void(); 2195 } 2196 2197 typedef void (ocfs2_pre_drop_cb_t)(struct ocfs2_lock_res *, void *); 2198 2199 struct drop_lock_cb { 2200 ocfs2_pre_drop_cb_t *drop_func; 2201 void *drop_data; 2202 }; 2203 2204 static int ocfs2_drop_lock(struct ocfs2_super *osb, 2205 struct ocfs2_lock_res *lockres, 2206 struct drop_lock_cb *dcb) 2207 { 2208 enum dlm_status status; 2209 unsigned long flags; 2210 2211 /* We didn't get anywhere near actually using this lockres. */ 2212 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) 2213 goto out; 2214 2215 spin_lock_irqsave(&lockres->l_lock, flags); 2216 2217 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_FREEING), 2218 "lockres %s, flags 0x%lx\n", 2219 lockres->l_name, lockres->l_flags); 2220 2221 while (lockres->l_flags & OCFS2_LOCK_BUSY) { 2222 mlog(0, "waiting on busy lock \"%s\": flags = %lx, action = " 2223 "%u, unlock_action = %u\n", 2224 lockres->l_name, lockres->l_flags, lockres->l_action, 2225 lockres->l_unlock_action); 2226 2227 spin_unlock_irqrestore(&lockres->l_lock, flags); 2228 2229 /* XXX: Today we just wait on any busy 2230 * locks... Perhaps we need to cancel converts in the 2231 * future? */ 2232 ocfs2_wait_on_busy_lock(lockres); 2233 2234 spin_lock_irqsave(&lockres->l_lock, flags); 2235 } 2236 2237 if (dcb) 2238 dcb->drop_func(lockres, dcb->drop_data); 2239 2240 if (lockres->l_flags & OCFS2_LOCK_BUSY) 2241 mlog(ML_ERROR, "destroying busy lock: \"%s\"\n", 2242 lockres->l_name); 2243 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) 2244 mlog(0, "destroying blocked lock: \"%s\"\n", lockres->l_name); 2245 2246 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 2247 spin_unlock_irqrestore(&lockres->l_lock, flags); 2248 goto out; 2249 } 2250 2251 lockres_clear_flags(lockres, OCFS2_LOCK_ATTACHED); 2252 2253 /* make sure we never get here while waiting for an ast to 2254 * fire. */ 2255 BUG_ON(lockres->l_action != OCFS2_AST_INVALID); 2256 2257 /* is this necessary? */ 2258 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2259 lockres->l_unlock_action = OCFS2_UNLOCK_DROP_LOCK; 2260 spin_unlock_irqrestore(&lockres->l_lock, flags); 2261 2262 mlog(0, "lock %s\n", lockres->l_name); 2263 2264 status = dlmunlock(osb->dlm, &lockres->l_lksb, LKM_VALBLK, 2265 lockres->l_ops->unlock_ast, lockres); 2266 if (status != DLM_NORMAL) { 2267 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2268 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 2269 dlm_print_one_lock(lockres->l_lksb.lockid); 2270 BUG(); 2271 } 2272 mlog(0, "lock %s, successfull return from dlmunlock\n", 2273 lockres->l_name); 2274 2275 ocfs2_wait_on_busy_lock(lockres); 2276 out: 2277 mlog_exit(0); 2278 return 0; 2279 } 2280 2281 /* Mark the lockres as being dropped. It will no longer be 2282 * queued if blocking, but we still may have to wait on it 2283 * being dequeued from the vote thread before we can consider 2284 * it safe to drop. 2285 * 2286 * You can *not* attempt to call cluster_lock on this lockres anymore. */ 2287 void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) 2288 { 2289 int status; 2290 struct ocfs2_mask_waiter mw; 2291 unsigned long flags; 2292 2293 ocfs2_init_mask_waiter(&mw); 2294 2295 spin_lock_irqsave(&lockres->l_lock, flags); 2296 lockres->l_flags |= OCFS2_LOCK_FREEING; 2297 while (lockres->l_flags & OCFS2_LOCK_QUEUED) { 2298 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_QUEUED, 0); 2299 spin_unlock_irqrestore(&lockres->l_lock, flags); 2300 2301 mlog(0, "Waiting on lockres %s\n", lockres->l_name); 2302 2303 status = ocfs2_wait_for_mask(&mw); 2304 if (status) 2305 mlog_errno(status); 2306 2307 spin_lock_irqsave(&lockres->l_lock, flags); 2308 } 2309 spin_unlock_irqrestore(&lockres->l_lock, flags); 2310 } 2311 2312 static void ocfs2_drop_osb_locks(struct ocfs2_super *osb) 2313 { 2314 int status; 2315 2316 mlog_entry_void(); 2317 2318 ocfs2_mark_lockres_freeing(&osb->osb_super_lockres); 2319 2320 status = ocfs2_drop_lock(osb, &osb->osb_super_lockres, NULL); 2321 if (status < 0) 2322 mlog_errno(status); 2323 2324 ocfs2_mark_lockres_freeing(&osb->osb_rename_lockres); 2325 2326 status = ocfs2_drop_lock(osb, &osb->osb_rename_lockres, NULL); 2327 if (status < 0) 2328 mlog_errno(status); 2329 2330 mlog_exit(status); 2331 } 2332 2333 static void ocfs2_meta_pre_drop(struct ocfs2_lock_res *lockres, void *data) 2334 { 2335 struct inode *inode = data; 2336 2337 /* the metadata lock requires a bit more work as we have an 2338 * LVB to worry about. */ 2339 if (lockres->l_flags & OCFS2_LOCK_ATTACHED && 2340 lockres->l_level == LKM_EXMODE && 2341 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) 2342 __ocfs2_stuff_meta_lvb(inode); 2343 } 2344 2345 int ocfs2_drop_inode_locks(struct inode *inode) 2346 { 2347 int status, err; 2348 struct drop_lock_cb meta_dcb = { ocfs2_meta_pre_drop, inode, }; 2349 2350 mlog_entry_void(); 2351 2352 /* No need to call ocfs2_mark_lockres_freeing here - 2353 * ocfs2_clear_inode has done it for us. */ 2354 2355 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2356 &OCFS2_I(inode)->ip_data_lockres, 2357 NULL); 2358 if (err < 0) 2359 mlog_errno(err); 2360 2361 status = err; 2362 2363 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2364 &OCFS2_I(inode)->ip_meta_lockres, 2365 &meta_dcb); 2366 if (err < 0) 2367 mlog_errno(err); 2368 if (err < 0 && !status) 2369 status = err; 2370 2371 err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb), 2372 &OCFS2_I(inode)->ip_rw_lockres, 2373 NULL); 2374 if (err < 0) 2375 mlog_errno(err); 2376 if (err < 0 && !status) 2377 status = err; 2378 2379 mlog_exit(status); 2380 return status; 2381 } 2382 2383 static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 2384 int new_level) 2385 { 2386 assert_spin_locked(&lockres->l_lock); 2387 2388 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 2389 2390 if (lockres->l_level <= new_level) { 2391 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", 2392 lockres->l_level, new_level); 2393 BUG(); 2394 } 2395 2396 mlog(0, "lock %s, new_level = %d, l_blocking = %d\n", 2397 lockres->l_name, new_level, lockres->l_blocking); 2398 2399 lockres->l_action = OCFS2_AST_DOWNCONVERT; 2400 lockres->l_requested = new_level; 2401 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2402 } 2403 2404 static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 2405 struct ocfs2_lock_res *lockres, 2406 int new_level, 2407 int lvb) 2408 { 2409 int ret, dlm_flags = LKM_CONVERT; 2410 enum dlm_status status; 2411 2412 mlog_entry_void(); 2413 2414 if (lvb) 2415 dlm_flags |= LKM_VALBLK; 2416 2417 status = dlmlock(osb->dlm, 2418 new_level, 2419 &lockres->l_lksb, 2420 dlm_flags, 2421 lockres->l_name, 2422 lockres->l_ops->ast, 2423 lockres, 2424 lockres->l_ops->bast); 2425 if (status != DLM_NORMAL) { 2426 ocfs2_log_dlm_error("dlmlock", status, lockres); 2427 ret = -EINVAL; 2428 ocfs2_recover_from_dlm_error(lockres, 1); 2429 goto bail; 2430 } 2431 2432 ret = 0; 2433 bail: 2434 mlog_exit(ret); 2435 return ret; 2436 } 2437 2438 /* returns 1 when the caller should unlock and call dlmunlock */ 2439 static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 2440 struct ocfs2_lock_res *lockres) 2441 { 2442 assert_spin_locked(&lockres->l_lock); 2443 2444 mlog_entry_void(); 2445 mlog(0, "lock %s\n", lockres->l_name); 2446 2447 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { 2448 /* If we're already trying to cancel a lock conversion 2449 * then just drop the spinlock and allow the caller to 2450 * requeue this lock. */ 2451 2452 mlog(0, "Lockres %s, skip convert\n", lockres->l_name); 2453 return 0; 2454 } 2455 2456 /* were we in a convert when we got the bast fire? */ 2457 BUG_ON(lockres->l_action != OCFS2_AST_CONVERT && 2458 lockres->l_action != OCFS2_AST_DOWNCONVERT); 2459 /* set things up for the unlockast to know to just 2460 * clear out the ast_action and unset busy, etc. */ 2461 lockres->l_unlock_action = OCFS2_UNLOCK_CANCEL_CONVERT; 2462 2463 mlog_bug_on_msg(!(lockres->l_flags & OCFS2_LOCK_BUSY), 2464 "lock %s, invalid flags: 0x%lx\n", 2465 lockres->l_name, lockres->l_flags); 2466 2467 return 1; 2468 } 2469 2470 static int ocfs2_cancel_convert(struct ocfs2_super *osb, 2471 struct ocfs2_lock_res *lockres) 2472 { 2473 int ret; 2474 enum dlm_status status; 2475 2476 mlog_entry_void(); 2477 mlog(0, "lock %s\n", lockres->l_name); 2478 2479 ret = 0; 2480 status = dlmunlock(osb->dlm, 2481 &lockres->l_lksb, 2482 LKM_CANCEL, 2483 lockres->l_ops->unlock_ast, 2484 lockres); 2485 if (status != DLM_NORMAL) { 2486 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2487 ret = -EINVAL; 2488 ocfs2_recover_from_dlm_error(lockres, 0); 2489 } 2490 2491 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); 2492 2493 mlog_exit(ret); 2494 return ret; 2495 } 2496 2497 static inline int ocfs2_can_downconvert_meta_lock(struct inode *inode, 2498 struct ocfs2_lock_res *lockres, 2499 int new_level) 2500 { 2501 int ret; 2502 2503 mlog_entry_void(); 2504 2505 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); 2506 2507 if (lockres->l_flags & OCFS2_LOCK_REFRESHING) { 2508 ret = 0; 2509 mlog(0, "lockres %s currently being refreshed -- backing " 2510 "off!\n", lockres->l_name); 2511 } else if (new_level == LKM_PRMODE) 2512 ret = !lockres->l_ex_holders && 2513 ocfs2_inode_fully_checkpointed(inode); 2514 else /* Must be NLMODE we're converting to. */ 2515 ret = !lockres->l_ro_holders && !lockres->l_ex_holders && 2516 ocfs2_inode_fully_checkpointed(inode); 2517 2518 mlog_exit(ret); 2519 return ret; 2520 } 2521 2522 static int ocfs2_do_unblock_meta(struct inode *inode, 2523 int *requeue) 2524 { 2525 int new_level; 2526 int set_lvb = 0; 2527 int ret = 0; 2528 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_meta_lockres; 2529 unsigned long flags; 2530 2531 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2532 2533 mlog_entry_void(); 2534 2535 spin_lock_irqsave(&lockres->l_lock, flags); 2536 2537 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 2538 2539 mlog(0, "l_level=%d, l_blocking=%d\n", lockres->l_level, 2540 lockres->l_blocking); 2541 2542 BUG_ON(lockres->l_level != LKM_EXMODE && 2543 lockres->l_level != LKM_PRMODE); 2544 2545 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 2546 *requeue = 1; 2547 ret = ocfs2_prepare_cancel_convert(osb, lockres); 2548 spin_unlock_irqrestore(&lockres->l_lock, flags); 2549 if (ret) { 2550 ret = ocfs2_cancel_convert(osb, lockres); 2551 if (ret < 0) 2552 mlog_errno(ret); 2553 } 2554 goto leave; 2555 } 2556 2557 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 2558 2559 mlog(0, "l_level=%d, l_blocking=%d, new_level=%d\n", 2560 lockres->l_level, lockres->l_blocking, new_level); 2561 2562 if (ocfs2_can_downconvert_meta_lock(inode, lockres, new_level)) { 2563 if (lockres->l_level == LKM_EXMODE) 2564 set_lvb = 1; 2565 2566 /* If the lock hasn't been refreshed yet (rare), then 2567 * our memory inode values are old and we skip 2568 * stuffing the lvb. There's no need to actually clear 2569 * out the lvb here as it's value is still valid. */ 2570 if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) { 2571 if (set_lvb) 2572 __ocfs2_stuff_meta_lvb(inode); 2573 } else 2574 mlog(0, "lockres %s: downconverting stale lock!\n", 2575 lockres->l_name); 2576 2577 mlog(0, "calling ocfs2_downconvert_lock with l_level=%d, " 2578 "l_blocking=%d, new_level=%d\n", 2579 lockres->l_level, lockres->l_blocking, new_level); 2580 2581 ocfs2_prepare_downconvert(lockres, new_level); 2582 spin_unlock_irqrestore(&lockres->l_lock, flags); 2583 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); 2584 goto leave; 2585 } 2586 if (!ocfs2_inode_fully_checkpointed(inode)) 2587 ocfs2_start_checkpoint(osb); 2588 2589 *requeue = 1; 2590 spin_unlock_irqrestore(&lockres->l_lock, flags); 2591 ret = 0; 2592 leave: 2593 mlog_exit(ret); 2594 return ret; 2595 } 2596 2597 static int ocfs2_generic_unblock_lock(struct ocfs2_super *osb, 2598 struct ocfs2_lock_res *lockres, 2599 int *requeue, 2600 ocfs2_convert_worker_t *worker) 2601 { 2602 unsigned long flags; 2603 int blocking; 2604 int new_level; 2605 int ret = 0; 2606 2607 mlog_entry_void(); 2608 2609 spin_lock_irqsave(&lockres->l_lock, flags); 2610 2611 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 2612 2613 recheck: 2614 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 2615 *requeue = 1; 2616 ret = ocfs2_prepare_cancel_convert(osb, lockres); 2617 spin_unlock_irqrestore(&lockres->l_lock, flags); 2618 if (ret) { 2619 ret = ocfs2_cancel_convert(osb, lockres); 2620 if (ret < 0) 2621 mlog_errno(ret); 2622 } 2623 goto leave; 2624 } 2625 2626 /* if we're blocking an exclusive and we have *any* holders, 2627 * then requeue. */ 2628 if ((lockres->l_blocking == LKM_EXMODE) 2629 && (lockres->l_ex_holders || lockres->l_ro_holders)) { 2630 spin_unlock_irqrestore(&lockres->l_lock, flags); 2631 *requeue = 1; 2632 ret = 0; 2633 goto leave; 2634 } 2635 2636 /* If it's a PR we're blocking, then only 2637 * requeue if we've got any EX holders */ 2638 if (lockres->l_blocking == LKM_PRMODE && 2639 lockres->l_ex_holders) { 2640 spin_unlock_irqrestore(&lockres->l_lock, flags); 2641 *requeue = 1; 2642 ret = 0; 2643 goto leave; 2644 } 2645 2646 /* If we get here, then we know that there are no more 2647 * incompatible holders (and anyone asking for an incompatible 2648 * lock is blocked). We can now downconvert the lock */ 2649 if (!worker) 2650 goto downconvert; 2651 2652 /* Some lockres types want to do a bit of work before 2653 * downconverting a lock. Allow that here. The worker function 2654 * may sleep, so we save off a copy of what we're blocking as 2655 * it may change while we're not holding the spin lock. */ 2656 blocking = lockres->l_blocking; 2657 spin_unlock_irqrestore(&lockres->l_lock, flags); 2658 2659 worker(lockres, blocking); 2660 2661 spin_lock_irqsave(&lockres->l_lock, flags); 2662 if (blocking != lockres->l_blocking) { 2663 /* If this changed underneath us, then we can't drop 2664 * it just yet. */ 2665 goto recheck; 2666 } 2667 2668 downconvert: 2669 *requeue = 0; 2670 new_level = ocfs2_highest_compat_lock_level(lockres->l_blocking); 2671 2672 ocfs2_prepare_downconvert(lockres, new_level); 2673 spin_unlock_irqrestore(&lockres->l_lock, flags); 2674 ret = ocfs2_downconvert_lock(osb, lockres, new_level, 0); 2675 leave: 2676 mlog_exit(ret); 2677 return ret; 2678 } 2679 2680 static void ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, 2681 int blocking) 2682 { 2683 struct inode *inode; 2684 struct address_space *mapping; 2685 2686 mlog_entry_void(); 2687 2688 inode = ocfs2_lock_res_inode(lockres); 2689 mapping = inode->i_mapping; 2690 2691 if (filemap_fdatawrite(mapping)) { 2692 mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", 2693 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2694 } 2695 sync_mapping_buffers(mapping); 2696 if (blocking == LKM_EXMODE) { 2697 truncate_inode_pages(mapping, 0); 2698 unmap_mapping_range(mapping, 0, 0, 0); 2699 } else { 2700 /* We only need to wait on the I/O if we're not also 2701 * truncating pages because truncate_inode_pages waits 2702 * for us above. We don't truncate pages if we're 2703 * blocking anything < EXMODE because we want to keep 2704 * them around in that case. */ 2705 filemap_fdatawait(mapping); 2706 } 2707 2708 mlog_exit_void(); 2709 } 2710 2711 int ocfs2_unblock_data(struct ocfs2_lock_res *lockres, 2712 int *requeue) 2713 { 2714 int status; 2715 struct inode *inode; 2716 struct ocfs2_super *osb; 2717 2718 mlog_entry_void(); 2719 2720 inode = ocfs2_lock_res_inode(lockres); 2721 osb = OCFS2_SB(inode->i_sb); 2722 2723 mlog(0, "unblock inode %llu\n", 2724 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2725 2726 status = ocfs2_generic_unblock_lock(osb, 2727 lockres, 2728 requeue, 2729 ocfs2_data_convert_worker); 2730 if (status < 0) 2731 mlog_errno(status); 2732 2733 mlog(0, "inode %llu, requeue = %d\n", 2734 (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue); 2735 2736 mlog_exit(status); 2737 return status; 2738 } 2739 2740 static int ocfs2_unblock_inode_lock(struct ocfs2_lock_res *lockres, 2741 int *requeue) 2742 { 2743 int status; 2744 struct inode *inode; 2745 2746 mlog_entry_void(); 2747 2748 mlog(0, "Unblock lockres %s\n", lockres->l_name); 2749 2750 inode = ocfs2_lock_res_inode(lockres); 2751 2752 status = ocfs2_generic_unblock_lock(OCFS2_SB(inode->i_sb), 2753 lockres, 2754 requeue, 2755 NULL); 2756 if (status < 0) 2757 mlog_errno(status); 2758 2759 mlog_exit(status); 2760 return status; 2761 } 2762 2763 2764 int ocfs2_unblock_meta(struct ocfs2_lock_res *lockres, 2765 int *requeue) 2766 { 2767 int status; 2768 struct inode *inode; 2769 2770 mlog_entry_void(); 2771 2772 inode = ocfs2_lock_res_inode(lockres); 2773 2774 mlog(0, "unblock inode %llu\n", 2775 (unsigned long long)OCFS2_I(inode)->ip_blkno); 2776 2777 status = ocfs2_do_unblock_meta(inode, requeue); 2778 if (status < 0) 2779 mlog_errno(status); 2780 2781 mlog(0, "inode %llu, requeue = %d\n", 2782 (unsigned long long)OCFS2_I(inode)->ip_blkno, *requeue); 2783 2784 mlog_exit(status); 2785 return status; 2786 } 2787 2788 /* Generic unblock function for any lockres whose private data is an 2789 * ocfs2_super pointer. */ 2790 static int ocfs2_unblock_osb_lock(struct ocfs2_lock_res *lockres, 2791 int *requeue) 2792 { 2793 int status; 2794 struct ocfs2_super *osb; 2795 2796 mlog_entry_void(); 2797 2798 mlog(0, "Unblock lockres %s\n", lockres->l_name); 2799 2800 osb = ocfs2_lock_res_super(lockres); 2801 2802 status = ocfs2_generic_unblock_lock(osb, 2803 lockres, 2804 requeue, 2805 NULL); 2806 if (status < 0) 2807 mlog_errno(status); 2808 2809 mlog_exit(status); 2810 return status; 2811 } 2812 2813 void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 2814 struct ocfs2_lock_res *lockres) 2815 { 2816 int status; 2817 int requeue = 0; 2818 unsigned long flags; 2819 2820 /* Our reference to the lockres in this function can be 2821 * considered valid until we remove the OCFS2_LOCK_QUEUED 2822 * flag. */ 2823 2824 mlog_entry_void(); 2825 2826 BUG_ON(!lockres); 2827 BUG_ON(!lockres->l_ops); 2828 BUG_ON(!lockres->l_ops->unblock); 2829 2830 mlog(0, "lockres %s blocked.\n", lockres->l_name); 2831 2832 /* Detect whether a lock has been marked as going away while 2833 * the vote thread was processing other things. A lock can 2834 * still be marked with OCFS2_LOCK_FREEING after this check, 2835 * but short circuiting here will still save us some 2836 * performance. */ 2837 spin_lock_irqsave(&lockres->l_lock, flags); 2838 if (lockres->l_flags & OCFS2_LOCK_FREEING) 2839 goto unqueue; 2840 spin_unlock_irqrestore(&lockres->l_lock, flags); 2841 2842 status = lockres->l_ops->unblock(lockres, &requeue); 2843 if (status < 0) 2844 mlog_errno(status); 2845 2846 spin_lock_irqsave(&lockres->l_lock, flags); 2847 unqueue: 2848 if (lockres->l_flags & OCFS2_LOCK_FREEING || !requeue) { 2849 lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); 2850 } else 2851 ocfs2_schedule_blocked_lock(osb, lockres); 2852 2853 mlog(0, "lockres %s, requeue = %s.\n", lockres->l_name, 2854 requeue ? "yes" : "no"); 2855 spin_unlock_irqrestore(&lockres->l_lock, flags); 2856 2857 mlog_exit_void(); 2858 } 2859 2860 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, 2861 struct ocfs2_lock_res *lockres) 2862 { 2863 mlog_entry_void(); 2864 2865 assert_spin_locked(&lockres->l_lock); 2866 2867 if (lockres->l_flags & OCFS2_LOCK_FREEING) { 2868 /* Do not schedule a lock for downconvert when it's on 2869 * the way to destruction - any nodes wanting access 2870 * to the resource will get it soon. */ 2871 mlog(0, "Lockres %s won't be scheduled: flags 0x%lx\n", 2872 lockres->l_name, lockres->l_flags); 2873 return; 2874 } 2875 2876 lockres_or_flags(lockres, OCFS2_LOCK_QUEUED); 2877 2878 spin_lock(&osb->vote_task_lock); 2879 if (list_empty(&lockres->l_blocked_list)) { 2880 list_add_tail(&lockres->l_blocked_list, 2881 &osb->blocked_lock_list); 2882 osb->blocked_lock_count++; 2883 } 2884 spin_unlock(&osb->vote_task_lock); 2885 2886 mlog_exit_void(); 2887 } 2888 2889 /* This aids in debugging situations where a bad LVB might be involved. */ 2890 void ocfs2_dump_meta_lvb_info(u64 level, 2891 const char *function, 2892 unsigned int line, 2893 struct ocfs2_lock_res *lockres) 2894 { 2895 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 2896 2897 mlog(level, "LVB information for %s (called from %s:%u):\n", 2898 lockres->l_name, function, line); 2899 mlog(level, "version: %u, clusters: %u\n", 2900 be32_to_cpu(lvb->lvb_version), be32_to_cpu(lvb->lvb_iclusters)); 2901 mlog(level, "size: %llu, uid %u, gid %u, mode 0x%x\n", 2902 (unsigned long long)be64_to_cpu(lvb->lvb_isize), 2903 be32_to_cpu(lvb->lvb_iuid), be32_to_cpu(lvb->lvb_igid), 2904 be16_to_cpu(lvb->lvb_imode)); 2905 mlog(level, "nlink %u, atime_packed 0x%llx, ctime_packed 0x%llx, " 2906 "mtime_packed 0x%llx iattr 0x%x\n", be16_to_cpu(lvb->lvb_inlink), 2907 (long long)be64_to_cpu(lvb->lvb_iatime_packed), 2908 (long long)be64_to_cpu(lvb->lvb_ictime_packed), 2909 (long long)be64_to_cpu(lvb->lvb_imtime_packed), 2910 be32_to_cpu(lvb->lvb_iattr)); 2911 } 2912