1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * inode.c 5 * 6 * vfs' aops, fops, dops and iops 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/fs.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/highmem.h> 30 #include <linux/pagemap.h> 31 #include <linux/smp_lock.h> 32 33 #include <asm/byteorder.h> 34 35 #define MLOG_MASK_PREFIX ML_INODE 36 #include <cluster/masklog.h> 37 38 #include "ocfs2.h" 39 40 #include "alloc.h" 41 #include "dlmglue.h" 42 #include "extent_map.h" 43 #include "file.h" 44 #include "heartbeat.h" 45 #include "inode.h" 46 #include "journal.h" 47 #include "namei.h" 48 #include "suballoc.h" 49 #include "super.h" 50 #include "symlink.h" 51 #include "sysfile.h" 52 #include "uptodate.h" 53 #include "vote.h" 54 55 #include "buffer_head_io.h" 56 57 #define OCFS2_FI_FLAG_NOWAIT 0x1 58 #define OCFS2_FI_FLAG_DELETE 0x2 59 struct ocfs2_find_inode_args 60 { 61 u64 fi_blkno; 62 unsigned long fi_ino; 63 unsigned int fi_flags; 64 }; 65 66 static int ocfs2_read_locked_inode(struct inode *inode, 67 struct ocfs2_find_inode_args *args); 68 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); 69 static int ocfs2_find_actor(struct inode *inode, void *opaque); 70 static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, 71 struct inode *inode, 72 struct buffer_head *fe_bh); 73 74 struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb, 75 u64 blkno, 76 int delete_vote) 77 { 78 struct ocfs2_find_inode_args args; 79 80 /* ocfs2_ilookup_for_vote should *only* be called from the 81 * vote thread */ 82 BUG_ON(current != osb->vote_task); 83 84 args.fi_blkno = blkno; 85 args.fi_flags = OCFS2_FI_FLAG_NOWAIT; 86 if (delete_vote) 87 args.fi_flags |= OCFS2_FI_FLAG_DELETE; 88 args.fi_ino = ino_from_blkno(osb->sb, blkno); 89 return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args); 90 } 91 92 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno) 93 { 94 struct inode *inode = NULL; 95 struct super_block *sb = osb->sb; 96 struct ocfs2_find_inode_args args; 97 98 mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno); 99 100 /* Ok. By now we've either got the offsets passed to us by the 101 * caller, or we just pulled them off the bh. Lets do some 102 * sanity checks to make sure they're OK. */ 103 if (blkno == 0) { 104 inode = ERR_PTR(-EINVAL); 105 mlog_errno(PTR_ERR(inode)); 106 goto bail; 107 } 108 109 args.fi_blkno = blkno; 110 args.fi_flags = 0; 111 args.fi_ino = ino_from_blkno(sb, blkno); 112 113 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 114 ocfs2_init_locked_inode, &args); 115 /* inode was *not* in the inode cache. 2.6.x requires 116 * us to do our own read_inode call and unlock it 117 * afterwards. */ 118 if (inode && inode->i_state & I_NEW) { 119 mlog(0, "Inode was not in inode cache, reading it.\n"); 120 ocfs2_read_locked_inode(inode, &args); 121 unlock_new_inode(inode); 122 } 123 if (inode == NULL) { 124 inode = ERR_PTR(-ENOMEM); 125 mlog_errno(PTR_ERR(inode)); 126 goto bail; 127 } 128 if (is_bad_inode(inode)) { 129 iput(inode); 130 inode = ERR_PTR(-ESTALE); 131 mlog_errno(PTR_ERR(inode)); 132 goto bail; 133 } 134 135 bail: 136 if (!IS_ERR(inode)) { 137 mlog(0, "returning inode with number %llu\n", 138 (unsigned long long)OCFS2_I(inode)->ip_blkno); 139 mlog_exit_ptr(inode); 140 } else 141 mlog_errno(PTR_ERR(inode)); 142 143 return inode; 144 } 145 146 147 /* 148 * here's how inodes get read from disk: 149 * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR 150 * found? : return the in-memory inode 151 * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE 152 */ 153 154 static int ocfs2_find_actor(struct inode *inode, void *opaque) 155 { 156 struct ocfs2_find_inode_args *args = NULL; 157 struct ocfs2_inode_info *oi = OCFS2_I(inode); 158 int ret = 0; 159 160 mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque); 161 162 args = opaque; 163 164 mlog_bug_on_msg(!inode, "No inode in find actor!\n"); 165 166 if (oi->ip_blkno != args->fi_blkno) 167 goto bail; 168 169 /* OCFS2_FI_FLAG_NOWAIT is *only* set from 170 * ocfs2_ilookup_for_vote which won't create an inode for one 171 * that isn't found. The vote thread which doesn't want to get 172 * an inode which is in the process of going away - otherwise 173 * the call to __wait_on_freeing_inode in find_inode_fast will 174 * cause it to deadlock on an inode which may be waiting on a 175 * vote (or lock release) in delete_inode */ 176 if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) && 177 (inode->i_state & (I_FREEING|I_CLEAR))) { 178 /* As stated above, we're not going to return an 179 * inode. In the case of a delete vote, the voting 180 * code is going to signal the other node to go 181 * ahead. Mark that state here, so this freeing inode 182 * has the state when it gets to delete_inode. */ 183 if (args->fi_flags & OCFS2_FI_FLAG_DELETE) { 184 spin_lock(&oi->ip_lock); 185 ocfs2_mark_inode_remotely_deleted(inode); 186 spin_unlock(&oi->ip_lock); 187 } 188 goto bail; 189 } 190 191 ret = 1; 192 bail: 193 mlog_exit(ret); 194 return ret; 195 } 196 197 /* 198 * initialize the new inode, but don't do anything that would cause 199 * us to sleep. 200 * return 0 on success, 1 on failure 201 */ 202 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) 203 { 204 struct ocfs2_find_inode_args *args = opaque; 205 206 mlog_entry("inode = %p, opaque = %p\n", inode, opaque); 207 208 inode->i_ino = args->fi_ino; 209 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 210 211 mlog_exit(0); 212 return 0; 213 } 214 215 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 216 int create_ino) 217 { 218 struct super_block *sb; 219 struct ocfs2_super *osb; 220 int status = -EINVAL; 221 222 mlog_entry("(0x%p, size:%llu)\n", inode, 223 (unsigned long long)fe->i_size); 224 225 sb = inode->i_sb; 226 osb = OCFS2_SB(sb); 227 228 /* this means that read_inode cannot create a superblock inode 229 * today. change if needed. */ 230 if (!OCFS2_IS_VALID_DINODE(fe) || 231 !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 232 mlog(ML_ERROR, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " 233 "signature = %.*s, flags = 0x%x\n", 234 inode->i_ino, 235 (unsigned long long)le64_to_cpu(fe->i_blkno), 7, 236 fe->i_signature, le32_to_cpu(fe->i_flags)); 237 goto bail; 238 } 239 240 if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) { 241 mlog(ML_ERROR, "file entry generation does not match " 242 "superblock! osb->fs_generation=%x, " 243 "fe->i_fs_generation=%x\n", 244 osb->fs_generation, le32_to_cpu(fe->i_fs_generation)); 245 goto bail; 246 } 247 248 inode->i_version = 1; 249 inode->i_generation = le32_to_cpu(fe->i_generation); 250 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 251 inode->i_mode = le16_to_cpu(fe->i_mode); 252 inode->i_uid = le32_to_cpu(fe->i_uid); 253 inode->i_gid = le32_to_cpu(fe->i_gid); 254 inode->i_blksize = (u32)osb->s_clustersize; 255 256 /* Fast symlinks will have i_size but no allocated clusters. */ 257 if (S_ISLNK(inode->i_mode) && !fe->i_clusters) 258 inode->i_blocks = 0; 259 else 260 inode->i_blocks = 261 ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size)); 262 inode->i_mapping->a_ops = &ocfs2_aops; 263 inode->i_flags |= S_NOATIME; 264 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 265 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 266 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); 267 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); 268 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); 269 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); 270 271 if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) 272 mlog(ML_ERROR, 273 "ip_blkno %llu != i_blkno %llu!\n", 274 (unsigned long long)OCFS2_I(inode)->ip_blkno, 275 (unsigned long long)fe->i_blkno); 276 277 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 278 OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT; 279 280 if (create_ino) 281 inode->i_ino = ino_from_blkno(inode->i_sb, 282 le64_to_cpu(fe->i_blkno)); 283 284 mlog(0, "blkno = %llu, ino = %lu, create_ino = %s\n", 285 (unsigned long long)fe->i_blkno, inode->i_ino, create_ino ? "true" : "false"); 286 287 inode->i_nlink = le16_to_cpu(fe->i_links_count); 288 289 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { 290 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 291 mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); 292 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { 293 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 294 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { 295 mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); 296 /* we can't actually hit this as read_inode can't 297 * handle superblocks today ;-) */ 298 BUG(); 299 } 300 301 switch (inode->i_mode & S_IFMT) { 302 case S_IFREG: 303 inode->i_fop = &ocfs2_fops; 304 inode->i_op = &ocfs2_file_iops; 305 i_size_write(inode, le64_to_cpu(fe->i_size)); 306 break; 307 case S_IFDIR: 308 inode->i_op = &ocfs2_dir_iops; 309 inode->i_fop = &ocfs2_dops; 310 i_size_write(inode, le64_to_cpu(fe->i_size)); 311 break; 312 case S_IFLNK: 313 if (ocfs2_inode_is_fast_symlink(inode)) 314 inode->i_op = &ocfs2_fast_symlink_inode_operations; 315 else 316 inode->i_op = &ocfs2_symlink_inode_operations; 317 i_size_write(inode, le64_to_cpu(fe->i_size)); 318 break; 319 default: 320 inode->i_op = &ocfs2_special_file_iops; 321 init_special_inode(inode, inode->i_mode, 322 inode->i_rdev); 323 break; 324 } 325 326 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, 327 OCFS2_LOCK_TYPE_RW, inode); 328 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres, 329 OCFS2_LOCK_TYPE_META, inode); 330 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_data_lockres, 331 OCFS2_LOCK_TYPE_DATA, inode); 332 333 status = 0; 334 bail: 335 mlog_exit(status); 336 return status; 337 } 338 339 static int ocfs2_read_locked_inode(struct inode *inode, 340 struct ocfs2_find_inode_args *args) 341 { 342 struct super_block *sb; 343 struct ocfs2_super *osb; 344 struct ocfs2_dinode *fe; 345 struct buffer_head *bh = NULL; 346 int status; 347 int sysfile = 0; 348 349 mlog_entry("(0x%p, 0x%p)\n", inode, args); 350 351 status = -EINVAL; 352 if (inode == NULL || inode->i_sb == NULL) { 353 mlog(ML_ERROR, "bad inode\n"); 354 goto bail; 355 } 356 sb = inode->i_sb; 357 osb = OCFS2_SB(sb); 358 359 if (!args) { 360 mlog(ML_ERROR, "bad inode args\n"); 361 make_bad_inode(inode); 362 goto bail; 363 } 364 365 /* Read the FE off disk. This is safe because the kernel only 366 * does one read_inode2 for a new inode, and if it doesn't 367 * exist yet then nobody can be working on it! */ 368 status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, NULL); 369 if (status < 0) { 370 mlog_errno(status); 371 make_bad_inode(inode); 372 goto bail; 373 } 374 375 fe = (struct ocfs2_dinode *) bh->b_data; 376 if (!OCFS2_IS_VALID_DINODE(fe)) { 377 mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", 378 (unsigned long long)fe->i_blkno, 7, fe->i_signature); 379 make_bad_inode(inode); 380 goto bail; 381 } 382 383 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) 384 sysfile = 1; 385 386 if (S_ISCHR(le16_to_cpu(fe->i_mode)) || 387 S_ISBLK(le16_to_cpu(fe->i_mode))) 388 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 389 390 status = -EINVAL; 391 if (ocfs2_populate_inode(inode, fe, 0) < 0) { 392 mlog(ML_ERROR, "populate failed! i_blkno=%llu, i_ino=%lu\n", 393 (unsigned long long)fe->i_blkno, inode->i_ino); 394 make_bad_inode(inode); 395 goto bail; 396 } 397 398 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); 399 400 if (sysfile) 401 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 402 403 status = 0; 404 405 bail: 406 if (args && bh) 407 brelse(bh); 408 409 mlog_exit(status); 410 return status; 411 } 412 413 void ocfs2_sync_blockdev(struct super_block *sb) 414 { 415 sync_blockdev(sb->s_bdev); 416 } 417 418 static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, 419 struct inode *inode, 420 struct buffer_head *fe_bh) 421 { 422 int status = 0; 423 struct ocfs2_journal_handle *handle = NULL; 424 struct ocfs2_truncate_context *tc = NULL; 425 struct ocfs2_dinode *fe; 426 427 mlog_entry_void(); 428 429 fe = (struct ocfs2_dinode *) fe_bh->b_data; 430 431 /* zero allocation, zero truncate :) */ 432 if (!fe->i_clusters) 433 goto bail; 434 435 handle = ocfs2_start_trans(osb, handle, OCFS2_INODE_UPDATE_CREDITS); 436 if (IS_ERR(handle)) { 437 status = PTR_ERR(handle); 438 handle = NULL; 439 mlog_errno(status); 440 goto bail; 441 } 442 443 status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL); 444 if (status < 0) { 445 mlog_errno(status); 446 goto bail; 447 } 448 449 ocfs2_commit_trans(handle); 450 handle = NULL; 451 452 status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc); 453 if (status < 0) { 454 mlog_errno(status); 455 goto bail; 456 } 457 458 status = ocfs2_commit_truncate(osb, inode, fe_bh, tc); 459 if (status < 0) { 460 mlog_errno(status); 461 goto bail; 462 } 463 bail: 464 if (handle) 465 ocfs2_commit_trans(handle); 466 467 mlog_exit(status); 468 return status; 469 } 470 471 static int ocfs2_remove_inode(struct inode *inode, 472 struct buffer_head *di_bh, 473 struct inode *orphan_dir_inode, 474 struct buffer_head *orphan_dir_bh) 475 { 476 int status; 477 struct inode *inode_alloc_inode = NULL; 478 struct buffer_head *inode_alloc_bh = NULL; 479 struct ocfs2_journal_handle *handle; 480 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 481 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 482 483 inode_alloc_inode = 484 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, 485 le16_to_cpu(di->i_suballoc_slot)); 486 if (!inode_alloc_inode) { 487 status = -EEXIST; 488 mlog_errno(status); 489 goto bail; 490 } 491 492 mutex_lock(&inode_alloc_inode->i_mutex); 493 status = ocfs2_meta_lock(inode_alloc_inode, NULL, &inode_alloc_bh, 1); 494 if (status < 0) { 495 mutex_unlock(&inode_alloc_inode->i_mutex); 496 497 mlog_errno(status); 498 goto bail; 499 } 500 501 handle = ocfs2_start_trans(osb, NULL, OCFS2_DELETE_INODE_CREDITS); 502 if (IS_ERR(handle)) { 503 status = PTR_ERR(handle); 504 mlog_errno(status); 505 goto bail_unlock; 506 } 507 508 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 509 orphan_dir_bh); 510 if (status < 0) { 511 mlog_errno(status); 512 goto bail_commit; 513 } 514 515 /* set the inodes dtime */ 516 status = ocfs2_journal_access(handle, inode, di_bh, 517 OCFS2_JOURNAL_ACCESS_WRITE); 518 if (status < 0) { 519 mlog_errno(status); 520 goto bail_commit; 521 } 522 523 di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); 524 le32_and_cpu(&di->i_flags, ~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 525 526 status = ocfs2_journal_dirty(handle, di_bh); 527 if (status < 0) { 528 mlog_errno(status); 529 goto bail_commit; 530 } 531 532 ocfs2_remove_from_cache(inode, di_bh); 533 534 status = ocfs2_free_dinode(handle, inode_alloc_inode, 535 inode_alloc_bh, di); 536 if (status < 0) 537 mlog_errno(status); 538 539 bail_commit: 540 ocfs2_commit_trans(handle); 541 bail_unlock: 542 ocfs2_meta_unlock(inode_alloc_inode, 1); 543 mutex_unlock(&inode_alloc_inode->i_mutex); 544 brelse(inode_alloc_bh); 545 bail: 546 iput(inode_alloc_inode); 547 548 return status; 549 } 550 551 /* 552 * Serialize with orphan dir recovery. If the process doing 553 * recovery on this orphan dir does an iget() with the dir 554 * i_mutex held, we'll deadlock here. Instead we detect this 555 * and exit early - recovery will wipe this inode for us. 556 */ 557 static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, 558 int slot) 559 { 560 int ret = 0; 561 562 spin_lock(&osb->osb_lock); 563 if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { 564 mlog(0, "Recovery is happening on orphan dir %d, will skip " 565 "this inode\n", slot); 566 ret = -EDEADLK; 567 goto out; 568 } 569 /* This signals to the orphan recovery process that it should 570 * wait for us to handle the wipe. */ 571 osb->osb_orphan_wipes[slot]++; 572 out: 573 spin_unlock(&osb->osb_lock); 574 return ret; 575 } 576 577 static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, 578 int slot) 579 { 580 spin_lock(&osb->osb_lock); 581 osb->osb_orphan_wipes[slot]--; 582 spin_unlock(&osb->osb_lock); 583 584 wake_up(&osb->osb_wipe_event); 585 } 586 587 static int ocfs2_wipe_inode(struct inode *inode, 588 struct buffer_head *di_bh) 589 { 590 int status, orphaned_slot; 591 struct inode *orphan_dir_inode = NULL; 592 struct buffer_head *orphan_dir_bh = NULL; 593 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 594 595 /* We've already voted on this so it should be readonly - no 596 * spinlock needed. */ 597 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; 598 599 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 600 if (status) 601 return status; 602 603 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 604 ORPHAN_DIR_SYSTEM_INODE, 605 orphaned_slot); 606 if (!orphan_dir_inode) { 607 status = -EEXIST; 608 mlog_errno(status); 609 goto bail; 610 } 611 612 /* Lock the orphan dir. The lock will be held for the entire 613 * delete_inode operation. We do this now to avoid races with 614 * recovery completion on other nodes. */ 615 mutex_lock(&orphan_dir_inode->i_mutex); 616 status = ocfs2_meta_lock(orphan_dir_inode, NULL, &orphan_dir_bh, 1); 617 if (status < 0) { 618 mutex_unlock(&orphan_dir_inode->i_mutex); 619 620 mlog_errno(status); 621 goto bail; 622 } 623 624 /* we do this while holding the orphan dir lock because we 625 * don't want recovery being run from another node to vote for 626 * an inode delete on us -- this will result in two nodes 627 * truncating the same file! */ 628 status = ocfs2_truncate_for_delete(osb, inode, di_bh); 629 if (status < 0) { 630 mlog_errno(status); 631 goto bail_unlock_dir; 632 } 633 634 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, 635 orphan_dir_bh); 636 if (status < 0) 637 mlog_errno(status); 638 639 bail_unlock_dir: 640 ocfs2_meta_unlock(orphan_dir_inode, 1); 641 mutex_unlock(&orphan_dir_inode->i_mutex); 642 brelse(orphan_dir_bh); 643 bail: 644 iput(orphan_dir_inode); 645 ocfs2_signal_wipe_completion(osb, orphaned_slot); 646 647 return status; 648 } 649 650 /* There is a series of simple checks that should be done before a 651 * vote is even considered. Encapsulate those in this function. */ 652 static int ocfs2_inode_is_valid_to_delete(struct inode *inode) 653 { 654 int ret = 0; 655 struct ocfs2_inode_info *oi = OCFS2_I(inode); 656 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 657 658 /* We shouldn't be getting here for the root directory 659 * inode.. */ 660 if (inode == osb->root_inode) { 661 mlog(ML_ERROR, "Skipping delete of root inode.\n"); 662 goto bail; 663 } 664 665 /* If we're coming from process_vote we can't go into our own 666 * voting [hello, deadlock city!], so unforuntately we just 667 * have to skip deleting this guy. That's OK though because 668 * the node who's doing the actual deleting should handle it 669 * anyway. */ 670 if (current == osb->vote_task) { 671 mlog(0, "Skipping delete of %lu because we're currently " 672 "in process_vote\n", inode->i_ino); 673 goto bail; 674 } 675 676 spin_lock(&oi->ip_lock); 677 /* OCFS2 *never* deletes system files. This should technically 678 * never get here as system file inodes should always have a 679 * positive link count. */ 680 if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { 681 mlog(ML_ERROR, "Skipping delete of system file %llu\n", 682 (unsigned long long)oi->ip_blkno); 683 goto bail_unlock; 684 } 685 686 /* If we have voted "yes" on the wipe of this inode for 687 * another node, it will be marked here so we can safely skip 688 * it. Recovery will cleanup any inodes we might inadvertantly 689 * skip here. */ 690 if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) { 691 mlog(0, "Skipping delete of %lu because another node " 692 "has done this for us.\n", inode->i_ino); 693 goto bail_unlock; 694 } 695 696 ret = 1; 697 bail_unlock: 698 spin_unlock(&oi->ip_lock); 699 bail: 700 return ret; 701 } 702 703 /* Query the cluster to determine whether we should wipe an inode from 704 * disk or not. 705 * 706 * Requires the inode to have the cluster lock. */ 707 static int ocfs2_query_inode_wipe(struct inode *inode, 708 struct buffer_head *di_bh, 709 int *wipe) 710 { 711 int status = 0; 712 struct ocfs2_inode_info *oi = OCFS2_I(inode); 713 struct ocfs2_dinode *di; 714 715 *wipe = 0; 716 717 /* While we were waiting for the cluster lock in 718 * ocfs2_delete_inode, another node might have asked to delete 719 * the inode. Recheck our flags to catch this. */ 720 if (!ocfs2_inode_is_valid_to_delete(inode)) { 721 mlog(0, "Skipping delete of %llu because flags changed\n", 722 (unsigned long long)oi->ip_blkno); 723 goto bail; 724 } 725 726 /* Now that we have an up to date inode, we can double check 727 * the link count. */ 728 if (inode->i_nlink) { 729 mlog(0, "Skipping delete of %llu because nlink = %u\n", 730 (unsigned long long)oi->ip_blkno, inode->i_nlink); 731 goto bail; 732 } 733 734 /* Do some basic inode verification... */ 735 di = (struct ocfs2_dinode *) di_bh->b_data; 736 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { 737 /* for lack of a better error? */ 738 status = -EEXIST; 739 mlog(ML_ERROR, 740 "Inode %llu (on-disk %llu) not orphaned! " 741 "Disk flags 0x%x, inode flags 0x%x\n", 742 (unsigned long long)oi->ip_blkno, 743 (unsigned long long)di->i_blkno, di->i_flags, 744 oi->ip_flags); 745 goto bail; 746 } 747 748 /* has someone already deleted us?! baaad... */ 749 if (di->i_dtime) { 750 status = -EEXIST; 751 mlog_errno(status); 752 goto bail; 753 } 754 755 status = ocfs2_request_delete_vote(inode); 756 /* -EBUSY means that other nodes are still using the 757 * inode. We're done here though, so avoid doing anything on 758 * disk and let them worry about deleting it. */ 759 if (status == -EBUSY) { 760 status = 0; 761 mlog(0, "Skipping delete of %llu because it is in use on" 762 "other nodes\n", (unsigned long long)oi->ip_blkno); 763 goto bail; 764 } 765 if (status < 0) { 766 mlog_errno(status); 767 goto bail; 768 } 769 770 spin_lock(&oi->ip_lock); 771 if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) { 772 /* Nobody knew which slot this inode was orphaned 773 * into. This may happen during node death and 774 * recovery knows how to clean it up so we can safely 775 * ignore this inode for now on. */ 776 mlog(0, "Nobody knew where inode %llu was orphaned!\n", 777 (unsigned long long)oi->ip_blkno); 778 } else { 779 *wipe = 1; 780 781 mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n", 782 (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot); 783 } 784 spin_unlock(&oi->ip_lock); 785 786 bail: 787 return status; 788 } 789 790 /* Support function for ocfs2_delete_inode. Will help us keep the 791 * inode data in a consistent state for clear_inode. Always truncates 792 * pages, optionally sync's them first. */ 793 static void ocfs2_cleanup_delete_inode(struct inode *inode, 794 int sync_data) 795 { 796 mlog(0, "Cleanup inode %llu, sync = %d\n", 797 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 798 if (sync_data) 799 write_inode_now(inode, 1); 800 truncate_inode_pages(&inode->i_data, 0); 801 } 802 803 void ocfs2_delete_inode(struct inode *inode) 804 { 805 int wipe, status; 806 sigset_t blocked, oldset; 807 struct buffer_head *di_bh = NULL; 808 809 mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); 810 811 if (is_bad_inode(inode)) { 812 mlog(0, "Skipping delete of bad inode\n"); 813 goto bail; 814 } 815 816 if (!ocfs2_inode_is_valid_to_delete(inode)) { 817 /* It's probably not necessary to truncate_inode_pages 818 * here but we do it for safety anyway (it will most 819 * likely be a no-op anyway) */ 820 ocfs2_cleanup_delete_inode(inode, 0); 821 goto bail; 822 } 823 824 /* We want to block signals in delete_inode as the lock and 825 * messaging paths may return us -ERESTARTSYS. Which would 826 * cause us to exit early, resulting in inodes being orphaned 827 * forever. */ 828 sigfillset(&blocked); 829 status = sigprocmask(SIG_BLOCK, &blocked, &oldset); 830 if (status < 0) { 831 mlog_errno(status); 832 ocfs2_cleanup_delete_inode(inode, 1); 833 goto bail; 834 } 835 836 /* Lock down the inode. This gives us an up to date view of 837 * it's metadata (for verification), and allows us to 838 * serialize delete_inode votes. 839 * 840 * Even though we might be doing a truncate, we don't take the 841 * allocation lock here as it won't be needed - nobody will 842 * have the file open. 843 */ 844 status = ocfs2_meta_lock(inode, NULL, &di_bh, 1); 845 if (status < 0) { 846 if (status != -ENOENT) 847 mlog_errno(status); 848 ocfs2_cleanup_delete_inode(inode, 0); 849 goto bail_unblock; 850 } 851 852 /* Query the cluster. This will be the final decision made 853 * before we go ahead and wipe the inode. */ 854 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 855 if (!wipe || status < 0) { 856 /* Error and inode busy vote both mean we won't be 857 * removing the inode, so they take almost the same 858 * path. */ 859 if (status < 0) 860 mlog_errno(status); 861 862 /* Someone in the cluster has voted to not wipe this 863 * inode, or it was never completely orphaned. Write 864 * out the pages and exit now. */ 865 ocfs2_cleanup_delete_inode(inode, 1); 866 goto bail_unlock_inode; 867 } 868 869 ocfs2_cleanup_delete_inode(inode, 0); 870 871 status = ocfs2_wipe_inode(inode, di_bh); 872 if (status < 0) { 873 if (status != -EDEADLK) 874 mlog_errno(status); 875 goto bail_unlock_inode; 876 } 877 878 /* Mark the inode as successfully deleted. This is important 879 * for ocfs2_clear_inode as it will check this flag and skip 880 * any checkpointing work */ 881 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 882 883 bail_unlock_inode: 884 ocfs2_meta_unlock(inode, 1); 885 brelse(di_bh); 886 bail_unblock: 887 status = sigprocmask(SIG_SETMASK, &oldset, NULL); 888 if (status < 0) 889 mlog_errno(status); 890 bail: 891 clear_inode(inode); 892 mlog_exit_void(); 893 } 894 895 void ocfs2_clear_inode(struct inode *inode) 896 { 897 int status; 898 struct ocfs2_inode_info *oi = OCFS2_I(inode); 899 900 mlog_entry_void(); 901 902 if (!inode) 903 goto bail; 904 905 mlog(0, "Clearing inode: %llu, nlink = %u\n", 906 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink); 907 908 mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL, 909 "Inode=%lu\n", inode->i_ino); 910 911 /* Do these before all the other work so that we don't bounce 912 * the vote thread while waiting to destroy the locks. */ 913 ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres); 914 ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres); 915 ocfs2_mark_lockres_freeing(&oi->ip_data_lockres); 916 917 /* We very well may get a clear_inode before all an inodes 918 * metadata has hit disk. Of course, we can't drop any cluster 919 * locks until the journal has finished with it. The only 920 * exception here are successfully wiped inodes - their 921 * metadata can now be considered to be part of the system 922 * inodes from which it came. */ 923 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED)) 924 ocfs2_checkpoint_inode(inode); 925 926 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), 927 "Clear inode of %llu, inode has io markers\n", 928 (unsigned long long)oi->ip_blkno); 929 930 ocfs2_extent_map_drop(inode, 0); 931 ocfs2_extent_map_init(inode); 932 933 status = ocfs2_drop_inode_locks(inode); 934 if (status < 0) 935 mlog_errno(status); 936 937 ocfs2_lock_res_free(&oi->ip_rw_lockres); 938 ocfs2_lock_res_free(&oi->ip_meta_lockres); 939 ocfs2_lock_res_free(&oi->ip_data_lockres); 940 941 ocfs2_metadata_cache_purge(inode); 942 943 mlog_bug_on_msg(oi->ip_metadata_cache.ci_num_cached, 944 "Clear inode of %llu, inode has %u cache items\n", 945 (unsigned long long)oi->ip_blkno, oi->ip_metadata_cache.ci_num_cached); 946 947 mlog_bug_on_msg(!(oi->ip_flags & OCFS2_INODE_CACHE_INLINE), 948 "Clear inode of %llu, inode has a bad flag\n", 949 (unsigned long long)oi->ip_blkno); 950 951 mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), 952 "Clear inode of %llu, inode is locked\n", 953 (unsigned long long)oi->ip_blkno); 954 955 mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), 956 "Clear inode of %llu, io_mutex is locked\n", 957 (unsigned long long)oi->ip_blkno); 958 mutex_unlock(&oi->ip_io_mutex); 959 960 /* 961 * down_trylock() returns 0, down_write_trylock() returns 1 962 * kernel 1, world 0 963 */ 964 mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), 965 "Clear inode of %llu, alloc_sem is locked\n", 966 (unsigned long long)oi->ip_blkno); 967 up_write(&oi->ip_alloc_sem); 968 969 mlog_bug_on_msg(oi->ip_open_count, 970 "Clear inode of %llu has open count %d\n", 971 (unsigned long long)oi->ip_blkno, oi->ip_open_count); 972 mlog_bug_on_msg(!list_empty(&oi->ip_handle_list), 973 "Clear inode of %llu has non empty handle list\n", 974 (unsigned long long)oi->ip_blkno); 975 mlog_bug_on_msg(oi->ip_handle, 976 "Clear inode of %llu has non empty handle pointer\n", 977 (unsigned long long)oi->ip_blkno); 978 979 /* Clear all other flags. */ 980 oi->ip_flags = OCFS2_INODE_CACHE_INLINE; 981 oi->ip_created_trans = 0; 982 oi->ip_last_trans = 0; 983 oi->ip_dir_start_lookup = 0; 984 oi->ip_blkno = 0ULL; 985 986 bail: 987 mlog_exit_void(); 988 } 989 990 /* Called under inode_lock, with no more references on the 991 * struct inode, so it's safe here to check the flags field 992 * and to manipulate i_nlink without any other locks. */ 993 void ocfs2_drop_inode(struct inode *inode) 994 { 995 struct ocfs2_inode_info *oi = OCFS2_I(inode); 996 997 mlog_entry_void(); 998 999 mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n", 1000 (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); 1001 1002 /* Testing ip_orphaned_slot here wouldn't work because we may 1003 * not have gotten a delete_inode vote from any other nodes 1004 * yet. */ 1005 if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) { 1006 mlog(0, "Inode was orphaned on another node, clearing nlink.\n"); 1007 inode->i_nlink = 0; 1008 } 1009 1010 generic_drop_inode(inode); 1011 1012 mlog_exit_void(); 1013 } 1014 1015 /* 1016 * TODO: this should probably be merged into ocfs2_get_block 1017 * 1018 * However, you now need to pay attention to the cont_prepare_write() 1019 * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much 1020 * expects never to extend). 1021 */ 1022 struct buffer_head *ocfs2_bread(struct inode *inode, 1023 int block, int *err, int reada) 1024 { 1025 struct buffer_head *bh = NULL; 1026 int tmperr; 1027 u64 p_blkno; 1028 int readflags = OCFS2_BH_CACHED; 1029 1030 #if 0 1031 /* only turn this on if we know we can deal with read_block 1032 * returning nothing */ 1033 if (reada) 1034 readflags |= OCFS2_BH_READAHEAD; 1035 #endif 1036 1037 if (((u64)block << inode->i_sb->s_blocksize_bits) >= 1038 i_size_read(inode)) { 1039 BUG_ON(!reada); 1040 return NULL; 1041 } 1042 1043 tmperr = ocfs2_extent_map_get_blocks(inode, block, 1, 1044 &p_blkno, NULL); 1045 if (tmperr < 0) { 1046 mlog_errno(tmperr); 1047 goto fail; 1048 } 1049 1050 tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, 1051 readflags, inode); 1052 if (tmperr < 0) 1053 goto fail; 1054 1055 tmperr = 0; 1056 1057 *err = 0; 1058 return bh; 1059 1060 fail: 1061 if (bh) { 1062 brelse(bh); 1063 bh = NULL; 1064 } 1065 *err = -EIO; 1066 return NULL; 1067 } 1068 1069 /* 1070 * This is called from our getattr. 1071 */ 1072 int ocfs2_inode_revalidate(struct dentry *dentry) 1073 { 1074 struct inode *inode = dentry->d_inode; 1075 int status = 0; 1076 1077 mlog_entry("(inode = 0x%p, ino = %llu)\n", inode, 1078 inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL); 1079 1080 if (!inode) { 1081 mlog(0, "eep, no inode!\n"); 1082 status = -ENOENT; 1083 goto bail; 1084 } 1085 1086 spin_lock(&OCFS2_I(inode)->ip_lock); 1087 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 1088 spin_unlock(&OCFS2_I(inode)->ip_lock); 1089 mlog(0, "inode deleted!\n"); 1090 status = -ENOENT; 1091 goto bail; 1092 } 1093 spin_unlock(&OCFS2_I(inode)->ip_lock); 1094 1095 /* Let ocfs2_meta_lock do the work of updating our struct 1096 * inode for us. */ 1097 status = ocfs2_meta_lock(inode, NULL, NULL, 0); 1098 if (status < 0) { 1099 if (status != -ENOENT) 1100 mlog_errno(status); 1101 goto bail; 1102 } 1103 ocfs2_meta_unlock(inode, 0); 1104 bail: 1105 mlog_exit(status); 1106 1107 return status; 1108 } 1109 1110 /* 1111 * Updates a disk inode from a 1112 * struct inode. 1113 * Only takes ip_lock. 1114 */ 1115 int ocfs2_mark_inode_dirty(struct ocfs2_journal_handle *handle, 1116 struct inode *inode, 1117 struct buffer_head *bh) 1118 { 1119 int status; 1120 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 1121 1122 mlog_entry("(inode %llu)\n", 1123 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1124 1125 status = ocfs2_journal_access(handle, inode, bh, 1126 OCFS2_JOURNAL_ACCESS_WRITE); 1127 if (status < 0) { 1128 mlog_errno(status); 1129 goto leave; 1130 } 1131 1132 spin_lock(&OCFS2_I(inode)->ip_lock); 1133 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); 1134 spin_unlock(&OCFS2_I(inode)->ip_lock); 1135 1136 fe->i_size = cpu_to_le64(i_size_read(inode)); 1137 fe->i_links_count = cpu_to_le16(inode->i_nlink); 1138 fe->i_uid = cpu_to_le32(inode->i_uid); 1139 fe->i_gid = cpu_to_le32(inode->i_gid); 1140 fe->i_mode = cpu_to_le16(inode->i_mode); 1141 fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); 1142 fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); 1143 fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 1144 fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 1145 fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); 1146 fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); 1147 1148 status = ocfs2_journal_dirty(handle, bh); 1149 if (status < 0) 1150 mlog_errno(status); 1151 1152 status = 0; 1153 leave: 1154 1155 mlog_exit(status); 1156 return status; 1157 } 1158 1159 /* 1160 * 1161 * Updates a struct inode from a disk inode. 1162 * does no i/o, only takes ip_lock. 1163 */ 1164 void ocfs2_refresh_inode(struct inode *inode, 1165 struct ocfs2_dinode *fe) 1166 { 1167 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1168 1169 spin_lock(&OCFS2_I(inode)->ip_lock); 1170 1171 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1172 i_size_write(inode, le64_to_cpu(fe->i_size)); 1173 inode->i_nlink = le16_to_cpu(fe->i_links_count); 1174 inode->i_uid = le32_to_cpu(fe->i_uid); 1175 inode->i_gid = le32_to_cpu(fe->i_gid); 1176 inode->i_mode = le16_to_cpu(fe->i_mode); 1177 inode->i_blksize = (u32) osb->s_clustersize; 1178 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) 1179 inode->i_blocks = 0; 1180 else 1181 inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode)); 1182 inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); 1183 inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); 1184 inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); 1185 inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); 1186 inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); 1187 inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); 1188 1189 spin_unlock(&OCFS2_I(inode)->ip_lock); 1190 } 1191