1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * inode.c 4 * 5 * vfs' aops, fops, dops and iops 6 * 7 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 8 */ 9 10 #include <linux/fs.h> 11 #include <linux/types.h> 12 #include <linux/highmem.h> 13 #include <linux/pagemap.h> 14 #include <linux/quotaops.h> 15 #include <linux/iversion.h> 16 #include <linux/fs_dirent.h> 17 18 #include <asm/byteorder.h> 19 20 #include <cluster/masklog.h> 21 22 #include "ocfs2.h" 23 24 #include "alloc.h" 25 #include "dir.h" 26 #include "blockcheck.h" 27 #include "dlmglue.h" 28 #include "extent_map.h" 29 #include "file.h" 30 #include "heartbeat.h" 31 #include "inode.h" 32 #include "journal.h" 33 #include "namei.h" 34 #include "suballoc.h" 35 #include "super.h" 36 #include "symlink.h" 37 #include "sysfile.h" 38 #include "uptodate.h" 39 #include "xattr.h" 40 #include "refcounttree.h" 41 #include "ocfs2_trace.h" 42 #include "filecheck.h" 43 44 #include "buffer_head_io.h" 45 46 struct ocfs2_find_inode_args 47 { 48 u64 fi_blkno; 49 unsigned long fi_ino; 50 unsigned int fi_flags; 51 unsigned int fi_sysfile_type; 52 }; 53 54 static int ocfs2_read_locked_inode(struct inode *inode, 55 struct ocfs2_find_inode_args *args); 56 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque); 57 static int ocfs2_find_actor(struct inode *inode, void *opaque); 58 static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, 59 struct inode *inode, 60 struct buffer_head *fe_bh); 61 62 static int ocfs2_filecheck_read_inode_block_full(struct inode *inode, 63 struct buffer_head **bh, 64 int flags, int type); 65 static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, 66 struct buffer_head *bh); 67 static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, 68 struct buffer_head *bh); 69 70 static bool ocfs2_valid_inode_mode(umode_t mode) 71 { 72 return fs_umode_to_ftype(mode) != FT_UNKNOWN; 73 } 74 75 static bool ocfs2_dinode_has_unexpected_rdev(struct ocfs2_dinode *di) 76 { 77 umode_t mode = le16_to_cpu(di->i_mode); 78 79 if (le32_to_cpu(di->i_flags) & OCFS2_SYSTEM_FL) 80 return false; 81 82 return !S_ISCHR(mode) && !S_ISBLK(mode) && di->id1.dev1.i_rdev != 0; 83 } 84 85 static bool ocfs2_dinode_has_size_without_clusters(struct super_block *sb, 86 struct ocfs2_dinode *di) 87 { 88 umode_t mode = le16_to_cpu(di->i_mode); 89 90 if (le32_to_cpu(di->i_flags) & OCFS2_SYSTEM_FL) 91 return false; 92 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) 93 return false; 94 if (!le64_to_cpu(di->i_size) || le32_to_cpu(di->i_clusters)) 95 return false; 96 97 if (S_ISDIR(mode)) 98 return true; 99 100 return !ocfs2_sparse_alloc(OCFS2_SB(sb)) && S_ISREG(mode); 101 } 102 103 void ocfs2_set_inode_flags(struct inode *inode) 104 { 105 unsigned int flags = OCFS2_I(inode)->ip_attr; 106 107 inode->i_flags &= ~(S_IMMUTABLE | 108 S_SYNC | S_APPEND | S_NOATIME | S_DIRSYNC); 109 110 if (flags & OCFS2_IMMUTABLE_FL) 111 inode->i_flags |= S_IMMUTABLE; 112 113 if (flags & OCFS2_SYNC_FL) 114 inode->i_flags |= S_SYNC; 115 if (flags & OCFS2_APPEND_FL) 116 inode->i_flags |= S_APPEND; 117 if (flags & OCFS2_NOATIME_FL) 118 inode->i_flags |= S_NOATIME; 119 if (flags & OCFS2_DIRSYNC_FL) 120 inode->i_flags |= S_DIRSYNC; 121 } 122 123 /* Propagate flags from i_flags to OCFS2_I(inode)->ip_attr */ 124 void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi) 125 { 126 unsigned int flags = oi->vfs_inode.i_flags; 127 128 oi->ip_attr &= ~(OCFS2_SYNC_FL|OCFS2_APPEND_FL| 129 OCFS2_IMMUTABLE_FL|OCFS2_NOATIME_FL|OCFS2_DIRSYNC_FL); 130 if (flags & S_SYNC) 131 oi->ip_attr |= OCFS2_SYNC_FL; 132 if (flags & S_APPEND) 133 oi->ip_attr |= OCFS2_APPEND_FL; 134 if (flags & S_IMMUTABLE) 135 oi->ip_attr |= OCFS2_IMMUTABLE_FL; 136 if (flags & S_NOATIME) 137 oi->ip_attr |= OCFS2_NOATIME_FL; 138 if (flags & S_DIRSYNC) 139 oi->ip_attr |= OCFS2_DIRSYNC_FL; 140 } 141 142 struct inode *ocfs2_ilookup(struct super_block *sb, u64 blkno) 143 { 144 struct ocfs2_find_inode_args args; 145 146 args.fi_blkno = blkno; 147 args.fi_flags = 0; 148 args.fi_ino = ino_from_blkno(sb, blkno); 149 args.fi_sysfile_type = 0; 150 151 return ilookup5(sb, blkno, ocfs2_find_actor, &args); 152 } 153 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, 154 int sysfile_type) 155 { 156 int rc = -ESTALE; 157 struct inode *inode = NULL; 158 struct super_block *sb = osb->sb; 159 struct ocfs2_find_inode_args args; 160 journal_t *journal = osb->journal->j_journal; 161 162 trace_ocfs2_iget_begin((unsigned long long)blkno, flags, 163 sysfile_type); 164 165 /* Ok. By now we've either got the offsets passed to us by the 166 * caller, or we just pulled them off the bh. Lets do some 167 * sanity checks to make sure they're OK. */ 168 if (blkno == 0) { 169 inode = ERR_PTR(-EINVAL); 170 mlog_errno(PTR_ERR(inode)); 171 goto bail; 172 } 173 174 args.fi_blkno = blkno; 175 args.fi_flags = flags; 176 args.fi_ino = ino_from_blkno(sb, blkno); 177 args.fi_sysfile_type = sysfile_type; 178 179 inode = iget5_locked(sb, args.fi_ino, ocfs2_find_actor, 180 ocfs2_init_locked_inode, &args); 181 /* inode was *not* in the inode cache. 2.6.x requires 182 * us to do our own read_inode call and unlock it 183 * afterwards. */ 184 if (inode == NULL) { 185 inode = ERR_PTR(-ENOMEM); 186 mlog_errno(PTR_ERR(inode)); 187 goto bail; 188 } 189 trace_ocfs2_iget5_locked(inode_state_read_once(inode)); 190 if (inode_state_read_once(inode) & I_NEW) { 191 rc = ocfs2_read_locked_inode(inode, &args); 192 unlock_new_inode(inode); 193 } 194 if (is_bad_inode(inode)) { 195 iput(inode); 196 inode = ERR_PTR(rc); 197 goto bail; 198 } 199 200 /* 201 * Set transaction id's of transactions that have to be committed 202 * to finish f[data]sync. We set them to currently running transaction 203 * as we cannot be sure that the inode or some of its metadata isn't 204 * part of the transaction - the inode could have been reclaimed and 205 * now it is reread from disk. 206 */ 207 if (journal) { 208 transaction_t *transaction; 209 tid_t tid; 210 struct ocfs2_inode_info *oi = OCFS2_I(inode); 211 212 read_lock(&journal->j_state_lock); 213 if (journal->j_running_transaction) 214 transaction = journal->j_running_transaction; 215 else 216 transaction = journal->j_committing_transaction; 217 if (transaction) 218 tid = transaction->t_tid; 219 else 220 tid = journal->j_commit_sequence; 221 read_unlock(&journal->j_state_lock); 222 oi->i_sync_tid = tid; 223 oi->i_datasync_tid = tid; 224 } 225 226 bail: 227 if (!IS_ERR(inode)) { 228 trace_ocfs2_iget_end(inode, 229 (unsigned long long)OCFS2_I(inode)->ip_blkno); 230 } 231 232 return inode; 233 } 234 235 static int ocfs2_dinode_has_extents(struct ocfs2_dinode *di) 236 { 237 /* inodes flagged with other stuff in id2 */ 238 if (le32_to_cpu(di->i_flags) & 239 (OCFS2_SUPER_BLOCK_FL | OCFS2_LOCAL_ALLOC_FL | OCFS2_CHAIN_FL | 240 OCFS2_DEALLOC_FL)) 241 return 0; 242 /* i_flags doesn't indicate when id2 is a fast symlink */ 243 if (S_ISLNK(le16_to_cpu(di->i_mode)) && le64_to_cpu(di->i_size) && 244 !le32_to_cpu(di->i_clusters)) 245 return 0; 246 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) 247 return 0; 248 249 return 1; 250 } 251 252 /* 253 * here's how inodes get read from disk: 254 * iget5_locked -> find_actor -> OCFS2_FIND_ACTOR 255 * found? : return the in-memory inode 256 * not found? : get_new_inode -> OCFS2_INIT_LOCKED_INODE 257 */ 258 259 static int ocfs2_find_actor(struct inode *inode, void *opaque) 260 { 261 struct ocfs2_find_inode_args *args = NULL; 262 struct ocfs2_inode_info *oi = OCFS2_I(inode); 263 int ret = 0; 264 265 args = opaque; 266 267 mlog_bug_on_msg(!inode, "No inode in find actor!\n"); 268 269 trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno); 270 271 if (oi->ip_blkno != args->fi_blkno) 272 goto bail; 273 274 ret = 1; 275 bail: 276 return ret; 277 } 278 279 /* 280 * initialize the new inode, but don't do anything that would cause 281 * us to sleep. 282 * return 0 on success, 1 on failure 283 */ 284 static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) 285 { 286 struct ocfs2_find_inode_args *args = opaque; 287 #ifdef CONFIG_LOCKDEP 288 static struct lock_class_key ocfs2_sysfile_lock_key[NUM_SYSTEM_INODES]; 289 static struct lock_class_key ocfs2_quota_ip_alloc_sem_key, 290 ocfs2_file_ip_alloc_sem_key; 291 #endif 292 293 inode->i_ino = args->fi_ino; 294 OCFS2_I(inode)->ip_blkno = args->fi_blkno; 295 #ifdef CONFIG_LOCKDEP 296 switch (args->fi_sysfile_type) { 297 case BAD_BLOCK_SYSTEM_INODE: 298 break; 299 case GLOBAL_INODE_ALLOC_SYSTEM_INODE: 300 lockdep_set_class(&inode->i_rwsem, 301 &ocfs2_sysfile_lock_key[GLOBAL_INODE_ALLOC_SYSTEM_INODE]); 302 break; 303 case SLOT_MAP_SYSTEM_INODE: 304 lockdep_set_class(&inode->i_rwsem, 305 &ocfs2_sysfile_lock_key[SLOT_MAP_SYSTEM_INODE]); 306 break; 307 case HEARTBEAT_SYSTEM_INODE: 308 lockdep_set_class(&inode->i_rwsem, 309 &ocfs2_sysfile_lock_key[HEARTBEAT_SYSTEM_INODE]); 310 break; 311 case GLOBAL_BITMAP_SYSTEM_INODE: 312 lockdep_set_class(&inode->i_rwsem, 313 &ocfs2_sysfile_lock_key[GLOBAL_BITMAP_SYSTEM_INODE]); 314 break; 315 case USER_QUOTA_SYSTEM_INODE: 316 lockdep_set_class(&inode->i_rwsem, 317 &ocfs2_sysfile_lock_key[USER_QUOTA_SYSTEM_INODE]); 318 break; 319 case GROUP_QUOTA_SYSTEM_INODE: 320 lockdep_set_class(&inode->i_rwsem, 321 &ocfs2_sysfile_lock_key[GROUP_QUOTA_SYSTEM_INODE]); 322 break; 323 case ORPHAN_DIR_SYSTEM_INODE: 324 lockdep_set_class(&inode->i_rwsem, 325 &ocfs2_sysfile_lock_key[ORPHAN_DIR_SYSTEM_INODE]); 326 break; 327 case EXTENT_ALLOC_SYSTEM_INODE: 328 lockdep_set_class(&inode->i_rwsem, 329 &ocfs2_sysfile_lock_key[EXTENT_ALLOC_SYSTEM_INODE]); 330 break; 331 case INODE_ALLOC_SYSTEM_INODE: 332 lockdep_set_class(&inode->i_rwsem, 333 &ocfs2_sysfile_lock_key[INODE_ALLOC_SYSTEM_INODE]); 334 break; 335 case JOURNAL_SYSTEM_INODE: 336 lockdep_set_class(&inode->i_rwsem, 337 &ocfs2_sysfile_lock_key[JOURNAL_SYSTEM_INODE]); 338 break; 339 case LOCAL_ALLOC_SYSTEM_INODE: 340 lockdep_set_class(&inode->i_rwsem, 341 &ocfs2_sysfile_lock_key[LOCAL_ALLOC_SYSTEM_INODE]); 342 break; 343 case TRUNCATE_LOG_SYSTEM_INODE: 344 lockdep_set_class(&inode->i_rwsem, 345 &ocfs2_sysfile_lock_key[TRUNCATE_LOG_SYSTEM_INODE]); 346 break; 347 case LOCAL_USER_QUOTA_SYSTEM_INODE: 348 lockdep_set_class(&inode->i_rwsem, 349 &ocfs2_sysfile_lock_key[LOCAL_USER_QUOTA_SYSTEM_INODE]); 350 break; 351 case LOCAL_GROUP_QUOTA_SYSTEM_INODE: 352 lockdep_set_class(&inode->i_rwsem, 353 &ocfs2_sysfile_lock_key[LOCAL_GROUP_QUOTA_SYSTEM_INODE]); 354 break; 355 default: 356 WARN_ONCE(1, "Unknown sysfile type %d\n", args->fi_sysfile_type); 357 } 358 if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || 359 args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || 360 args->fi_sysfile_type == LOCAL_USER_QUOTA_SYSTEM_INODE || 361 args->fi_sysfile_type == LOCAL_GROUP_QUOTA_SYSTEM_INODE) 362 lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, 363 &ocfs2_quota_ip_alloc_sem_key); 364 else 365 lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem, 366 &ocfs2_file_ip_alloc_sem_key); 367 #endif 368 369 return 0; 370 } 371 372 void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, 373 int create_ino) 374 { 375 struct super_block *sb; 376 struct ocfs2_super *osb; 377 int use_plocks = 1; 378 379 sb = inode->i_sb; 380 osb = OCFS2_SB(sb); 381 382 if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || 383 ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) 384 use_plocks = 0; 385 386 /* 387 * These have all been checked by ocfs2_read_inode_block() or set 388 * by ocfs2_mknod_locked(), so a failure is a code bug. 389 */ 390 BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode 391 cannot create a superblock 392 inode today. change if 393 that is needed. */ 394 BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))); 395 BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation); 396 397 398 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 399 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 400 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 401 402 inode_set_iversion(inode, 1); 403 inode->i_generation = le32_to_cpu(fe->i_generation); 404 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 405 inode->i_mode = le16_to_cpu(fe->i_mode); 406 i_uid_write(inode, le32_to_cpu(fe->i_uid)); 407 i_gid_write(inode, le32_to_cpu(fe->i_gid)); 408 409 /* Fast symlinks will have i_size but no allocated clusters. */ 410 if (S_ISLNK(inode->i_mode) && !fe->i_clusters) { 411 inode->i_blocks = 0; 412 inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops; 413 } else { 414 inode->i_blocks = ocfs2_inode_sector_count(inode); 415 inode->i_mapping->a_ops = &ocfs2_aops; 416 } 417 inode_set_atime(inode, le64_to_cpu(fe->i_atime), 418 le32_to_cpu(fe->i_atime_nsec)); 419 inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), 420 le32_to_cpu(fe->i_mtime_nsec)); 421 inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), 422 le32_to_cpu(fe->i_ctime_nsec)); 423 424 if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) 425 mlog(ML_ERROR, 426 "ip_blkno %llu != i_blkno %llu!\n", 427 (unsigned long long)OCFS2_I(inode)->ip_blkno, 428 (unsigned long long)le64_to_cpu(fe->i_blkno)); 429 430 set_nlink(inode, ocfs2_read_links_count(fe)); 431 432 trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno, 433 le32_to_cpu(fe->i_flags)); 434 if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { 435 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; 436 inode->i_flags |= S_NOQUOTA; 437 } 438 439 if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { 440 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 441 } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { 442 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; 443 } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { 444 inode->i_flags |= S_NOQUOTA; 445 } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { 446 /* we can't actually hit this as read_inode can't 447 * handle superblocks today ;-) */ 448 BUG(); 449 } 450 451 switch (inode->i_mode & S_IFMT) { 452 case S_IFREG: 453 if (use_plocks) 454 inode->i_fop = &ocfs2_fops; 455 else 456 inode->i_fop = &ocfs2_fops_no_plocks; 457 inode->i_op = &ocfs2_file_iops; 458 i_size_write(inode, le64_to_cpu(fe->i_size)); 459 break; 460 case S_IFDIR: 461 inode->i_op = &ocfs2_dir_iops; 462 if (use_plocks) 463 inode->i_fop = &ocfs2_dops; 464 else 465 inode->i_fop = &ocfs2_dops_no_plocks; 466 i_size_write(inode, le64_to_cpu(fe->i_size)); 467 OCFS2_I(inode)->ip_dir_lock_gen = 1; 468 break; 469 case S_IFLNK: 470 inode->i_op = &ocfs2_symlink_inode_operations; 471 inode_nohighmem(inode); 472 i_size_write(inode, le64_to_cpu(fe->i_size)); 473 break; 474 default: 475 inode->i_op = &ocfs2_special_file_iops; 476 init_special_inode(inode, inode->i_mode, 477 inode->i_rdev); 478 break; 479 } 480 481 if (create_ino) { 482 inode->i_ino = ino_from_blkno(inode->i_sb, 483 le64_to_cpu(fe->i_blkno)); 484 485 /* 486 * If we ever want to create system files from kernel, 487 * the generation argument to 488 * ocfs2_inode_lock_res_init() will have to change. 489 */ 490 BUG_ON(le32_to_cpu(fe->i_flags) & OCFS2_SYSTEM_FL); 491 492 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, 493 OCFS2_LOCK_TYPE_META, 0, inode); 494 495 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 496 OCFS2_LOCK_TYPE_OPEN, 0, inode); 497 } 498 499 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres, 500 OCFS2_LOCK_TYPE_RW, inode->i_generation, 501 inode); 502 503 ocfs2_set_inode_flags(inode); 504 505 OCFS2_I(inode)->ip_last_used_slot = 0; 506 OCFS2_I(inode)->ip_last_used_group = 0; 507 508 if (S_ISDIR(inode->i_mode)) 509 ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv, 510 OCFS2_RESV_FLAG_DIR); 511 } 512 513 static int ocfs2_read_locked_inode(struct inode *inode, 514 struct ocfs2_find_inode_args *args) 515 { 516 struct super_block *sb; 517 struct ocfs2_super *osb; 518 struct ocfs2_dinode *fe; 519 struct buffer_head *bh = NULL; 520 int status, can_lock, lock_level = 0; 521 u32 generation = 0; 522 523 status = -EINVAL; 524 sb = inode->i_sb; 525 osb = OCFS2_SB(sb); 526 527 /* 528 * To improve performance of cold-cache inode stats, we take 529 * the cluster lock here if possible. 530 * 531 * Generally, OCFS2 never trusts the contents of an inode 532 * unless it's holding a cluster lock, so taking it here isn't 533 * a correctness issue as much as it is a performance 534 * improvement. 535 * 536 * There are three times when taking the lock is not a good idea: 537 * 538 * 1) During startup, before we have initialized the DLM. 539 * 540 * 2) If we are reading certain system files which never get 541 * cluster locks (local alloc, truncate log). 542 * 543 * 3) If the process doing the iget() is responsible for 544 * orphan dir recovery. We're holding the orphan dir lock and 545 * can get into a deadlock with another process on another 546 * node in ->delete_inode(). 547 * 548 * #1 and #2 can be simply solved by never taking the lock 549 * here for system files (which are the only type we read 550 * during mount). It's a heavier approach, but our main 551 * concern is user-accessible files anyway. 552 * 553 * #3 works itself out because we'll eventually take the 554 * cluster lock before trusting anything anyway. 555 */ 556 can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 557 && !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) 558 && !ocfs2_mount_local(osb); 559 560 trace_ocfs2_read_locked_inode( 561 (unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock); 562 563 /* 564 * To maintain backwards compatibility with older versions of 565 * ocfs2-tools, we still store the generation value for system 566 * files. The only ones that actually matter to userspace are 567 * the journals, but it's easier and inexpensive to just flag 568 * all system files similarly. 569 */ 570 if (args->fi_flags & OCFS2_FI_FLAG_SYSFILE) 571 generation = osb->fs_generation; 572 573 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_inode_lockres, 574 OCFS2_LOCK_TYPE_META, 575 generation, inode); 576 577 ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres, 578 OCFS2_LOCK_TYPE_OPEN, 579 0, inode); 580 581 if (can_lock) { 582 status = ocfs2_open_lock(inode); 583 if (status) { 584 make_bad_inode(inode); 585 mlog_errno(status); 586 return status; 587 } 588 status = ocfs2_inode_lock(inode, NULL, lock_level); 589 if (status) { 590 make_bad_inode(inode); 591 mlog_errno(status); 592 return status; 593 } 594 } 595 596 if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) { 597 status = ocfs2_try_open_lock(inode, 0); 598 if (status) { 599 make_bad_inode(inode); 600 return status; 601 } 602 } 603 604 if (can_lock) { 605 if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) 606 status = ocfs2_filecheck_read_inode_block_full(inode, 607 &bh, OCFS2_BH_IGNORE_CACHE, 0); 608 else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) 609 status = ocfs2_filecheck_read_inode_block_full(inode, 610 &bh, OCFS2_BH_IGNORE_CACHE, 1); 611 else 612 status = ocfs2_read_inode_block_full(inode, 613 &bh, OCFS2_BH_IGNORE_CACHE); 614 } else { 615 status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); 616 /* 617 * If buffer is in jbd, then its checksum may not have been 618 * computed as yet. 619 */ 620 if (!status && !buffer_jbd(bh)) { 621 if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_CHK) 622 status = ocfs2_filecheck_validate_inode_block( 623 osb->sb, bh); 624 else if (args->fi_flags & OCFS2_FI_FLAG_FILECHECK_FIX) 625 status = ocfs2_filecheck_repair_inode_block( 626 osb->sb, bh); 627 else 628 status = ocfs2_validate_inode_block( 629 osb->sb, bh); 630 } 631 } 632 if (status < 0) { 633 mlog_errno(status); 634 goto bail; 635 } 636 637 status = -EINVAL; 638 fe = (struct ocfs2_dinode *) bh->b_data; 639 640 /* 641 * This is a code bug. Right now the caller needs to 642 * understand whether it is asking for a system file inode or 643 * not so the proper lock names can be built. 644 */ 645 mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != 646 !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), 647 "Inode %llu: system file state is ambiguous\n", 648 (unsigned long long)args->fi_blkno); 649 650 if (S_ISCHR(le16_to_cpu(fe->i_mode)) || 651 S_ISBLK(le16_to_cpu(fe->i_mode))) 652 inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 653 654 ocfs2_populate_inode(inode, fe, 0); 655 656 BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); 657 658 if (buffer_dirty(bh) && !buffer_jbd(bh)) { 659 if (can_lock) { 660 ocfs2_inode_unlock(inode, lock_level); 661 lock_level = 1; 662 ocfs2_inode_lock(inode, NULL, lock_level); 663 } 664 status = ocfs2_write_block(osb, bh, INODE_CACHE(inode)); 665 if (status < 0) { 666 mlog_errno(status); 667 goto bail; 668 } 669 } 670 671 status = 0; 672 673 bail: 674 if (can_lock) 675 ocfs2_inode_unlock(inode, lock_level); 676 677 if (status < 0) 678 make_bad_inode(inode); 679 680 brelse(bh); 681 682 return status; 683 } 684 685 void ocfs2_sync_blockdev(struct super_block *sb) 686 { 687 sync_blockdev(sb->s_bdev); 688 } 689 690 static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, 691 struct inode *inode, 692 struct buffer_head *fe_bh) 693 { 694 int status = 0; 695 struct ocfs2_dinode *fe; 696 handle_t *handle = NULL; 697 698 fe = (struct ocfs2_dinode *) fe_bh->b_data; 699 700 /* 701 * This check will also skip truncate of inodes with inline 702 * data and fast symlinks. 703 */ 704 if (fe->i_clusters) { 705 if (ocfs2_should_order_data(inode)) 706 ocfs2_begin_ordered_truncate(inode, 0); 707 708 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 709 if (IS_ERR(handle)) { 710 status = PTR_ERR(handle); 711 handle = NULL; 712 mlog_errno(status); 713 goto out; 714 } 715 716 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 717 fe_bh, 718 OCFS2_JOURNAL_ACCESS_WRITE); 719 if (status < 0) { 720 mlog_errno(status); 721 goto out; 722 } 723 724 i_size_write(inode, 0); 725 726 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 727 if (status < 0) { 728 mlog_errno(status); 729 goto out; 730 } 731 732 ocfs2_commit_trans(osb, handle); 733 handle = NULL; 734 735 status = ocfs2_commit_truncate(osb, inode, fe_bh); 736 if (status < 0) 737 mlog_errno(status); 738 } 739 740 out: 741 if (handle) 742 ocfs2_commit_trans(osb, handle); 743 return status; 744 } 745 746 static int ocfs2_remove_inode(struct inode *inode, 747 struct buffer_head *di_bh, 748 struct inode *orphan_dir_inode, 749 struct buffer_head *orphan_dir_bh) 750 { 751 int status; 752 struct inode *inode_alloc_inode = NULL; 753 struct buffer_head *inode_alloc_bh = NULL; 754 handle_t *handle; 755 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 756 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 757 758 inode_alloc_inode = 759 ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, 760 le16_to_cpu(di->i_suballoc_slot)); 761 if (!inode_alloc_inode) { 762 status = -ENOENT; 763 mlog_errno(status); 764 goto bail; 765 } 766 767 inode_lock(inode_alloc_inode); 768 status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); 769 if (status < 0) { 770 inode_unlock(inode_alloc_inode); 771 772 mlog_errno(status); 773 goto bail; 774 } 775 776 handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + 777 ocfs2_quota_trans_credits(inode->i_sb)); 778 if (IS_ERR(handle)) { 779 status = PTR_ERR(handle); 780 mlog_errno(status); 781 goto bail_unlock; 782 } 783 784 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { 785 status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, 786 orphan_dir_bh, false); 787 if (status < 0) { 788 mlog_errno(status); 789 goto bail_commit; 790 } 791 } 792 793 /* set the inodes dtime */ 794 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 795 OCFS2_JOURNAL_ACCESS_WRITE); 796 if (status < 0) { 797 mlog_errno(status); 798 goto bail_commit; 799 } 800 801 di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); 802 di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); 803 ocfs2_journal_dirty(handle, di_bh); 804 805 ocfs2_remove_from_cache(INODE_CACHE(inode), di_bh); 806 dquot_free_inode(inode); 807 808 status = ocfs2_free_dinode(handle, inode_alloc_inode, 809 inode_alloc_bh, di); 810 if (status < 0) 811 mlog_errno(status); 812 813 bail_commit: 814 ocfs2_commit_trans(osb, handle); 815 bail_unlock: 816 ocfs2_inode_unlock(inode_alloc_inode, 1); 817 inode_unlock(inode_alloc_inode); 818 brelse(inode_alloc_bh); 819 bail: 820 iput(inode_alloc_inode); 821 822 return status; 823 } 824 825 /* 826 * Serialize with orphan dir recovery. If the process doing 827 * recovery on this orphan dir does an iget() with the dir 828 * i_rwsem held, we'll deadlock here. Instead we detect this 829 * and exit early - recovery will wipe this inode for us. 830 */ 831 static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, 832 int slot) 833 { 834 int ret = 0; 835 836 spin_lock(&osb->osb_lock); 837 if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { 838 ret = -EDEADLK; 839 goto out; 840 } 841 /* This signals to the orphan recovery process that it should 842 * wait for us to handle the wipe. */ 843 osb->osb_orphan_wipes[slot]++; 844 out: 845 spin_unlock(&osb->osb_lock); 846 trace_ocfs2_check_orphan_recovery_state(slot, ret); 847 return ret; 848 } 849 850 static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, 851 int slot) 852 { 853 spin_lock(&osb->osb_lock); 854 osb->osb_orphan_wipes[slot]--; 855 spin_unlock(&osb->osb_lock); 856 857 wake_up(&osb->osb_wipe_event); 858 } 859 860 static int ocfs2_wipe_inode(struct inode *inode, 861 struct buffer_head *di_bh) 862 { 863 int status, orphaned_slot = -1; 864 struct inode *orphan_dir_inode = NULL; 865 struct buffer_head *orphan_dir_bh = NULL; 866 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 867 struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 868 869 if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { 870 orphaned_slot = le16_to_cpu(di->i_orphaned_slot); 871 872 status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 873 if (status) 874 return status; 875 876 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 877 ORPHAN_DIR_SYSTEM_INODE, 878 orphaned_slot); 879 if (!orphan_dir_inode) { 880 status = -ENOENT; 881 mlog_errno(status); 882 goto bail; 883 } 884 885 /* Lock the orphan dir. The lock will be held for the entire 886 * delete_inode operation. We do this now to avoid races with 887 * recovery completion on other nodes. */ 888 inode_lock(orphan_dir_inode); 889 status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); 890 if (status < 0) { 891 inode_unlock(orphan_dir_inode); 892 893 mlog_errno(status); 894 goto bail; 895 } 896 } 897 898 /* we do this while holding the orphan dir lock because we 899 * don't want recovery being run from another node to try an 900 * inode delete underneath us -- this will result in two nodes 901 * truncating the same file! */ 902 status = ocfs2_truncate_for_delete(osb, inode, di_bh); 903 if (status < 0) { 904 mlog_errno(status); 905 goto bail_unlock_dir; 906 } 907 908 /* Remove any dir index tree */ 909 if (S_ISDIR(inode->i_mode)) { 910 status = ocfs2_dx_dir_truncate(inode, di_bh); 911 if (status) { 912 mlog_errno(status); 913 goto bail_unlock_dir; 914 } 915 } 916 917 /*Free extended attribute resources associated with this inode.*/ 918 status = ocfs2_xattr_remove(inode, di_bh); 919 if (status < 0) { 920 mlog_errno(status); 921 goto bail_unlock_dir; 922 } 923 924 status = ocfs2_remove_refcount_tree(inode, di_bh); 925 if (status < 0) { 926 mlog_errno(status); 927 goto bail_unlock_dir; 928 } 929 930 status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, 931 orphan_dir_bh); 932 if (status < 0) 933 mlog_errno(status); 934 935 bail_unlock_dir: 936 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR) 937 return status; 938 939 ocfs2_inode_unlock(orphan_dir_inode, 1); 940 inode_unlock(orphan_dir_inode); 941 brelse(orphan_dir_bh); 942 bail: 943 iput(orphan_dir_inode); 944 ocfs2_signal_wipe_completion(osb, orphaned_slot); 945 946 return status; 947 } 948 949 /* There is a series of simple checks that should be done before a 950 * trylock is even considered. Encapsulate those in this function. */ 951 static int ocfs2_inode_is_valid_to_delete(struct inode *inode) 952 { 953 int ret = 0; 954 struct ocfs2_inode_info *oi = OCFS2_I(inode); 955 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 956 957 trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task, 958 (unsigned long long)oi->ip_blkno, 959 oi->ip_flags); 960 961 /* We shouldn't be getting here for the root directory 962 * inode.. */ 963 if (inode == osb->root_inode) { 964 mlog(ML_ERROR, "Skipping delete of root inode.\n"); 965 goto bail; 966 } 967 968 /* 969 * If we're coming from downconvert_thread we can't go into our own 970 * voting [hello, deadlock city!] so we cannot delete the inode. But 971 * since we dropped last inode ref when downconverting dentry lock, 972 * we cannot have the file open and thus the node doing unlink will 973 * take care of deleting the inode. 974 */ 975 if (current == osb->dc_task) 976 goto bail; 977 978 spin_lock(&oi->ip_lock); 979 /* OCFS2 *never* deletes system files. This should technically 980 * never get here as system file inodes should always have a 981 * positive link count. */ 982 if (oi->ip_flags & OCFS2_INODE_SYSTEM_FILE) { 983 mlog(ML_ERROR, "Skipping delete of system file %llu\n", 984 (unsigned long long)oi->ip_blkno); 985 goto bail_unlock; 986 } 987 988 ret = 1; 989 bail_unlock: 990 spin_unlock(&oi->ip_lock); 991 bail: 992 return ret; 993 } 994 995 /* Query the cluster to determine whether we should wipe an inode from 996 * disk or not. 997 * 998 * Requires the inode to have the cluster lock. */ 999 static int ocfs2_query_inode_wipe(struct inode *inode, 1000 struct buffer_head *di_bh, 1001 int *wipe) 1002 { 1003 int status = 0, reason = 0; 1004 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1005 struct ocfs2_dinode *di; 1006 1007 *wipe = 0; 1008 1009 trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno, 1010 inode->i_nlink); 1011 1012 /* While we were waiting for the cluster lock in 1013 * ocfs2_delete_inode, another node might have asked to delete 1014 * the inode. Recheck our flags to catch this. */ 1015 if (!ocfs2_inode_is_valid_to_delete(inode)) { 1016 reason = 1; 1017 goto bail; 1018 } 1019 1020 /* Now that we have an up to date inode, we can double check 1021 * the link count. */ 1022 if (inode->i_nlink) 1023 goto bail; 1024 1025 /* Do some basic inode verification... */ 1026 di = (struct ocfs2_dinode *) di_bh->b_data; 1027 if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) && 1028 !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { 1029 /* 1030 * Inodes in the orphan dir must have ORPHANED_FL. The only 1031 * inodes that come back out of the orphan dir are reflink 1032 * targets. A reflink target may be moved out of the orphan 1033 * dir between the time we scan the directory and the time we 1034 * process it. This would lead to HAS_REFCOUNT_FL being set but 1035 * ORPHANED_FL not. 1036 */ 1037 if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { 1038 reason = 2; 1039 goto bail; 1040 } 1041 1042 /* for lack of a better error? */ 1043 status = -EEXIST; 1044 mlog(ML_ERROR, 1045 "Inode %llu (on-disk %llu) not orphaned! " 1046 "Disk flags 0x%x, inode flags 0x%x\n", 1047 (unsigned long long)oi->ip_blkno, 1048 (unsigned long long)le64_to_cpu(di->i_blkno), 1049 le32_to_cpu(di->i_flags), oi->ip_flags); 1050 goto bail; 1051 } 1052 1053 /* has someone already deleted us?! baaad... */ 1054 if (di->i_dtime) { 1055 status = -EEXIST; 1056 mlog_errno(status); 1057 goto bail; 1058 } 1059 1060 /* 1061 * This is how ocfs2 determines whether an inode is still live 1062 * within the cluster. Every node takes a shared read lock on 1063 * the inode open lock in ocfs2_read_locked_inode(). When we 1064 * get to ->delete_inode(), each node tries to convert it's 1065 * lock to an exclusive. Trylocks are serialized by the inode 1066 * meta data lock. If the upconvert succeeds, we know the inode 1067 * is no longer live and can be deleted. 1068 * 1069 * Though we call this with the meta data lock held, the 1070 * trylock keeps us from ABBA deadlock. 1071 */ 1072 status = ocfs2_try_open_lock(inode, 1); 1073 if (status == -EAGAIN) { 1074 status = 0; 1075 reason = 3; 1076 goto bail; 1077 } 1078 if (status < 0) { 1079 mlog_errno(status); 1080 goto bail; 1081 } 1082 1083 *wipe = 1; 1084 trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot)); 1085 1086 bail: 1087 trace_ocfs2_query_inode_wipe_end(status, reason); 1088 return status; 1089 } 1090 1091 /* Support function for ocfs2_delete_inode. Will help us keep the 1092 * inode data in a consistent state for clear_inode. Always truncates 1093 * pages, optionally sync's them first. */ 1094 static void ocfs2_cleanup_delete_inode(struct inode *inode, 1095 int sync_data) 1096 { 1097 trace_ocfs2_cleanup_delete_inode( 1098 (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data); 1099 if (sync_data) 1100 filemap_write_and_wait(inode->i_mapping); 1101 truncate_inode_pages_final(&inode->i_data); 1102 } 1103 1104 static void ocfs2_delete_inode(struct inode *inode) 1105 { 1106 int wipe, status; 1107 sigset_t oldset; 1108 struct buffer_head *di_bh = NULL; 1109 struct ocfs2_dinode *di = NULL; 1110 1111 trace_ocfs2_delete_inode(inode->i_ino, 1112 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1113 is_bad_inode(inode)); 1114 1115 /* When we fail in read_inode() we mark inode as bad. The second test 1116 * catches the case when inode allocation fails before allocating 1117 * a block for inode. */ 1118 if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) 1119 goto bail; 1120 1121 if (!ocfs2_inode_is_valid_to_delete(inode)) { 1122 /* It's probably not necessary to truncate_inode_pages 1123 * here but we do it for safety anyway (it will most 1124 * likely be a no-op anyway) */ 1125 ocfs2_cleanup_delete_inode(inode, 0); 1126 goto bail; 1127 } 1128 1129 dquot_initialize(inode); 1130 1131 /* We want to block signals in delete_inode as the lock and 1132 * messaging paths may return us -ERESTARTSYS. Which would 1133 * cause us to exit early, resulting in inodes being orphaned 1134 * forever. */ 1135 ocfs2_block_signals(&oldset); 1136 1137 /* 1138 * Synchronize us against ocfs2_get_dentry. We take this in 1139 * shared mode so that all nodes can still concurrently 1140 * process deletes. 1141 */ 1142 status = ocfs2_nfs_sync_lock(OCFS2_SB(inode->i_sb), 0); 1143 if (status < 0) { 1144 mlog(ML_ERROR, "getting nfs sync lock(PR) failed %d\n", status); 1145 ocfs2_cleanup_delete_inode(inode, 0); 1146 goto bail_unblock; 1147 } 1148 /* Lock down the inode. This gives us an up to date view of 1149 * it's metadata (for verification), and allows us to 1150 * serialize delete_inode on multiple nodes. 1151 * 1152 * Even though we might be doing a truncate, we don't take the 1153 * allocation lock here as it won't be needed - nobody will 1154 * have the file open. 1155 */ 1156 status = ocfs2_inode_lock(inode, &di_bh, 1); 1157 if (status < 0) { 1158 if (status != -ENOENT) 1159 mlog_errno(status); 1160 ocfs2_cleanup_delete_inode(inode, 0); 1161 goto bail_unlock_nfs_sync; 1162 } 1163 1164 di = (struct ocfs2_dinode *)di_bh->b_data; 1165 /* Skip inode deletion and wait for dio orphan entry recovered 1166 * first */ 1167 if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { 1168 ocfs2_cleanup_delete_inode(inode, 0); 1169 goto bail_unlock_inode; 1170 } 1171 1172 /* Query the cluster. This will be the final decision made 1173 * before we go ahead and wipe the inode. */ 1174 status = ocfs2_query_inode_wipe(inode, di_bh, &wipe); 1175 if (!wipe || status < 0) { 1176 /* Error and remote inode busy both mean we won't be 1177 * removing the inode, so they take almost the same 1178 * path. */ 1179 if (status < 0) 1180 mlog_errno(status); 1181 1182 /* Someone in the cluster has disallowed a wipe of 1183 * this inode, or it was never completely 1184 * orphaned. Write out the pages and exit now. */ 1185 ocfs2_cleanup_delete_inode(inode, 1); 1186 goto bail_unlock_inode; 1187 } 1188 1189 ocfs2_cleanup_delete_inode(inode, 0); 1190 1191 status = ocfs2_wipe_inode(inode, di_bh); 1192 if (status < 0) { 1193 if (status != -EDEADLK) 1194 mlog_errno(status); 1195 goto bail_unlock_inode; 1196 } 1197 1198 /* 1199 * Mark the inode as successfully deleted. 1200 * 1201 * This is important for ocfs2_clear_inode() as it will check 1202 * this flag and skip any checkpointing work 1203 * 1204 * ocfs2_stuff_meta_lvb() also uses this flag to invalidate 1205 * the LVB for other nodes. 1206 */ 1207 OCFS2_I(inode)->ip_flags |= OCFS2_INODE_DELETED; 1208 1209 bail_unlock_inode: 1210 ocfs2_inode_unlock(inode, 1); 1211 brelse(di_bh); 1212 1213 bail_unlock_nfs_sync: 1214 ocfs2_nfs_sync_unlock(OCFS2_SB(inode->i_sb), 0); 1215 1216 bail_unblock: 1217 ocfs2_unblock_signals(&oldset); 1218 bail: 1219 return; 1220 } 1221 1222 static void ocfs2_clear_inode(struct inode *inode) 1223 { 1224 int status; 1225 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1226 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1227 1228 clear_inode(inode); 1229 trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno, 1230 inode->i_nlink); 1231 1232 mlog_bug_on_msg(osb == NULL, 1233 "Inode=%llu\n", inode->i_ino); 1234 1235 dquot_drop(inode); 1236 1237 /* To prevent remote deletes we hold open lock before, now it 1238 * is time to unlock PR and EX open locks. */ 1239 ocfs2_open_unlock(inode); 1240 1241 /* Do these before all the other work so that we don't bounce 1242 * the downconvert thread while waiting to destroy the locks. */ 1243 ocfs2_mark_lockres_freeing(osb, &oi->ip_rw_lockres); 1244 ocfs2_mark_lockres_freeing(osb, &oi->ip_inode_lockres); 1245 ocfs2_mark_lockres_freeing(osb, &oi->ip_open_lockres); 1246 1247 ocfs2_resv_discard(&osb->osb_la_resmap, 1248 &oi->ip_la_data_resv); 1249 ocfs2_resv_init_once(&oi->ip_la_data_resv); 1250 1251 /* We very well may get a clear_inode before all an inodes 1252 * metadata has hit disk. Of course, we can't drop any cluster 1253 * locks until the journal has finished with it. The only 1254 * exception here are successfully wiped inodes - their 1255 * metadata can now be considered to be part of the system 1256 * inodes from which it came. */ 1257 if (!(oi->ip_flags & OCFS2_INODE_DELETED)) 1258 ocfs2_checkpoint_inode(inode); 1259 1260 mlog_bug_on_msg(!list_empty(&oi->ip_io_markers), 1261 "Clear inode of %llu, inode has io markers\n", 1262 (unsigned long long)oi->ip_blkno); 1263 mlog_bug_on_msg(!list_empty(&oi->ip_unwritten_list), 1264 "Clear inode of %llu, inode has unwritten extents\n", 1265 (unsigned long long)oi->ip_blkno); 1266 1267 ocfs2_extent_map_trunc(inode, 0); 1268 1269 status = ocfs2_drop_inode_locks(inode); 1270 if (status < 0) 1271 mlog_errno(status); 1272 1273 ocfs2_lock_res_free(&oi->ip_rw_lockres); 1274 ocfs2_lock_res_free(&oi->ip_inode_lockres); 1275 ocfs2_lock_res_free(&oi->ip_open_lockres); 1276 1277 ocfs2_metadata_cache_exit(INODE_CACHE(inode)); 1278 1279 mlog_bug_on_msg(INODE_CACHE(inode)->ci_num_cached, 1280 "Clear inode of %llu, inode has %u cache items\n", 1281 (unsigned long long)oi->ip_blkno, 1282 INODE_CACHE(inode)->ci_num_cached); 1283 1284 mlog_bug_on_msg(!(INODE_CACHE(inode)->ci_flags & OCFS2_CACHE_FL_INLINE), 1285 "Clear inode of %llu, inode has a bad flag\n", 1286 (unsigned long long)oi->ip_blkno); 1287 1288 mlog_bug_on_msg(spin_is_locked(&oi->ip_lock), 1289 "Clear inode of %llu, inode is locked\n", 1290 (unsigned long long)oi->ip_blkno); 1291 1292 mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), 1293 "Clear inode of %llu, io_mutex is locked\n", 1294 (unsigned long long)oi->ip_blkno); 1295 mutex_unlock(&oi->ip_io_mutex); 1296 1297 /* 1298 * down_trylock() returns 0, down_write_trylock() returns 1 1299 * kernel 1, world 0 1300 */ 1301 mlog_bug_on_msg(!down_write_trylock(&oi->ip_alloc_sem), 1302 "Clear inode of %llu, alloc_sem is locked\n", 1303 (unsigned long long)oi->ip_blkno); 1304 up_write(&oi->ip_alloc_sem); 1305 1306 mlog_bug_on_msg(oi->ip_open_count, 1307 "Clear inode of %llu has open count %d\n", 1308 (unsigned long long)oi->ip_blkno, oi->ip_open_count); 1309 1310 /* Clear all other flags. */ 1311 oi->ip_flags = 0; 1312 oi->ip_dir_start_lookup = 0; 1313 oi->ip_blkno = 0ULL; 1314 1315 /* 1316 * ip_jinode is used to track txns against this inode. We ensure that 1317 * the journal is flushed before journal shutdown. Thus it is safe to 1318 * have inodes get cleaned up after journal shutdown. 1319 */ 1320 if (!osb->journal) 1321 return; 1322 1323 jbd2_journal_release_jbd_inode(osb->journal->j_journal, 1324 &oi->ip_jinode); 1325 } 1326 1327 void ocfs2_evict_inode(struct inode *inode) 1328 { 1329 write_inode_now(inode, 1); 1330 1331 if (!inode->i_nlink || 1332 (OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) { 1333 ocfs2_delete_inode(inode); 1334 } else { 1335 truncate_inode_pages_final(&inode->i_data); 1336 } 1337 ocfs2_clear_inode(inode); 1338 } 1339 1340 /* 1341 * This is called from our getattr. 1342 */ 1343 int ocfs2_inode_revalidate(struct dentry *dentry) 1344 { 1345 struct inode *inode = d_inode(dentry); 1346 int status = 0; 1347 1348 trace_ocfs2_inode_revalidate(inode, 1349 inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL, 1350 inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0); 1351 1352 if (!inode) { 1353 status = -ENOENT; 1354 goto bail; 1355 } 1356 1357 spin_lock(&OCFS2_I(inode)->ip_lock); 1358 if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { 1359 spin_unlock(&OCFS2_I(inode)->ip_lock); 1360 status = -ENOENT; 1361 goto bail; 1362 } 1363 spin_unlock(&OCFS2_I(inode)->ip_lock); 1364 1365 /* Let ocfs2_inode_lock do the work of updating our struct 1366 * inode for us. */ 1367 status = ocfs2_inode_lock(inode, NULL, 0); 1368 if (status < 0) { 1369 if (status != -ENOENT) 1370 mlog_errno(status); 1371 goto bail; 1372 } 1373 ocfs2_inode_unlock(inode, 0); 1374 bail: 1375 return status; 1376 } 1377 1378 /* 1379 * Updates a disk inode from a 1380 * struct inode. 1381 * Only takes ip_lock. 1382 */ 1383 int ocfs2_mark_inode_dirty(handle_t *handle, 1384 struct inode *inode, 1385 struct buffer_head *bh) 1386 { 1387 int status; 1388 struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; 1389 1390 trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno); 1391 1392 status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh, 1393 OCFS2_JOURNAL_ACCESS_WRITE); 1394 if (status < 0) { 1395 mlog_errno(status); 1396 goto leave; 1397 } 1398 1399 spin_lock(&OCFS2_I(inode)->ip_lock); 1400 fe->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); 1401 ocfs2_get_inode_flags(OCFS2_I(inode)); 1402 fe->i_attr = cpu_to_le32(OCFS2_I(inode)->ip_attr); 1403 fe->i_dyn_features = cpu_to_le16(OCFS2_I(inode)->ip_dyn_features); 1404 spin_unlock(&OCFS2_I(inode)->ip_lock); 1405 1406 fe->i_size = cpu_to_le64(i_size_read(inode)); 1407 ocfs2_set_links_count(fe, inode->i_nlink); 1408 fe->i_uid = cpu_to_le32(i_uid_read(inode)); 1409 fe->i_gid = cpu_to_le32(i_gid_read(inode)); 1410 fe->i_mode = cpu_to_le16(inode->i_mode); 1411 fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); 1412 fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); 1413 fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); 1414 fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); 1415 fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); 1416 fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); 1417 1418 ocfs2_journal_dirty(handle, bh); 1419 ocfs2_update_inode_fsync_trans(handle, inode, 1); 1420 leave: 1421 return status; 1422 } 1423 1424 /* 1425 * 1426 * Updates a struct inode from a disk inode. 1427 * does no i/o, only takes ip_lock. 1428 */ 1429 void ocfs2_refresh_inode(struct inode *inode, 1430 struct ocfs2_dinode *fe) 1431 { 1432 spin_lock(&OCFS2_I(inode)->ip_lock); 1433 1434 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1435 OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); 1436 OCFS2_I(inode)->ip_dyn_features = le16_to_cpu(fe->i_dyn_features); 1437 ocfs2_set_inode_flags(inode); 1438 i_size_write(inode, le64_to_cpu(fe->i_size)); 1439 set_nlink(inode, ocfs2_read_links_count(fe)); 1440 i_uid_write(inode, le32_to_cpu(fe->i_uid)); 1441 i_gid_write(inode, le32_to_cpu(fe->i_gid)); 1442 inode->i_mode = le16_to_cpu(fe->i_mode); 1443 if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0) 1444 inode->i_blocks = 0; 1445 else 1446 inode->i_blocks = ocfs2_inode_sector_count(inode); 1447 inode_set_atime(inode, le64_to_cpu(fe->i_atime), 1448 le32_to_cpu(fe->i_atime_nsec)); 1449 inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), 1450 le32_to_cpu(fe->i_mtime_nsec)); 1451 inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), 1452 le32_to_cpu(fe->i_ctime_nsec)); 1453 1454 spin_unlock(&OCFS2_I(inode)->ip_lock); 1455 } 1456 1457 int ocfs2_validate_inode_block(struct super_block *sb, 1458 struct buffer_head *bh) 1459 { 1460 int rc; 1461 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1462 1463 trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr); 1464 1465 BUG_ON(!buffer_uptodate(bh)); 1466 1467 /* 1468 * If the ecc fails, we return the error but otherwise 1469 * leave the filesystem running. We know any error is 1470 * local to this block. 1471 */ 1472 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); 1473 if (rc) { 1474 mlog(ML_ERROR, "Checksum failed for dinode %llu\n", 1475 (unsigned long long)bh->b_blocknr); 1476 goto bail; 1477 } 1478 1479 if ((!di->i_links_count && !di->i_links_count_hi) || !di->i_mode) { 1480 mlog(ML_ERROR, "Invalid dinode #%llu: " 1481 "Corrupt state (nlink = %u or mode = %u) detected!\n", 1482 (unsigned long long)bh->b_blocknr, 1483 ocfs2_read_links_count(di), le16_to_cpu(di->i_mode)); 1484 rc = -EFSCORRUPTED; 1485 goto bail; 1486 } 1487 /* 1488 * Errors after here are fatal. 1489 */ 1490 1491 rc = -EINVAL; 1492 1493 if (!OCFS2_IS_VALID_DINODE(di)) { 1494 rc = ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", 1495 (unsigned long long)bh->b_blocknr, 7, 1496 di->i_signature); 1497 goto bail; 1498 } 1499 1500 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { 1501 rc = ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", 1502 (unsigned long long)bh->b_blocknr, 1503 (unsigned long long)le64_to_cpu(di->i_blkno)); 1504 goto bail; 1505 } 1506 1507 if (!(le32_to_cpu(di->i_flags) & OCFS2_VALID_FL)) { 1508 rc = ocfs2_error(sb, 1509 "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", 1510 (unsigned long long)bh->b_blocknr); 1511 goto bail; 1512 } 1513 1514 if (le32_to_cpu(di->i_fs_generation) != 1515 OCFS2_SB(sb)->fs_generation) { 1516 rc = ocfs2_error(sb, 1517 "Invalid dinode #%llu: fs_generation is %u\n", 1518 (unsigned long long)bh->b_blocknr, 1519 le32_to_cpu(di->i_fs_generation)); 1520 goto bail; 1521 } 1522 1523 if (le16_to_cpu(di->i_suballoc_slot) != (u16)OCFS2_INVALID_SLOT && 1524 (u32)le16_to_cpu(di->i_suballoc_slot) > OCFS2_SB(sb)->max_slots - 1) { 1525 rc = ocfs2_error(sb, "Invalid dinode %llu: suballoc slot %u\n", 1526 (unsigned long long)bh->b_blocknr, 1527 le16_to_cpu(di->i_suballoc_slot)); 1528 goto bail; 1529 } 1530 1531 /* 1532 * Reject dinodes whose i_mode does not name one of the seven 1533 * canonical POSIX file types. ocfs2_populate_inode() copies 1534 * i_mode verbatim into inode->i_mode and then dispatches via 1535 * switch (mode & S_IFMT) to file/dir/symlink/special_file iops; 1536 * an unrecognised type falls into ocfs2_special_file_iops with 1537 * init_special_inode(), which interprets i_rdev. Constrain the 1538 * type here so the dispatch only ever sees a value mkfs.ocfs2 / 1539 * VFS can produce. 1540 */ 1541 if (!ocfs2_valid_inode_mode(le16_to_cpu(di->i_mode))) { 1542 rc = ocfs2_error(sb, 1543 "Invalid dinode #%llu: mode 0%o has unknown file type\n", 1544 (unsigned long long)bh->b_blocknr, 1545 le16_to_cpu(di->i_mode)); 1546 goto bail; 1547 } 1548 1549 /* 1550 * id1.dev1.i_rdev is the device-number arm of the id1 union and 1551 * is only meaningful for character and block device inodes. For 1552 * any other regular user-visible file type the on-disk value 1553 * must be zero. ocfs2_populate_inode() currently runs 1554 * 1555 * inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); 1556 * 1557 * unconditionally, before the S_IFMT switch decides whether the 1558 * inode is a special file. As a result, an i_rdev value present 1559 * on a non-device inode is silently published into the in-core 1560 * inode; a subsequent forced re-read or in-core mode mutation 1561 * (cluster peer with raw write access to the shared LUN, 1562 * on-disk corruption, or a separately forged dinode) can then 1563 * expose the attacker-controlled device number to 1564 * init_special_inode() without ever showing an unusual i_mode 1565 * at validation time. 1566 * 1567 * System inodes (OCFS2_SYSTEM_FL) legitimately use the bitmap1 1568 * and journal1 arms of the same union (allocator i_used / 1569 * i_total counters and the journal ij_flags / 1570 * ij_recovery_generation pair); those bytes are not an i_rdev 1571 * and must not be checked here. Restrict the cross-check to 1572 * non-system inodes, which is the full attacker-controllable 1573 * surface. 1574 */ 1575 if (ocfs2_dinode_has_unexpected_rdev(di)) { 1576 rc = ocfs2_error(sb, 1577 "Invalid dinode #%llu: non-device mode 0%o with i_rdev %llu\n", 1578 (unsigned long long)bh->b_blocknr, 1579 le16_to_cpu(di->i_mode), 1580 (unsigned long long)le64_to_cpu(di->id1.dev1.i_rdev)); 1581 goto bail; 1582 } 1583 1584 /* 1585 * Non-inline directories must not have i_size without allocated 1586 * clusters: directory growth adds storage before advancing i_size, 1587 * and readdir walks i_size block-by-block. A forged directory 1588 * with zero clusters and a huge i_size would repeatedly fault on 1589 * holes while advancing through the claimed size. 1590 * 1591 * Non-inline regular files have the same invariant on non-sparse 1592 * volumes. Sparse regular files are different: truncate can 1593 * legitimately grow i_size without allocating clusters, so keep 1594 * the sparse-alloc carveout for S_IFREG only. System inodes and 1595 * inline-data dinodes have their own storage rules. 1596 */ 1597 if (ocfs2_dinode_has_size_without_clusters(sb, di)) { 1598 if (S_ISDIR(le16_to_cpu(di->i_mode))) 1599 rc = ocfs2_error(sb, 1600 "Invalid dinode #%llu: directory i_size %llu with i_clusters 0 and no inline-data flag\n", 1601 (unsigned long long)bh->b_blocknr, 1602 (unsigned long long)le64_to_cpu(di->i_size)); 1603 else 1604 rc = ocfs2_error(sb, 1605 "Invalid dinode #%llu: regular file i_size %llu with i_clusters 0 and no inline-data flag on non-sparse volume\n", 1606 (unsigned long long)bh->b_blocknr, 1607 (unsigned long long)le64_to_cpu(di->i_size)); 1608 goto bail; 1609 } 1610 1611 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) { 1612 struct ocfs2_inline_data *data = &di->id2.i_data; 1613 1614 if (le32_to_cpu(di->i_clusters)) { 1615 rc = ocfs2_error(sb, 1616 "Invalid dinode %llu: %u clusters\n", 1617 (unsigned long long)bh->b_blocknr, 1618 le32_to_cpu(di->i_clusters)); 1619 goto bail; 1620 } 1621 1622 if (le16_to_cpu(data->id_count) > 1623 ocfs2_max_inline_data_with_xattr(sb, di)) { 1624 rc = ocfs2_error(sb, 1625 "Invalid dinode #%llu: inline data id_count %u exceeds max %d\n", 1626 (unsigned long long)bh->b_blocknr, 1627 le16_to_cpu(data->id_count), 1628 ocfs2_max_inline_data_with_xattr(sb, di)); 1629 goto bail; 1630 } 1631 1632 if (le64_to_cpu(di->i_size) > le16_to_cpu(data->id_count)) { 1633 rc = ocfs2_error(sb, 1634 "Invalid dinode #%llu: inline data i_size %llu exceeds id_count %u\n", 1635 (unsigned long long)bh->b_blocknr, 1636 (unsigned long long)le64_to_cpu(di->i_size), 1637 le16_to_cpu(data->id_count)); 1638 goto bail; 1639 } 1640 } 1641 1642 if (S_ISLNK(le16_to_cpu(di->i_mode)) && 1643 !le32_to_cpu(di->i_clusters)) { 1644 int max_inline = ocfs2_fast_symlink_chars(sb); 1645 u64 i_size = le64_to_cpu(di->i_size); 1646 1647 if (i_size >= max_inline) { 1648 rc = ocfs2_error(sb, 1649 "Invalid dinode #%llu: fast symlink i_size %llu exceeds max %d\n", 1650 (unsigned long long)bh->b_blocknr, 1651 (unsigned long long)i_size, 1652 max_inline - 1); 1653 goto bail; 1654 } 1655 1656 if (strnlen((char *)di->id2.i_symlink, i_size + 1) != i_size) { 1657 rc = ocfs2_error(sb, 1658 "Invalid dinode #%llu: fast symlink is not NUL-terminated at i_size %llu\n", 1659 (unsigned long long)bh->b_blocknr, 1660 (unsigned long long)i_size); 1661 goto bail; 1662 } 1663 } 1664 1665 if (le32_to_cpu(di->i_flags) & OCFS2_CHAIN_FL) { 1666 struct ocfs2_chain_list *cl = &di->id2.i_chain; 1667 u16 bpc = 1 << (OCFS2_SB(sb)->s_clustersize_bits - 1668 sb->s_blocksize_bits); 1669 1670 if (le16_to_cpu(cl->cl_count) != ocfs2_chain_recs_per_inode(sb)) { 1671 rc = ocfs2_error(sb, "Invalid dinode %llu: chain list count %u\n", 1672 (unsigned long long)bh->b_blocknr, 1673 le16_to_cpu(cl->cl_count)); 1674 goto bail; 1675 } 1676 if (le16_to_cpu(cl->cl_next_free_rec) > le16_to_cpu(cl->cl_count)) { 1677 rc = ocfs2_error(sb, "Invalid dinode %llu: chain list index %u\n", 1678 (unsigned long long)bh->b_blocknr, 1679 le16_to_cpu(cl->cl_next_free_rec)); 1680 goto bail; 1681 } 1682 if (OCFS2_SB(sb)->bitmap_blkno && 1683 OCFS2_SB(sb)->bitmap_blkno != le64_to_cpu(di->i_blkno) && 1684 le16_to_cpu(cl->cl_bpc) != bpc) { 1685 rc = ocfs2_error(sb, "Invalid dinode %llu: bits per cluster %u\n", 1686 (unsigned long long)bh->b_blocknr, 1687 le16_to_cpu(cl->cl_bpc)); 1688 goto bail; 1689 } 1690 } 1691 1692 if ((le16_to_cpu(di->i_dyn_features) & OCFS2_HAS_REFCOUNT_FL) && 1693 !di->i_refcount_loc) { 1694 rc = ocfs2_error(sb, "Inode #%llu has refcount flag but no i_refcount_loc\n", 1695 (unsigned long long)bh->b_blocknr); 1696 goto bail; 1697 } 1698 1699 if (ocfs2_dinode_has_extents(di)) { 1700 struct ocfs2_extent_list *el = &di->id2.i_list; 1701 u16 count = le16_to_cpu(el->l_count); 1702 u16 next_free = le16_to_cpu(el->l_next_free_rec); 1703 1704 if (count == 0) { 1705 rc = ocfs2_error(sb, 1706 "Invalid dinode %llu: extent list l_count is zero\n", 1707 (unsigned long long)bh->b_blocknr); 1708 goto bail; 1709 } 1710 /* 1711 * The exact capacity depends on i_xattr_inline_size, another 1712 * unvalidated on-disk field. Inline xattrs only shrink the 1713 * list, so the no-xattr maximum is a safe upper bound that a 1714 * valid l_count never exceeds. 1715 */ 1716 if (count > ocfs2_extent_recs_per_inode(sb)) { 1717 rc = ocfs2_error(sb, 1718 "Invalid dinode %llu: extent list l_count %u exceeds max %u\n", 1719 (unsigned long long)bh->b_blocknr, count, 1720 ocfs2_extent_recs_per_inode(sb)); 1721 goto bail; 1722 } 1723 if (next_free > count) { 1724 rc = ocfs2_error(sb, 1725 "Invalid dinode %llu: extent list l_next_free_rec %u exceeds l_count %u\n", 1726 (unsigned long long)bh->b_blocknr, next_free, count); 1727 goto bail; 1728 } 1729 } 1730 1731 rc = 0; 1732 1733 bail: 1734 return rc; 1735 } 1736 1737 static int ocfs2_filecheck_validate_inode_block(struct super_block *sb, 1738 struct buffer_head *bh) 1739 { 1740 int rc = 0; 1741 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1742 1743 trace_ocfs2_filecheck_validate_inode_block( 1744 (unsigned long long)bh->b_blocknr); 1745 1746 BUG_ON(!buffer_uptodate(bh)); 1747 1748 /* 1749 * Call ocfs2_validate_meta_ecc() first since it has ecc repair 1750 * function, but we should not return error immediately when ecc 1751 * validation fails, because the reason is quite likely the invalid 1752 * inode number inputted. 1753 */ 1754 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); 1755 if (rc) { 1756 mlog(ML_ERROR, 1757 "Filecheck: checksum failed for dinode %llu\n", 1758 (unsigned long long)bh->b_blocknr); 1759 rc = -OCFS2_FILECHECK_ERR_BLOCKECC; 1760 } 1761 1762 if (!OCFS2_IS_VALID_DINODE(di)) { 1763 mlog(ML_ERROR, 1764 "Filecheck: invalid dinode #%llu: signature = %.*s\n", 1765 (unsigned long long)bh->b_blocknr, 7, di->i_signature); 1766 rc = -OCFS2_FILECHECK_ERR_INVALIDINO; 1767 goto bail; 1768 } else if (rc) 1769 goto bail; 1770 1771 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { 1772 mlog(ML_ERROR, 1773 "Filecheck: invalid dinode #%llu: i_blkno is %llu\n", 1774 (unsigned long long)bh->b_blocknr, 1775 (unsigned long long)le64_to_cpu(di->i_blkno)); 1776 rc = -OCFS2_FILECHECK_ERR_BLOCKNO; 1777 goto bail; 1778 } 1779 1780 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 1781 mlog(ML_ERROR, 1782 "Filecheck: invalid dinode #%llu: OCFS2_VALID_FL " 1783 "not set\n", 1784 (unsigned long long)bh->b_blocknr); 1785 rc = -OCFS2_FILECHECK_ERR_VALIDFLAG; 1786 goto bail; 1787 } 1788 1789 if (le32_to_cpu(di->i_fs_generation) != 1790 OCFS2_SB(sb)->fs_generation) { 1791 mlog(ML_ERROR, 1792 "Filecheck: invalid dinode #%llu: fs_generation is %u\n", 1793 (unsigned long long)bh->b_blocknr, 1794 le32_to_cpu(di->i_fs_generation)); 1795 rc = -OCFS2_FILECHECK_ERR_GENERATION; 1796 goto bail; 1797 } 1798 1799 if (!ocfs2_valid_inode_mode(le16_to_cpu(di->i_mode))) { 1800 mlog(ML_ERROR, 1801 "Filecheck: invalid dinode #%llu: mode 0%o has unknown file type\n", 1802 (unsigned long long)bh->b_blocknr, 1803 le16_to_cpu(di->i_mode)); 1804 rc = -OCFS2_FILECHECK_ERR_INVALIDINO; 1805 goto bail; 1806 } 1807 1808 if (ocfs2_dinode_has_unexpected_rdev(di)) { 1809 mlog(ML_ERROR, 1810 "Filecheck: invalid dinode #%llu: non-device mode 0%o with i_rdev %llu\n", 1811 (unsigned long long)bh->b_blocknr, 1812 le16_to_cpu(di->i_mode), 1813 (unsigned long long)le64_to_cpu(di->id1.dev1.i_rdev)); 1814 rc = -OCFS2_FILECHECK_ERR_INVALIDINO; 1815 goto bail; 1816 } 1817 1818 if (ocfs2_dinode_has_size_without_clusters(sb, di)) { 1819 if (S_ISDIR(le16_to_cpu(di->i_mode))) 1820 mlog(ML_ERROR, 1821 "Filecheck: invalid dinode #%llu: directory i_size %llu with i_clusters 0 and no inline-data flag\n", 1822 (unsigned long long)bh->b_blocknr, 1823 (unsigned long long)le64_to_cpu(di->i_size)); 1824 else 1825 mlog(ML_ERROR, 1826 "Filecheck: invalid dinode #%llu: regular file i_size %llu with i_clusters 0 and no inline-data flag on non-sparse volume\n", 1827 (unsigned long long)bh->b_blocknr, 1828 (unsigned long long)le64_to_cpu(di->i_size)); 1829 rc = -OCFS2_FILECHECK_ERR_INVALIDINO; 1830 } 1831 1832 bail: 1833 return rc; 1834 } 1835 1836 static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, 1837 struct buffer_head *bh) 1838 { 1839 int changed = 0; 1840 struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; 1841 1842 if (!ocfs2_filecheck_validate_inode_block(sb, bh)) 1843 return 0; 1844 1845 trace_ocfs2_filecheck_repair_inode_block( 1846 (unsigned long long)bh->b_blocknr); 1847 1848 if (unlikely(ocfs2_emergency_state(OCFS2_SB(sb)))) { 1849 mlog(ML_ERROR, 1850 "Filecheck: cannot repair dinode #%llu " 1851 "on readonly filesystem\n", 1852 (unsigned long long)bh->b_blocknr); 1853 return -OCFS2_FILECHECK_ERR_READONLY; 1854 } 1855 1856 if (buffer_jbd(bh)) { 1857 mlog(ML_ERROR, 1858 "Filecheck: cannot repair dinode #%llu, " 1859 "its buffer is in jbd\n", 1860 (unsigned long long)bh->b_blocknr); 1861 return -OCFS2_FILECHECK_ERR_INJBD; 1862 } 1863 1864 if (!OCFS2_IS_VALID_DINODE(di)) { 1865 /* Cannot fix invalid inode block */ 1866 return -OCFS2_FILECHECK_ERR_INVALIDINO; 1867 } 1868 1869 if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { 1870 /* Cannot just add VALID_FL flag back as a fix, 1871 * need more things to check here. 1872 */ 1873 return -OCFS2_FILECHECK_ERR_VALIDFLAG; 1874 } 1875 1876 if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { 1877 di->i_blkno = cpu_to_le64(bh->b_blocknr); 1878 changed = 1; 1879 mlog(ML_ERROR, 1880 "Filecheck: reset dinode #%llu: i_blkno to %llu\n", 1881 (unsigned long long)bh->b_blocknr, 1882 (unsigned long long)le64_to_cpu(di->i_blkno)); 1883 } 1884 1885 if (le32_to_cpu(di->i_fs_generation) != 1886 OCFS2_SB(sb)->fs_generation) { 1887 di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); 1888 changed = 1; 1889 mlog(ML_ERROR, 1890 "Filecheck: reset dinode #%llu: fs_generation to %u\n", 1891 (unsigned long long)bh->b_blocknr, 1892 le32_to_cpu(di->i_fs_generation)); 1893 } 1894 1895 if (ocfs2_dinode_has_extents(di) && 1896 le16_to_cpu(di->id2.i_list.l_next_free_rec) > le16_to_cpu(di->id2.i_list.l_count)) { 1897 di->id2.i_list.l_next_free_rec = di->id2.i_list.l_count; 1898 changed = 1; 1899 mlog(ML_ERROR, 1900 "Filecheck: reset dinode #%llu: l_next_free_rec to %u\n", 1901 (unsigned long long)bh->b_blocknr, 1902 le16_to_cpu(di->id2.i_list.l_next_free_rec)); 1903 } 1904 1905 if (changed || ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check)) { 1906 ocfs2_compute_meta_ecc(sb, bh->b_data, &di->i_check); 1907 mark_buffer_dirty(bh); 1908 mlog(ML_ERROR, 1909 "Filecheck: reset dinode #%llu: compute meta ecc\n", 1910 (unsigned long long)bh->b_blocknr); 1911 } 1912 1913 return 0; 1914 } 1915 1916 static int 1917 ocfs2_filecheck_read_inode_block_full(struct inode *inode, 1918 struct buffer_head **bh, 1919 int flags, int type) 1920 { 1921 int rc; 1922 struct buffer_head *tmp = *bh; 1923 1924 if (!type) /* Check inode block */ 1925 rc = ocfs2_read_blocks(INODE_CACHE(inode), 1926 OCFS2_I(inode)->ip_blkno, 1927 1, &tmp, flags, 1928 ocfs2_filecheck_validate_inode_block); 1929 else /* Repair inode block */ 1930 rc = ocfs2_read_blocks(INODE_CACHE(inode), 1931 OCFS2_I(inode)->ip_blkno, 1932 1, &tmp, flags, 1933 ocfs2_filecheck_repair_inode_block); 1934 1935 /* If ocfs2_read_blocks() got us a new bh, pass it up. */ 1936 if (!rc && !*bh) 1937 *bh = tmp; 1938 1939 return rc; 1940 } 1941 1942 int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, 1943 int flags) 1944 { 1945 int rc; 1946 struct buffer_head *tmp = *bh; 1947 1948 rc = ocfs2_read_blocks(INODE_CACHE(inode), OCFS2_I(inode)->ip_blkno, 1949 1, &tmp, flags, ocfs2_validate_inode_block); 1950 1951 if (rc < 0) 1952 make_bad_inode(inode); 1953 /* If ocfs2_read_blocks() got us a new bh, pass it up. */ 1954 if (!rc && !*bh) 1955 *bh = tmp; 1956 1957 return rc; 1958 } 1959 1960 int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) 1961 { 1962 return ocfs2_read_inode_block_full(inode, bh, 0); 1963 } 1964 1965 1966 static u64 ocfs2_inode_cache_owner(struct ocfs2_caching_info *ci) 1967 { 1968 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1969 1970 return oi->ip_blkno; 1971 } 1972 1973 static struct super_block *ocfs2_inode_cache_get_super(struct ocfs2_caching_info *ci) 1974 { 1975 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1976 1977 return oi->vfs_inode.i_sb; 1978 } 1979 1980 static void ocfs2_inode_cache_lock(struct ocfs2_caching_info *ci) 1981 __acquires(&oi->ip_lock) 1982 { 1983 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1984 1985 spin_lock(&oi->ip_lock); 1986 } 1987 1988 static void ocfs2_inode_cache_unlock(struct ocfs2_caching_info *ci) 1989 __releases(&oi->ip_lock) 1990 { 1991 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1992 1993 spin_unlock(&oi->ip_lock); 1994 } 1995 1996 static void ocfs2_inode_cache_io_lock(struct ocfs2_caching_info *ci) 1997 { 1998 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 1999 2000 mutex_lock(&oi->ip_io_mutex); 2001 } 2002 2003 static void ocfs2_inode_cache_io_unlock(struct ocfs2_caching_info *ci) 2004 { 2005 struct ocfs2_inode_info *oi = cache_info_to_inode(ci); 2006 2007 mutex_unlock(&oi->ip_io_mutex); 2008 } 2009 2010 const struct ocfs2_caching_operations ocfs2_inode_caching_ops = { 2011 .co_owner = ocfs2_inode_cache_owner, 2012 .co_get_super = ocfs2_inode_cache_get_super, 2013 .co_cache_lock = ocfs2_inode_cache_lock, 2014 .co_cache_unlock = ocfs2_inode_cache_unlock, 2015 .co_io_lock = ocfs2_inode_cache_io_lock, 2016 .co_io_unlock = ocfs2_inode_cache_io_unlock, 2017 }; 2018