1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include <linux/iversion.h> 7 8 #include "xfs.h" 9 #include "xfs_fs.h" 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_defer.h" 16 #include "xfs_inode.h" 17 #include "xfs_dir2.h" 18 #include "xfs_attr.h" 19 #include "xfs_trans_space.h" 20 #include "xfs_trans.h" 21 #include "xfs_buf_item.h" 22 #include "xfs_inode_item.h" 23 #include "xfs_iunlink_item.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_bmap.h" 26 #include "xfs_bmap_util.h" 27 #include "xfs_errortag.h" 28 #include "xfs_error.h" 29 #include "xfs_quota.h" 30 #include "xfs_filestream.h" 31 #include "xfs_trace.h" 32 #include "xfs_icache.h" 33 #include "xfs_symlink.h" 34 #include "xfs_trans_priv.h" 35 #include "xfs_log.h" 36 #include "xfs_bmap_btree.h" 37 #include "xfs_reflink.h" 38 #include "xfs_ag.h" 39 #include "xfs_log_priv.h" 40 #include "xfs_health.h" 41 42 struct kmem_cache *xfs_inode_cache; 43 44 STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *); 45 STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag, 46 struct xfs_inode *); 47 48 /* 49 * helper function to extract extent size hint from inode 50 */ 51 xfs_extlen_t 52 xfs_get_extsz_hint( 53 struct xfs_inode *ip) 54 { 55 /* 56 * No point in aligning allocations if we need to COW to actually 57 * write to them. 58 */ 59 if (xfs_is_always_cow_inode(ip)) 60 return 0; 61 if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) 62 return ip->i_extsize; 63 if (XFS_IS_REALTIME_INODE(ip)) 64 return ip->i_mount->m_sb.sb_rextsize; 65 return 0; 66 } 67 68 /* 69 * Helper function to extract CoW extent size hint from inode. 70 * Between the extent size hint and the CoW extent size hint, we 71 * return the greater of the two. If the value is zero (automatic), 72 * use the default size. 73 */ 74 xfs_extlen_t 75 xfs_get_cowextsz_hint( 76 struct xfs_inode *ip) 77 { 78 xfs_extlen_t a, b; 79 80 a = 0; 81 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 82 a = ip->i_cowextsize; 83 b = xfs_get_extsz_hint(ip); 84 85 a = max(a, b); 86 if (a == 0) 87 return XFS_DEFAULT_COWEXTSZ_HINT; 88 return a; 89 } 90 91 /* 92 * These two are wrapper routines around the xfs_ilock() routine used to 93 * centralize some grungy code. They are used in places that wish to lock the 94 * inode solely for reading the extents. The reason these places can't just 95 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to 96 * bringing in of the extents from disk for a file in b-tree format. If the 97 * inode is in b-tree format, then we need to lock the inode exclusively until 98 * the extents are read in. Locking it exclusively all the time would limit 99 * our parallelism unnecessarily, though. What we do instead is check to see 100 * if the extents have been read in yet, and only lock the inode exclusively 101 * if they have not. 102 * 103 * The functions return a value which should be given to the corresponding 104 * xfs_iunlock() call. 105 */ 106 uint 107 xfs_ilock_data_map_shared( 108 struct xfs_inode *ip) 109 { 110 uint lock_mode = XFS_ILOCK_SHARED; 111 112 if (xfs_need_iread_extents(&ip->i_df)) 113 lock_mode = XFS_ILOCK_EXCL; 114 xfs_ilock(ip, lock_mode); 115 return lock_mode; 116 } 117 118 uint 119 xfs_ilock_attr_map_shared( 120 struct xfs_inode *ip) 121 { 122 uint lock_mode = XFS_ILOCK_SHARED; 123 124 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 125 lock_mode = XFS_ILOCK_EXCL; 126 xfs_ilock(ip, lock_mode); 127 return lock_mode; 128 } 129 130 /* 131 * You can't set both SHARED and EXCL for the same lock, 132 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, 133 * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values 134 * to set in lock_flags. 135 */ 136 static inline void 137 xfs_lock_flags_assert( 138 uint lock_flags) 139 { 140 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 141 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 142 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != 143 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 144 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 145 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 146 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 147 ASSERT(lock_flags != 0); 148 } 149 150 /* 151 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 152 * multi-reader locks: invalidate_lock and the i_lock. This routine allows 153 * various combinations of the locks to be obtained. 154 * 155 * The 3 locks should always be ordered so that the IO lock is obtained first, 156 * the mmap lock second and the ilock last in order to prevent deadlock. 157 * 158 * Basic locking order: 159 * 160 * i_rwsem -> invalidate_lock -> page_lock -> i_ilock 161 * 162 * mmap_lock locking order: 163 * 164 * i_rwsem -> page lock -> mmap_lock 165 * mmap_lock -> invalidate_lock -> page_lock 166 * 167 * The difference in mmap_lock locking order mean that we cannot hold the 168 * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths 169 * can fault in pages during copy in/out (for buffered IO) or require the 170 * mmap_lock in get_user_pages() to map the user pages into the kernel address 171 * space for direct IO. Similarly the i_rwsem cannot be taken inside a page 172 * fault because page faults already hold the mmap_lock. 173 * 174 * Hence to serialise fully against both syscall and mmap based IO, we need to 175 * take both the i_rwsem and the invalidate_lock. These locks should *only* be 176 * both taken in places where we need to invalidate the page cache in a race 177 * free manner (e.g. truncate, hole punch and other extent manipulation 178 * functions). 179 */ 180 void 181 xfs_ilock( 182 xfs_inode_t *ip, 183 uint lock_flags) 184 { 185 trace_xfs_ilock(ip, lock_flags, _RET_IP_); 186 187 xfs_lock_flags_assert(lock_flags); 188 189 if (lock_flags & XFS_IOLOCK_EXCL) { 190 down_write_nested(&VFS_I(ip)->i_rwsem, 191 XFS_IOLOCK_DEP(lock_flags)); 192 } else if (lock_flags & XFS_IOLOCK_SHARED) { 193 down_read_nested(&VFS_I(ip)->i_rwsem, 194 XFS_IOLOCK_DEP(lock_flags)); 195 } 196 197 if (lock_flags & XFS_MMAPLOCK_EXCL) { 198 down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, 199 XFS_MMAPLOCK_DEP(lock_flags)); 200 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 201 down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, 202 XFS_MMAPLOCK_DEP(lock_flags)); 203 } 204 205 if (lock_flags & XFS_ILOCK_EXCL) 206 down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 207 else if (lock_flags & XFS_ILOCK_SHARED) 208 down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 209 } 210 211 /* 212 * This is just like xfs_ilock(), except that the caller 213 * is guaranteed not to sleep. It returns 1 if it gets 214 * the requested locks and 0 otherwise. If the IO lock is 215 * obtained but the inode lock cannot be, then the IO lock 216 * is dropped before returning. 217 * 218 * ip -- the inode being locked 219 * lock_flags -- this parameter indicates the inode's locks to be 220 * to be locked. See the comment for xfs_ilock() for a list 221 * of valid values. 222 */ 223 int 224 xfs_ilock_nowait( 225 xfs_inode_t *ip, 226 uint lock_flags) 227 { 228 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); 229 230 xfs_lock_flags_assert(lock_flags); 231 232 if (lock_flags & XFS_IOLOCK_EXCL) { 233 if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) 234 goto out; 235 } else if (lock_flags & XFS_IOLOCK_SHARED) { 236 if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) 237 goto out; 238 } 239 240 if (lock_flags & XFS_MMAPLOCK_EXCL) { 241 if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) 242 goto out_undo_iolock; 243 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 244 if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) 245 goto out_undo_iolock; 246 } 247 248 if (lock_flags & XFS_ILOCK_EXCL) { 249 if (!down_write_trylock(&ip->i_lock)) 250 goto out_undo_mmaplock; 251 } else if (lock_flags & XFS_ILOCK_SHARED) { 252 if (!down_read_trylock(&ip->i_lock)) 253 goto out_undo_mmaplock; 254 } 255 return 1; 256 257 out_undo_mmaplock: 258 if (lock_flags & XFS_MMAPLOCK_EXCL) 259 up_write(&VFS_I(ip)->i_mapping->invalidate_lock); 260 else if (lock_flags & XFS_MMAPLOCK_SHARED) 261 up_read(&VFS_I(ip)->i_mapping->invalidate_lock); 262 out_undo_iolock: 263 if (lock_flags & XFS_IOLOCK_EXCL) 264 up_write(&VFS_I(ip)->i_rwsem); 265 else if (lock_flags & XFS_IOLOCK_SHARED) 266 up_read(&VFS_I(ip)->i_rwsem); 267 out: 268 return 0; 269 } 270 271 /* 272 * xfs_iunlock() is used to drop the inode locks acquired with 273 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass 274 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so 275 * that we know which locks to drop. 276 * 277 * ip -- the inode being unlocked 278 * lock_flags -- this parameter indicates the inode's locks to be 279 * to be unlocked. See the comment for xfs_ilock() for a list 280 * of valid values for this parameter. 281 * 282 */ 283 void 284 xfs_iunlock( 285 xfs_inode_t *ip, 286 uint lock_flags) 287 { 288 xfs_lock_flags_assert(lock_flags); 289 290 if (lock_flags & XFS_IOLOCK_EXCL) 291 up_write(&VFS_I(ip)->i_rwsem); 292 else if (lock_flags & XFS_IOLOCK_SHARED) 293 up_read(&VFS_I(ip)->i_rwsem); 294 295 if (lock_flags & XFS_MMAPLOCK_EXCL) 296 up_write(&VFS_I(ip)->i_mapping->invalidate_lock); 297 else if (lock_flags & XFS_MMAPLOCK_SHARED) 298 up_read(&VFS_I(ip)->i_mapping->invalidate_lock); 299 300 if (lock_flags & XFS_ILOCK_EXCL) 301 up_write(&ip->i_lock); 302 else if (lock_flags & XFS_ILOCK_SHARED) 303 up_read(&ip->i_lock); 304 305 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 306 } 307 308 /* 309 * give up write locks. the i/o lock cannot be held nested 310 * if it is being demoted. 311 */ 312 void 313 xfs_ilock_demote( 314 xfs_inode_t *ip, 315 uint lock_flags) 316 { 317 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); 318 ASSERT((lock_flags & 319 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 320 321 if (lock_flags & XFS_ILOCK_EXCL) 322 downgrade_write(&ip->i_lock); 323 if (lock_flags & XFS_MMAPLOCK_EXCL) 324 downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); 325 if (lock_flags & XFS_IOLOCK_EXCL) 326 downgrade_write(&VFS_I(ip)->i_rwsem); 327 328 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 329 } 330 331 void 332 xfs_assert_ilocked( 333 struct xfs_inode *ip, 334 uint lock_flags) 335 { 336 /* 337 * Sometimes we assert the ILOCK is held exclusively, but we're in 338 * a workqueue, so lockdep doesn't know we're the owner. 339 */ 340 if (lock_flags & XFS_ILOCK_SHARED) 341 rwsem_assert_held(&ip->i_lock); 342 else if (lock_flags & XFS_ILOCK_EXCL) 343 rwsem_assert_held_write_nolockdep(&ip->i_lock); 344 345 if (lock_flags & XFS_MMAPLOCK_SHARED) 346 rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock); 347 else if (lock_flags & XFS_MMAPLOCK_EXCL) 348 rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock); 349 350 if (lock_flags & XFS_IOLOCK_SHARED) 351 rwsem_assert_held(&VFS_I(ip)->i_rwsem); 352 else if (lock_flags & XFS_IOLOCK_EXCL) 353 rwsem_assert_held_write(&VFS_I(ip)->i_rwsem); 354 } 355 356 /* 357 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when 358 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined 359 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build 360 * errors and warnings. 361 */ 362 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) 363 static bool 364 xfs_lockdep_subclass_ok( 365 int subclass) 366 { 367 return subclass < MAX_LOCKDEP_SUBCLASSES; 368 } 369 #else 370 #define xfs_lockdep_subclass_ok(subclass) (true) 371 #endif 372 373 /* 374 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different 375 * value. This can be called for any type of inode lock combination, including 376 * parent locking. Care must be taken to ensure we don't overrun the subclass 377 * storage fields in the class mask we build. 378 */ 379 static inline uint 380 xfs_lock_inumorder( 381 uint lock_mode, 382 uint subclass) 383 { 384 uint class = 0; 385 386 ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | 387 XFS_ILOCK_RTSUM))); 388 ASSERT(xfs_lockdep_subclass_ok(subclass)); 389 390 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { 391 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); 392 class += subclass << XFS_IOLOCK_SHIFT; 393 } 394 395 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { 396 ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); 397 class += subclass << XFS_MMAPLOCK_SHIFT; 398 } 399 400 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { 401 ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); 402 class += subclass << XFS_ILOCK_SHIFT; 403 } 404 405 return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; 406 } 407 408 /* 409 * The following routine will lock n inodes in exclusive mode. We assume the 410 * caller calls us with the inodes in i_ino order. 411 * 412 * We need to detect deadlock where an inode that we lock is in the AIL and we 413 * start waiting for another inode that is locked by a thread in a long running 414 * transaction (such as truncate). This can result in deadlock since the long 415 * running trans might need to wait for the inode we just locked in order to 416 * push the tail and free space in the log. 417 * 418 * xfs_lock_inodes() can only be used to lock one type of lock at a time - 419 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we 420 * lock more than one at a time, lockdep will report false positives saying we 421 * have violated locking orders. 422 */ 423 static void 424 xfs_lock_inodes( 425 struct xfs_inode **ips, 426 int inodes, 427 uint lock_mode) 428 { 429 int attempts = 0; 430 uint i; 431 int j; 432 bool try_lock; 433 struct xfs_log_item *lp; 434 435 /* 436 * Currently supports between 2 and 5 inodes with exclusive locking. We 437 * support an arbitrary depth of locking here, but absolute limits on 438 * inodes depend on the type of locking and the limits placed by 439 * lockdep annotations in xfs_lock_inumorder. These are all checked by 440 * the asserts. 441 */ 442 ASSERT(ips && inodes >= 2 && inodes <= 5); 443 ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | 444 XFS_ILOCK_EXCL)); 445 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | 446 XFS_ILOCK_SHARED))); 447 ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || 448 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); 449 ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || 450 inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); 451 452 if (lock_mode & XFS_IOLOCK_EXCL) { 453 ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); 454 } else if (lock_mode & XFS_MMAPLOCK_EXCL) 455 ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); 456 457 again: 458 try_lock = false; 459 i = 0; 460 for (; i < inodes; i++) { 461 ASSERT(ips[i]); 462 463 if (i && (ips[i] == ips[i - 1])) /* Already locked */ 464 continue; 465 466 /* 467 * If try_lock is not set yet, make sure all locked inodes are 468 * not in the AIL. If any are, set try_lock to be used later. 469 */ 470 if (!try_lock) { 471 for (j = (i - 1); j >= 0 && !try_lock; j--) { 472 lp = &ips[j]->i_itemp->ili_item; 473 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) 474 try_lock = true; 475 } 476 } 477 478 /* 479 * If any of the previous locks we have locked is in the AIL, 480 * we must TRY to get the second and subsequent locks. If 481 * we can't get any, we must release all we have 482 * and try again. 483 */ 484 if (!try_lock) { 485 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); 486 continue; 487 } 488 489 /* try_lock means we have an inode locked that is in the AIL. */ 490 ASSERT(i != 0); 491 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) 492 continue; 493 494 /* 495 * Unlock all previous guys and try again. xfs_iunlock will try 496 * to push the tail if the inode is in the AIL. 497 */ 498 attempts++; 499 for (j = i - 1; j >= 0; j--) { 500 /* 501 * Check to see if we've already unlocked this one. Not 502 * the first one going back, and the inode ptr is the 503 * same. 504 */ 505 if (j != (i - 1) && ips[j] == ips[j + 1]) 506 continue; 507 508 xfs_iunlock(ips[j], lock_mode); 509 } 510 511 if ((attempts % 5) == 0) { 512 delay(1); /* Don't just spin the CPU */ 513 } 514 goto again; 515 } 516 } 517 518 /* 519 * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and 520 * mmaplock must be double-locked separately since we use i_rwsem and 521 * invalidate_lock for that. We now support taking one lock EXCL and the 522 * other SHARED. 523 */ 524 void 525 xfs_lock_two_inodes( 526 struct xfs_inode *ip0, 527 uint ip0_mode, 528 struct xfs_inode *ip1, 529 uint ip1_mode) 530 { 531 int attempts = 0; 532 struct xfs_log_item *lp; 533 534 ASSERT(hweight32(ip0_mode) == 1); 535 ASSERT(hweight32(ip1_mode) == 1); 536 ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 537 ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 538 ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 539 ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 540 ASSERT(ip0->i_ino != ip1->i_ino); 541 542 if (ip0->i_ino > ip1->i_ino) { 543 swap(ip0, ip1); 544 swap(ip0_mode, ip1_mode); 545 } 546 547 again: 548 xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0)); 549 550 /* 551 * If the first lock we have locked is in the AIL, we must TRY to get 552 * the second lock. If we can't get it, we must release the first one 553 * and try again. 554 */ 555 lp = &ip0->i_itemp->ili_item; 556 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { 557 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { 558 xfs_iunlock(ip0, ip0_mode); 559 if ((++attempts % 5) == 0) 560 delay(1); /* Don't just spin the CPU */ 561 goto again; 562 } 563 } else { 564 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); 565 } 566 } 567 568 uint 569 xfs_ip2xflags( 570 struct xfs_inode *ip) 571 { 572 uint flags = 0; 573 574 if (ip->i_diflags & XFS_DIFLAG_ANY) { 575 if (ip->i_diflags & XFS_DIFLAG_REALTIME) 576 flags |= FS_XFLAG_REALTIME; 577 if (ip->i_diflags & XFS_DIFLAG_PREALLOC) 578 flags |= FS_XFLAG_PREALLOC; 579 if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) 580 flags |= FS_XFLAG_IMMUTABLE; 581 if (ip->i_diflags & XFS_DIFLAG_APPEND) 582 flags |= FS_XFLAG_APPEND; 583 if (ip->i_diflags & XFS_DIFLAG_SYNC) 584 flags |= FS_XFLAG_SYNC; 585 if (ip->i_diflags & XFS_DIFLAG_NOATIME) 586 flags |= FS_XFLAG_NOATIME; 587 if (ip->i_diflags & XFS_DIFLAG_NODUMP) 588 flags |= FS_XFLAG_NODUMP; 589 if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) 590 flags |= FS_XFLAG_RTINHERIT; 591 if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) 592 flags |= FS_XFLAG_PROJINHERIT; 593 if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) 594 flags |= FS_XFLAG_NOSYMLINKS; 595 if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) 596 flags |= FS_XFLAG_EXTSIZE; 597 if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) 598 flags |= FS_XFLAG_EXTSZINHERIT; 599 if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) 600 flags |= FS_XFLAG_NODEFRAG; 601 if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) 602 flags |= FS_XFLAG_FILESTREAM; 603 } 604 605 if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { 606 if (ip->i_diflags2 & XFS_DIFLAG2_DAX) 607 flags |= FS_XFLAG_DAX; 608 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 609 flags |= FS_XFLAG_COWEXTSIZE; 610 } 611 612 if (xfs_inode_has_attr_fork(ip)) 613 flags |= FS_XFLAG_HASATTR; 614 return flags; 615 } 616 617 /* 618 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 619 * is allowed, otherwise it has to be an exact match. If a CI match is found, 620 * ci_name->name will point to a the actual name (caller must free) or 621 * will be set to NULL if an exact match is found. 622 */ 623 int 624 xfs_lookup( 625 struct xfs_inode *dp, 626 const struct xfs_name *name, 627 struct xfs_inode **ipp, 628 struct xfs_name *ci_name) 629 { 630 xfs_ino_t inum; 631 int error; 632 633 trace_xfs_lookup(dp, name); 634 635 if (xfs_is_shutdown(dp->i_mount)) 636 return -EIO; 637 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 638 return -EIO; 639 640 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 641 if (error) 642 goto out_unlock; 643 644 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); 645 if (error) 646 goto out_free_name; 647 648 return 0; 649 650 out_free_name: 651 if (ci_name) 652 kfree(ci_name->name); 653 out_unlock: 654 *ipp = NULL; 655 return error; 656 } 657 658 /* Propagate di_flags from a parent inode to a child inode. */ 659 static void 660 xfs_inode_inherit_flags( 661 struct xfs_inode *ip, 662 const struct xfs_inode *pip) 663 { 664 unsigned int di_flags = 0; 665 xfs_failaddr_t failaddr; 666 umode_t mode = VFS_I(ip)->i_mode; 667 668 if (S_ISDIR(mode)) { 669 if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) 670 di_flags |= XFS_DIFLAG_RTINHERIT; 671 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 672 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 673 ip->i_extsize = pip->i_extsize; 674 } 675 if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) 676 di_flags |= XFS_DIFLAG_PROJINHERIT; 677 } else if (S_ISREG(mode)) { 678 if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && 679 xfs_has_realtime(ip->i_mount)) 680 di_flags |= XFS_DIFLAG_REALTIME; 681 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 682 di_flags |= XFS_DIFLAG_EXTSIZE; 683 ip->i_extsize = pip->i_extsize; 684 } 685 } 686 if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && 687 xfs_inherit_noatime) 688 di_flags |= XFS_DIFLAG_NOATIME; 689 if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && 690 xfs_inherit_nodump) 691 di_flags |= XFS_DIFLAG_NODUMP; 692 if ((pip->i_diflags & XFS_DIFLAG_SYNC) && 693 xfs_inherit_sync) 694 di_flags |= XFS_DIFLAG_SYNC; 695 if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && 696 xfs_inherit_nosymlinks) 697 di_flags |= XFS_DIFLAG_NOSYMLINKS; 698 if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && 699 xfs_inherit_nodefrag) 700 di_flags |= XFS_DIFLAG_NODEFRAG; 701 if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) 702 di_flags |= XFS_DIFLAG_FILESTREAM; 703 704 ip->i_diflags |= di_flags; 705 706 /* 707 * Inode verifiers on older kernels only check that the extent size 708 * hint is an integer multiple of the rt extent size on realtime files. 709 * They did not check the hint alignment on a directory with both 710 * rtinherit and extszinherit flags set. If the misaligned hint is 711 * propagated from a directory into a new realtime file, new file 712 * allocations will fail due to math errors in the rt allocator and/or 713 * trip the verifiers. Validate the hint settings in the new file so 714 * that we don't let broken hints propagate. 715 */ 716 failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, 717 VFS_I(ip)->i_mode, ip->i_diflags); 718 if (failaddr) { 719 ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | 720 XFS_DIFLAG_EXTSZINHERIT); 721 ip->i_extsize = 0; 722 } 723 } 724 725 /* Propagate di_flags2 from a parent inode to a child inode. */ 726 static void 727 xfs_inode_inherit_flags2( 728 struct xfs_inode *ip, 729 const struct xfs_inode *pip) 730 { 731 xfs_failaddr_t failaddr; 732 733 if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { 734 ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; 735 ip->i_cowextsize = pip->i_cowextsize; 736 } 737 if (pip->i_diflags2 & XFS_DIFLAG2_DAX) 738 ip->i_diflags2 |= XFS_DIFLAG2_DAX; 739 740 /* Don't let invalid cowextsize hints propagate. */ 741 failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, 742 VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); 743 if (failaddr) { 744 ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; 745 ip->i_cowextsize = 0; 746 } 747 } 748 749 /* 750 * Initialise a newly allocated inode and return the in-core inode to the 751 * caller locked exclusively. 752 */ 753 int 754 xfs_init_new_inode( 755 struct mnt_idmap *idmap, 756 struct xfs_trans *tp, 757 struct xfs_inode *pip, 758 xfs_ino_t ino, 759 umode_t mode, 760 xfs_nlink_t nlink, 761 dev_t rdev, 762 prid_t prid, 763 bool init_xattrs, 764 struct xfs_inode **ipp) 765 { 766 struct inode *dir = pip ? VFS_I(pip) : NULL; 767 struct xfs_mount *mp = tp->t_mountp; 768 struct xfs_inode *ip; 769 unsigned int flags; 770 int error; 771 struct timespec64 tv; 772 struct inode *inode; 773 774 /* 775 * Protect against obviously corrupt allocation btree records. Later 776 * xfs_iget checks will catch re-allocation of other active in-memory 777 * and on-disk inodes. If we don't catch reallocating the parent inode 778 * here we will deadlock in xfs_iget() so we have to do these checks 779 * first. 780 */ 781 if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { 782 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); 783 xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), 784 XFS_SICK_AG_INOBT); 785 return -EFSCORRUPTED; 786 } 787 788 /* 789 * Get the in-core inode with the lock held exclusively to prevent 790 * others from looking at until we're done. 791 */ 792 error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 793 if (error) 794 return error; 795 796 ASSERT(ip != NULL); 797 inode = VFS_I(ip); 798 set_nlink(inode, nlink); 799 inode->i_rdev = rdev; 800 ip->i_projid = prid; 801 802 if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { 803 inode_fsuid_set(inode, idmap); 804 inode->i_gid = dir->i_gid; 805 inode->i_mode = mode; 806 } else { 807 inode_init_owner(idmap, inode, dir, mode); 808 } 809 810 /* 811 * If the group ID of the new file does not match the effective group 812 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 813 * (and only if the irix_sgid_inherit compatibility variable is set). 814 */ 815 if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && 816 !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) 817 inode->i_mode &= ~S_ISGID; 818 819 ip->i_disk_size = 0; 820 ip->i_df.if_nextents = 0; 821 ASSERT(ip->i_nblocks == 0); 822 823 tv = inode_set_ctime_current(inode); 824 inode_set_mtime_to_ts(inode, tv); 825 inode_set_atime_to_ts(inode, tv); 826 827 ip->i_extsize = 0; 828 ip->i_diflags = 0; 829 830 if (xfs_has_v3inodes(mp)) { 831 inode_set_iversion(inode, 1); 832 ip->i_cowextsize = 0; 833 ip->i_crtime = tv; 834 } 835 836 flags = XFS_ILOG_CORE; 837 switch (mode & S_IFMT) { 838 case S_IFIFO: 839 case S_IFCHR: 840 case S_IFBLK: 841 case S_IFSOCK: 842 ip->i_df.if_format = XFS_DINODE_FMT_DEV; 843 flags |= XFS_ILOG_DEV; 844 break; 845 case S_IFREG: 846 case S_IFDIR: 847 if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) 848 xfs_inode_inherit_flags(ip, pip); 849 if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) 850 xfs_inode_inherit_flags2(ip, pip); 851 fallthrough; 852 case S_IFLNK: 853 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 854 ip->i_df.if_bytes = 0; 855 ip->i_df.if_data = NULL; 856 break; 857 default: 858 ASSERT(0); 859 } 860 861 /* 862 * If we need to create attributes immediately after allocating the 863 * inode, initialise an empty attribute fork right now. We use the 864 * default fork offset for attributes here as we don't know exactly what 865 * size or how many attributes we might be adding. We can do this 866 * safely here because we know the data fork is completely empty and 867 * this saves us from needing to run a separate transaction to set the 868 * fork offset in the immediate future. 869 */ 870 if (init_xattrs && xfs_has_attr(mp)) { 871 ip->i_forkoff = xfs_default_attroffset(ip) >> 3; 872 xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); 873 } 874 875 /* 876 * Log the new values stuffed into the inode. 877 */ 878 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 879 xfs_trans_log_inode(tp, ip, flags); 880 881 /* now that we have an i_mode we can setup the inode structure */ 882 xfs_setup_inode(ip); 883 884 *ipp = ip; 885 return 0; 886 } 887 888 /* 889 * Decrement the link count on an inode & log the change. If this causes the 890 * link count to go to zero, move the inode to AGI unlinked list so that it can 891 * be freed when the last active reference goes away via xfs_inactive(). 892 */ 893 static int /* error */ 894 xfs_droplink( 895 xfs_trans_t *tp, 896 xfs_inode_t *ip) 897 { 898 if (VFS_I(ip)->i_nlink == 0) { 899 xfs_alert(ip->i_mount, 900 "%s: Attempt to drop inode (%llu) with nlink zero.", 901 __func__, ip->i_ino); 902 return -EFSCORRUPTED; 903 } 904 905 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 906 907 drop_nlink(VFS_I(ip)); 908 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 909 910 if (VFS_I(ip)->i_nlink) 911 return 0; 912 913 return xfs_iunlink(tp, ip); 914 } 915 916 /* 917 * Increment the link count on an inode & log the change. 918 */ 919 static void 920 xfs_bumplink( 921 xfs_trans_t *tp, 922 xfs_inode_t *ip) 923 { 924 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 925 926 inc_nlink(VFS_I(ip)); 927 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 928 } 929 930 #ifdef CONFIG_XFS_LIVE_HOOKS 931 /* 932 * Use a static key here to reduce the overhead of directory live update hooks. 933 * If the compiler supports jump labels, the static branch will be replaced by 934 * a nop sled when there are no hook users. Online fsck is currently the only 935 * caller, so this is a reasonable tradeoff. 936 * 937 * Note: Patching the kernel code requires taking the cpu hotplug lock. Other 938 * parts of the kernel allocate memory with that lock held, which means that 939 * XFS callers cannot hold any locks that might be used by memory reclaim or 940 * writeback when calling the static_branch_{inc,dec} functions. 941 */ 942 DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); 943 944 void 945 xfs_dir_hook_disable(void) 946 { 947 xfs_hooks_switch_off(&xfs_dir_hooks_switch); 948 } 949 950 void 951 xfs_dir_hook_enable(void) 952 { 953 xfs_hooks_switch_on(&xfs_dir_hooks_switch); 954 } 955 956 /* Call hooks for a directory update relating to a child dirent update. */ 957 inline void 958 xfs_dir_update_hook( 959 struct xfs_inode *dp, 960 struct xfs_inode *ip, 961 int delta, 962 const struct xfs_name *name) 963 { 964 if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { 965 struct xfs_dir_update_params p = { 966 .dp = dp, 967 .ip = ip, 968 .delta = delta, 969 .name = name, 970 }; 971 struct xfs_mount *mp = ip->i_mount; 972 973 xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); 974 } 975 } 976 977 /* Call the specified function during a directory update. */ 978 int 979 xfs_dir_hook_add( 980 struct xfs_mount *mp, 981 struct xfs_dir_hook *hook) 982 { 983 return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); 984 } 985 986 /* Stop calling the specified function during a directory update. */ 987 void 988 xfs_dir_hook_del( 989 struct xfs_mount *mp, 990 struct xfs_dir_hook *hook) 991 { 992 xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); 993 } 994 995 /* Configure directory update hook functions. */ 996 void 997 xfs_dir_hook_setup( 998 struct xfs_dir_hook *hook, 999 notifier_fn_t mod_fn) 1000 { 1001 xfs_hook_setup(&hook->dirent_hook, mod_fn); 1002 } 1003 #endif /* CONFIG_XFS_LIVE_HOOKS */ 1004 1005 int 1006 xfs_create( 1007 struct mnt_idmap *idmap, 1008 xfs_inode_t *dp, 1009 struct xfs_name *name, 1010 umode_t mode, 1011 dev_t rdev, 1012 bool init_xattrs, 1013 xfs_inode_t **ipp) 1014 { 1015 int is_dir = S_ISDIR(mode); 1016 struct xfs_mount *mp = dp->i_mount; 1017 struct xfs_inode *ip = NULL; 1018 struct xfs_trans *tp = NULL; 1019 int error; 1020 bool unlock_dp_on_error = false; 1021 prid_t prid; 1022 struct xfs_dquot *udqp = NULL; 1023 struct xfs_dquot *gdqp = NULL; 1024 struct xfs_dquot *pdqp = NULL; 1025 struct xfs_trans_res *tres; 1026 uint resblks; 1027 xfs_ino_t ino; 1028 1029 trace_xfs_create(dp, name); 1030 1031 if (xfs_is_shutdown(mp)) 1032 return -EIO; 1033 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 1034 return -EIO; 1035 1036 prid = xfs_get_initial_prid(dp); 1037 1038 /* 1039 * Make sure that we have allocated dquot(s) on disk. 1040 */ 1041 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), 1042 mapped_fsgid(idmap, &init_user_ns), prid, 1043 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1044 &udqp, &gdqp, &pdqp); 1045 if (error) 1046 return error; 1047 1048 if (is_dir) { 1049 resblks = XFS_MKDIR_SPACE_RES(mp, name->len); 1050 tres = &M_RES(mp)->tr_mkdir; 1051 } else { 1052 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1053 tres = &M_RES(mp)->tr_create; 1054 } 1055 1056 /* 1057 * Initially assume that the file does not exist and 1058 * reserve the resources for that case. If that is not 1059 * the case we'll drop the one we have and get a more 1060 * appropriate transaction later. 1061 */ 1062 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1063 &tp); 1064 if (error == -ENOSPC) { 1065 /* flush outstanding delalloc blocks and retry */ 1066 xfs_flush_inodes(mp); 1067 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, 1068 resblks, &tp); 1069 } 1070 if (error) 1071 goto out_release_dquots; 1072 1073 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1074 unlock_dp_on_error = true; 1075 1076 /* 1077 * A newly created regular or special file just has one directory 1078 * entry pointing to them, but a directory also the "." entry 1079 * pointing to itself. 1080 */ 1081 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); 1082 if (!error) 1083 error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 1084 is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); 1085 if (error) 1086 goto out_trans_cancel; 1087 1088 /* 1089 * Now we join the directory inode to the transaction. We do not do it 1090 * earlier because xfs_dialloc might commit the previous transaction 1091 * (and release all the locks). An error from here on will result in 1092 * the transaction cancel unlocking dp so don't do it explicitly in the 1093 * error path. 1094 */ 1095 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1096 unlock_dp_on_error = false; 1097 1098 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1099 resblks - XFS_IALLOC_SPACE_RES(mp)); 1100 if (error) { 1101 ASSERT(error != -ENOSPC); 1102 goto out_trans_cancel; 1103 } 1104 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1105 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1106 1107 if (is_dir) { 1108 error = xfs_dir_init(tp, ip, dp); 1109 if (error) 1110 goto out_trans_cancel; 1111 1112 xfs_bumplink(tp, dp); 1113 } 1114 1115 /* 1116 * Create ip with a reference from dp, and add '.' and '..' references 1117 * if it's a directory. 1118 */ 1119 xfs_dir_update_hook(dp, ip, 1, name); 1120 1121 /* 1122 * If this is a synchronous mount, make sure that the 1123 * create transaction goes to disk before returning to 1124 * the user. 1125 */ 1126 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 1127 xfs_trans_set_sync(tp); 1128 1129 /* 1130 * Attach the dquot(s) to the inodes and modify them incore. 1131 * These ids of the inode couldn't have changed since the new 1132 * inode has been locked ever since it was created. 1133 */ 1134 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1135 1136 error = xfs_trans_commit(tp); 1137 if (error) 1138 goto out_release_inode; 1139 1140 xfs_qm_dqrele(udqp); 1141 xfs_qm_dqrele(gdqp); 1142 xfs_qm_dqrele(pdqp); 1143 1144 *ipp = ip; 1145 return 0; 1146 1147 out_trans_cancel: 1148 xfs_trans_cancel(tp); 1149 out_release_inode: 1150 /* 1151 * Wait until after the current transaction is aborted to finish the 1152 * setup of the inode and release the inode. This prevents recursive 1153 * transactions and deadlocks from xfs_inactive. 1154 */ 1155 if (ip) { 1156 xfs_finish_inode_setup(ip); 1157 xfs_irele(ip); 1158 } 1159 out_release_dquots: 1160 xfs_qm_dqrele(udqp); 1161 xfs_qm_dqrele(gdqp); 1162 xfs_qm_dqrele(pdqp); 1163 1164 if (unlock_dp_on_error) 1165 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1166 return error; 1167 } 1168 1169 int 1170 xfs_create_tmpfile( 1171 struct mnt_idmap *idmap, 1172 struct xfs_inode *dp, 1173 umode_t mode, 1174 struct xfs_inode **ipp) 1175 { 1176 struct xfs_mount *mp = dp->i_mount; 1177 struct xfs_inode *ip = NULL; 1178 struct xfs_trans *tp = NULL; 1179 int error; 1180 prid_t prid; 1181 struct xfs_dquot *udqp = NULL; 1182 struct xfs_dquot *gdqp = NULL; 1183 struct xfs_dquot *pdqp = NULL; 1184 struct xfs_trans_res *tres; 1185 uint resblks; 1186 xfs_ino_t ino; 1187 1188 if (xfs_is_shutdown(mp)) 1189 return -EIO; 1190 1191 prid = xfs_get_initial_prid(dp); 1192 1193 /* 1194 * Make sure that we have allocated dquot(s) on disk. 1195 */ 1196 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), 1197 mapped_fsgid(idmap, &init_user_ns), prid, 1198 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1199 &udqp, &gdqp, &pdqp); 1200 if (error) 1201 return error; 1202 1203 resblks = XFS_IALLOC_SPACE_RES(mp); 1204 tres = &M_RES(mp)->tr_create_tmpfile; 1205 1206 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1207 &tp); 1208 if (error) 1209 goto out_release_dquots; 1210 1211 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); 1212 if (!error) 1213 error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 1214 0, 0, prid, false, &ip); 1215 if (error) 1216 goto out_trans_cancel; 1217 1218 if (xfs_has_wsync(mp)) 1219 xfs_trans_set_sync(tp); 1220 1221 /* 1222 * Attach the dquot(s) to the inodes and modify them incore. 1223 * These ids of the inode couldn't have changed since the new 1224 * inode has been locked ever since it was created. 1225 */ 1226 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1227 1228 error = xfs_iunlink(tp, ip); 1229 if (error) 1230 goto out_trans_cancel; 1231 1232 error = xfs_trans_commit(tp); 1233 if (error) 1234 goto out_release_inode; 1235 1236 xfs_qm_dqrele(udqp); 1237 xfs_qm_dqrele(gdqp); 1238 xfs_qm_dqrele(pdqp); 1239 1240 *ipp = ip; 1241 return 0; 1242 1243 out_trans_cancel: 1244 xfs_trans_cancel(tp); 1245 out_release_inode: 1246 /* 1247 * Wait until after the current transaction is aborted to finish the 1248 * setup of the inode and release the inode. This prevents recursive 1249 * transactions and deadlocks from xfs_inactive. 1250 */ 1251 if (ip) { 1252 xfs_finish_inode_setup(ip); 1253 xfs_irele(ip); 1254 } 1255 out_release_dquots: 1256 xfs_qm_dqrele(udqp); 1257 xfs_qm_dqrele(gdqp); 1258 xfs_qm_dqrele(pdqp); 1259 1260 return error; 1261 } 1262 1263 int 1264 xfs_link( 1265 xfs_inode_t *tdp, 1266 xfs_inode_t *sip, 1267 struct xfs_name *target_name) 1268 { 1269 xfs_mount_t *mp = tdp->i_mount; 1270 xfs_trans_t *tp; 1271 int error, nospace_error = 0; 1272 int resblks; 1273 1274 trace_xfs_link(tdp, target_name); 1275 1276 ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); 1277 1278 if (xfs_is_shutdown(mp)) 1279 return -EIO; 1280 if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) 1281 return -EIO; 1282 1283 error = xfs_qm_dqattach(sip); 1284 if (error) 1285 goto std_return; 1286 1287 error = xfs_qm_dqattach(tdp); 1288 if (error) 1289 goto std_return; 1290 1291 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1292 error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, 1293 &tp, &nospace_error); 1294 if (error) 1295 goto std_return; 1296 1297 /* 1298 * If we are using project inheritance, we only allow hard link 1299 * creation in our tree when the project IDs are the same; else 1300 * the tree quota mechanism could be circumvented. 1301 */ 1302 if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 1303 tdp->i_projid != sip->i_projid)) { 1304 error = -EXDEV; 1305 goto error_return; 1306 } 1307 1308 if (!resblks) { 1309 error = xfs_dir_canenter(tp, tdp, target_name); 1310 if (error) 1311 goto error_return; 1312 } 1313 1314 /* 1315 * Handle initial link state of O_TMPFILE inode 1316 */ 1317 if (VFS_I(sip)->i_nlink == 0) { 1318 struct xfs_perag *pag; 1319 1320 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); 1321 error = xfs_iunlink_remove(tp, pag, sip); 1322 xfs_perag_put(pag); 1323 if (error) 1324 goto error_return; 1325 } 1326 1327 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1328 resblks); 1329 if (error) 1330 goto error_return; 1331 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1332 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1333 1334 xfs_bumplink(tp, sip); 1335 xfs_dir_update_hook(tdp, sip, 1, target_name); 1336 1337 /* 1338 * If this is a synchronous mount, make sure that the 1339 * link transaction goes to disk before returning to 1340 * the user. 1341 */ 1342 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 1343 xfs_trans_set_sync(tp); 1344 1345 return xfs_trans_commit(tp); 1346 1347 error_return: 1348 xfs_trans_cancel(tp); 1349 std_return: 1350 if (error == -ENOSPC && nospace_error) 1351 error = nospace_error; 1352 return error; 1353 } 1354 1355 /* Clear the reflink flag and the cowblocks tag if possible. */ 1356 static void 1357 xfs_itruncate_clear_reflink_flags( 1358 struct xfs_inode *ip) 1359 { 1360 struct xfs_ifork *dfork; 1361 struct xfs_ifork *cfork; 1362 1363 if (!xfs_is_reflink_inode(ip)) 1364 return; 1365 dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK); 1366 cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); 1367 if (dfork->if_bytes == 0 && cfork->if_bytes == 0) 1368 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1369 if (cfork->if_bytes == 0) 1370 xfs_inode_clear_cowblocks_tag(ip); 1371 } 1372 1373 /* 1374 * Free up the underlying blocks past new_size. The new size must be smaller 1375 * than the current size. This routine can be used both for the attribute and 1376 * data fork, and does not modify the inode size, which is left to the caller. 1377 * 1378 * The transaction passed to this routine must have made a permanent log 1379 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1380 * given transaction and start new ones, so make sure everything involved in 1381 * the transaction is tidy before calling here. Some transaction will be 1382 * returned to the caller to be committed. The incoming transaction must 1383 * already include the inode, and both inode locks must be held exclusively. 1384 * The inode must also be "held" within the transaction. On return the inode 1385 * will be "held" within the returned transaction. This routine does NOT 1386 * require any disk space to be reserved for it within the transaction. 1387 * 1388 * If we get an error, we must return with the inode locked and linked into the 1389 * current transaction. This keeps things simple for the higher level code, 1390 * because it always knows that the inode is locked and held in the transaction 1391 * that returns to it whether errors occur or not. We don't mark the inode 1392 * dirty on error so that transactions can be easily aborted if possible. 1393 */ 1394 int 1395 xfs_itruncate_extents_flags( 1396 struct xfs_trans **tpp, 1397 struct xfs_inode *ip, 1398 int whichfork, 1399 xfs_fsize_t new_size, 1400 int flags) 1401 { 1402 struct xfs_mount *mp = ip->i_mount; 1403 struct xfs_trans *tp = *tpp; 1404 xfs_fileoff_t first_unmap_block; 1405 int error = 0; 1406 1407 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1408 if (atomic_read(&VFS_I(ip)->i_count)) 1409 xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); 1410 ASSERT(new_size <= XFS_ISIZE(ip)); 1411 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1412 ASSERT(ip->i_itemp != NULL); 1413 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1414 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1415 1416 trace_xfs_itruncate_extents_start(ip, new_size); 1417 1418 flags |= xfs_bmapi_aflag(whichfork); 1419 1420 /* 1421 * Since it is possible for space to become allocated beyond 1422 * the end of the file (in a crash where the space is allocated 1423 * but the inode size is not yet updated), simply remove any 1424 * blocks which show up between the new EOF and the maximum 1425 * possible file size. 1426 * 1427 * We have to free all the blocks to the bmbt maximum offset, even if 1428 * the page cache can't scale that far. 1429 */ 1430 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1431 if (!xfs_verify_fileoff(mp, first_unmap_block)) { 1432 WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); 1433 return 0; 1434 } 1435 1436 error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, 1437 XFS_MAX_FILEOFF); 1438 if (error) 1439 goto out; 1440 1441 if (whichfork == XFS_DATA_FORK) { 1442 /* Remove all pending CoW reservations. */ 1443 error = xfs_reflink_cancel_cow_blocks(ip, &tp, 1444 first_unmap_block, XFS_MAX_FILEOFF, true); 1445 if (error) 1446 goto out; 1447 1448 xfs_itruncate_clear_reflink_flags(ip); 1449 } 1450 1451 /* 1452 * Always re-log the inode so that our permanent transaction can keep 1453 * on rolling it forward in the log. 1454 */ 1455 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1456 1457 trace_xfs_itruncate_extents_end(ip, new_size); 1458 1459 out: 1460 *tpp = tp; 1461 return error; 1462 } 1463 1464 int 1465 xfs_release( 1466 xfs_inode_t *ip) 1467 { 1468 xfs_mount_t *mp = ip->i_mount; 1469 int error = 0; 1470 1471 if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) 1472 return 0; 1473 1474 /* If this is a read-only mount, don't do this (would generate I/O) */ 1475 if (xfs_is_readonly(mp)) 1476 return 0; 1477 1478 if (!xfs_is_shutdown(mp)) { 1479 int truncated; 1480 1481 /* 1482 * If we previously truncated this file and removed old data 1483 * in the process, we want to initiate "early" writeout on 1484 * the last close. This is an attempt to combat the notorious 1485 * NULL files problem which is particularly noticeable from a 1486 * truncate down, buffered (re-)write (delalloc), followed by 1487 * a crash. What we are effectively doing here is 1488 * significantly reducing the time window where we'd otherwise 1489 * be exposed to that problem. 1490 */ 1491 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1492 if (truncated) { 1493 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 1494 if (ip->i_delayed_blks > 0) { 1495 error = filemap_flush(VFS_I(ip)->i_mapping); 1496 if (error) 1497 return error; 1498 } 1499 } 1500 } 1501 1502 if (VFS_I(ip)->i_nlink == 0) 1503 return 0; 1504 1505 /* 1506 * If we can't get the iolock just skip truncating the blocks past EOF 1507 * because we could deadlock with the mmap_lock otherwise. We'll get 1508 * another chance to drop them once the last reference to the inode is 1509 * dropped, so we'll never leak blocks permanently. 1510 */ 1511 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) 1512 return 0; 1513 1514 if (xfs_can_free_eofblocks(ip, false)) { 1515 /* 1516 * Check if the inode is being opened, written and closed 1517 * frequently and we have delayed allocation blocks outstanding 1518 * (e.g. streaming writes from the NFS server), truncating the 1519 * blocks past EOF will cause fragmentation to occur. 1520 * 1521 * In this case don't do the truncation, but we have to be 1522 * careful how we detect this case. Blocks beyond EOF show up as 1523 * i_delayed_blks even when the inode is clean, so we need to 1524 * truncate them away first before checking for a dirty release. 1525 * Hence on the first dirty close we will still remove the 1526 * speculative allocation, but after that we will leave it in 1527 * place. 1528 */ 1529 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) 1530 goto out_unlock; 1531 1532 error = xfs_free_eofblocks(ip); 1533 if (error) 1534 goto out_unlock; 1535 1536 /* delalloc blocks after truncation means it really is dirty */ 1537 if (ip->i_delayed_blks) 1538 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); 1539 } 1540 1541 out_unlock: 1542 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1543 return error; 1544 } 1545 1546 /* 1547 * xfs_inactive_truncate 1548 * 1549 * Called to perform a truncate when an inode becomes unlinked. 1550 */ 1551 STATIC int 1552 xfs_inactive_truncate( 1553 struct xfs_inode *ip) 1554 { 1555 struct xfs_mount *mp = ip->i_mount; 1556 struct xfs_trans *tp; 1557 int error; 1558 1559 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 1560 if (error) { 1561 ASSERT(xfs_is_shutdown(mp)); 1562 return error; 1563 } 1564 xfs_ilock(ip, XFS_ILOCK_EXCL); 1565 xfs_trans_ijoin(tp, ip, 0); 1566 1567 /* 1568 * Log the inode size first to prevent stale data exposure in the event 1569 * of a system crash before the truncate completes. See the related 1570 * comment in xfs_vn_setattr_size() for details. 1571 */ 1572 ip->i_disk_size = 0; 1573 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1574 1575 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); 1576 if (error) 1577 goto error_trans_cancel; 1578 1579 ASSERT(ip->i_df.if_nextents == 0); 1580 1581 error = xfs_trans_commit(tp); 1582 if (error) 1583 goto error_unlock; 1584 1585 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1586 return 0; 1587 1588 error_trans_cancel: 1589 xfs_trans_cancel(tp); 1590 error_unlock: 1591 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1592 return error; 1593 } 1594 1595 /* 1596 * xfs_inactive_ifree() 1597 * 1598 * Perform the inode free when an inode is unlinked. 1599 */ 1600 STATIC int 1601 xfs_inactive_ifree( 1602 struct xfs_inode *ip) 1603 { 1604 struct xfs_mount *mp = ip->i_mount; 1605 struct xfs_trans *tp; 1606 int error; 1607 1608 /* 1609 * We try to use a per-AG reservation for any block needed by the finobt 1610 * tree, but as the finobt feature predates the per-AG reservation 1611 * support a degraded file system might not have enough space for the 1612 * reservation at mount time. In that case try to dip into the reserved 1613 * pool and pray. 1614 * 1615 * Send a warning if the reservation does happen to fail, as the inode 1616 * now remains allocated and sits on the unlinked list until the fs is 1617 * repaired. 1618 */ 1619 if (unlikely(mp->m_finobt_nores)) { 1620 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 1621 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, 1622 &tp); 1623 } else { 1624 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); 1625 } 1626 if (error) { 1627 if (error == -ENOSPC) { 1628 xfs_warn_ratelimited(mp, 1629 "Failed to remove inode(s) from unlinked list. " 1630 "Please free space, unmount and run xfs_repair."); 1631 } else { 1632 ASSERT(xfs_is_shutdown(mp)); 1633 } 1634 return error; 1635 } 1636 1637 /* 1638 * We do not hold the inode locked across the entire rolling transaction 1639 * here. We only need to hold it for the first transaction that 1640 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the 1641 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode 1642 * here breaks the relationship between cluster buffer invalidation and 1643 * stale inode invalidation on cluster buffer item journal commit 1644 * completion, and can result in leaving dirty stale inodes hanging 1645 * around in memory. 1646 * 1647 * We have no need for serialising this inode operation against other 1648 * operations - we freed the inode and hence reallocation is required 1649 * and that will serialise on reallocating the space the deferops need 1650 * to free. Hence we can unlock the inode on the first commit of 1651 * the transaction rather than roll it right through the deferops. This 1652 * avoids relogging the XFS_ISTALE inode. 1653 * 1654 * We check that xfs_ifree() hasn't grown an internal transaction roll 1655 * by asserting that the inode is still locked when it returns. 1656 */ 1657 xfs_ilock(ip, XFS_ILOCK_EXCL); 1658 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1659 1660 error = xfs_ifree(tp, ip); 1661 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1662 if (error) { 1663 /* 1664 * If we fail to free the inode, shut down. The cancel 1665 * might do that, we need to make sure. Otherwise the 1666 * inode might be lost for a long time or forever. 1667 */ 1668 if (!xfs_is_shutdown(mp)) { 1669 xfs_notice(mp, "%s: xfs_ifree returned error %d", 1670 __func__, error); 1671 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1672 } 1673 xfs_trans_cancel(tp); 1674 return error; 1675 } 1676 1677 /* 1678 * Credit the quota account(s). The inode is gone. 1679 */ 1680 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1681 1682 return xfs_trans_commit(tp); 1683 } 1684 1685 /* 1686 * Returns true if we need to update the on-disk metadata before we can free 1687 * the memory used by this inode. Updates include freeing post-eof 1688 * preallocations; freeing COW staging extents; and marking the inode free in 1689 * the inobt if it is on the unlinked list. 1690 */ 1691 bool 1692 xfs_inode_needs_inactive( 1693 struct xfs_inode *ip) 1694 { 1695 struct xfs_mount *mp = ip->i_mount; 1696 struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 1697 1698 /* 1699 * If the inode is already free, then there can be nothing 1700 * to clean up here. 1701 */ 1702 if (VFS_I(ip)->i_mode == 0) 1703 return false; 1704 1705 /* 1706 * If this is a read-only mount, don't do this (would generate I/O) 1707 * unless we're in log recovery and cleaning the iunlinked list. 1708 */ 1709 if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) 1710 return false; 1711 1712 /* If the log isn't running, push inodes straight to reclaim. */ 1713 if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp)) 1714 return false; 1715 1716 /* Metadata inodes require explicit resource cleanup. */ 1717 if (xfs_is_metadata_inode(ip)) 1718 return false; 1719 1720 /* Want to clean out the cow blocks if there are any. */ 1721 if (cow_ifp && cow_ifp->if_bytes > 0) 1722 return true; 1723 1724 /* Unlinked files must be freed. */ 1725 if (VFS_I(ip)->i_nlink == 0) 1726 return true; 1727 1728 /* 1729 * This file isn't being freed, so check if there are post-eof blocks 1730 * to free. @force is true because we are evicting an inode from the 1731 * cache. Post-eof blocks must be freed, lest we end up with broken 1732 * free space accounting. 1733 * 1734 * Note: don't bother with iolock here since lockdep complains about 1735 * acquiring it in reclaim context. We have the only reference to the 1736 * inode at this point anyways. 1737 */ 1738 return xfs_can_free_eofblocks(ip, true); 1739 } 1740 1741 /* 1742 * Save health status somewhere, if we're dumping an inode with uncorrected 1743 * errors and online repair isn't running. 1744 */ 1745 static inline void 1746 xfs_inactive_health( 1747 struct xfs_inode *ip) 1748 { 1749 struct xfs_mount *mp = ip->i_mount; 1750 struct xfs_perag *pag; 1751 unsigned int sick; 1752 unsigned int checked; 1753 1754 xfs_inode_measure_sickness(ip, &sick, &checked); 1755 if (!sick) 1756 return; 1757 1758 trace_xfs_inode_unfixed_corruption(ip, sick); 1759 1760 if (sick & XFS_SICK_INO_FORGET) 1761 return; 1762 1763 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1764 if (!pag) { 1765 /* There had better still be a perag structure! */ 1766 ASSERT(0); 1767 return; 1768 } 1769 1770 xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES); 1771 xfs_perag_put(pag); 1772 } 1773 1774 /* 1775 * xfs_inactive 1776 * 1777 * This is called when the vnode reference count for the vnode 1778 * goes to zero. If the file has been unlinked, then it must 1779 * now be truncated. Also, we clear all of the read-ahead state 1780 * kept for the inode here since the file is now closed. 1781 */ 1782 int 1783 xfs_inactive( 1784 xfs_inode_t *ip) 1785 { 1786 struct xfs_mount *mp; 1787 int error = 0; 1788 int truncate = 0; 1789 1790 /* 1791 * If the inode is already free, then there can be nothing 1792 * to clean up here. 1793 */ 1794 if (VFS_I(ip)->i_mode == 0) { 1795 ASSERT(ip->i_df.if_broot_bytes == 0); 1796 goto out; 1797 } 1798 1799 mp = ip->i_mount; 1800 ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); 1801 1802 xfs_inactive_health(ip); 1803 1804 /* 1805 * If this is a read-only mount, don't do this (would generate I/O) 1806 * unless we're in log recovery and cleaning the iunlinked list. 1807 */ 1808 if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) 1809 goto out; 1810 1811 /* Metadata inodes require explicit resource cleanup. */ 1812 if (xfs_is_metadata_inode(ip)) 1813 goto out; 1814 1815 /* Try to clean out the cow blocks if there are any. */ 1816 if (xfs_inode_has_cow_data(ip)) 1817 xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); 1818 1819 if (VFS_I(ip)->i_nlink != 0) { 1820 /* 1821 * force is true because we are evicting an inode from the 1822 * cache. Post-eof blocks must be freed, lest we end up with 1823 * broken free space accounting. 1824 * 1825 * Note: don't bother with iolock here since lockdep complains 1826 * about acquiring it in reclaim context. We have the only 1827 * reference to the inode at this point anyways. 1828 */ 1829 if (xfs_can_free_eofblocks(ip, true)) 1830 error = xfs_free_eofblocks(ip); 1831 1832 goto out; 1833 } 1834 1835 if (S_ISREG(VFS_I(ip)->i_mode) && 1836 (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || 1837 ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) 1838 truncate = 1; 1839 1840 if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { 1841 /* 1842 * If this inode is being inactivated during a quotacheck and 1843 * has not yet been scanned by quotacheck, we /must/ remove 1844 * the dquots from the inode before inactivation changes the 1845 * block and inode counts. Most probably this is a result of 1846 * reloading the incore iunlinked list to purge unrecovered 1847 * unlinked inodes. 1848 */ 1849 xfs_qm_dqdetach(ip); 1850 } else { 1851 error = xfs_qm_dqattach(ip); 1852 if (error) 1853 goto out; 1854 } 1855 1856 if (S_ISLNK(VFS_I(ip)->i_mode)) 1857 error = xfs_inactive_symlink(ip); 1858 else if (truncate) 1859 error = xfs_inactive_truncate(ip); 1860 if (error) 1861 goto out; 1862 1863 /* 1864 * If there are attributes associated with the file then blow them away 1865 * now. The code calls a routine that recursively deconstructs the 1866 * attribute fork. If also blows away the in-core attribute fork. 1867 */ 1868 if (xfs_inode_has_attr_fork(ip)) { 1869 error = xfs_attr_inactive(ip); 1870 if (error) 1871 goto out; 1872 } 1873 1874 ASSERT(ip->i_forkoff == 0); 1875 1876 /* 1877 * Free the inode. 1878 */ 1879 error = xfs_inactive_ifree(ip); 1880 1881 out: 1882 /* 1883 * We're done making metadata updates for this inode, so we can release 1884 * the attached dquots. 1885 */ 1886 xfs_qm_dqdetach(ip); 1887 return error; 1888 } 1889 1890 /* 1891 * In-Core Unlinked List Lookups 1892 * ============================= 1893 * 1894 * Every inode is supposed to be reachable from some other piece of metadata 1895 * with the exception of the root directory. Inodes with a connection to a 1896 * file descriptor but not linked from anywhere in the on-disk directory tree 1897 * are collectively known as unlinked inodes, though the filesystem itself 1898 * maintains links to these inodes so that on-disk metadata are consistent. 1899 * 1900 * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI 1901 * header contains a number of buckets that point to an inode, and each inode 1902 * record has a pointer to the next inode in the hash chain. This 1903 * singly-linked list causes scaling problems in the iunlink remove function 1904 * because we must walk that list to find the inode that points to the inode 1905 * being removed from the unlinked hash bucket list. 1906 * 1907 * Hence we keep an in-memory double linked list to link each inode on an 1908 * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer 1909 * based lists would require having 64 list heads in the perag, one for each 1910 * list. This is expensive in terms of memory (think millions of AGs) and cache 1911 * misses on lookups. Instead, use the fact that inodes on the unlinked list 1912 * must be referenced at the VFS level to keep them on the list and hence we 1913 * have an existence guarantee for inodes on the unlinked list. 1914 * 1915 * Given we have an existence guarantee, we can use lockless inode cache lookups 1916 * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode 1917 * for the double linked unlinked list, and we don't need any extra locking to 1918 * keep the list safe as all manipulations are done under the AGI buffer lock. 1919 * Keeping the list up to date does not require memory allocation, just finding 1920 * the XFS inode and updating the next/prev unlinked list aginos. 1921 */ 1922 1923 /* 1924 * Find an inode on the unlinked list. This does not take references to the 1925 * inode as we have existence guarantees by holding the AGI buffer lock and that 1926 * only unlinked, referenced inodes can be on the unlinked inode list. If we 1927 * don't find the inode in cache, then let the caller handle the situation. 1928 */ 1929 static struct xfs_inode * 1930 xfs_iunlink_lookup( 1931 struct xfs_perag *pag, 1932 xfs_agino_t agino) 1933 { 1934 struct xfs_inode *ip; 1935 1936 rcu_read_lock(); 1937 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 1938 if (!ip) { 1939 /* Caller can handle inode not being in memory. */ 1940 rcu_read_unlock(); 1941 return NULL; 1942 } 1943 1944 /* 1945 * Inode in RCU freeing limbo should not happen. Warn about this and 1946 * let the caller handle the failure. 1947 */ 1948 if (WARN_ON_ONCE(!ip->i_ino)) { 1949 rcu_read_unlock(); 1950 return NULL; 1951 } 1952 ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)); 1953 rcu_read_unlock(); 1954 return ip; 1955 } 1956 1957 /* 1958 * Update the prev pointer of the next agino. Returns -ENOLINK if the inode 1959 * is not in cache. 1960 */ 1961 static int 1962 xfs_iunlink_update_backref( 1963 struct xfs_perag *pag, 1964 xfs_agino_t prev_agino, 1965 xfs_agino_t next_agino) 1966 { 1967 struct xfs_inode *ip; 1968 1969 /* No update necessary if we are at the end of the list. */ 1970 if (next_agino == NULLAGINO) 1971 return 0; 1972 1973 ip = xfs_iunlink_lookup(pag, next_agino); 1974 if (!ip) 1975 return -ENOLINK; 1976 1977 ip->i_prev_unlinked = prev_agino; 1978 return 0; 1979 } 1980 1981 /* 1982 * Point the AGI unlinked bucket at an inode and log the results. The caller 1983 * is responsible for validating the old value. 1984 */ 1985 STATIC int 1986 xfs_iunlink_update_bucket( 1987 struct xfs_trans *tp, 1988 struct xfs_perag *pag, 1989 struct xfs_buf *agibp, 1990 unsigned int bucket_index, 1991 xfs_agino_t new_agino) 1992 { 1993 struct xfs_agi *agi = agibp->b_addr; 1994 xfs_agino_t old_value; 1995 int offset; 1996 1997 ASSERT(xfs_verify_agino_or_null(pag, new_agino)); 1998 1999 old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2000 trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, 2001 old_value, new_agino); 2002 2003 /* 2004 * We should never find the head of the list already set to the value 2005 * passed in because either we're adding or removing ourselves from the 2006 * head of the list. 2007 */ 2008 if (old_value == new_agino) { 2009 xfs_buf_mark_corrupt(agibp); 2010 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2011 return -EFSCORRUPTED; 2012 } 2013 2014 agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); 2015 offset = offsetof(struct xfs_agi, agi_unlinked) + 2016 (sizeof(xfs_agino_t) * bucket_index); 2017 xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); 2018 return 0; 2019 } 2020 2021 /* 2022 * Load the inode @next_agino into the cache and set its prev_unlinked pointer 2023 * to @prev_agino. Caller must hold the AGI to synchronize with other changes 2024 * to the unlinked list. 2025 */ 2026 STATIC int 2027 xfs_iunlink_reload_next( 2028 struct xfs_trans *tp, 2029 struct xfs_buf *agibp, 2030 xfs_agino_t prev_agino, 2031 xfs_agino_t next_agino) 2032 { 2033 struct xfs_perag *pag = agibp->b_pag; 2034 struct xfs_mount *mp = pag->pag_mount; 2035 struct xfs_inode *next_ip = NULL; 2036 xfs_ino_t ino; 2037 int error; 2038 2039 ASSERT(next_agino != NULLAGINO); 2040 2041 #ifdef DEBUG 2042 rcu_read_lock(); 2043 next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); 2044 ASSERT(next_ip == NULL); 2045 rcu_read_unlock(); 2046 #endif 2047 2048 xfs_info_ratelimited(mp, 2049 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", 2050 next_agino, pag->pag_agno); 2051 2052 /* 2053 * Use an untrusted lookup just to be cautious in case the AGI has been 2054 * corrupted and now points at a free inode. That shouldn't happen, 2055 * but we'd rather shut down now since we're already running in a weird 2056 * situation. 2057 */ 2058 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); 2059 error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); 2060 if (error) { 2061 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2062 return error; 2063 } 2064 2065 /* If this is not an unlinked inode, something is very wrong. */ 2066 if (VFS_I(next_ip)->i_nlink != 0) { 2067 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2068 error = -EFSCORRUPTED; 2069 goto rele; 2070 } 2071 2072 next_ip->i_prev_unlinked = prev_agino; 2073 trace_xfs_iunlink_reload_next(next_ip); 2074 rele: 2075 ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); 2076 if (xfs_is_quotacheck_running(mp) && next_ip) 2077 xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); 2078 xfs_irele(next_ip); 2079 return error; 2080 } 2081 2082 static int 2083 xfs_iunlink_insert_inode( 2084 struct xfs_trans *tp, 2085 struct xfs_perag *pag, 2086 struct xfs_buf *agibp, 2087 struct xfs_inode *ip) 2088 { 2089 struct xfs_mount *mp = tp->t_mountp; 2090 struct xfs_agi *agi = agibp->b_addr; 2091 xfs_agino_t next_agino; 2092 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2093 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2094 int error; 2095 2096 /* 2097 * Get the index into the agi hash table for the list this inode will 2098 * go on. Make sure the pointer isn't garbage and that this inode 2099 * isn't already on the list. 2100 */ 2101 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2102 if (next_agino == agino || 2103 !xfs_verify_agino_or_null(pag, next_agino)) { 2104 xfs_buf_mark_corrupt(agibp); 2105 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2106 return -EFSCORRUPTED; 2107 } 2108 2109 /* 2110 * Update the prev pointer in the next inode to point back to this 2111 * inode. 2112 */ 2113 error = xfs_iunlink_update_backref(pag, agino, next_agino); 2114 if (error == -ENOLINK) 2115 error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); 2116 if (error) 2117 return error; 2118 2119 if (next_agino != NULLAGINO) { 2120 /* 2121 * There is already another inode in the bucket, so point this 2122 * inode to the current head of the list. 2123 */ 2124 error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); 2125 if (error) 2126 return error; 2127 ip->i_next_unlinked = next_agino; 2128 } 2129 2130 /* Point the head of the list to point to this inode. */ 2131 ip->i_prev_unlinked = NULLAGINO; 2132 return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); 2133 } 2134 2135 /* 2136 * This is called when the inode's link count has gone to 0 or we are creating 2137 * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. 2138 * 2139 * We place the on-disk inode on a list in the AGI. It will be pulled from this 2140 * list when the inode is freed. 2141 */ 2142 STATIC int 2143 xfs_iunlink( 2144 struct xfs_trans *tp, 2145 struct xfs_inode *ip) 2146 { 2147 struct xfs_mount *mp = tp->t_mountp; 2148 struct xfs_perag *pag; 2149 struct xfs_buf *agibp; 2150 int error; 2151 2152 ASSERT(VFS_I(ip)->i_nlink == 0); 2153 ASSERT(VFS_I(ip)->i_mode != 0); 2154 trace_xfs_iunlink(ip); 2155 2156 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2157 2158 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2159 error = xfs_read_agi(pag, tp, &agibp); 2160 if (error) 2161 goto out; 2162 2163 error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); 2164 out: 2165 xfs_perag_put(pag); 2166 return error; 2167 } 2168 2169 static int 2170 xfs_iunlink_remove_inode( 2171 struct xfs_trans *tp, 2172 struct xfs_perag *pag, 2173 struct xfs_buf *agibp, 2174 struct xfs_inode *ip) 2175 { 2176 struct xfs_mount *mp = tp->t_mountp; 2177 struct xfs_agi *agi = agibp->b_addr; 2178 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2179 xfs_agino_t head_agino; 2180 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2181 int error; 2182 2183 trace_xfs_iunlink_remove(ip); 2184 2185 /* 2186 * Get the index into the agi hash table for the list this inode will 2187 * go on. Make sure the head pointer isn't garbage. 2188 */ 2189 head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2190 if (!xfs_verify_agino(pag, head_agino)) { 2191 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 2192 agi, sizeof(*agi)); 2193 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2194 return -EFSCORRUPTED; 2195 } 2196 2197 /* 2198 * Set our inode's next_unlinked pointer to NULL and then return 2199 * the old pointer value so that we can update whatever was previous 2200 * to us in the list to point to whatever was next in the list. 2201 */ 2202 error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); 2203 if (error) 2204 return error; 2205 2206 /* 2207 * Update the prev pointer in the next inode to point back to previous 2208 * inode in the chain. 2209 */ 2210 error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, 2211 ip->i_next_unlinked); 2212 if (error == -ENOLINK) 2213 error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, 2214 ip->i_next_unlinked); 2215 if (error) 2216 return error; 2217 2218 if (head_agino != agino) { 2219 struct xfs_inode *prev_ip; 2220 2221 prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); 2222 if (!prev_ip) { 2223 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 2224 return -EFSCORRUPTED; 2225 } 2226 2227 error = xfs_iunlink_log_inode(tp, prev_ip, pag, 2228 ip->i_next_unlinked); 2229 prev_ip->i_next_unlinked = ip->i_next_unlinked; 2230 } else { 2231 /* Point the head of the list to the next unlinked inode. */ 2232 error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, 2233 ip->i_next_unlinked); 2234 } 2235 2236 ip->i_next_unlinked = NULLAGINO; 2237 ip->i_prev_unlinked = 0; 2238 return error; 2239 } 2240 2241 /* 2242 * Pull the on-disk inode from the AGI unlinked list. 2243 */ 2244 STATIC int 2245 xfs_iunlink_remove( 2246 struct xfs_trans *tp, 2247 struct xfs_perag *pag, 2248 struct xfs_inode *ip) 2249 { 2250 struct xfs_buf *agibp; 2251 int error; 2252 2253 trace_xfs_iunlink_remove(ip); 2254 2255 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2256 error = xfs_read_agi(pag, tp, &agibp); 2257 if (error) 2258 return error; 2259 2260 return xfs_iunlink_remove_inode(tp, pag, agibp, ip); 2261 } 2262 2263 /* 2264 * Look up the inode number specified and if it is not already marked XFS_ISTALE 2265 * mark it stale. We should only find clean inodes in this lookup that aren't 2266 * already stale. 2267 */ 2268 static void 2269 xfs_ifree_mark_inode_stale( 2270 struct xfs_perag *pag, 2271 struct xfs_inode *free_ip, 2272 xfs_ino_t inum) 2273 { 2274 struct xfs_mount *mp = pag->pag_mount; 2275 struct xfs_inode_log_item *iip; 2276 struct xfs_inode *ip; 2277 2278 retry: 2279 rcu_read_lock(); 2280 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); 2281 2282 /* Inode not in memory, nothing to do */ 2283 if (!ip) { 2284 rcu_read_unlock(); 2285 return; 2286 } 2287 2288 /* 2289 * because this is an RCU protected lookup, we could find a recently 2290 * freed or even reallocated inode during the lookup. We need to check 2291 * under the i_flags_lock for a valid inode here. Skip it if it is not 2292 * valid, the wrong inode or stale. 2293 */ 2294 spin_lock(&ip->i_flags_lock); 2295 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) 2296 goto out_iflags_unlock; 2297 2298 /* 2299 * Don't try to lock/unlock the current inode, but we _cannot_ skip the 2300 * other inodes that we did not find in the list attached to the buffer 2301 * and are not already marked stale. If we can't lock it, back off and 2302 * retry. 2303 */ 2304 if (ip != free_ip) { 2305 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2306 spin_unlock(&ip->i_flags_lock); 2307 rcu_read_unlock(); 2308 delay(1); 2309 goto retry; 2310 } 2311 } 2312 ip->i_flags |= XFS_ISTALE; 2313 2314 /* 2315 * If the inode is flushing, it is already attached to the buffer. All 2316 * we needed to do here is mark the inode stale so buffer IO completion 2317 * will remove it from the AIL. 2318 */ 2319 iip = ip->i_itemp; 2320 if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { 2321 ASSERT(!list_empty(&iip->ili_item.li_bio_list)); 2322 ASSERT(iip->ili_last_fields); 2323 goto out_iunlock; 2324 } 2325 2326 /* 2327 * Inodes not attached to the buffer can be released immediately. 2328 * Everything else has to go through xfs_iflush_abort() on journal 2329 * commit as the flock synchronises removal of the inode from the 2330 * cluster buffer against inode reclaim. 2331 */ 2332 if (!iip || list_empty(&iip->ili_item.li_bio_list)) 2333 goto out_iunlock; 2334 2335 __xfs_iflags_set(ip, XFS_IFLUSHING); 2336 spin_unlock(&ip->i_flags_lock); 2337 rcu_read_unlock(); 2338 2339 /* we have a dirty inode in memory that has not yet been flushed. */ 2340 spin_lock(&iip->ili_lock); 2341 iip->ili_last_fields = iip->ili_fields; 2342 iip->ili_fields = 0; 2343 iip->ili_fsync_fields = 0; 2344 spin_unlock(&iip->ili_lock); 2345 ASSERT(iip->ili_last_fields); 2346 2347 if (ip != free_ip) 2348 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2349 return; 2350 2351 out_iunlock: 2352 if (ip != free_ip) 2353 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2354 out_iflags_unlock: 2355 spin_unlock(&ip->i_flags_lock); 2356 rcu_read_unlock(); 2357 } 2358 2359 /* 2360 * A big issue when freeing the inode cluster is that we _cannot_ skip any 2361 * inodes that are in memory - they all must be marked stale and attached to 2362 * the cluster buffer. 2363 */ 2364 static int 2365 xfs_ifree_cluster( 2366 struct xfs_trans *tp, 2367 struct xfs_perag *pag, 2368 struct xfs_inode *free_ip, 2369 struct xfs_icluster *xic) 2370 { 2371 struct xfs_mount *mp = free_ip->i_mount; 2372 struct xfs_ino_geometry *igeo = M_IGEO(mp); 2373 struct xfs_buf *bp; 2374 xfs_daddr_t blkno; 2375 xfs_ino_t inum = xic->first_ino; 2376 int nbufs; 2377 int i, j; 2378 int ioffset; 2379 int error; 2380 2381 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; 2382 2383 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { 2384 /* 2385 * The allocation bitmap tells us which inodes of the chunk were 2386 * physically allocated. Skip the cluster if an inode falls into 2387 * a sparse region. 2388 */ 2389 ioffset = inum - xic->first_ino; 2390 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { 2391 ASSERT(ioffset % igeo->inodes_per_cluster == 0); 2392 continue; 2393 } 2394 2395 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2396 XFS_INO_TO_AGBNO(mp, inum)); 2397 2398 /* 2399 * We obtain and lock the backing buffer first in the process 2400 * here to ensure dirty inodes attached to the buffer remain in 2401 * the flushing state while we mark them stale. 2402 * 2403 * If we scan the in-memory inodes first, then buffer IO can 2404 * complete before we get a lock on it, and hence we may fail 2405 * to mark all the active inodes on the buffer stale. 2406 */ 2407 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2408 mp->m_bsize * igeo->blocks_per_cluster, 2409 XBF_UNMAPPED, &bp); 2410 if (error) 2411 return error; 2412 2413 /* 2414 * This buffer may not have been correctly initialised as we 2415 * didn't read it from disk. That's not important because we are 2416 * only using to mark the buffer as stale in the log, and to 2417 * attach stale cached inodes on it. That means it will never be 2418 * dispatched for IO. If it is, we want to know about it, and we 2419 * want it to fail. We can acheive this by adding a write 2420 * verifier to the buffer. 2421 */ 2422 bp->b_ops = &xfs_inode_buf_ops; 2423 2424 /* 2425 * Now we need to set all the cached clean inodes as XFS_ISTALE, 2426 * too. This requires lookups, and will skip inodes that we've 2427 * already marked XFS_ISTALE. 2428 */ 2429 for (i = 0; i < igeo->inodes_per_cluster; i++) 2430 xfs_ifree_mark_inode_stale(pag, free_ip, inum + i); 2431 2432 xfs_trans_stale_inode_buf(tp, bp); 2433 xfs_trans_binval(tp, bp); 2434 } 2435 return 0; 2436 } 2437 2438 /* 2439 * This is called to return an inode to the inode free list. The inode should 2440 * already be truncated to 0 length and have no pages associated with it. This 2441 * routine also assumes that the inode is already a part of the transaction. 2442 * 2443 * The on-disk copy of the inode will have been added to the list of unlinked 2444 * inodes in the AGI. We need to remove the inode from that list atomically with 2445 * respect to freeing it here. 2446 */ 2447 int 2448 xfs_ifree( 2449 struct xfs_trans *tp, 2450 struct xfs_inode *ip) 2451 { 2452 struct xfs_mount *mp = ip->i_mount; 2453 struct xfs_perag *pag; 2454 struct xfs_icluster xic = { 0 }; 2455 struct xfs_inode_log_item *iip = ip->i_itemp; 2456 int error; 2457 2458 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 2459 ASSERT(VFS_I(ip)->i_nlink == 0); 2460 ASSERT(ip->i_df.if_nextents == 0); 2461 ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); 2462 ASSERT(ip->i_nblocks == 0); 2463 2464 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2465 2466 /* 2467 * Free the inode first so that we guarantee that the AGI lock is going 2468 * to be taken before we remove the inode from the unlinked list. This 2469 * makes the AGI lock -> unlinked list modification order the same as 2470 * used in O_TMPFILE creation. 2471 */ 2472 error = xfs_difree(tp, pag, ip->i_ino, &xic); 2473 if (error) 2474 goto out; 2475 2476 error = xfs_iunlink_remove(tp, pag, ip); 2477 if (error) 2478 goto out; 2479 2480 /* 2481 * Free any local-format data sitting around before we reset the 2482 * data fork to extents format. Note that the attr fork data has 2483 * already been freed by xfs_attr_inactive. 2484 */ 2485 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 2486 kfree(ip->i_df.if_data); 2487 ip->i_df.if_data = NULL; 2488 ip->i_df.if_bytes = 0; 2489 } 2490 2491 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ 2492 ip->i_diflags = 0; 2493 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 2494 ip->i_forkoff = 0; /* mark the attr fork not in use */ 2495 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 2496 if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) 2497 xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); 2498 2499 /* Don't attempt to replay owner changes for a deleted inode */ 2500 spin_lock(&iip->ili_lock); 2501 iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); 2502 spin_unlock(&iip->ili_lock); 2503 2504 /* 2505 * Bump the generation count so no one will be confused 2506 * by reincarnations of this inode. 2507 */ 2508 VFS_I(ip)->i_generation++; 2509 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2510 2511 if (xic.deleted) 2512 error = xfs_ifree_cluster(tp, pag, ip, &xic); 2513 out: 2514 xfs_perag_put(pag); 2515 return error; 2516 } 2517 2518 /* 2519 * This is called to unpin an inode. The caller must have the inode locked 2520 * in at least shared mode so that the buffer cannot be subsequently pinned 2521 * once someone is waiting for it to be unpinned. 2522 */ 2523 static void 2524 xfs_iunpin( 2525 struct xfs_inode *ip) 2526 { 2527 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); 2528 2529 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2530 2531 /* Give the log a push to start the unpinning I/O */ 2532 xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); 2533 2534 } 2535 2536 static void 2537 __xfs_iunpin_wait( 2538 struct xfs_inode *ip) 2539 { 2540 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2541 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2542 2543 xfs_iunpin(ip); 2544 2545 do { 2546 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2547 if (xfs_ipincount(ip)) 2548 io_schedule(); 2549 } while (xfs_ipincount(ip)); 2550 finish_wait(wq, &wait.wq_entry); 2551 } 2552 2553 void 2554 xfs_iunpin_wait( 2555 struct xfs_inode *ip) 2556 { 2557 if (xfs_ipincount(ip)) 2558 __xfs_iunpin_wait(ip); 2559 } 2560 2561 /* 2562 * Removing an inode from the namespace involves removing the directory entry 2563 * and dropping the link count on the inode. Removing the directory entry can 2564 * result in locking an AGF (directory blocks were freed) and removing a link 2565 * count can result in placing the inode on an unlinked list which results in 2566 * locking an AGI. 2567 * 2568 * The big problem here is that we have an ordering constraint on AGF and AGI 2569 * locking - inode allocation locks the AGI, then can allocate a new extent for 2570 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode 2571 * removes the inode from the unlinked list, requiring that we lock the AGI 2572 * first, and then freeing the inode can result in an inode chunk being freed 2573 * and hence freeing disk space requiring that we lock an AGF. 2574 * 2575 * Hence the ordering that is imposed by other parts of the code is AGI before 2576 * AGF. This means we cannot remove the directory entry before we drop the inode 2577 * reference count and put it on the unlinked list as this results in a lock 2578 * order of AGF then AGI, and this can deadlock against inode allocation and 2579 * freeing. Therefore we must drop the link counts before we remove the 2580 * directory entry. 2581 * 2582 * This is still safe from a transactional point of view - it is not until we 2583 * get to xfs_defer_finish() that we have the possibility of multiple 2584 * transactions in this operation. Hence as long as we remove the directory 2585 * entry and drop the link count in the first transaction of the remove 2586 * operation, there are no transactional constraints on the ordering here. 2587 */ 2588 int 2589 xfs_remove( 2590 xfs_inode_t *dp, 2591 struct xfs_name *name, 2592 xfs_inode_t *ip) 2593 { 2594 xfs_mount_t *mp = dp->i_mount; 2595 xfs_trans_t *tp = NULL; 2596 int is_dir = S_ISDIR(VFS_I(ip)->i_mode); 2597 int dontcare; 2598 int error = 0; 2599 uint resblks; 2600 2601 trace_xfs_remove(dp, name); 2602 2603 if (xfs_is_shutdown(mp)) 2604 return -EIO; 2605 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 2606 return -EIO; 2607 2608 error = xfs_qm_dqattach(dp); 2609 if (error) 2610 goto std_return; 2611 2612 error = xfs_qm_dqattach(ip); 2613 if (error) 2614 goto std_return; 2615 2616 /* 2617 * We try to get the real space reservation first, allowing for 2618 * directory btree deletion(s) implying possible bmap insert(s). If we 2619 * can't get the space reservation then we use 0 instead, and avoid the 2620 * bmap btree insert(s) in the directory code by, if the bmap insert 2621 * tries to happen, instead trimming the LAST block from the directory. 2622 * 2623 * Ignore EDQUOT and ENOSPC being returned via nospace_error because 2624 * the directory code can handle a reservationless update and we don't 2625 * want to prevent a user from trying to free space by deleting things. 2626 */ 2627 resblks = XFS_REMOVE_SPACE_RES(mp); 2628 error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, 2629 &tp, &dontcare); 2630 if (error) { 2631 ASSERT(error != -ENOSPC); 2632 goto std_return; 2633 } 2634 2635 /* 2636 * If we're removing a directory perform some additional validation. 2637 */ 2638 if (is_dir) { 2639 ASSERT(VFS_I(ip)->i_nlink >= 2); 2640 if (VFS_I(ip)->i_nlink != 2) { 2641 error = -ENOTEMPTY; 2642 goto out_trans_cancel; 2643 } 2644 if (!xfs_dir_isempty(ip)) { 2645 error = -ENOTEMPTY; 2646 goto out_trans_cancel; 2647 } 2648 2649 /* Drop the link from ip's "..". */ 2650 error = xfs_droplink(tp, dp); 2651 if (error) 2652 goto out_trans_cancel; 2653 2654 /* Drop the "." link from ip to self. */ 2655 error = xfs_droplink(tp, ip); 2656 if (error) 2657 goto out_trans_cancel; 2658 2659 /* 2660 * Point the unlinked child directory's ".." entry to the root 2661 * directory to eliminate back-references to inodes that may 2662 * get freed before the child directory is closed. If the fs 2663 * gets shrunk, this can lead to dirent inode validation errors. 2664 */ 2665 if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { 2666 error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, 2667 tp->t_mountp->m_sb.sb_rootino, 0); 2668 if (error) 2669 goto out_trans_cancel; 2670 } 2671 } else { 2672 /* 2673 * When removing a non-directory we need to log the parent 2674 * inode here. For a directory this is done implicitly 2675 * by the xfs_droplink call for the ".." entry. 2676 */ 2677 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2678 } 2679 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2680 2681 /* Drop the link from dp to ip. */ 2682 error = xfs_droplink(tp, ip); 2683 if (error) 2684 goto out_trans_cancel; 2685 2686 error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); 2687 if (error) { 2688 ASSERT(error != -ENOENT); 2689 goto out_trans_cancel; 2690 } 2691 2692 /* 2693 * Drop the link from dp to ip, and if ip was a directory, remove the 2694 * '.' and '..' references since we freed the directory. 2695 */ 2696 xfs_dir_update_hook(dp, ip, -1, name); 2697 2698 /* 2699 * If this is a synchronous mount, make sure that the 2700 * remove transaction goes to disk before returning to 2701 * the user. 2702 */ 2703 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 2704 xfs_trans_set_sync(tp); 2705 2706 error = xfs_trans_commit(tp); 2707 if (error) 2708 goto std_return; 2709 2710 if (is_dir && xfs_inode_is_filestream(ip)) 2711 xfs_filestream_deassociate(ip); 2712 2713 return 0; 2714 2715 out_trans_cancel: 2716 xfs_trans_cancel(tp); 2717 std_return: 2718 return error; 2719 } 2720 2721 /* 2722 * Enter all inodes for a rename transaction into a sorted array. 2723 */ 2724 #define __XFS_SORT_INODES 5 2725 STATIC void 2726 xfs_sort_for_rename( 2727 struct xfs_inode *dp1, /* in: old (source) directory inode */ 2728 struct xfs_inode *dp2, /* in: new (target) directory inode */ 2729 struct xfs_inode *ip1, /* in: inode of old entry */ 2730 struct xfs_inode *ip2, /* in: inode of new entry */ 2731 struct xfs_inode *wip, /* in: whiteout inode */ 2732 struct xfs_inode **i_tab,/* out: sorted array of inodes */ 2733 int *num_inodes) /* in/out: inodes in array */ 2734 { 2735 int i, j; 2736 2737 ASSERT(*num_inodes == __XFS_SORT_INODES); 2738 memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); 2739 2740 /* 2741 * i_tab contains a list of pointers to inodes. We initialize 2742 * the table here & we'll sort it. We will then use it to 2743 * order the acquisition of the inode locks. 2744 * 2745 * Note that the table may contain duplicates. e.g., dp1 == dp2. 2746 */ 2747 i = 0; 2748 i_tab[i++] = dp1; 2749 i_tab[i++] = dp2; 2750 i_tab[i++] = ip1; 2751 if (ip2) 2752 i_tab[i++] = ip2; 2753 if (wip) 2754 i_tab[i++] = wip; 2755 *num_inodes = i; 2756 2757 /* 2758 * Sort the elements via bubble sort. (Remember, there are at 2759 * most 5 elements to sort, so this is adequate.) 2760 */ 2761 for (i = 0; i < *num_inodes; i++) { 2762 for (j = 1; j < *num_inodes; j++) { 2763 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { 2764 struct xfs_inode *temp = i_tab[j]; 2765 i_tab[j] = i_tab[j-1]; 2766 i_tab[j-1] = temp; 2767 } 2768 } 2769 } 2770 } 2771 2772 static int 2773 xfs_finish_rename( 2774 struct xfs_trans *tp) 2775 { 2776 /* 2777 * If this is a synchronous mount, make sure that the rename transaction 2778 * goes to disk before returning to the user. 2779 */ 2780 if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) 2781 xfs_trans_set_sync(tp); 2782 2783 return xfs_trans_commit(tp); 2784 } 2785 2786 /* 2787 * xfs_cross_rename() 2788 * 2789 * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall 2790 */ 2791 STATIC int 2792 xfs_cross_rename( 2793 struct xfs_trans *tp, 2794 struct xfs_inode *dp1, 2795 struct xfs_name *name1, 2796 struct xfs_inode *ip1, 2797 struct xfs_inode *dp2, 2798 struct xfs_name *name2, 2799 struct xfs_inode *ip2, 2800 int spaceres) 2801 { 2802 int error = 0; 2803 int ip1_flags = 0; 2804 int ip2_flags = 0; 2805 int dp2_flags = 0; 2806 2807 /* Swap inode number for dirent in first parent */ 2808 error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); 2809 if (error) 2810 goto out_trans_abort; 2811 2812 /* Swap inode number for dirent in second parent */ 2813 error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); 2814 if (error) 2815 goto out_trans_abort; 2816 2817 /* 2818 * If we're renaming one or more directories across different parents, 2819 * update the respective ".." entries (and link counts) to match the new 2820 * parents. 2821 */ 2822 if (dp1 != dp2) { 2823 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2824 2825 if (S_ISDIR(VFS_I(ip2)->i_mode)) { 2826 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, 2827 dp1->i_ino, spaceres); 2828 if (error) 2829 goto out_trans_abort; 2830 2831 /* transfer ip2 ".." reference to dp1 */ 2832 if (!S_ISDIR(VFS_I(ip1)->i_mode)) { 2833 error = xfs_droplink(tp, dp2); 2834 if (error) 2835 goto out_trans_abort; 2836 xfs_bumplink(tp, dp1); 2837 } 2838 2839 /* 2840 * Although ip1 isn't changed here, userspace needs 2841 * to be warned about the change, so that applications 2842 * relying on it (like backup ones), will properly 2843 * notify the change 2844 */ 2845 ip1_flags |= XFS_ICHGTIME_CHG; 2846 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2847 } 2848 2849 if (S_ISDIR(VFS_I(ip1)->i_mode)) { 2850 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, 2851 dp2->i_ino, spaceres); 2852 if (error) 2853 goto out_trans_abort; 2854 2855 /* transfer ip1 ".." reference to dp2 */ 2856 if (!S_ISDIR(VFS_I(ip2)->i_mode)) { 2857 error = xfs_droplink(tp, dp1); 2858 if (error) 2859 goto out_trans_abort; 2860 xfs_bumplink(tp, dp2); 2861 } 2862 2863 /* 2864 * Although ip2 isn't changed here, userspace needs 2865 * to be warned about the change, so that applications 2866 * relying on it (like backup ones), will properly 2867 * notify the change 2868 */ 2869 ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2870 ip2_flags |= XFS_ICHGTIME_CHG; 2871 } 2872 } 2873 2874 if (ip1_flags) { 2875 xfs_trans_ichgtime(tp, ip1, ip1_flags); 2876 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); 2877 } 2878 if (ip2_flags) { 2879 xfs_trans_ichgtime(tp, ip2, ip2_flags); 2880 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); 2881 } 2882 if (dp2_flags) { 2883 xfs_trans_ichgtime(tp, dp2, dp2_flags); 2884 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); 2885 } 2886 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2887 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); 2888 2889 /* 2890 * Inform our hook clients that we've finished an exchange operation as 2891 * follows: removed the source and target files from their directories; 2892 * added the target to the source directory; and added the source to 2893 * the target directory. All inodes are locked, so it's ok to model a 2894 * rename this way so long as we say we deleted entries before we add 2895 * new ones. 2896 */ 2897 xfs_dir_update_hook(dp1, ip1, -1, name1); 2898 xfs_dir_update_hook(dp2, ip2, -1, name2); 2899 xfs_dir_update_hook(dp1, ip2, 1, name1); 2900 xfs_dir_update_hook(dp2, ip1, 1, name2); 2901 2902 return xfs_finish_rename(tp); 2903 2904 out_trans_abort: 2905 xfs_trans_cancel(tp); 2906 return error; 2907 } 2908 2909 /* 2910 * xfs_rename_alloc_whiteout() 2911 * 2912 * Return a referenced, unlinked, unlocked inode that can be used as a 2913 * whiteout in a rename transaction. We use a tmpfile inode here so that if we 2914 * crash between allocating the inode and linking it into the rename transaction 2915 * recovery will free the inode and we won't leak it. 2916 */ 2917 static int 2918 xfs_rename_alloc_whiteout( 2919 struct mnt_idmap *idmap, 2920 struct xfs_name *src_name, 2921 struct xfs_inode *dp, 2922 struct xfs_inode **wip) 2923 { 2924 struct xfs_inode *tmpfile; 2925 struct qstr name; 2926 int error; 2927 2928 error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, 2929 &tmpfile); 2930 if (error) 2931 return error; 2932 2933 name.name = src_name->name; 2934 name.len = src_name->len; 2935 error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name); 2936 if (error) { 2937 xfs_finish_inode_setup(tmpfile); 2938 xfs_irele(tmpfile); 2939 return error; 2940 } 2941 2942 /* 2943 * Prepare the tmpfile inode as if it were created through the VFS. 2944 * Complete the inode setup and flag it as linkable. nlink is already 2945 * zero, so we can skip the drop_nlink. 2946 */ 2947 xfs_setup_iops(tmpfile); 2948 xfs_finish_inode_setup(tmpfile); 2949 VFS_I(tmpfile)->i_state |= I_LINKABLE; 2950 2951 *wip = tmpfile; 2952 return 0; 2953 } 2954 2955 /* 2956 * xfs_rename 2957 */ 2958 int 2959 xfs_rename( 2960 struct mnt_idmap *idmap, 2961 struct xfs_inode *src_dp, 2962 struct xfs_name *src_name, 2963 struct xfs_inode *src_ip, 2964 struct xfs_inode *target_dp, 2965 struct xfs_name *target_name, 2966 struct xfs_inode *target_ip, 2967 unsigned int flags) 2968 { 2969 struct xfs_mount *mp = src_dp->i_mount; 2970 struct xfs_trans *tp; 2971 struct xfs_inode *wip = NULL; /* whiteout inode */ 2972 struct xfs_inode *inodes[__XFS_SORT_INODES]; 2973 int i; 2974 int num_inodes = __XFS_SORT_INODES; 2975 bool new_parent = (src_dp != target_dp); 2976 bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); 2977 int spaceres; 2978 bool retried = false; 2979 int error, nospace_error = 0; 2980 2981 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 2982 2983 if ((flags & RENAME_EXCHANGE) && !target_ip) 2984 return -EINVAL; 2985 2986 /* 2987 * If we are doing a whiteout operation, allocate the whiteout inode 2988 * we will be placing at the target and ensure the type is set 2989 * appropriately. 2990 */ 2991 if (flags & RENAME_WHITEOUT) { 2992 error = xfs_rename_alloc_whiteout(idmap, src_name, 2993 target_dp, &wip); 2994 if (error) 2995 return error; 2996 2997 /* setup target dirent info as whiteout */ 2998 src_name->type = XFS_DIR3_FT_CHRDEV; 2999 } 3000 3001 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, 3002 inodes, &num_inodes); 3003 3004 retry: 3005 nospace_error = 0; 3006 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 3007 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); 3008 if (error == -ENOSPC) { 3009 nospace_error = error; 3010 spaceres = 0; 3011 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, 3012 &tp); 3013 } 3014 if (error) 3015 goto out_release_wip; 3016 3017 /* 3018 * Attach the dquots to the inodes 3019 */ 3020 error = xfs_qm_vop_rename_dqattach(inodes); 3021 if (error) 3022 goto out_trans_cancel; 3023 3024 /* 3025 * Lock all the participating inodes. Depending upon whether 3026 * the target_name exists in the target directory, and 3027 * whether the target directory is the same as the source 3028 * directory, we can lock from 2 to 5 inodes. 3029 */ 3030 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 3031 3032 /* 3033 * Join all the inodes to the transaction. From this point on, 3034 * we can rely on either trans_commit or trans_cancel to unlock 3035 * them. 3036 */ 3037 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 3038 if (new_parent) 3039 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 3040 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 3041 if (target_ip) 3042 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 3043 if (wip) 3044 xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); 3045 3046 /* 3047 * If we are using project inheritance, we only allow renames 3048 * into our tree when the project IDs are the same; else the 3049 * tree quota mechanism would be circumvented. 3050 */ 3051 if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 3052 target_dp->i_projid != src_ip->i_projid)) { 3053 error = -EXDEV; 3054 goto out_trans_cancel; 3055 } 3056 3057 /* RENAME_EXCHANGE is unique from here on. */ 3058 if (flags & RENAME_EXCHANGE) 3059 return xfs_cross_rename(tp, src_dp, src_name, src_ip, 3060 target_dp, target_name, target_ip, 3061 spaceres); 3062 3063 /* 3064 * Try to reserve quota to handle an expansion of the target directory. 3065 * We'll allow the rename to continue in reservationless mode if we hit 3066 * a space usage constraint. If we trigger reservationless mode, save 3067 * the errno if there isn't any free space in the target directory. 3068 */ 3069 if (spaceres != 0) { 3070 error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres, 3071 0, false); 3072 if (error == -EDQUOT || error == -ENOSPC) { 3073 if (!retried) { 3074 xfs_trans_cancel(tp); 3075 xfs_blockgc_free_quota(target_dp, 0); 3076 retried = true; 3077 goto retry; 3078 } 3079 3080 nospace_error = error; 3081 spaceres = 0; 3082 error = 0; 3083 } 3084 if (error) 3085 goto out_trans_cancel; 3086 } 3087 3088 /* 3089 * Check for expected errors before we dirty the transaction 3090 * so we can return an error without a transaction abort. 3091 */ 3092 if (target_ip == NULL) { 3093 /* 3094 * If there's no space reservation, check the entry will 3095 * fit before actually inserting it. 3096 */ 3097 if (!spaceres) { 3098 error = xfs_dir_canenter(tp, target_dp, target_name); 3099 if (error) 3100 goto out_trans_cancel; 3101 } 3102 } else { 3103 /* 3104 * If target exists and it's a directory, check that whether 3105 * it can be destroyed. 3106 */ 3107 if (S_ISDIR(VFS_I(target_ip)->i_mode) && 3108 (!xfs_dir_isempty(target_ip) || 3109 (VFS_I(target_ip)->i_nlink > 2))) { 3110 error = -EEXIST; 3111 goto out_trans_cancel; 3112 } 3113 } 3114 3115 /* 3116 * Lock the AGI buffers we need to handle bumping the nlink of the 3117 * whiteout inode off the unlinked list and to handle dropping the 3118 * nlink of the target inode. Per locking order rules, do this in 3119 * increasing AG order and before directory block allocation tries to 3120 * grab AGFs because we grab AGIs before AGFs. 3121 * 3122 * The (vfs) caller must ensure that if src is a directory then 3123 * target_ip is either null or an empty directory. 3124 */ 3125 for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { 3126 if (inodes[i] == wip || 3127 (inodes[i] == target_ip && 3128 (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { 3129 struct xfs_perag *pag; 3130 struct xfs_buf *bp; 3131 3132 pag = xfs_perag_get(mp, 3133 XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); 3134 error = xfs_read_agi(pag, tp, &bp); 3135 xfs_perag_put(pag); 3136 if (error) 3137 goto out_trans_cancel; 3138 } 3139 } 3140 3141 /* 3142 * Directory entry creation below may acquire the AGF. Remove 3143 * the whiteout from the unlinked list first to preserve correct 3144 * AGI/AGF locking order. This dirties the transaction so failures 3145 * after this point will abort and log recovery will clean up the 3146 * mess. 3147 * 3148 * For whiteouts, we need to bump the link count on the whiteout 3149 * inode. After this point, we have a real link, clear the tmpfile 3150 * state flag from the inode so it doesn't accidentally get misused 3151 * in future. 3152 */ 3153 if (wip) { 3154 struct xfs_perag *pag; 3155 3156 ASSERT(VFS_I(wip)->i_nlink == 0); 3157 3158 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); 3159 error = xfs_iunlink_remove(tp, pag, wip); 3160 xfs_perag_put(pag); 3161 if (error) 3162 goto out_trans_cancel; 3163 3164 xfs_bumplink(tp, wip); 3165 VFS_I(wip)->i_state &= ~I_LINKABLE; 3166 } 3167 3168 /* 3169 * Set up the target. 3170 */ 3171 if (target_ip == NULL) { 3172 /* 3173 * If target does not exist and the rename crosses 3174 * directories, adjust the target directory link count 3175 * to account for the ".." reference from the new entry. 3176 */ 3177 error = xfs_dir_createname(tp, target_dp, target_name, 3178 src_ip->i_ino, spaceres); 3179 if (error) 3180 goto out_trans_cancel; 3181 3182 xfs_trans_ichgtime(tp, target_dp, 3183 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3184 3185 if (new_parent && src_is_directory) { 3186 xfs_bumplink(tp, target_dp); 3187 } 3188 } else { /* target_ip != NULL */ 3189 /* 3190 * Link the source inode under the target name. 3191 * If the source inode is a directory and we are moving 3192 * it across directories, its ".." entry will be 3193 * inconsistent until we replace that down below. 3194 * 3195 * In case there is already an entry with the same 3196 * name at the destination directory, remove it first. 3197 */ 3198 error = xfs_dir_replace(tp, target_dp, target_name, 3199 src_ip->i_ino, spaceres); 3200 if (error) 3201 goto out_trans_cancel; 3202 3203 xfs_trans_ichgtime(tp, target_dp, 3204 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3205 3206 /* 3207 * Decrement the link count on the target since the target 3208 * dir no longer points to it. 3209 */ 3210 error = xfs_droplink(tp, target_ip); 3211 if (error) 3212 goto out_trans_cancel; 3213 3214 if (src_is_directory) { 3215 /* 3216 * Drop the link from the old "." entry. 3217 */ 3218 error = xfs_droplink(tp, target_ip); 3219 if (error) 3220 goto out_trans_cancel; 3221 } 3222 } /* target_ip != NULL */ 3223 3224 /* 3225 * Remove the source. 3226 */ 3227 if (new_parent && src_is_directory) { 3228 /* 3229 * Rewrite the ".." entry to point to the new 3230 * directory. 3231 */ 3232 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, 3233 target_dp->i_ino, spaceres); 3234 ASSERT(error != -EEXIST); 3235 if (error) 3236 goto out_trans_cancel; 3237 } 3238 3239 /* 3240 * We always want to hit the ctime on the source inode. 3241 * 3242 * This isn't strictly required by the standards since the source 3243 * inode isn't really being changed, but old unix file systems did 3244 * it and some incremental backup programs won't work without it. 3245 */ 3246 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 3247 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); 3248 3249 /* 3250 * Adjust the link count on src_dp. This is necessary when 3251 * renaming a directory, either within one parent when 3252 * the target existed, or across two parent directories. 3253 */ 3254 if (src_is_directory && (new_parent || target_ip != NULL)) { 3255 3256 /* 3257 * Decrement link count on src_directory since the 3258 * entry that's moved no longer points to it. 3259 */ 3260 error = xfs_droplink(tp, src_dp); 3261 if (error) 3262 goto out_trans_cancel; 3263 } 3264 3265 /* 3266 * For whiteouts, we only need to update the source dirent with the 3267 * inode number of the whiteout inode rather than removing it 3268 * altogether. 3269 */ 3270 if (wip) 3271 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, 3272 spaceres); 3273 else 3274 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, 3275 spaceres); 3276 3277 if (error) 3278 goto out_trans_cancel; 3279 3280 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3281 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 3282 if (new_parent) 3283 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 3284 3285 /* 3286 * Inform our hook clients that we've finished a rename operation as 3287 * follows: removed the source and target files from their directories; 3288 * that we've added the source to the target directory; and finally 3289 * that we've added the whiteout, if there was one. All inodes are 3290 * locked, so it's ok to model a rename this way so long as we say we 3291 * deleted entries before we add new ones. 3292 */ 3293 if (target_ip) 3294 xfs_dir_update_hook(target_dp, target_ip, -1, target_name); 3295 xfs_dir_update_hook(src_dp, src_ip, -1, src_name); 3296 xfs_dir_update_hook(target_dp, src_ip, 1, target_name); 3297 if (wip) 3298 xfs_dir_update_hook(src_dp, wip, 1, src_name); 3299 3300 error = xfs_finish_rename(tp); 3301 if (wip) 3302 xfs_irele(wip); 3303 return error; 3304 3305 out_trans_cancel: 3306 xfs_trans_cancel(tp); 3307 out_release_wip: 3308 if (wip) 3309 xfs_irele(wip); 3310 if (error == -ENOSPC && nospace_error) 3311 error = nospace_error; 3312 return error; 3313 } 3314 3315 static int 3316 xfs_iflush( 3317 struct xfs_inode *ip, 3318 struct xfs_buf *bp) 3319 { 3320 struct xfs_inode_log_item *iip = ip->i_itemp; 3321 struct xfs_dinode *dip; 3322 struct xfs_mount *mp = ip->i_mount; 3323 int error; 3324 3325 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); 3326 ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); 3327 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || 3328 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3329 ASSERT(iip->ili_item.li_buf == bp); 3330 3331 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); 3332 3333 /* 3334 * We don't flush the inode if any of the following checks fail, but we 3335 * do still update the log item and attach to the backing buffer as if 3336 * the flush happened. This is a formality to facilitate predictable 3337 * error handling as the caller will shutdown and fail the buffer. 3338 */ 3339 error = -EFSCORRUPTED; 3340 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 3341 mp, XFS_ERRTAG_IFLUSH_1)) { 3342 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3343 "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, 3344 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 3345 goto flush_out; 3346 } 3347 if (S_ISREG(VFS_I(ip)->i_mode)) { 3348 if (XFS_TEST_ERROR( 3349 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3350 ip->i_df.if_format != XFS_DINODE_FMT_BTREE, 3351 mp, XFS_ERRTAG_IFLUSH_3)) { 3352 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3353 "%s: Bad regular inode %llu, ptr "PTR_FMT, 3354 __func__, ip->i_ino, ip); 3355 goto flush_out; 3356 } 3357 } else if (S_ISDIR(VFS_I(ip)->i_mode)) { 3358 if (XFS_TEST_ERROR( 3359 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3360 ip->i_df.if_format != XFS_DINODE_FMT_BTREE && 3361 ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, 3362 mp, XFS_ERRTAG_IFLUSH_4)) { 3363 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3364 "%s: Bad directory inode %llu, ptr "PTR_FMT, 3365 __func__, ip->i_ino, ip); 3366 goto flush_out; 3367 } 3368 } 3369 if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > 3370 ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { 3371 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3372 "%s: detected corrupt incore inode %llu, " 3373 "total extents = %llu nblocks = %lld, ptr "PTR_FMT, 3374 __func__, ip->i_ino, 3375 ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af), 3376 ip->i_nblocks, ip); 3377 goto flush_out; 3378 } 3379 if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, 3380 mp, XFS_ERRTAG_IFLUSH_6)) { 3381 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3382 "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, 3383 __func__, ip->i_ino, ip->i_forkoff, ip); 3384 goto flush_out; 3385 } 3386 3387 /* 3388 * Inode item log recovery for v2 inodes are dependent on the flushiter 3389 * count for correct sequencing. We bump the flush iteration count so 3390 * we can detect flushes which postdate a log record during recovery. 3391 * This is redundant as we now log every change and hence this can't 3392 * happen but we need to still do it to ensure backwards compatibility 3393 * with old kernels that predate logging all inode changes. 3394 */ 3395 if (!xfs_has_v3inodes(mp)) 3396 ip->i_flushiter++; 3397 3398 /* 3399 * If there are inline format data / attr forks attached to this inode, 3400 * make sure they are not corrupt. 3401 */ 3402 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && 3403 xfs_ifork_verify_local_data(ip)) 3404 goto flush_out; 3405 if (xfs_inode_has_attr_fork(ip) && 3406 ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && 3407 xfs_ifork_verify_local_attr(ip)) 3408 goto flush_out; 3409 3410 /* 3411 * Copy the dirty parts of the inode into the on-disk inode. We always 3412 * copy out the core of the inode, because if the inode is dirty at all 3413 * the core must be. 3414 */ 3415 xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); 3416 3417 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3418 if (!xfs_has_v3inodes(mp)) { 3419 if (ip->i_flushiter == DI_MAX_FLUSH) 3420 ip->i_flushiter = 0; 3421 } 3422 3423 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); 3424 if (xfs_inode_has_attr_fork(ip)) 3425 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); 3426 3427 /* 3428 * We've recorded everything logged in the inode, so we'd like to clear 3429 * the ili_fields bits so we don't log and flush things unnecessarily. 3430 * However, we can't stop logging all this information until the data 3431 * we've copied into the disk buffer is written to disk. If we did we 3432 * might overwrite the copy of the inode in the log with all the data 3433 * after re-logging only part of it, and in the face of a crash we 3434 * wouldn't have all the data we need to recover. 3435 * 3436 * What we do is move the bits to the ili_last_fields field. When 3437 * logging the inode, these bits are moved back to the ili_fields field. 3438 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since 3439 * we know that the information those bits represent is permanently on 3440 * disk. As long as the flush completes before the inode is logged 3441 * again, then both ili_fields and ili_last_fields will be cleared. 3442 */ 3443 error = 0; 3444 flush_out: 3445 spin_lock(&iip->ili_lock); 3446 iip->ili_last_fields = iip->ili_fields; 3447 iip->ili_fields = 0; 3448 iip->ili_fsync_fields = 0; 3449 spin_unlock(&iip->ili_lock); 3450 3451 /* 3452 * Store the current LSN of the inode so that we can tell whether the 3453 * item has moved in the AIL from xfs_buf_inode_iodone(). 3454 */ 3455 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3456 &iip->ili_item.li_lsn); 3457 3458 /* generate the checksum. */ 3459 xfs_dinode_calc_crc(mp, dip); 3460 if (error) 3461 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 3462 return error; 3463 } 3464 3465 /* 3466 * Non-blocking flush of dirty inode metadata into the backing buffer. 3467 * 3468 * The caller must have a reference to the inode and hold the cluster buffer 3469 * locked. The function will walk across all the inodes on the cluster buffer it 3470 * can find and lock without blocking, and flush them to the cluster buffer. 3471 * 3472 * On successful flushing of at least one inode, the caller must write out the 3473 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and 3474 * the caller needs to release the buffer. On failure, the filesystem will be 3475 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED 3476 * will be returned. 3477 */ 3478 int 3479 xfs_iflush_cluster( 3480 struct xfs_buf *bp) 3481 { 3482 struct xfs_mount *mp = bp->b_mount; 3483 struct xfs_log_item *lip, *n; 3484 struct xfs_inode *ip; 3485 struct xfs_inode_log_item *iip; 3486 int clcount = 0; 3487 int error = 0; 3488 3489 /* 3490 * We must use the safe variant here as on shutdown xfs_iflush_abort() 3491 * will remove itself from the list. 3492 */ 3493 list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { 3494 iip = (struct xfs_inode_log_item *)lip; 3495 ip = iip->ili_inode; 3496 3497 /* 3498 * Quick and dirty check to avoid locks if possible. 3499 */ 3500 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) 3501 continue; 3502 if (xfs_ipincount(ip)) 3503 continue; 3504 3505 /* 3506 * The inode is still attached to the buffer, which means it is 3507 * dirty but reclaim might try to grab it. Check carefully for 3508 * that, and grab the ilock while still holding the i_flags_lock 3509 * to guarantee reclaim will not be able to reclaim this inode 3510 * once we drop the i_flags_lock. 3511 */ 3512 spin_lock(&ip->i_flags_lock); 3513 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); 3514 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { 3515 spin_unlock(&ip->i_flags_lock); 3516 continue; 3517 } 3518 3519 /* 3520 * ILOCK will pin the inode against reclaim and prevent 3521 * concurrent transactions modifying the inode while we are 3522 * flushing the inode. If we get the lock, set the flushing 3523 * state before we drop the i_flags_lock. 3524 */ 3525 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 3526 spin_unlock(&ip->i_flags_lock); 3527 continue; 3528 } 3529 __xfs_iflags_set(ip, XFS_IFLUSHING); 3530 spin_unlock(&ip->i_flags_lock); 3531 3532 /* 3533 * Abort flushing this inode if we are shut down because the 3534 * inode may not currently be in the AIL. This can occur when 3535 * log I/O failure unpins the inode without inserting into the 3536 * AIL, leaving a dirty/unpinned inode attached to the buffer 3537 * that otherwise looks like it should be flushed. 3538 */ 3539 if (xlog_is_shutdown(mp->m_log)) { 3540 xfs_iunpin_wait(ip); 3541 xfs_iflush_abort(ip); 3542 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3543 error = -EIO; 3544 continue; 3545 } 3546 3547 /* don't block waiting on a log force to unpin dirty inodes */ 3548 if (xfs_ipincount(ip)) { 3549 xfs_iflags_clear(ip, XFS_IFLUSHING); 3550 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3551 continue; 3552 } 3553 3554 if (!xfs_inode_clean(ip)) 3555 error = xfs_iflush(ip, bp); 3556 else 3557 xfs_iflags_clear(ip, XFS_IFLUSHING); 3558 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3559 if (error) 3560 break; 3561 clcount++; 3562 } 3563 3564 if (error) { 3565 /* 3566 * Shutdown first so we kill the log before we release this 3567 * buffer. If it is an INODE_ALLOC buffer and pins the tail 3568 * of the log, failing it before the _log_ is shut down can 3569 * result in the log tail being moved forward in the journal 3570 * on disk because log writes can still be taking place. Hence 3571 * unpinning the tail will allow the ICREATE intent to be 3572 * removed from the log an recovery will fail with uninitialised 3573 * inode cluster buffers. 3574 */ 3575 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3576 bp->b_flags |= XBF_ASYNC; 3577 xfs_buf_ioend_fail(bp); 3578 return error; 3579 } 3580 3581 if (!clcount) 3582 return -EAGAIN; 3583 3584 XFS_STATS_INC(mp, xs_icluster_flushcnt); 3585 XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); 3586 return 0; 3587 3588 } 3589 3590 /* Release an inode. */ 3591 void 3592 xfs_irele( 3593 struct xfs_inode *ip) 3594 { 3595 trace_xfs_irele(ip, _RET_IP_); 3596 iput(VFS_I(ip)); 3597 } 3598 3599 /* 3600 * Ensure all commited transactions touching the inode are written to the log. 3601 */ 3602 int 3603 xfs_log_force_inode( 3604 struct xfs_inode *ip) 3605 { 3606 xfs_csn_t seq = 0; 3607 3608 xfs_ilock(ip, XFS_ILOCK_SHARED); 3609 if (xfs_ipincount(ip)) 3610 seq = ip->i_itemp->ili_commit_seq; 3611 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3612 3613 if (!seq) 3614 return 0; 3615 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); 3616 } 3617 3618 /* 3619 * Grab the exclusive iolock for a data copy from src to dest, making sure to 3620 * abide vfs locking order (lowest pointer value goes first) and breaking the 3621 * layout leases before proceeding. The loop is needed because we cannot call 3622 * the blocking break_layout() with the iolocks held, and therefore have to 3623 * back out both locks. 3624 */ 3625 static int 3626 xfs_iolock_two_inodes_and_break_layout( 3627 struct inode *src, 3628 struct inode *dest) 3629 { 3630 int error; 3631 3632 if (src > dest) 3633 swap(src, dest); 3634 3635 retry: 3636 /* Wait to break both inodes' layouts before we start locking. */ 3637 error = break_layout(src, true); 3638 if (error) 3639 return error; 3640 if (src != dest) { 3641 error = break_layout(dest, true); 3642 if (error) 3643 return error; 3644 } 3645 3646 /* Lock one inode and make sure nobody got in and leased it. */ 3647 inode_lock(src); 3648 error = break_layout(src, false); 3649 if (error) { 3650 inode_unlock(src); 3651 if (error == -EWOULDBLOCK) 3652 goto retry; 3653 return error; 3654 } 3655 3656 if (src == dest) 3657 return 0; 3658 3659 /* Lock the other inode and make sure nobody got in and leased it. */ 3660 inode_lock_nested(dest, I_MUTEX_NONDIR2); 3661 error = break_layout(dest, false); 3662 if (error) { 3663 inode_unlock(src); 3664 inode_unlock(dest); 3665 if (error == -EWOULDBLOCK) 3666 goto retry; 3667 return error; 3668 } 3669 3670 return 0; 3671 } 3672 3673 static int 3674 xfs_mmaplock_two_inodes_and_break_dax_layout( 3675 struct xfs_inode *ip1, 3676 struct xfs_inode *ip2) 3677 { 3678 int error; 3679 bool retry; 3680 struct page *page; 3681 3682 if (ip1->i_ino > ip2->i_ino) 3683 swap(ip1, ip2); 3684 3685 again: 3686 retry = false; 3687 /* Lock the first inode */ 3688 xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); 3689 error = xfs_break_dax_layouts(VFS_I(ip1), &retry); 3690 if (error || retry) { 3691 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3692 if (error == 0 && retry) 3693 goto again; 3694 return error; 3695 } 3696 3697 if (ip1 == ip2) 3698 return 0; 3699 3700 /* Nested lock the second inode */ 3701 xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1)); 3702 /* 3703 * We cannot use xfs_break_dax_layouts() directly here because it may 3704 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable 3705 * for this nested lock case. 3706 */ 3707 page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); 3708 if (page && page_ref_count(page) != 1) { 3709 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3710 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3711 goto again; 3712 } 3713 3714 return 0; 3715 } 3716 3717 /* 3718 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or 3719 * mmap activity. 3720 */ 3721 int 3722 xfs_ilock2_io_mmap( 3723 struct xfs_inode *ip1, 3724 struct xfs_inode *ip2) 3725 { 3726 int ret; 3727 3728 ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); 3729 if (ret) 3730 return ret; 3731 3732 if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { 3733 ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2); 3734 if (ret) { 3735 inode_unlock(VFS_I(ip2)); 3736 if (ip1 != ip2) 3737 inode_unlock(VFS_I(ip1)); 3738 return ret; 3739 } 3740 } else 3741 filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, 3742 VFS_I(ip2)->i_mapping); 3743 3744 return 0; 3745 } 3746 3747 /* Unlock both inodes to allow IO and mmap activity. */ 3748 void 3749 xfs_iunlock2_io_mmap( 3750 struct xfs_inode *ip1, 3751 struct xfs_inode *ip2) 3752 { 3753 if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { 3754 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3755 if (ip1 != ip2) 3756 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3757 } else 3758 filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, 3759 VFS_I(ip2)->i_mapping); 3760 3761 inode_unlock(VFS_I(ip2)); 3762 if (ip1 != ip2) 3763 inode_unlock(VFS_I(ip1)); 3764 } 3765 3766 /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ 3767 void 3768 xfs_iunlock2_remapping( 3769 struct xfs_inode *ip1, 3770 struct xfs_inode *ip2) 3771 { 3772 xfs_iflags_clear(ip1, XFS_IREMAPPING); 3773 3774 if (ip1 != ip2) 3775 xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); 3776 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3777 3778 if (ip1 != ip2) 3779 inode_unlock_shared(VFS_I(ip1)); 3780 inode_unlock(VFS_I(ip2)); 3781 } 3782 3783 /* 3784 * Reload the incore inode list for this inode. Caller should ensure that 3785 * the link count cannot change, either by taking ILOCK_SHARED or otherwise 3786 * preventing other threads from executing. 3787 */ 3788 int 3789 xfs_inode_reload_unlinked_bucket( 3790 struct xfs_trans *tp, 3791 struct xfs_inode *ip) 3792 { 3793 struct xfs_mount *mp = tp->t_mountp; 3794 struct xfs_buf *agibp; 3795 struct xfs_agi *agi; 3796 struct xfs_perag *pag; 3797 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 3798 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 3799 xfs_agino_t prev_agino, next_agino; 3800 unsigned int bucket; 3801 bool foundit = false; 3802 int error; 3803 3804 /* Grab the first inode in the list */ 3805 pag = xfs_perag_get(mp, agno); 3806 error = xfs_ialloc_read_agi(pag, tp, &agibp); 3807 xfs_perag_put(pag); 3808 if (error) 3809 return error; 3810 3811 /* 3812 * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the 3813 * incore unlinked list pointers for this inode. Check once more to 3814 * see if we raced with anyone else to reload the unlinked list. 3815 */ 3816 if (!xfs_inode_unlinked_incomplete(ip)) { 3817 foundit = true; 3818 goto out_agibp; 3819 } 3820 3821 bucket = agino % XFS_AGI_UNLINKED_BUCKETS; 3822 agi = agibp->b_addr; 3823 3824 trace_xfs_inode_reload_unlinked_bucket(ip); 3825 3826 xfs_info_ratelimited(mp, 3827 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", 3828 agino, agno); 3829 3830 prev_agino = NULLAGINO; 3831 next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); 3832 while (next_agino != NULLAGINO) { 3833 struct xfs_inode *next_ip = NULL; 3834 3835 /* Found this caller's inode, set its backlink. */ 3836 if (next_agino == agino) { 3837 next_ip = ip; 3838 next_ip->i_prev_unlinked = prev_agino; 3839 foundit = true; 3840 goto next_inode; 3841 } 3842 3843 /* Try in-memory lookup first. */ 3844 next_ip = xfs_iunlink_lookup(pag, next_agino); 3845 if (next_ip) 3846 goto next_inode; 3847 3848 /* Inode not in memory, try reloading it. */ 3849 error = xfs_iunlink_reload_next(tp, agibp, prev_agino, 3850 next_agino); 3851 if (error) 3852 break; 3853 3854 /* Grab the reloaded inode. */ 3855 next_ip = xfs_iunlink_lookup(pag, next_agino); 3856 if (!next_ip) { 3857 /* No incore inode at all? We reloaded it... */ 3858 ASSERT(next_ip != NULL); 3859 error = -EFSCORRUPTED; 3860 break; 3861 } 3862 3863 next_inode: 3864 prev_agino = next_agino; 3865 next_agino = next_ip->i_next_unlinked; 3866 } 3867 3868 out_agibp: 3869 xfs_trans_brelse(tp, agibp); 3870 /* Should have found this inode somewhere in the iunlinked bucket. */ 3871 if (!error && !foundit) 3872 error = -EFSCORRUPTED; 3873 return error; 3874 } 3875 3876 /* Decide if this inode is missing its unlinked list and reload it. */ 3877 int 3878 xfs_inode_reload_unlinked( 3879 struct xfs_inode *ip) 3880 { 3881 struct xfs_trans *tp; 3882 int error; 3883 3884 error = xfs_trans_alloc_empty(ip->i_mount, &tp); 3885 if (error) 3886 return error; 3887 3888 xfs_ilock(ip, XFS_ILOCK_SHARED); 3889 if (xfs_inode_unlinked_incomplete(ip)) 3890 error = xfs_inode_reload_unlinked_bucket(tp, ip); 3891 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3892 xfs_trans_cancel(tp); 3893 3894 return error; 3895 } 3896 3897 /* Has this inode fork been zapped by repair? */ 3898 bool 3899 xfs_ifork_zapped( 3900 const struct xfs_inode *ip, 3901 int whichfork) 3902 { 3903 unsigned int datamask = 0; 3904 3905 switch (whichfork) { 3906 case XFS_DATA_FORK: 3907 switch (ip->i_vnode.i_mode & S_IFMT) { 3908 case S_IFDIR: 3909 datamask = XFS_SICK_INO_DIR_ZAPPED; 3910 break; 3911 case S_IFLNK: 3912 datamask = XFS_SICK_INO_SYMLINK_ZAPPED; 3913 break; 3914 } 3915 return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); 3916 case XFS_ATTR_FORK: 3917 return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; 3918 default: 3919 return false; 3920 } 3921 } 3922 3923 /* Compute the number of data and realtime blocks used by a file. */ 3924 void 3925 xfs_inode_count_blocks( 3926 struct xfs_trans *tp, 3927 struct xfs_inode *ip, 3928 xfs_filblks_t *dblocks, 3929 xfs_filblks_t *rblocks) 3930 { 3931 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 3932 3933 *rblocks = 0; 3934 if (XFS_IS_REALTIME_INODE(ip)) 3935 xfs_bmap_count_leaves(ifp, rblocks); 3936 *dblocks = ip->i_nblocks - *rblocks; 3937 } 3938