1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include <linux/iversion.h> 7 8 #include "xfs.h" 9 #include "xfs_fs.h" 10 #include "xfs_shared.h" 11 #include "xfs_format.h" 12 #include "xfs_log_format.h" 13 #include "xfs_trans_resv.h" 14 #include "xfs_mount.h" 15 #include "xfs_defer.h" 16 #include "xfs_inode.h" 17 #include "xfs_dir2.h" 18 #include "xfs_attr.h" 19 #include "xfs_bit.h" 20 #include "xfs_trans_space.h" 21 #include "xfs_trans.h" 22 #include "xfs_buf_item.h" 23 #include "xfs_inode_item.h" 24 #include "xfs_iunlink_item.h" 25 #include "xfs_ialloc.h" 26 #include "xfs_bmap.h" 27 #include "xfs_bmap_util.h" 28 #include "xfs_errortag.h" 29 #include "xfs_error.h" 30 #include "xfs_quota.h" 31 #include "xfs_filestream.h" 32 #include "xfs_trace.h" 33 #include "xfs_icache.h" 34 #include "xfs_symlink.h" 35 #include "xfs_trans_priv.h" 36 #include "xfs_log.h" 37 #include "xfs_bmap_btree.h" 38 #include "xfs_reflink.h" 39 #include "xfs_ag.h" 40 #include "xfs_log_priv.h" 41 #include "xfs_health.h" 42 #include "xfs_pnfs.h" 43 44 struct kmem_cache *xfs_inode_cache; 45 46 /* 47 * helper function to extract extent size hint from inode 48 */ 49 xfs_extlen_t 50 xfs_get_extsz_hint( 51 struct xfs_inode *ip) 52 { 53 /* 54 * No point in aligning allocations if we need to COW to actually 55 * write to them. 56 */ 57 if (xfs_is_always_cow_inode(ip)) 58 return 0; 59 if ((ip->i_diflags & XFS_DIFLAG_EXTSIZE) && ip->i_extsize) 60 return ip->i_extsize; 61 if (XFS_IS_REALTIME_INODE(ip)) 62 return ip->i_mount->m_sb.sb_rextsize; 63 return 0; 64 } 65 66 /* 67 * Helper function to extract CoW extent size hint from inode. 68 * Between the extent size hint and the CoW extent size hint, we 69 * return the greater of the two. If the value is zero (automatic), 70 * use the default size. 71 */ 72 xfs_extlen_t 73 xfs_get_cowextsz_hint( 74 struct xfs_inode *ip) 75 { 76 xfs_extlen_t a, b; 77 78 a = 0; 79 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 80 a = ip->i_cowextsize; 81 b = xfs_get_extsz_hint(ip); 82 83 a = max(a, b); 84 if (a == 0) 85 return XFS_DEFAULT_COWEXTSZ_HINT; 86 return a; 87 } 88 89 /* 90 * These two are wrapper routines around the xfs_ilock() routine used to 91 * centralize some grungy code. They are used in places that wish to lock the 92 * inode solely for reading the extents. The reason these places can't just 93 * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to 94 * bringing in of the extents from disk for a file in b-tree format. If the 95 * inode is in b-tree format, then we need to lock the inode exclusively until 96 * the extents are read in. Locking it exclusively all the time would limit 97 * our parallelism unnecessarily, though. What we do instead is check to see 98 * if the extents have been read in yet, and only lock the inode exclusively 99 * if they have not. 100 * 101 * The functions return a value which should be given to the corresponding 102 * xfs_iunlock() call. 103 */ 104 uint 105 xfs_ilock_data_map_shared( 106 struct xfs_inode *ip) 107 { 108 uint lock_mode = XFS_ILOCK_SHARED; 109 110 if (xfs_need_iread_extents(&ip->i_df)) 111 lock_mode = XFS_ILOCK_EXCL; 112 xfs_ilock(ip, lock_mode); 113 return lock_mode; 114 } 115 116 uint 117 xfs_ilock_attr_map_shared( 118 struct xfs_inode *ip) 119 { 120 uint lock_mode = XFS_ILOCK_SHARED; 121 122 if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af)) 123 lock_mode = XFS_ILOCK_EXCL; 124 xfs_ilock(ip, lock_mode); 125 return lock_mode; 126 } 127 128 /* 129 * You can't set both SHARED and EXCL for the same lock, 130 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_MMAPLOCK_SHARED, 131 * XFS_MMAPLOCK_EXCL, XFS_ILOCK_SHARED, XFS_ILOCK_EXCL are valid values 132 * to set in lock_flags. 133 */ 134 static inline void 135 xfs_lock_flags_assert( 136 uint lock_flags) 137 { 138 ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != 139 (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); 140 ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != 141 (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); 142 ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != 143 (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); 144 ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); 145 ASSERT(lock_flags != 0); 146 } 147 148 /* 149 * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 150 * multi-reader locks: invalidate_lock and the i_lock. This routine allows 151 * various combinations of the locks to be obtained. 152 * 153 * The 3 locks should always be ordered so that the IO lock is obtained first, 154 * the mmap lock second and the ilock last in order to prevent deadlock. 155 * 156 * Basic locking order: 157 * 158 * i_rwsem -> invalidate_lock -> page_lock -> i_ilock 159 * 160 * mmap_lock locking order: 161 * 162 * i_rwsem -> page lock -> mmap_lock 163 * mmap_lock -> invalidate_lock -> page_lock 164 * 165 * The difference in mmap_lock locking order mean that we cannot hold the 166 * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths 167 * can fault in pages during copy in/out (for buffered IO) or require the 168 * mmap_lock in get_user_pages() to map the user pages into the kernel address 169 * space for direct IO. Similarly the i_rwsem cannot be taken inside a page 170 * fault because page faults already hold the mmap_lock. 171 * 172 * Hence to serialise fully against both syscall and mmap based IO, we need to 173 * take both the i_rwsem and the invalidate_lock. These locks should *only* be 174 * both taken in places where we need to invalidate the page cache in a race 175 * free manner (e.g. truncate, hole punch and other extent manipulation 176 * functions). 177 */ 178 void 179 xfs_ilock( 180 xfs_inode_t *ip, 181 uint lock_flags) 182 { 183 trace_xfs_ilock(ip, lock_flags, _RET_IP_); 184 185 xfs_lock_flags_assert(lock_flags); 186 187 if (lock_flags & XFS_IOLOCK_EXCL) { 188 down_write_nested(&VFS_I(ip)->i_rwsem, 189 XFS_IOLOCK_DEP(lock_flags)); 190 } else if (lock_flags & XFS_IOLOCK_SHARED) { 191 down_read_nested(&VFS_I(ip)->i_rwsem, 192 XFS_IOLOCK_DEP(lock_flags)); 193 } 194 195 if (lock_flags & XFS_MMAPLOCK_EXCL) { 196 down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, 197 XFS_MMAPLOCK_DEP(lock_flags)); 198 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 199 down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, 200 XFS_MMAPLOCK_DEP(lock_flags)); 201 } 202 203 if (lock_flags & XFS_ILOCK_EXCL) 204 down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 205 else if (lock_flags & XFS_ILOCK_SHARED) 206 down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); 207 } 208 209 /* 210 * This is just like xfs_ilock(), except that the caller 211 * is guaranteed not to sleep. It returns 1 if it gets 212 * the requested locks and 0 otherwise. If the IO lock is 213 * obtained but the inode lock cannot be, then the IO lock 214 * is dropped before returning. 215 * 216 * ip -- the inode being locked 217 * lock_flags -- this parameter indicates the inode's locks to be 218 * to be locked. See the comment for xfs_ilock() for a list 219 * of valid values. 220 */ 221 int 222 xfs_ilock_nowait( 223 xfs_inode_t *ip, 224 uint lock_flags) 225 { 226 trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_); 227 228 xfs_lock_flags_assert(lock_flags); 229 230 if (lock_flags & XFS_IOLOCK_EXCL) { 231 if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) 232 goto out; 233 } else if (lock_flags & XFS_IOLOCK_SHARED) { 234 if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) 235 goto out; 236 } 237 238 if (lock_flags & XFS_MMAPLOCK_EXCL) { 239 if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) 240 goto out_undo_iolock; 241 } else if (lock_flags & XFS_MMAPLOCK_SHARED) { 242 if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) 243 goto out_undo_iolock; 244 } 245 246 if (lock_flags & XFS_ILOCK_EXCL) { 247 if (!down_write_trylock(&ip->i_lock)) 248 goto out_undo_mmaplock; 249 } else if (lock_flags & XFS_ILOCK_SHARED) { 250 if (!down_read_trylock(&ip->i_lock)) 251 goto out_undo_mmaplock; 252 } 253 return 1; 254 255 out_undo_mmaplock: 256 if (lock_flags & XFS_MMAPLOCK_EXCL) 257 up_write(&VFS_I(ip)->i_mapping->invalidate_lock); 258 else if (lock_flags & XFS_MMAPLOCK_SHARED) 259 up_read(&VFS_I(ip)->i_mapping->invalidate_lock); 260 out_undo_iolock: 261 if (lock_flags & XFS_IOLOCK_EXCL) 262 up_write(&VFS_I(ip)->i_rwsem); 263 else if (lock_flags & XFS_IOLOCK_SHARED) 264 up_read(&VFS_I(ip)->i_rwsem); 265 out: 266 return 0; 267 } 268 269 /* 270 * xfs_iunlock() is used to drop the inode locks acquired with 271 * xfs_ilock() and xfs_ilock_nowait(). The caller must pass 272 * in the flags given to xfs_ilock() or xfs_ilock_nowait() so 273 * that we know which locks to drop. 274 * 275 * ip -- the inode being unlocked 276 * lock_flags -- this parameter indicates the inode's locks to be 277 * to be unlocked. See the comment for xfs_ilock() for a list 278 * of valid values for this parameter. 279 * 280 */ 281 void 282 xfs_iunlock( 283 xfs_inode_t *ip, 284 uint lock_flags) 285 { 286 xfs_lock_flags_assert(lock_flags); 287 288 if (lock_flags & XFS_IOLOCK_EXCL) 289 up_write(&VFS_I(ip)->i_rwsem); 290 else if (lock_flags & XFS_IOLOCK_SHARED) 291 up_read(&VFS_I(ip)->i_rwsem); 292 293 if (lock_flags & XFS_MMAPLOCK_EXCL) 294 up_write(&VFS_I(ip)->i_mapping->invalidate_lock); 295 else if (lock_flags & XFS_MMAPLOCK_SHARED) 296 up_read(&VFS_I(ip)->i_mapping->invalidate_lock); 297 298 if (lock_flags & XFS_ILOCK_EXCL) 299 up_write(&ip->i_lock); 300 else if (lock_flags & XFS_ILOCK_SHARED) 301 up_read(&ip->i_lock); 302 303 trace_xfs_iunlock(ip, lock_flags, _RET_IP_); 304 } 305 306 /* 307 * give up write locks. the i/o lock cannot be held nested 308 * if it is being demoted. 309 */ 310 void 311 xfs_ilock_demote( 312 xfs_inode_t *ip, 313 uint lock_flags) 314 { 315 ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); 316 ASSERT((lock_flags & 317 ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); 318 319 if (lock_flags & XFS_ILOCK_EXCL) 320 downgrade_write(&ip->i_lock); 321 if (lock_flags & XFS_MMAPLOCK_EXCL) 322 downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); 323 if (lock_flags & XFS_IOLOCK_EXCL) 324 downgrade_write(&VFS_I(ip)->i_rwsem); 325 326 trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); 327 } 328 329 void 330 xfs_assert_ilocked( 331 struct xfs_inode *ip, 332 uint lock_flags) 333 { 334 /* 335 * Sometimes we assert the ILOCK is held exclusively, but we're in 336 * a workqueue, so lockdep doesn't know we're the owner. 337 */ 338 if (lock_flags & XFS_ILOCK_SHARED) 339 rwsem_assert_held(&ip->i_lock); 340 else if (lock_flags & XFS_ILOCK_EXCL) 341 rwsem_assert_held_write_nolockdep(&ip->i_lock); 342 343 if (lock_flags & XFS_MMAPLOCK_SHARED) 344 rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock); 345 else if (lock_flags & XFS_MMAPLOCK_EXCL) 346 rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock); 347 348 if (lock_flags & XFS_IOLOCK_SHARED) 349 rwsem_assert_held(&VFS_I(ip)->i_rwsem); 350 else if (lock_flags & XFS_IOLOCK_EXCL) 351 rwsem_assert_held_write(&VFS_I(ip)->i_rwsem); 352 } 353 354 /* 355 * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when 356 * DEBUG or XFS_WARN is set. And MAX_LOCKDEP_SUBCLASSES is then only defined 357 * when CONFIG_LOCKDEP is set. Hence the complex define below to avoid build 358 * errors and warnings. 359 */ 360 #if (defined(DEBUG) || defined(XFS_WARN)) && defined(CONFIG_LOCKDEP) 361 static bool 362 xfs_lockdep_subclass_ok( 363 int subclass) 364 { 365 return subclass < MAX_LOCKDEP_SUBCLASSES; 366 } 367 #else 368 #define xfs_lockdep_subclass_ok(subclass) (true) 369 #endif 370 371 /* 372 * Bump the subclass so xfs_lock_inodes() acquires each lock with a different 373 * value. This can be called for any type of inode lock combination, including 374 * parent locking. Care must be taken to ensure we don't overrun the subclass 375 * storage fields in the class mask we build. 376 */ 377 static inline uint 378 xfs_lock_inumorder( 379 uint lock_mode, 380 uint subclass) 381 { 382 uint class = 0; 383 384 ASSERT(!(lock_mode & (XFS_ILOCK_PARENT | XFS_ILOCK_RTBITMAP | 385 XFS_ILOCK_RTSUM))); 386 ASSERT(xfs_lockdep_subclass_ok(subclass)); 387 388 if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { 389 ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); 390 class += subclass << XFS_IOLOCK_SHIFT; 391 } 392 393 if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { 394 ASSERT(subclass <= XFS_MMAPLOCK_MAX_SUBCLASS); 395 class += subclass << XFS_MMAPLOCK_SHIFT; 396 } 397 398 if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) { 399 ASSERT(subclass <= XFS_ILOCK_MAX_SUBCLASS); 400 class += subclass << XFS_ILOCK_SHIFT; 401 } 402 403 return (lock_mode & ~XFS_LOCK_SUBCLASS_MASK) | class; 404 } 405 406 /* 407 * The following routine will lock n inodes in exclusive mode. We assume the 408 * caller calls us with the inodes in i_ino order. 409 * 410 * We need to detect deadlock where an inode that we lock is in the AIL and we 411 * start waiting for another inode that is locked by a thread in a long running 412 * transaction (such as truncate). This can result in deadlock since the long 413 * running trans might need to wait for the inode we just locked in order to 414 * push the tail and free space in the log. 415 * 416 * xfs_lock_inodes() can only be used to lock one type of lock at a time - 417 * the iolock, the mmaplock or the ilock, but not more than one at a time. If we 418 * lock more than one at a time, lockdep will report false positives saying we 419 * have violated locking orders. 420 */ 421 static void 422 xfs_lock_inodes( 423 struct xfs_inode **ips, 424 int inodes, 425 uint lock_mode) 426 { 427 int attempts = 0; 428 uint i; 429 int j; 430 bool try_lock; 431 struct xfs_log_item *lp; 432 433 /* 434 * Currently supports between 2 and 5 inodes with exclusive locking. We 435 * support an arbitrary depth of locking here, but absolute limits on 436 * inodes depend on the type of locking and the limits placed by 437 * lockdep annotations in xfs_lock_inumorder. These are all checked by 438 * the asserts. 439 */ 440 ASSERT(ips && inodes >= 2 && inodes <= 5); 441 ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL | 442 XFS_ILOCK_EXCL)); 443 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | 444 XFS_ILOCK_SHARED))); 445 ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || 446 inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); 447 ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || 448 inodes <= XFS_ILOCK_MAX_SUBCLASS + 1); 449 450 if (lock_mode & XFS_IOLOCK_EXCL) { 451 ASSERT(!(lock_mode & (XFS_MMAPLOCK_EXCL | XFS_ILOCK_EXCL))); 452 } else if (lock_mode & XFS_MMAPLOCK_EXCL) 453 ASSERT(!(lock_mode & XFS_ILOCK_EXCL)); 454 455 again: 456 try_lock = false; 457 i = 0; 458 for (; i < inodes; i++) { 459 ASSERT(ips[i]); 460 461 if (i && (ips[i] == ips[i - 1])) /* Already locked */ 462 continue; 463 464 /* 465 * If try_lock is not set yet, make sure all locked inodes are 466 * not in the AIL. If any are, set try_lock to be used later. 467 */ 468 if (!try_lock) { 469 for (j = (i - 1); j >= 0 && !try_lock; j--) { 470 lp = &ips[j]->i_itemp->ili_item; 471 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) 472 try_lock = true; 473 } 474 } 475 476 /* 477 * If any of the previous locks we have locked is in the AIL, 478 * we must TRY to get the second and subsequent locks. If 479 * we can't get any, we must release all we have 480 * and try again. 481 */ 482 if (!try_lock) { 483 xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); 484 continue; 485 } 486 487 /* try_lock means we have an inode locked that is in the AIL. */ 488 ASSERT(i != 0); 489 if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) 490 continue; 491 492 /* 493 * Unlock all previous guys and try again. xfs_iunlock will try 494 * to push the tail if the inode is in the AIL. 495 */ 496 attempts++; 497 for (j = i - 1; j >= 0; j--) { 498 /* 499 * Check to see if we've already unlocked this one. Not 500 * the first one going back, and the inode ptr is the 501 * same. 502 */ 503 if (j != (i - 1) && ips[j] == ips[j + 1]) 504 continue; 505 506 xfs_iunlock(ips[j], lock_mode); 507 } 508 509 if ((attempts % 5) == 0) { 510 delay(1); /* Don't just spin the CPU */ 511 } 512 goto again; 513 } 514 } 515 516 /* 517 * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and 518 * mmaplock must be double-locked separately since we use i_rwsem and 519 * invalidate_lock for that. We now support taking one lock EXCL and the 520 * other SHARED. 521 */ 522 void 523 xfs_lock_two_inodes( 524 struct xfs_inode *ip0, 525 uint ip0_mode, 526 struct xfs_inode *ip1, 527 uint ip1_mode) 528 { 529 int attempts = 0; 530 struct xfs_log_item *lp; 531 532 ASSERT(hweight32(ip0_mode) == 1); 533 ASSERT(hweight32(ip1_mode) == 1); 534 ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 535 ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); 536 ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 537 ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); 538 ASSERT(ip0->i_ino != ip1->i_ino); 539 540 if (ip0->i_ino > ip1->i_ino) { 541 swap(ip0, ip1); 542 swap(ip0_mode, ip1_mode); 543 } 544 545 again: 546 xfs_ilock(ip0, xfs_lock_inumorder(ip0_mode, 0)); 547 548 /* 549 * If the first lock we have locked is in the AIL, we must TRY to get 550 * the second lock. If we can't get it, we must release the first one 551 * and try again. 552 */ 553 lp = &ip0->i_itemp->ili_item; 554 if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { 555 if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { 556 xfs_iunlock(ip0, ip0_mode); 557 if ((++attempts % 5) == 0) 558 delay(1); /* Don't just spin the CPU */ 559 goto again; 560 } 561 } else { 562 xfs_ilock(ip1, xfs_lock_inumorder(ip1_mode, 1)); 563 } 564 } 565 566 uint 567 xfs_ip2xflags( 568 struct xfs_inode *ip) 569 { 570 uint flags = 0; 571 572 if (ip->i_diflags & XFS_DIFLAG_ANY) { 573 if (ip->i_diflags & XFS_DIFLAG_REALTIME) 574 flags |= FS_XFLAG_REALTIME; 575 if (ip->i_diflags & XFS_DIFLAG_PREALLOC) 576 flags |= FS_XFLAG_PREALLOC; 577 if (ip->i_diflags & XFS_DIFLAG_IMMUTABLE) 578 flags |= FS_XFLAG_IMMUTABLE; 579 if (ip->i_diflags & XFS_DIFLAG_APPEND) 580 flags |= FS_XFLAG_APPEND; 581 if (ip->i_diflags & XFS_DIFLAG_SYNC) 582 flags |= FS_XFLAG_SYNC; 583 if (ip->i_diflags & XFS_DIFLAG_NOATIME) 584 flags |= FS_XFLAG_NOATIME; 585 if (ip->i_diflags & XFS_DIFLAG_NODUMP) 586 flags |= FS_XFLAG_NODUMP; 587 if (ip->i_diflags & XFS_DIFLAG_RTINHERIT) 588 flags |= FS_XFLAG_RTINHERIT; 589 if (ip->i_diflags & XFS_DIFLAG_PROJINHERIT) 590 flags |= FS_XFLAG_PROJINHERIT; 591 if (ip->i_diflags & XFS_DIFLAG_NOSYMLINKS) 592 flags |= FS_XFLAG_NOSYMLINKS; 593 if (ip->i_diflags & XFS_DIFLAG_EXTSIZE) 594 flags |= FS_XFLAG_EXTSIZE; 595 if (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) 596 flags |= FS_XFLAG_EXTSZINHERIT; 597 if (ip->i_diflags & XFS_DIFLAG_NODEFRAG) 598 flags |= FS_XFLAG_NODEFRAG; 599 if (ip->i_diflags & XFS_DIFLAG_FILESTREAM) 600 flags |= FS_XFLAG_FILESTREAM; 601 } 602 603 if (ip->i_diflags2 & XFS_DIFLAG2_ANY) { 604 if (ip->i_diflags2 & XFS_DIFLAG2_DAX) 605 flags |= FS_XFLAG_DAX; 606 if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) 607 flags |= FS_XFLAG_COWEXTSIZE; 608 } 609 610 if (xfs_inode_has_attr_fork(ip)) 611 flags |= FS_XFLAG_HASATTR; 612 return flags; 613 } 614 615 /* 616 * Lookups up an inode from "name". If ci_name is not NULL, then a CI match 617 * is allowed, otherwise it has to be an exact match. If a CI match is found, 618 * ci_name->name will point to a the actual name (caller must free) or 619 * will be set to NULL if an exact match is found. 620 */ 621 int 622 xfs_lookup( 623 struct xfs_inode *dp, 624 const struct xfs_name *name, 625 struct xfs_inode **ipp, 626 struct xfs_name *ci_name) 627 { 628 xfs_ino_t inum; 629 int error; 630 631 trace_xfs_lookup(dp, name); 632 633 if (xfs_is_shutdown(dp->i_mount)) 634 return -EIO; 635 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 636 return -EIO; 637 638 error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); 639 if (error) 640 goto out_unlock; 641 642 error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp); 643 if (error) 644 goto out_free_name; 645 646 return 0; 647 648 out_free_name: 649 if (ci_name) 650 kfree(ci_name->name); 651 out_unlock: 652 *ipp = NULL; 653 return error; 654 } 655 656 /* Propagate di_flags from a parent inode to a child inode. */ 657 static void 658 xfs_inode_inherit_flags( 659 struct xfs_inode *ip, 660 const struct xfs_inode *pip) 661 { 662 unsigned int di_flags = 0; 663 xfs_failaddr_t failaddr; 664 umode_t mode = VFS_I(ip)->i_mode; 665 666 if (S_ISDIR(mode)) { 667 if (pip->i_diflags & XFS_DIFLAG_RTINHERIT) 668 di_flags |= XFS_DIFLAG_RTINHERIT; 669 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 670 di_flags |= XFS_DIFLAG_EXTSZINHERIT; 671 ip->i_extsize = pip->i_extsize; 672 } 673 if (pip->i_diflags & XFS_DIFLAG_PROJINHERIT) 674 di_flags |= XFS_DIFLAG_PROJINHERIT; 675 } else if (S_ISREG(mode)) { 676 if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && 677 xfs_has_realtime(ip->i_mount)) 678 di_flags |= XFS_DIFLAG_REALTIME; 679 if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { 680 di_flags |= XFS_DIFLAG_EXTSIZE; 681 ip->i_extsize = pip->i_extsize; 682 } 683 } 684 if ((pip->i_diflags & XFS_DIFLAG_NOATIME) && 685 xfs_inherit_noatime) 686 di_flags |= XFS_DIFLAG_NOATIME; 687 if ((pip->i_diflags & XFS_DIFLAG_NODUMP) && 688 xfs_inherit_nodump) 689 di_flags |= XFS_DIFLAG_NODUMP; 690 if ((pip->i_diflags & XFS_DIFLAG_SYNC) && 691 xfs_inherit_sync) 692 di_flags |= XFS_DIFLAG_SYNC; 693 if ((pip->i_diflags & XFS_DIFLAG_NOSYMLINKS) && 694 xfs_inherit_nosymlinks) 695 di_flags |= XFS_DIFLAG_NOSYMLINKS; 696 if ((pip->i_diflags & XFS_DIFLAG_NODEFRAG) && 697 xfs_inherit_nodefrag) 698 di_flags |= XFS_DIFLAG_NODEFRAG; 699 if (pip->i_diflags & XFS_DIFLAG_FILESTREAM) 700 di_flags |= XFS_DIFLAG_FILESTREAM; 701 702 ip->i_diflags |= di_flags; 703 704 /* 705 * Inode verifiers on older kernels only check that the extent size 706 * hint is an integer multiple of the rt extent size on realtime files. 707 * They did not check the hint alignment on a directory with both 708 * rtinherit and extszinherit flags set. If the misaligned hint is 709 * propagated from a directory into a new realtime file, new file 710 * allocations will fail due to math errors in the rt allocator and/or 711 * trip the verifiers. Validate the hint settings in the new file so 712 * that we don't let broken hints propagate. 713 */ 714 failaddr = xfs_inode_validate_extsize(ip->i_mount, ip->i_extsize, 715 VFS_I(ip)->i_mode, ip->i_diflags); 716 if (failaddr) { 717 ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | 718 XFS_DIFLAG_EXTSZINHERIT); 719 ip->i_extsize = 0; 720 } 721 } 722 723 /* Propagate di_flags2 from a parent inode to a child inode. */ 724 static void 725 xfs_inode_inherit_flags2( 726 struct xfs_inode *ip, 727 const struct xfs_inode *pip) 728 { 729 xfs_failaddr_t failaddr; 730 731 if (pip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) { 732 ip->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE; 733 ip->i_cowextsize = pip->i_cowextsize; 734 } 735 if (pip->i_diflags2 & XFS_DIFLAG2_DAX) 736 ip->i_diflags2 |= XFS_DIFLAG2_DAX; 737 738 /* Don't let invalid cowextsize hints propagate. */ 739 failaddr = xfs_inode_validate_cowextsize(ip->i_mount, ip->i_cowextsize, 740 VFS_I(ip)->i_mode, ip->i_diflags, ip->i_diflags2); 741 if (failaddr) { 742 ip->i_diflags2 &= ~XFS_DIFLAG2_COWEXTSIZE; 743 ip->i_cowextsize = 0; 744 } 745 } 746 747 /* 748 * Initialise a newly allocated inode and return the in-core inode to the 749 * caller locked exclusively. 750 */ 751 int 752 xfs_init_new_inode( 753 struct mnt_idmap *idmap, 754 struct xfs_trans *tp, 755 struct xfs_inode *pip, 756 xfs_ino_t ino, 757 umode_t mode, 758 xfs_nlink_t nlink, 759 dev_t rdev, 760 prid_t prid, 761 bool init_xattrs, 762 struct xfs_inode **ipp) 763 { 764 struct inode *dir = pip ? VFS_I(pip) : NULL; 765 struct xfs_mount *mp = tp->t_mountp; 766 struct xfs_inode *ip; 767 unsigned int flags; 768 int error; 769 struct timespec64 tv; 770 struct inode *inode; 771 772 /* 773 * Protect against obviously corrupt allocation btree records. Later 774 * xfs_iget checks will catch re-allocation of other active in-memory 775 * and on-disk inodes. If we don't catch reallocating the parent inode 776 * here we will deadlock in xfs_iget() so we have to do these checks 777 * first. 778 */ 779 if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { 780 xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); 781 xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino), 782 XFS_SICK_AG_INOBT); 783 return -EFSCORRUPTED; 784 } 785 786 /* 787 * Get the in-core inode with the lock held exclusively to prevent 788 * others from looking at until we're done. 789 */ 790 error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip); 791 if (error) 792 return error; 793 794 ASSERT(ip != NULL); 795 inode = VFS_I(ip); 796 set_nlink(inode, nlink); 797 inode->i_rdev = rdev; 798 ip->i_projid = prid; 799 800 if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { 801 inode_fsuid_set(inode, idmap); 802 inode->i_gid = dir->i_gid; 803 inode->i_mode = mode; 804 } else { 805 inode_init_owner(idmap, inode, dir, mode); 806 } 807 808 /* 809 * If the group ID of the new file does not match the effective group 810 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared 811 * (and only if the irix_sgid_inherit compatibility variable is set). 812 */ 813 if (irix_sgid_inherit && (inode->i_mode & S_ISGID) && 814 !vfsgid_in_group_p(i_gid_into_vfsgid(idmap, inode))) 815 inode->i_mode &= ~S_ISGID; 816 817 ip->i_disk_size = 0; 818 ip->i_df.if_nextents = 0; 819 ASSERT(ip->i_nblocks == 0); 820 821 tv = inode_set_ctime_current(inode); 822 inode_set_mtime_to_ts(inode, tv); 823 inode_set_atime_to_ts(inode, tv); 824 825 ip->i_extsize = 0; 826 ip->i_diflags = 0; 827 828 if (xfs_has_v3inodes(mp)) { 829 inode_set_iversion(inode, 1); 830 ip->i_cowextsize = 0; 831 ip->i_crtime = tv; 832 } 833 834 flags = XFS_ILOG_CORE; 835 switch (mode & S_IFMT) { 836 case S_IFIFO: 837 case S_IFCHR: 838 case S_IFBLK: 839 case S_IFSOCK: 840 ip->i_df.if_format = XFS_DINODE_FMT_DEV; 841 flags |= XFS_ILOG_DEV; 842 break; 843 case S_IFREG: 844 case S_IFDIR: 845 if (pip && (pip->i_diflags & XFS_DIFLAG_ANY)) 846 xfs_inode_inherit_flags(ip, pip); 847 if (pip && (pip->i_diflags2 & XFS_DIFLAG2_ANY)) 848 xfs_inode_inherit_flags2(ip, pip); 849 fallthrough; 850 case S_IFLNK: 851 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 852 ip->i_df.if_bytes = 0; 853 ip->i_df.if_data = NULL; 854 break; 855 default: 856 ASSERT(0); 857 } 858 859 /* 860 * If we need to create attributes immediately after allocating the 861 * inode, initialise an empty attribute fork right now. We use the 862 * default fork offset for attributes here as we don't know exactly what 863 * size or how many attributes we might be adding. We can do this 864 * safely here because we know the data fork is completely empty and 865 * this saves us from needing to run a separate transaction to set the 866 * fork offset in the immediate future. 867 */ 868 if (init_xattrs && xfs_has_attr(mp)) { 869 ip->i_forkoff = xfs_default_attroffset(ip) >> 3; 870 xfs_ifork_init_attr(ip, XFS_DINODE_FMT_EXTENTS, 0); 871 } 872 873 /* 874 * Log the new values stuffed into the inode. 875 */ 876 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 877 xfs_trans_log_inode(tp, ip, flags); 878 879 /* now that we have an i_mode we can setup the inode structure */ 880 xfs_setup_inode(ip); 881 882 *ipp = ip; 883 return 0; 884 } 885 886 /* 887 * Decrement the link count on an inode & log the change. If this causes the 888 * link count to go to zero, move the inode to AGI unlinked list so that it can 889 * be freed when the last active reference goes away via xfs_inactive(). 890 */ 891 static int /* error */ 892 xfs_droplink( 893 xfs_trans_t *tp, 894 xfs_inode_t *ip) 895 { 896 if (VFS_I(ip)->i_nlink == 0) { 897 xfs_alert(ip->i_mount, 898 "%s: Attempt to drop inode (%llu) with nlink zero.", 899 __func__, ip->i_ino); 900 return -EFSCORRUPTED; 901 } 902 903 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 904 905 drop_nlink(VFS_I(ip)); 906 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 907 908 if (VFS_I(ip)->i_nlink) 909 return 0; 910 911 return xfs_iunlink(tp, ip); 912 } 913 914 /* 915 * Increment the link count on an inode & log the change. 916 */ 917 void 918 xfs_bumplink( 919 struct xfs_trans *tp, 920 struct xfs_inode *ip) 921 { 922 xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); 923 924 inc_nlink(VFS_I(ip)); 925 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 926 } 927 928 #ifdef CONFIG_XFS_LIVE_HOOKS 929 /* 930 * Use a static key here to reduce the overhead of directory live update hooks. 931 * If the compiler supports jump labels, the static branch will be replaced by 932 * a nop sled when there are no hook users. Online fsck is currently the only 933 * caller, so this is a reasonable tradeoff. 934 * 935 * Note: Patching the kernel code requires taking the cpu hotplug lock. Other 936 * parts of the kernel allocate memory with that lock held, which means that 937 * XFS callers cannot hold any locks that might be used by memory reclaim or 938 * writeback when calling the static_branch_{inc,dec} functions. 939 */ 940 DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch); 941 942 void 943 xfs_dir_hook_disable(void) 944 { 945 xfs_hooks_switch_off(&xfs_dir_hooks_switch); 946 } 947 948 void 949 xfs_dir_hook_enable(void) 950 { 951 xfs_hooks_switch_on(&xfs_dir_hooks_switch); 952 } 953 954 /* Call hooks for a directory update relating to a child dirent update. */ 955 inline void 956 xfs_dir_update_hook( 957 struct xfs_inode *dp, 958 struct xfs_inode *ip, 959 int delta, 960 const struct xfs_name *name) 961 { 962 if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) { 963 struct xfs_dir_update_params p = { 964 .dp = dp, 965 .ip = ip, 966 .delta = delta, 967 .name = name, 968 }; 969 struct xfs_mount *mp = ip->i_mount; 970 971 xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p); 972 } 973 } 974 975 /* Call the specified function during a directory update. */ 976 int 977 xfs_dir_hook_add( 978 struct xfs_mount *mp, 979 struct xfs_dir_hook *hook) 980 { 981 return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook); 982 } 983 984 /* Stop calling the specified function during a directory update. */ 985 void 986 xfs_dir_hook_del( 987 struct xfs_mount *mp, 988 struct xfs_dir_hook *hook) 989 { 990 xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook); 991 } 992 993 /* Configure directory update hook functions. */ 994 void 995 xfs_dir_hook_setup( 996 struct xfs_dir_hook *hook, 997 notifier_fn_t mod_fn) 998 { 999 xfs_hook_setup(&hook->dirent_hook, mod_fn); 1000 } 1001 #endif /* CONFIG_XFS_LIVE_HOOKS */ 1002 1003 int 1004 xfs_create( 1005 struct mnt_idmap *idmap, 1006 xfs_inode_t *dp, 1007 struct xfs_name *name, 1008 umode_t mode, 1009 dev_t rdev, 1010 bool init_xattrs, 1011 xfs_inode_t **ipp) 1012 { 1013 int is_dir = S_ISDIR(mode); 1014 struct xfs_mount *mp = dp->i_mount; 1015 struct xfs_inode *ip = NULL; 1016 struct xfs_trans *tp = NULL; 1017 int error; 1018 bool unlock_dp_on_error = false; 1019 prid_t prid; 1020 struct xfs_dquot *udqp = NULL; 1021 struct xfs_dquot *gdqp = NULL; 1022 struct xfs_dquot *pdqp = NULL; 1023 struct xfs_trans_res *tres; 1024 uint resblks; 1025 xfs_ino_t ino; 1026 1027 trace_xfs_create(dp, name); 1028 1029 if (xfs_is_shutdown(mp)) 1030 return -EIO; 1031 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 1032 return -EIO; 1033 1034 prid = xfs_get_initial_prid(dp); 1035 1036 /* 1037 * Make sure that we have allocated dquot(s) on disk. 1038 */ 1039 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), 1040 mapped_fsgid(idmap, &init_user_ns), prid, 1041 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1042 &udqp, &gdqp, &pdqp); 1043 if (error) 1044 return error; 1045 1046 if (is_dir) { 1047 resblks = XFS_MKDIR_SPACE_RES(mp, name->len); 1048 tres = &M_RES(mp)->tr_mkdir; 1049 } else { 1050 resblks = XFS_CREATE_SPACE_RES(mp, name->len); 1051 tres = &M_RES(mp)->tr_create; 1052 } 1053 1054 /* 1055 * Initially assume that the file does not exist and 1056 * reserve the resources for that case. If that is not 1057 * the case we'll drop the one we have and get a more 1058 * appropriate transaction later. 1059 */ 1060 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1061 &tp); 1062 if (error == -ENOSPC) { 1063 /* flush outstanding delalloc blocks and retry */ 1064 xfs_flush_inodes(mp); 1065 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, 1066 resblks, &tp); 1067 } 1068 if (error) 1069 goto out_release_dquots; 1070 1071 xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); 1072 unlock_dp_on_error = true; 1073 1074 /* 1075 * A newly created regular or special file just has one directory 1076 * entry pointing to them, but a directory also the "." entry 1077 * pointing to itself. 1078 */ 1079 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); 1080 if (!error) 1081 error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 1082 is_dir ? 2 : 1, rdev, prid, init_xattrs, &ip); 1083 if (error) 1084 goto out_trans_cancel; 1085 1086 /* 1087 * Now we join the directory inode to the transaction. We do not do it 1088 * earlier because xfs_dialloc might commit the previous transaction 1089 * (and release all the locks). An error from here on will result in 1090 * the transaction cancel unlocking dp so don't do it explicitly in the 1091 * error path. 1092 */ 1093 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1094 unlock_dp_on_error = false; 1095 1096 error = xfs_dir_createname(tp, dp, name, ip->i_ino, 1097 resblks - XFS_IALLOC_SPACE_RES(mp)); 1098 if (error) { 1099 ASSERT(error != -ENOSPC); 1100 goto out_trans_cancel; 1101 } 1102 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1103 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 1104 1105 if (is_dir) { 1106 error = xfs_dir_init(tp, ip, dp); 1107 if (error) 1108 goto out_trans_cancel; 1109 1110 xfs_bumplink(tp, dp); 1111 } 1112 1113 /* 1114 * Create ip with a reference from dp, and add '.' and '..' references 1115 * if it's a directory. 1116 */ 1117 xfs_dir_update_hook(dp, ip, 1, name); 1118 1119 /* 1120 * If this is a synchronous mount, make sure that the 1121 * create transaction goes to disk before returning to 1122 * the user. 1123 */ 1124 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 1125 xfs_trans_set_sync(tp); 1126 1127 /* 1128 * Attach the dquot(s) to the inodes and modify them incore. 1129 * These ids of the inode couldn't have changed since the new 1130 * inode has been locked ever since it was created. 1131 */ 1132 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1133 1134 error = xfs_trans_commit(tp); 1135 if (error) 1136 goto out_release_inode; 1137 1138 xfs_qm_dqrele(udqp); 1139 xfs_qm_dqrele(gdqp); 1140 xfs_qm_dqrele(pdqp); 1141 1142 *ipp = ip; 1143 return 0; 1144 1145 out_trans_cancel: 1146 xfs_trans_cancel(tp); 1147 out_release_inode: 1148 /* 1149 * Wait until after the current transaction is aborted to finish the 1150 * setup of the inode and release the inode. This prevents recursive 1151 * transactions and deadlocks from xfs_inactive. 1152 */ 1153 if (ip) { 1154 xfs_finish_inode_setup(ip); 1155 xfs_irele(ip); 1156 } 1157 out_release_dquots: 1158 xfs_qm_dqrele(udqp); 1159 xfs_qm_dqrele(gdqp); 1160 xfs_qm_dqrele(pdqp); 1161 1162 if (unlock_dp_on_error) 1163 xfs_iunlock(dp, XFS_ILOCK_EXCL); 1164 return error; 1165 } 1166 1167 int 1168 xfs_create_tmpfile( 1169 struct mnt_idmap *idmap, 1170 struct xfs_inode *dp, 1171 umode_t mode, 1172 struct xfs_inode **ipp) 1173 { 1174 struct xfs_mount *mp = dp->i_mount; 1175 struct xfs_inode *ip = NULL; 1176 struct xfs_trans *tp = NULL; 1177 int error; 1178 prid_t prid; 1179 struct xfs_dquot *udqp = NULL; 1180 struct xfs_dquot *gdqp = NULL; 1181 struct xfs_dquot *pdqp = NULL; 1182 struct xfs_trans_res *tres; 1183 uint resblks; 1184 xfs_ino_t ino; 1185 1186 if (xfs_is_shutdown(mp)) 1187 return -EIO; 1188 1189 prid = xfs_get_initial_prid(dp); 1190 1191 /* 1192 * Make sure that we have allocated dquot(s) on disk. 1193 */ 1194 error = xfs_qm_vop_dqalloc(dp, mapped_fsuid(idmap, &init_user_ns), 1195 mapped_fsgid(idmap, &init_user_ns), prid, 1196 XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, 1197 &udqp, &gdqp, &pdqp); 1198 if (error) 1199 return error; 1200 1201 resblks = XFS_IALLOC_SPACE_RES(mp); 1202 tres = &M_RES(mp)->tr_create_tmpfile; 1203 1204 error = xfs_trans_alloc_icreate(mp, tres, udqp, gdqp, pdqp, resblks, 1205 &tp); 1206 if (error) 1207 goto out_release_dquots; 1208 1209 error = xfs_dialloc(&tp, dp->i_ino, mode, &ino); 1210 if (!error) 1211 error = xfs_init_new_inode(idmap, tp, dp, ino, mode, 1212 0, 0, prid, false, &ip); 1213 if (error) 1214 goto out_trans_cancel; 1215 1216 if (xfs_has_wsync(mp)) 1217 xfs_trans_set_sync(tp); 1218 1219 /* 1220 * Attach the dquot(s) to the inodes and modify them incore. 1221 * These ids of the inode couldn't have changed since the new 1222 * inode has been locked ever since it was created. 1223 */ 1224 xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); 1225 1226 error = xfs_iunlink(tp, ip); 1227 if (error) 1228 goto out_trans_cancel; 1229 1230 error = xfs_trans_commit(tp); 1231 if (error) 1232 goto out_release_inode; 1233 1234 xfs_qm_dqrele(udqp); 1235 xfs_qm_dqrele(gdqp); 1236 xfs_qm_dqrele(pdqp); 1237 1238 *ipp = ip; 1239 return 0; 1240 1241 out_trans_cancel: 1242 xfs_trans_cancel(tp); 1243 out_release_inode: 1244 /* 1245 * Wait until after the current transaction is aborted to finish the 1246 * setup of the inode and release the inode. This prevents recursive 1247 * transactions and deadlocks from xfs_inactive. 1248 */ 1249 if (ip) { 1250 xfs_finish_inode_setup(ip); 1251 xfs_irele(ip); 1252 } 1253 out_release_dquots: 1254 xfs_qm_dqrele(udqp); 1255 xfs_qm_dqrele(gdqp); 1256 xfs_qm_dqrele(pdqp); 1257 1258 return error; 1259 } 1260 1261 int 1262 xfs_link( 1263 xfs_inode_t *tdp, 1264 xfs_inode_t *sip, 1265 struct xfs_name *target_name) 1266 { 1267 xfs_mount_t *mp = tdp->i_mount; 1268 xfs_trans_t *tp; 1269 int error, nospace_error = 0; 1270 int resblks; 1271 1272 trace_xfs_link(tdp, target_name); 1273 1274 ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); 1275 1276 if (xfs_is_shutdown(mp)) 1277 return -EIO; 1278 if (xfs_ifork_zapped(tdp, XFS_DATA_FORK)) 1279 return -EIO; 1280 1281 error = xfs_qm_dqattach(sip); 1282 if (error) 1283 goto std_return; 1284 1285 error = xfs_qm_dqattach(tdp); 1286 if (error) 1287 goto std_return; 1288 1289 resblks = XFS_LINK_SPACE_RES(mp, target_name->len); 1290 error = xfs_trans_alloc_dir(tdp, &M_RES(mp)->tr_link, sip, &resblks, 1291 &tp, &nospace_error); 1292 if (error) 1293 goto std_return; 1294 1295 /* 1296 * If we are using project inheritance, we only allow hard link 1297 * creation in our tree when the project IDs are the same; else 1298 * the tree quota mechanism could be circumvented. 1299 */ 1300 if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 1301 tdp->i_projid != sip->i_projid)) { 1302 /* 1303 * Project quota setup skips special files which can 1304 * leave inodes in a PROJINHERIT directory without a 1305 * project ID set. We need to allow links to be made 1306 * to these "project-less" inodes because userspace 1307 * expects them to succeed after project ID setup, 1308 * but everything else should be rejected. 1309 */ 1310 if (!special_file(VFS_I(sip)->i_mode) || 1311 sip->i_projid != 0) { 1312 error = -EXDEV; 1313 goto error_return; 1314 } 1315 } 1316 1317 if (!resblks) { 1318 error = xfs_dir_canenter(tp, tdp, target_name); 1319 if (error) 1320 goto error_return; 1321 } 1322 1323 /* 1324 * Handle initial link state of O_TMPFILE inode 1325 */ 1326 if (VFS_I(sip)->i_nlink == 0) { 1327 struct xfs_perag *pag; 1328 1329 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, sip->i_ino)); 1330 error = xfs_iunlink_remove(tp, pag, sip); 1331 xfs_perag_put(pag); 1332 if (error) 1333 goto error_return; 1334 } 1335 1336 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, 1337 resblks); 1338 if (error) 1339 goto error_return; 1340 xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 1341 xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); 1342 1343 xfs_bumplink(tp, sip); 1344 xfs_dir_update_hook(tdp, sip, 1, target_name); 1345 1346 /* 1347 * If this is a synchronous mount, make sure that the 1348 * link transaction goes to disk before returning to 1349 * the user. 1350 */ 1351 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 1352 xfs_trans_set_sync(tp); 1353 1354 return xfs_trans_commit(tp); 1355 1356 error_return: 1357 xfs_trans_cancel(tp); 1358 std_return: 1359 if (error == -ENOSPC && nospace_error) 1360 error = nospace_error; 1361 return error; 1362 } 1363 1364 /* Clear the reflink flag and the cowblocks tag if possible. */ 1365 static void 1366 xfs_itruncate_clear_reflink_flags( 1367 struct xfs_inode *ip) 1368 { 1369 struct xfs_ifork *dfork; 1370 struct xfs_ifork *cfork; 1371 1372 if (!xfs_is_reflink_inode(ip)) 1373 return; 1374 dfork = xfs_ifork_ptr(ip, XFS_DATA_FORK); 1375 cfork = xfs_ifork_ptr(ip, XFS_COW_FORK); 1376 if (dfork->if_bytes == 0 && cfork->if_bytes == 0) 1377 ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; 1378 if (cfork->if_bytes == 0) 1379 xfs_inode_clear_cowblocks_tag(ip); 1380 } 1381 1382 /* 1383 * Free up the underlying blocks past new_size. The new size must be smaller 1384 * than the current size. This routine can be used both for the attribute and 1385 * data fork, and does not modify the inode size, which is left to the caller. 1386 * 1387 * The transaction passed to this routine must have made a permanent log 1388 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the 1389 * given transaction and start new ones, so make sure everything involved in 1390 * the transaction is tidy before calling here. Some transaction will be 1391 * returned to the caller to be committed. The incoming transaction must 1392 * already include the inode, and both inode locks must be held exclusively. 1393 * The inode must also be "held" within the transaction. On return the inode 1394 * will be "held" within the returned transaction. This routine does NOT 1395 * require any disk space to be reserved for it within the transaction. 1396 * 1397 * If we get an error, we must return with the inode locked and linked into the 1398 * current transaction. This keeps things simple for the higher level code, 1399 * because it always knows that the inode is locked and held in the transaction 1400 * that returns to it whether errors occur or not. We don't mark the inode 1401 * dirty on error so that transactions can be easily aborted if possible. 1402 */ 1403 int 1404 xfs_itruncate_extents_flags( 1405 struct xfs_trans **tpp, 1406 struct xfs_inode *ip, 1407 int whichfork, 1408 xfs_fsize_t new_size, 1409 int flags) 1410 { 1411 struct xfs_mount *mp = ip->i_mount; 1412 struct xfs_trans *tp = *tpp; 1413 xfs_fileoff_t first_unmap_block; 1414 int error = 0; 1415 1416 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1417 if (atomic_read(&VFS_I(ip)->i_count)) 1418 xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL); 1419 ASSERT(new_size <= XFS_ISIZE(ip)); 1420 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 1421 ASSERT(ip->i_itemp != NULL); 1422 ASSERT(ip->i_itemp->ili_lock_flags == 0); 1423 ASSERT(!XFS_NOT_DQATTACHED(mp, ip)); 1424 1425 trace_xfs_itruncate_extents_start(ip, new_size); 1426 1427 flags |= xfs_bmapi_aflag(whichfork); 1428 1429 /* 1430 * Since it is possible for space to become allocated beyond 1431 * the end of the file (in a crash where the space is allocated 1432 * but the inode size is not yet updated), simply remove any 1433 * blocks which show up between the new EOF and the maximum 1434 * possible file size. 1435 * 1436 * We have to free all the blocks to the bmbt maximum offset, even if 1437 * the page cache can't scale that far. 1438 */ 1439 first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size); 1440 if (!xfs_verify_fileoff(mp, first_unmap_block)) { 1441 WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF); 1442 return 0; 1443 } 1444 1445 error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block, 1446 XFS_MAX_FILEOFF); 1447 if (error) 1448 goto out; 1449 1450 if (whichfork == XFS_DATA_FORK) { 1451 /* Remove all pending CoW reservations. */ 1452 error = xfs_reflink_cancel_cow_blocks(ip, &tp, 1453 first_unmap_block, XFS_MAX_FILEOFF, true); 1454 if (error) 1455 goto out; 1456 1457 xfs_itruncate_clear_reflink_flags(ip); 1458 } 1459 1460 /* 1461 * Always re-log the inode so that our permanent transaction can keep 1462 * on rolling it forward in the log. 1463 */ 1464 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1465 1466 trace_xfs_itruncate_extents_end(ip, new_size); 1467 1468 out: 1469 *tpp = tp; 1470 return error; 1471 } 1472 1473 int 1474 xfs_release( 1475 xfs_inode_t *ip) 1476 { 1477 xfs_mount_t *mp = ip->i_mount; 1478 int error = 0; 1479 1480 if (!S_ISREG(VFS_I(ip)->i_mode) || (VFS_I(ip)->i_mode == 0)) 1481 return 0; 1482 1483 /* If this is a read-only mount, don't do this (would generate I/O) */ 1484 if (xfs_is_readonly(mp)) 1485 return 0; 1486 1487 if (!xfs_is_shutdown(mp)) { 1488 int truncated; 1489 1490 /* 1491 * If we previously truncated this file and removed old data 1492 * in the process, we want to initiate "early" writeout on 1493 * the last close. This is an attempt to combat the notorious 1494 * NULL files problem which is particularly noticeable from a 1495 * truncate down, buffered (re-)write (delalloc), followed by 1496 * a crash. What we are effectively doing here is 1497 * significantly reducing the time window where we'd otherwise 1498 * be exposed to that problem. 1499 */ 1500 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED); 1501 if (truncated) { 1502 xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE); 1503 if (ip->i_delayed_blks > 0) { 1504 error = filemap_flush(VFS_I(ip)->i_mapping); 1505 if (error) 1506 return error; 1507 } 1508 } 1509 } 1510 1511 if (VFS_I(ip)->i_nlink == 0) 1512 return 0; 1513 1514 /* 1515 * If we can't get the iolock just skip truncating the blocks past EOF 1516 * because we could deadlock with the mmap_lock otherwise. We'll get 1517 * another chance to drop them once the last reference to the inode is 1518 * dropped, so we'll never leak blocks permanently. 1519 */ 1520 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) 1521 return 0; 1522 1523 if (xfs_can_free_eofblocks(ip, false)) { 1524 /* 1525 * Check if the inode is being opened, written and closed 1526 * frequently and we have delayed allocation blocks outstanding 1527 * (e.g. streaming writes from the NFS server), truncating the 1528 * blocks past EOF will cause fragmentation to occur. 1529 * 1530 * In this case don't do the truncation, but we have to be 1531 * careful how we detect this case. Blocks beyond EOF show up as 1532 * i_delayed_blks even when the inode is clean, so we need to 1533 * truncate them away first before checking for a dirty release. 1534 * Hence on the first dirty close we will still remove the 1535 * speculative allocation, but after that we will leave it in 1536 * place. 1537 */ 1538 if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE)) 1539 goto out_unlock; 1540 1541 error = xfs_free_eofblocks(ip); 1542 if (error) 1543 goto out_unlock; 1544 1545 /* delalloc blocks after truncation means it really is dirty */ 1546 if (ip->i_delayed_blks) 1547 xfs_iflags_set(ip, XFS_IDIRTY_RELEASE); 1548 } 1549 1550 out_unlock: 1551 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1552 return error; 1553 } 1554 1555 /* 1556 * Mark all the buffers attached to this directory stale. In theory we should 1557 * never be freeing a directory with any blocks at all, but this covers the 1558 * case where we've recovered a directory swap with a "temporary" directory 1559 * created by online repair and now need to dump it. 1560 */ 1561 STATIC void 1562 xfs_inactive_dir( 1563 struct xfs_inode *dp) 1564 { 1565 struct xfs_iext_cursor icur; 1566 struct xfs_bmbt_irec got; 1567 struct xfs_mount *mp = dp->i_mount; 1568 struct xfs_da_geometry *geo = mp->m_dir_geo; 1569 struct xfs_ifork *ifp = xfs_ifork_ptr(dp, XFS_DATA_FORK); 1570 xfs_fileoff_t off; 1571 1572 /* 1573 * Invalidate each directory block. All directory blocks are of 1574 * fsbcount length and alignment, so we only need to walk those same 1575 * offsets. We hold the only reference to this inode, so we must wait 1576 * for the buffer locks. 1577 */ 1578 for_each_xfs_iext(ifp, &icur, &got) { 1579 for (off = round_up(got.br_startoff, geo->fsbcount); 1580 off < got.br_startoff + got.br_blockcount; 1581 off += geo->fsbcount) { 1582 struct xfs_buf *bp = NULL; 1583 xfs_fsblock_t fsbno; 1584 int error; 1585 1586 fsbno = (off - got.br_startoff) + got.br_startblock; 1587 error = xfs_buf_incore(mp->m_ddev_targp, 1588 XFS_FSB_TO_DADDR(mp, fsbno), 1589 XFS_FSB_TO_BB(mp, geo->fsbcount), 1590 XBF_LIVESCAN, &bp); 1591 if (error) 1592 continue; 1593 1594 xfs_buf_stale(bp); 1595 xfs_buf_relse(bp); 1596 } 1597 } 1598 } 1599 1600 /* 1601 * xfs_inactive_truncate 1602 * 1603 * Called to perform a truncate when an inode becomes unlinked. 1604 */ 1605 STATIC int 1606 xfs_inactive_truncate( 1607 struct xfs_inode *ip) 1608 { 1609 struct xfs_mount *mp = ip->i_mount; 1610 struct xfs_trans *tp; 1611 int error; 1612 1613 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); 1614 if (error) { 1615 ASSERT(xfs_is_shutdown(mp)); 1616 return error; 1617 } 1618 xfs_ilock(ip, XFS_ILOCK_EXCL); 1619 xfs_trans_ijoin(tp, ip, 0); 1620 1621 /* 1622 * Log the inode size first to prevent stale data exposure in the event 1623 * of a system crash before the truncate completes. See the related 1624 * comment in xfs_vn_setattr_size() for details. 1625 */ 1626 ip->i_disk_size = 0; 1627 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 1628 1629 error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); 1630 if (error) 1631 goto error_trans_cancel; 1632 1633 ASSERT(ip->i_df.if_nextents == 0); 1634 1635 error = xfs_trans_commit(tp); 1636 if (error) 1637 goto error_unlock; 1638 1639 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1640 return 0; 1641 1642 error_trans_cancel: 1643 xfs_trans_cancel(tp); 1644 error_unlock: 1645 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1646 return error; 1647 } 1648 1649 /* 1650 * xfs_inactive_ifree() 1651 * 1652 * Perform the inode free when an inode is unlinked. 1653 */ 1654 STATIC int 1655 xfs_inactive_ifree( 1656 struct xfs_inode *ip) 1657 { 1658 struct xfs_mount *mp = ip->i_mount; 1659 struct xfs_trans *tp; 1660 int error; 1661 1662 /* 1663 * We try to use a per-AG reservation for any block needed by the finobt 1664 * tree, but as the finobt feature predates the per-AG reservation 1665 * support a degraded file system might not have enough space for the 1666 * reservation at mount time. In that case try to dip into the reserved 1667 * pool and pray. 1668 * 1669 * Send a warning if the reservation does happen to fail, as the inode 1670 * now remains allocated and sits on the unlinked list until the fs is 1671 * repaired. 1672 */ 1673 if (unlikely(mp->m_finobt_nores)) { 1674 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 1675 XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, 1676 &tp); 1677 } else { 1678 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); 1679 } 1680 if (error) { 1681 if (error == -ENOSPC) { 1682 xfs_warn_ratelimited(mp, 1683 "Failed to remove inode(s) from unlinked list. " 1684 "Please free space, unmount and run xfs_repair."); 1685 } else { 1686 ASSERT(xfs_is_shutdown(mp)); 1687 } 1688 return error; 1689 } 1690 1691 /* 1692 * We do not hold the inode locked across the entire rolling transaction 1693 * here. We only need to hold it for the first transaction that 1694 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the 1695 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode 1696 * here breaks the relationship between cluster buffer invalidation and 1697 * stale inode invalidation on cluster buffer item journal commit 1698 * completion, and can result in leaving dirty stale inodes hanging 1699 * around in memory. 1700 * 1701 * We have no need for serialising this inode operation against other 1702 * operations - we freed the inode and hence reallocation is required 1703 * and that will serialise on reallocating the space the deferops need 1704 * to free. Hence we can unlock the inode on the first commit of 1705 * the transaction rather than roll it right through the deferops. This 1706 * avoids relogging the XFS_ISTALE inode. 1707 * 1708 * We check that xfs_ifree() hasn't grown an internal transaction roll 1709 * by asserting that the inode is still locked when it returns. 1710 */ 1711 xfs_ilock(ip, XFS_ILOCK_EXCL); 1712 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); 1713 1714 error = xfs_ifree(tp, ip); 1715 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 1716 if (error) { 1717 /* 1718 * If we fail to free the inode, shut down. The cancel 1719 * might do that, we need to make sure. Otherwise the 1720 * inode might be lost for a long time or forever. 1721 */ 1722 if (!xfs_is_shutdown(mp)) { 1723 xfs_notice(mp, "%s: xfs_ifree returned error %d", 1724 __func__, error); 1725 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 1726 } 1727 xfs_trans_cancel(tp); 1728 return error; 1729 } 1730 1731 /* 1732 * Credit the quota account(s). The inode is gone. 1733 */ 1734 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1735 1736 return xfs_trans_commit(tp); 1737 } 1738 1739 /* 1740 * Returns true if we need to update the on-disk metadata before we can free 1741 * the memory used by this inode. Updates include freeing post-eof 1742 * preallocations; freeing COW staging extents; and marking the inode free in 1743 * the inobt if it is on the unlinked list. 1744 */ 1745 bool 1746 xfs_inode_needs_inactive( 1747 struct xfs_inode *ip) 1748 { 1749 struct xfs_mount *mp = ip->i_mount; 1750 struct xfs_ifork *cow_ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); 1751 1752 /* 1753 * If the inode is already free, then there can be nothing 1754 * to clean up here. 1755 */ 1756 if (VFS_I(ip)->i_mode == 0) 1757 return false; 1758 1759 /* 1760 * If this is a read-only mount, don't do this (would generate I/O) 1761 * unless we're in log recovery and cleaning the iunlinked list. 1762 */ 1763 if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) 1764 return false; 1765 1766 /* If the log isn't running, push inodes straight to reclaim. */ 1767 if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp)) 1768 return false; 1769 1770 /* Metadata inodes require explicit resource cleanup. */ 1771 if (xfs_is_metadata_inode(ip)) 1772 return false; 1773 1774 /* Want to clean out the cow blocks if there are any. */ 1775 if (cow_ifp && cow_ifp->if_bytes > 0) 1776 return true; 1777 1778 /* Unlinked files must be freed. */ 1779 if (VFS_I(ip)->i_nlink == 0) 1780 return true; 1781 1782 /* 1783 * This file isn't being freed, so check if there are post-eof blocks 1784 * to free. @force is true because we are evicting an inode from the 1785 * cache. Post-eof blocks must be freed, lest we end up with broken 1786 * free space accounting. 1787 * 1788 * Note: don't bother with iolock here since lockdep complains about 1789 * acquiring it in reclaim context. We have the only reference to the 1790 * inode at this point anyways. 1791 */ 1792 return xfs_can_free_eofblocks(ip, true); 1793 } 1794 1795 /* 1796 * Save health status somewhere, if we're dumping an inode with uncorrected 1797 * errors and online repair isn't running. 1798 */ 1799 static inline void 1800 xfs_inactive_health( 1801 struct xfs_inode *ip) 1802 { 1803 struct xfs_mount *mp = ip->i_mount; 1804 struct xfs_perag *pag; 1805 unsigned int sick; 1806 unsigned int checked; 1807 1808 xfs_inode_measure_sickness(ip, &sick, &checked); 1809 if (!sick) 1810 return; 1811 1812 trace_xfs_inode_unfixed_corruption(ip, sick); 1813 1814 if (sick & XFS_SICK_INO_FORGET) 1815 return; 1816 1817 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1818 if (!pag) { 1819 /* There had better still be a perag structure! */ 1820 ASSERT(0); 1821 return; 1822 } 1823 1824 xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES); 1825 xfs_perag_put(pag); 1826 } 1827 1828 /* 1829 * xfs_inactive 1830 * 1831 * This is called when the vnode reference count for the vnode 1832 * goes to zero. If the file has been unlinked, then it must 1833 * now be truncated. Also, we clear all of the read-ahead state 1834 * kept for the inode here since the file is now closed. 1835 */ 1836 int 1837 xfs_inactive( 1838 xfs_inode_t *ip) 1839 { 1840 struct xfs_mount *mp; 1841 int error = 0; 1842 int truncate = 0; 1843 1844 /* 1845 * If the inode is already free, then there can be nothing 1846 * to clean up here. 1847 */ 1848 if (VFS_I(ip)->i_mode == 0) { 1849 ASSERT(ip->i_df.if_broot_bytes == 0); 1850 goto out; 1851 } 1852 1853 mp = ip->i_mount; 1854 ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); 1855 1856 xfs_inactive_health(ip); 1857 1858 /* 1859 * If this is a read-only mount, don't do this (would generate I/O) 1860 * unless we're in log recovery and cleaning the iunlinked list. 1861 */ 1862 if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) 1863 goto out; 1864 1865 /* Metadata inodes require explicit resource cleanup. */ 1866 if (xfs_is_metadata_inode(ip)) 1867 goto out; 1868 1869 /* Try to clean out the cow blocks if there are any. */ 1870 if (xfs_inode_has_cow_data(ip)) 1871 xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, true); 1872 1873 if (VFS_I(ip)->i_nlink != 0) { 1874 /* 1875 * force is true because we are evicting an inode from the 1876 * cache. Post-eof blocks must be freed, lest we end up with 1877 * broken free space accounting. 1878 * 1879 * Note: don't bother with iolock here since lockdep complains 1880 * about acquiring it in reclaim context. We have the only 1881 * reference to the inode at this point anyways. 1882 */ 1883 if (xfs_can_free_eofblocks(ip, true)) 1884 error = xfs_free_eofblocks(ip); 1885 1886 goto out; 1887 } 1888 1889 if (S_ISREG(VFS_I(ip)->i_mode) && 1890 (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || 1891 ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) 1892 truncate = 1; 1893 1894 if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { 1895 /* 1896 * If this inode is being inactivated during a quotacheck and 1897 * has not yet been scanned by quotacheck, we /must/ remove 1898 * the dquots from the inode before inactivation changes the 1899 * block and inode counts. Most probably this is a result of 1900 * reloading the incore iunlinked list to purge unrecovered 1901 * unlinked inodes. 1902 */ 1903 xfs_qm_dqdetach(ip); 1904 } else { 1905 error = xfs_qm_dqattach(ip); 1906 if (error) 1907 goto out; 1908 } 1909 1910 if (S_ISDIR(VFS_I(ip)->i_mode) && ip->i_df.if_nextents > 0) { 1911 xfs_inactive_dir(ip); 1912 truncate = 1; 1913 } 1914 1915 if (S_ISLNK(VFS_I(ip)->i_mode)) 1916 error = xfs_inactive_symlink(ip); 1917 else if (truncate) 1918 error = xfs_inactive_truncate(ip); 1919 if (error) 1920 goto out; 1921 1922 /* 1923 * If there are attributes associated with the file then blow them away 1924 * now. The code calls a routine that recursively deconstructs the 1925 * attribute fork. If also blows away the in-core attribute fork. 1926 */ 1927 if (xfs_inode_has_attr_fork(ip)) { 1928 error = xfs_attr_inactive(ip); 1929 if (error) 1930 goto out; 1931 } 1932 1933 ASSERT(ip->i_forkoff == 0); 1934 1935 /* 1936 * Free the inode. 1937 */ 1938 error = xfs_inactive_ifree(ip); 1939 1940 out: 1941 /* 1942 * We're done making metadata updates for this inode, so we can release 1943 * the attached dquots. 1944 */ 1945 xfs_qm_dqdetach(ip); 1946 return error; 1947 } 1948 1949 /* 1950 * In-Core Unlinked List Lookups 1951 * ============================= 1952 * 1953 * Every inode is supposed to be reachable from some other piece of metadata 1954 * with the exception of the root directory. Inodes with a connection to a 1955 * file descriptor but not linked from anywhere in the on-disk directory tree 1956 * are collectively known as unlinked inodes, though the filesystem itself 1957 * maintains links to these inodes so that on-disk metadata are consistent. 1958 * 1959 * XFS implements a per-AG on-disk hash table of unlinked inodes. The AGI 1960 * header contains a number of buckets that point to an inode, and each inode 1961 * record has a pointer to the next inode in the hash chain. This 1962 * singly-linked list causes scaling problems in the iunlink remove function 1963 * because we must walk that list to find the inode that points to the inode 1964 * being removed from the unlinked hash bucket list. 1965 * 1966 * Hence we keep an in-memory double linked list to link each inode on an 1967 * unlinked list. Because there are 64 unlinked lists per AGI, keeping pointer 1968 * based lists would require having 64 list heads in the perag, one for each 1969 * list. This is expensive in terms of memory (think millions of AGs) and cache 1970 * misses on lookups. Instead, use the fact that inodes on the unlinked list 1971 * must be referenced at the VFS level to keep them on the list and hence we 1972 * have an existence guarantee for inodes on the unlinked list. 1973 * 1974 * Given we have an existence guarantee, we can use lockless inode cache lookups 1975 * to resolve aginos to xfs inodes. This means we only need 8 bytes per inode 1976 * for the double linked unlinked list, and we don't need any extra locking to 1977 * keep the list safe as all manipulations are done under the AGI buffer lock. 1978 * Keeping the list up to date does not require memory allocation, just finding 1979 * the XFS inode and updating the next/prev unlinked list aginos. 1980 */ 1981 1982 /* 1983 * Find an inode on the unlinked list. This does not take references to the 1984 * inode as we have existence guarantees by holding the AGI buffer lock and that 1985 * only unlinked, referenced inodes can be on the unlinked inode list. If we 1986 * don't find the inode in cache, then let the caller handle the situation. 1987 */ 1988 static struct xfs_inode * 1989 xfs_iunlink_lookup( 1990 struct xfs_perag *pag, 1991 xfs_agino_t agino) 1992 { 1993 struct xfs_inode *ip; 1994 1995 rcu_read_lock(); 1996 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 1997 if (!ip) { 1998 /* Caller can handle inode not being in memory. */ 1999 rcu_read_unlock(); 2000 return NULL; 2001 } 2002 2003 /* 2004 * Inode in RCU freeing limbo should not happen. Warn about this and 2005 * let the caller handle the failure. 2006 */ 2007 if (WARN_ON_ONCE(!ip->i_ino)) { 2008 rcu_read_unlock(); 2009 return NULL; 2010 } 2011 ASSERT(!xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)); 2012 rcu_read_unlock(); 2013 return ip; 2014 } 2015 2016 /* 2017 * Update the prev pointer of the next agino. Returns -ENOLINK if the inode 2018 * is not in cache. 2019 */ 2020 static int 2021 xfs_iunlink_update_backref( 2022 struct xfs_perag *pag, 2023 xfs_agino_t prev_agino, 2024 xfs_agino_t next_agino) 2025 { 2026 struct xfs_inode *ip; 2027 2028 /* No update necessary if we are at the end of the list. */ 2029 if (next_agino == NULLAGINO) 2030 return 0; 2031 2032 ip = xfs_iunlink_lookup(pag, next_agino); 2033 if (!ip) 2034 return -ENOLINK; 2035 2036 ip->i_prev_unlinked = prev_agino; 2037 return 0; 2038 } 2039 2040 /* 2041 * Point the AGI unlinked bucket at an inode and log the results. The caller 2042 * is responsible for validating the old value. 2043 */ 2044 STATIC int 2045 xfs_iunlink_update_bucket( 2046 struct xfs_trans *tp, 2047 struct xfs_perag *pag, 2048 struct xfs_buf *agibp, 2049 unsigned int bucket_index, 2050 xfs_agino_t new_agino) 2051 { 2052 struct xfs_agi *agi = agibp->b_addr; 2053 xfs_agino_t old_value; 2054 int offset; 2055 2056 ASSERT(xfs_verify_agino_or_null(pag, new_agino)); 2057 2058 old_value = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2059 trace_xfs_iunlink_update_bucket(tp->t_mountp, pag->pag_agno, bucket_index, 2060 old_value, new_agino); 2061 2062 /* 2063 * We should never find the head of the list already set to the value 2064 * passed in because either we're adding or removing ourselves from the 2065 * head of the list. 2066 */ 2067 if (old_value == new_agino) { 2068 xfs_buf_mark_corrupt(agibp); 2069 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2070 return -EFSCORRUPTED; 2071 } 2072 2073 agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino); 2074 offset = offsetof(struct xfs_agi, agi_unlinked) + 2075 (sizeof(xfs_agino_t) * bucket_index); 2076 xfs_trans_log_buf(tp, agibp, offset, offset + sizeof(xfs_agino_t) - 1); 2077 return 0; 2078 } 2079 2080 /* 2081 * Load the inode @next_agino into the cache and set its prev_unlinked pointer 2082 * to @prev_agino. Caller must hold the AGI to synchronize with other changes 2083 * to the unlinked list. 2084 */ 2085 STATIC int 2086 xfs_iunlink_reload_next( 2087 struct xfs_trans *tp, 2088 struct xfs_buf *agibp, 2089 xfs_agino_t prev_agino, 2090 xfs_agino_t next_agino) 2091 { 2092 struct xfs_perag *pag = agibp->b_pag; 2093 struct xfs_mount *mp = pag->pag_mount; 2094 struct xfs_inode *next_ip = NULL; 2095 xfs_ino_t ino; 2096 int error; 2097 2098 ASSERT(next_agino != NULLAGINO); 2099 2100 #ifdef DEBUG 2101 rcu_read_lock(); 2102 next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); 2103 ASSERT(next_ip == NULL); 2104 rcu_read_unlock(); 2105 #endif 2106 2107 xfs_info_ratelimited(mp, 2108 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", 2109 next_agino, pag->pag_agno); 2110 2111 /* 2112 * Use an untrusted lookup just to be cautious in case the AGI has been 2113 * corrupted and now points at a free inode. That shouldn't happen, 2114 * but we'd rather shut down now since we're already running in a weird 2115 * situation. 2116 */ 2117 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); 2118 error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); 2119 if (error) { 2120 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2121 return error; 2122 } 2123 2124 /* If this is not an unlinked inode, something is very wrong. */ 2125 if (VFS_I(next_ip)->i_nlink != 0) { 2126 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2127 error = -EFSCORRUPTED; 2128 goto rele; 2129 } 2130 2131 next_ip->i_prev_unlinked = prev_agino; 2132 trace_xfs_iunlink_reload_next(next_ip); 2133 rele: 2134 ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); 2135 if (xfs_is_quotacheck_running(mp) && next_ip) 2136 xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); 2137 xfs_irele(next_ip); 2138 return error; 2139 } 2140 2141 static int 2142 xfs_iunlink_insert_inode( 2143 struct xfs_trans *tp, 2144 struct xfs_perag *pag, 2145 struct xfs_buf *agibp, 2146 struct xfs_inode *ip) 2147 { 2148 struct xfs_mount *mp = tp->t_mountp; 2149 struct xfs_agi *agi = agibp->b_addr; 2150 xfs_agino_t next_agino; 2151 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2152 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2153 int error; 2154 2155 /* 2156 * Get the index into the agi hash table for the list this inode will 2157 * go on. Make sure the pointer isn't garbage and that this inode 2158 * isn't already on the list. 2159 */ 2160 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2161 if (next_agino == agino || 2162 !xfs_verify_agino_or_null(pag, next_agino)) { 2163 xfs_buf_mark_corrupt(agibp); 2164 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2165 return -EFSCORRUPTED; 2166 } 2167 2168 /* 2169 * Update the prev pointer in the next inode to point back to this 2170 * inode. 2171 */ 2172 error = xfs_iunlink_update_backref(pag, agino, next_agino); 2173 if (error == -ENOLINK) 2174 error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); 2175 if (error) 2176 return error; 2177 2178 if (next_agino != NULLAGINO) { 2179 /* 2180 * There is already another inode in the bucket, so point this 2181 * inode to the current head of the list. 2182 */ 2183 error = xfs_iunlink_log_inode(tp, ip, pag, next_agino); 2184 if (error) 2185 return error; 2186 ip->i_next_unlinked = next_agino; 2187 } 2188 2189 /* Point the head of the list to point to this inode. */ 2190 ip->i_prev_unlinked = NULLAGINO; 2191 return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); 2192 } 2193 2194 /* 2195 * This is called when the inode's link count has gone to 0 or we are creating 2196 * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. 2197 * 2198 * We place the on-disk inode on a list in the AGI. It will be pulled from this 2199 * list when the inode is freed. 2200 */ 2201 int 2202 xfs_iunlink( 2203 struct xfs_trans *tp, 2204 struct xfs_inode *ip) 2205 { 2206 struct xfs_mount *mp = tp->t_mountp; 2207 struct xfs_perag *pag; 2208 struct xfs_buf *agibp; 2209 int error; 2210 2211 ASSERT(VFS_I(ip)->i_nlink == 0); 2212 ASSERT(VFS_I(ip)->i_mode != 0); 2213 trace_xfs_iunlink(ip); 2214 2215 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2216 2217 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2218 error = xfs_read_agi(pag, tp, 0, &agibp); 2219 if (error) 2220 goto out; 2221 2222 error = xfs_iunlink_insert_inode(tp, pag, agibp, ip); 2223 out: 2224 xfs_perag_put(pag); 2225 return error; 2226 } 2227 2228 static int 2229 xfs_iunlink_remove_inode( 2230 struct xfs_trans *tp, 2231 struct xfs_perag *pag, 2232 struct xfs_buf *agibp, 2233 struct xfs_inode *ip) 2234 { 2235 struct xfs_mount *mp = tp->t_mountp; 2236 struct xfs_agi *agi = agibp->b_addr; 2237 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 2238 xfs_agino_t head_agino; 2239 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; 2240 int error; 2241 2242 trace_xfs_iunlink_remove(ip); 2243 2244 /* 2245 * Get the index into the agi hash table for the list this inode will 2246 * go on. Make sure the head pointer isn't garbage. 2247 */ 2248 head_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]); 2249 if (!xfs_verify_agino(pag, head_agino)) { 2250 XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, 2251 agi, sizeof(*agi)); 2252 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI); 2253 return -EFSCORRUPTED; 2254 } 2255 2256 /* 2257 * Set our inode's next_unlinked pointer to NULL and then return 2258 * the old pointer value so that we can update whatever was previous 2259 * to us in the list to point to whatever was next in the list. 2260 */ 2261 error = xfs_iunlink_log_inode(tp, ip, pag, NULLAGINO); 2262 if (error) 2263 return error; 2264 2265 /* 2266 * Update the prev pointer in the next inode to point back to previous 2267 * inode in the chain. 2268 */ 2269 error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, 2270 ip->i_next_unlinked); 2271 if (error == -ENOLINK) 2272 error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, 2273 ip->i_next_unlinked); 2274 if (error) 2275 return error; 2276 2277 if (head_agino != agino) { 2278 struct xfs_inode *prev_ip; 2279 2280 prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked); 2281 if (!prev_ip) { 2282 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 2283 return -EFSCORRUPTED; 2284 } 2285 2286 error = xfs_iunlink_log_inode(tp, prev_ip, pag, 2287 ip->i_next_unlinked); 2288 prev_ip->i_next_unlinked = ip->i_next_unlinked; 2289 } else { 2290 /* Point the head of the list to the next unlinked inode. */ 2291 error = xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, 2292 ip->i_next_unlinked); 2293 } 2294 2295 ip->i_next_unlinked = NULLAGINO; 2296 ip->i_prev_unlinked = 0; 2297 return error; 2298 } 2299 2300 /* 2301 * Pull the on-disk inode from the AGI unlinked list. 2302 */ 2303 int 2304 xfs_iunlink_remove( 2305 struct xfs_trans *tp, 2306 struct xfs_perag *pag, 2307 struct xfs_inode *ip) 2308 { 2309 struct xfs_buf *agibp; 2310 int error; 2311 2312 trace_xfs_iunlink_remove(ip); 2313 2314 /* Get the agi buffer first. It ensures lock ordering on the list. */ 2315 error = xfs_read_agi(pag, tp, 0, &agibp); 2316 if (error) 2317 return error; 2318 2319 return xfs_iunlink_remove_inode(tp, pag, agibp, ip); 2320 } 2321 2322 /* 2323 * Look up the inode number specified and if it is not already marked XFS_ISTALE 2324 * mark it stale. We should only find clean inodes in this lookup that aren't 2325 * already stale. 2326 */ 2327 static void 2328 xfs_ifree_mark_inode_stale( 2329 struct xfs_perag *pag, 2330 struct xfs_inode *free_ip, 2331 xfs_ino_t inum) 2332 { 2333 struct xfs_mount *mp = pag->pag_mount; 2334 struct xfs_inode_log_item *iip; 2335 struct xfs_inode *ip; 2336 2337 retry: 2338 rcu_read_lock(); 2339 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum)); 2340 2341 /* Inode not in memory, nothing to do */ 2342 if (!ip) { 2343 rcu_read_unlock(); 2344 return; 2345 } 2346 2347 /* 2348 * because this is an RCU protected lookup, we could find a recently 2349 * freed or even reallocated inode during the lookup. We need to check 2350 * under the i_flags_lock for a valid inode here. Skip it if it is not 2351 * valid, the wrong inode or stale. 2352 */ 2353 spin_lock(&ip->i_flags_lock); 2354 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) 2355 goto out_iflags_unlock; 2356 2357 /* 2358 * Don't try to lock/unlock the current inode, but we _cannot_ skip the 2359 * other inodes that we did not find in the list attached to the buffer 2360 * and are not already marked stale. If we can't lock it, back off and 2361 * retry. 2362 */ 2363 if (ip != free_ip) { 2364 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { 2365 spin_unlock(&ip->i_flags_lock); 2366 rcu_read_unlock(); 2367 delay(1); 2368 goto retry; 2369 } 2370 } 2371 ip->i_flags |= XFS_ISTALE; 2372 2373 /* 2374 * If the inode is flushing, it is already attached to the buffer. All 2375 * we needed to do here is mark the inode stale so buffer IO completion 2376 * will remove it from the AIL. 2377 */ 2378 iip = ip->i_itemp; 2379 if (__xfs_iflags_test(ip, XFS_IFLUSHING)) { 2380 ASSERT(!list_empty(&iip->ili_item.li_bio_list)); 2381 ASSERT(iip->ili_last_fields); 2382 goto out_iunlock; 2383 } 2384 2385 /* 2386 * Inodes not attached to the buffer can be released immediately. 2387 * Everything else has to go through xfs_iflush_abort() on journal 2388 * commit as the flock synchronises removal of the inode from the 2389 * cluster buffer against inode reclaim. 2390 */ 2391 if (!iip || list_empty(&iip->ili_item.li_bio_list)) 2392 goto out_iunlock; 2393 2394 __xfs_iflags_set(ip, XFS_IFLUSHING); 2395 spin_unlock(&ip->i_flags_lock); 2396 rcu_read_unlock(); 2397 2398 /* we have a dirty inode in memory that has not yet been flushed. */ 2399 spin_lock(&iip->ili_lock); 2400 iip->ili_last_fields = iip->ili_fields; 2401 iip->ili_fields = 0; 2402 iip->ili_fsync_fields = 0; 2403 spin_unlock(&iip->ili_lock); 2404 ASSERT(iip->ili_last_fields); 2405 2406 if (ip != free_ip) 2407 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2408 return; 2409 2410 out_iunlock: 2411 if (ip != free_ip) 2412 xfs_iunlock(ip, XFS_ILOCK_EXCL); 2413 out_iflags_unlock: 2414 spin_unlock(&ip->i_flags_lock); 2415 rcu_read_unlock(); 2416 } 2417 2418 /* 2419 * A big issue when freeing the inode cluster is that we _cannot_ skip any 2420 * inodes that are in memory - they all must be marked stale and attached to 2421 * the cluster buffer. 2422 */ 2423 static int 2424 xfs_ifree_cluster( 2425 struct xfs_trans *tp, 2426 struct xfs_perag *pag, 2427 struct xfs_inode *free_ip, 2428 struct xfs_icluster *xic) 2429 { 2430 struct xfs_mount *mp = free_ip->i_mount; 2431 struct xfs_ino_geometry *igeo = M_IGEO(mp); 2432 struct xfs_buf *bp; 2433 xfs_daddr_t blkno; 2434 xfs_ino_t inum = xic->first_ino; 2435 int nbufs; 2436 int i, j; 2437 int ioffset; 2438 int error; 2439 2440 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; 2441 2442 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { 2443 /* 2444 * The allocation bitmap tells us which inodes of the chunk were 2445 * physically allocated. Skip the cluster if an inode falls into 2446 * a sparse region. 2447 */ 2448 ioffset = inum - xic->first_ino; 2449 if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { 2450 ASSERT(ioffset % igeo->inodes_per_cluster == 0); 2451 continue; 2452 } 2453 2454 blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), 2455 XFS_INO_TO_AGBNO(mp, inum)); 2456 2457 /* 2458 * We obtain and lock the backing buffer first in the process 2459 * here to ensure dirty inodes attached to the buffer remain in 2460 * the flushing state while we mark them stale. 2461 * 2462 * If we scan the in-memory inodes first, then buffer IO can 2463 * complete before we get a lock on it, and hence we may fail 2464 * to mark all the active inodes on the buffer stale. 2465 */ 2466 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 2467 mp->m_bsize * igeo->blocks_per_cluster, 2468 XBF_UNMAPPED, &bp); 2469 if (error) 2470 return error; 2471 2472 /* 2473 * This buffer may not have been correctly initialised as we 2474 * didn't read it from disk. That's not important because we are 2475 * only using to mark the buffer as stale in the log, and to 2476 * attach stale cached inodes on it. That means it will never be 2477 * dispatched for IO. If it is, we want to know about it, and we 2478 * want it to fail. We can acheive this by adding a write 2479 * verifier to the buffer. 2480 */ 2481 bp->b_ops = &xfs_inode_buf_ops; 2482 2483 /* 2484 * Now we need to set all the cached clean inodes as XFS_ISTALE, 2485 * too. This requires lookups, and will skip inodes that we've 2486 * already marked XFS_ISTALE. 2487 */ 2488 for (i = 0; i < igeo->inodes_per_cluster; i++) 2489 xfs_ifree_mark_inode_stale(pag, free_ip, inum + i); 2490 2491 xfs_trans_stale_inode_buf(tp, bp); 2492 xfs_trans_binval(tp, bp); 2493 } 2494 return 0; 2495 } 2496 2497 /* 2498 * This is called to return an inode to the inode free list. The inode should 2499 * already be truncated to 0 length and have no pages associated with it. This 2500 * routine also assumes that the inode is already a part of the transaction. 2501 * 2502 * The on-disk copy of the inode will have been added to the list of unlinked 2503 * inodes in the AGI. We need to remove the inode from that list atomically with 2504 * respect to freeing it here. 2505 */ 2506 int 2507 xfs_ifree( 2508 struct xfs_trans *tp, 2509 struct xfs_inode *ip) 2510 { 2511 struct xfs_mount *mp = ip->i_mount; 2512 struct xfs_perag *pag; 2513 struct xfs_icluster xic = { 0 }; 2514 struct xfs_inode_log_item *iip = ip->i_itemp; 2515 int error; 2516 2517 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL); 2518 ASSERT(VFS_I(ip)->i_nlink == 0); 2519 ASSERT(ip->i_df.if_nextents == 0); 2520 ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode)); 2521 ASSERT(ip->i_nblocks == 0); 2522 2523 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 2524 2525 /* 2526 * Free the inode first so that we guarantee that the AGI lock is going 2527 * to be taken before we remove the inode from the unlinked list. This 2528 * makes the AGI lock -> unlinked list modification order the same as 2529 * used in O_TMPFILE creation. 2530 */ 2531 error = xfs_difree(tp, pag, ip->i_ino, &xic); 2532 if (error) 2533 goto out; 2534 2535 error = xfs_iunlink_remove(tp, pag, ip); 2536 if (error) 2537 goto out; 2538 2539 /* 2540 * Free any local-format data sitting around before we reset the 2541 * data fork to extents format. Note that the attr fork data has 2542 * already been freed by xfs_attr_inactive. 2543 */ 2544 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) { 2545 kfree(ip->i_df.if_data); 2546 ip->i_df.if_data = NULL; 2547 ip->i_df.if_bytes = 0; 2548 } 2549 2550 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */ 2551 ip->i_diflags = 0; 2552 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 2553 ip->i_forkoff = 0; /* mark the attr fork not in use */ 2554 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; 2555 if (xfs_iflags_test(ip, XFS_IPRESERVE_DM_FIELDS)) 2556 xfs_iflags_clear(ip, XFS_IPRESERVE_DM_FIELDS); 2557 2558 /* Don't attempt to replay owner changes for a deleted inode */ 2559 spin_lock(&iip->ili_lock); 2560 iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER); 2561 spin_unlock(&iip->ili_lock); 2562 2563 /* 2564 * Bump the generation count so no one will be confused 2565 * by reincarnations of this inode. 2566 */ 2567 VFS_I(ip)->i_generation++; 2568 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2569 2570 if (xic.deleted) 2571 error = xfs_ifree_cluster(tp, pag, ip, &xic); 2572 out: 2573 xfs_perag_put(pag); 2574 return error; 2575 } 2576 2577 /* 2578 * This is called to unpin an inode. The caller must have the inode locked 2579 * in at least shared mode so that the buffer cannot be subsequently pinned 2580 * once someone is waiting for it to be unpinned. 2581 */ 2582 static void 2583 xfs_iunpin( 2584 struct xfs_inode *ip) 2585 { 2586 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); 2587 2588 trace_xfs_inode_unpin_nowait(ip, _RET_IP_); 2589 2590 /* Give the log a push to start the unpinning I/O */ 2591 xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL); 2592 2593 } 2594 2595 static void 2596 __xfs_iunpin_wait( 2597 struct xfs_inode *ip) 2598 { 2599 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT); 2600 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT); 2601 2602 xfs_iunpin(ip); 2603 2604 do { 2605 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 2606 if (xfs_ipincount(ip)) 2607 io_schedule(); 2608 } while (xfs_ipincount(ip)); 2609 finish_wait(wq, &wait.wq_entry); 2610 } 2611 2612 void 2613 xfs_iunpin_wait( 2614 struct xfs_inode *ip) 2615 { 2616 if (xfs_ipincount(ip)) 2617 __xfs_iunpin_wait(ip); 2618 } 2619 2620 /* 2621 * Removing an inode from the namespace involves removing the directory entry 2622 * and dropping the link count on the inode. Removing the directory entry can 2623 * result in locking an AGF (directory blocks were freed) and removing a link 2624 * count can result in placing the inode on an unlinked list which results in 2625 * locking an AGI. 2626 * 2627 * The big problem here is that we have an ordering constraint on AGF and AGI 2628 * locking - inode allocation locks the AGI, then can allocate a new extent for 2629 * new inodes, locking the AGF after the AGI. Similarly, freeing the inode 2630 * removes the inode from the unlinked list, requiring that we lock the AGI 2631 * first, and then freeing the inode can result in an inode chunk being freed 2632 * and hence freeing disk space requiring that we lock an AGF. 2633 * 2634 * Hence the ordering that is imposed by other parts of the code is AGI before 2635 * AGF. This means we cannot remove the directory entry before we drop the inode 2636 * reference count and put it on the unlinked list as this results in a lock 2637 * order of AGF then AGI, and this can deadlock against inode allocation and 2638 * freeing. Therefore we must drop the link counts before we remove the 2639 * directory entry. 2640 * 2641 * This is still safe from a transactional point of view - it is not until we 2642 * get to xfs_defer_finish() that we have the possibility of multiple 2643 * transactions in this operation. Hence as long as we remove the directory 2644 * entry and drop the link count in the first transaction of the remove 2645 * operation, there are no transactional constraints on the ordering here. 2646 */ 2647 int 2648 xfs_remove( 2649 xfs_inode_t *dp, 2650 struct xfs_name *name, 2651 xfs_inode_t *ip) 2652 { 2653 xfs_mount_t *mp = dp->i_mount; 2654 xfs_trans_t *tp = NULL; 2655 int is_dir = S_ISDIR(VFS_I(ip)->i_mode); 2656 int dontcare; 2657 int error = 0; 2658 uint resblks; 2659 2660 trace_xfs_remove(dp, name); 2661 2662 if (xfs_is_shutdown(mp)) 2663 return -EIO; 2664 if (xfs_ifork_zapped(dp, XFS_DATA_FORK)) 2665 return -EIO; 2666 2667 error = xfs_qm_dqattach(dp); 2668 if (error) 2669 goto std_return; 2670 2671 error = xfs_qm_dqattach(ip); 2672 if (error) 2673 goto std_return; 2674 2675 /* 2676 * We try to get the real space reservation first, allowing for 2677 * directory btree deletion(s) implying possible bmap insert(s). If we 2678 * can't get the space reservation then we use 0 instead, and avoid the 2679 * bmap btree insert(s) in the directory code by, if the bmap insert 2680 * tries to happen, instead trimming the LAST block from the directory. 2681 * 2682 * Ignore EDQUOT and ENOSPC being returned via nospace_error because 2683 * the directory code can handle a reservationless update and we don't 2684 * want to prevent a user from trying to free space by deleting things. 2685 */ 2686 resblks = XFS_REMOVE_SPACE_RES(mp); 2687 error = xfs_trans_alloc_dir(dp, &M_RES(mp)->tr_remove, ip, &resblks, 2688 &tp, &dontcare); 2689 if (error) { 2690 ASSERT(error != -ENOSPC); 2691 goto std_return; 2692 } 2693 2694 /* 2695 * If we're removing a directory perform some additional validation. 2696 */ 2697 if (is_dir) { 2698 ASSERT(VFS_I(ip)->i_nlink >= 2); 2699 if (VFS_I(ip)->i_nlink != 2) { 2700 error = -ENOTEMPTY; 2701 goto out_trans_cancel; 2702 } 2703 if (!xfs_dir_isempty(ip)) { 2704 error = -ENOTEMPTY; 2705 goto out_trans_cancel; 2706 } 2707 2708 /* Drop the link from ip's "..". */ 2709 error = xfs_droplink(tp, dp); 2710 if (error) 2711 goto out_trans_cancel; 2712 2713 /* Drop the "." link from ip to self. */ 2714 error = xfs_droplink(tp, ip); 2715 if (error) 2716 goto out_trans_cancel; 2717 2718 /* 2719 * Point the unlinked child directory's ".." entry to the root 2720 * directory to eliminate back-references to inodes that may 2721 * get freed before the child directory is closed. If the fs 2722 * gets shrunk, this can lead to dirent inode validation errors. 2723 */ 2724 if (dp->i_ino != tp->t_mountp->m_sb.sb_rootino) { 2725 error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, 2726 tp->t_mountp->m_sb.sb_rootino, 0); 2727 if (error) 2728 goto out_trans_cancel; 2729 } 2730 } else { 2731 /* 2732 * When removing a non-directory we need to log the parent 2733 * inode here. For a directory this is done implicitly 2734 * by the xfs_droplink call for the ".." entry. 2735 */ 2736 xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); 2737 } 2738 xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2739 2740 /* Drop the link from dp to ip. */ 2741 error = xfs_droplink(tp, ip); 2742 if (error) 2743 goto out_trans_cancel; 2744 2745 error = xfs_dir_removename(tp, dp, name, ip->i_ino, resblks); 2746 if (error) { 2747 ASSERT(error != -ENOENT); 2748 goto out_trans_cancel; 2749 } 2750 2751 /* 2752 * Drop the link from dp to ip, and if ip was a directory, remove the 2753 * '.' and '..' references since we freed the directory. 2754 */ 2755 xfs_dir_update_hook(dp, ip, -1, name); 2756 2757 /* 2758 * If this is a synchronous mount, make sure that the 2759 * remove transaction goes to disk before returning to 2760 * the user. 2761 */ 2762 if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) 2763 xfs_trans_set_sync(tp); 2764 2765 error = xfs_trans_commit(tp); 2766 if (error) 2767 goto std_return; 2768 2769 if (is_dir && xfs_inode_is_filestream(ip)) 2770 xfs_filestream_deassociate(ip); 2771 2772 return 0; 2773 2774 out_trans_cancel: 2775 xfs_trans_cancel(tp); 2776 std_return: 2777 return error; 2778 } 2779 2780 /* 2781 * Enter all inodes for a rename transaction into a sorted array. 2782 */ 2783 #define __XFS_SORT_INODES 5 2784 STATIC void 2785 xfs_sort_for_rename( 2786 struct xfs_inode *dp1, /* in: old (source) directory inode */ 2787 struct xfs_inode *dp2, /* in: new (target) directory inode */ 2788 struct xfs_inode *ip1, /* in: inode of old entry */ 2789 struct xfs_inode *ip2, /* in: inode of new entry */ 2790 struct xfs_inode *wip, /* in: whiteout inode */ 2791 struct xfs_inode **i_tab,/* out: sorted array of inodes */ 2792 int *num_inodes) /* in/out: inodes in array */ 2793 { 2794 int i, j; 2795 2796 ASSERT(*num_inodes == __XFS_SORT_INODES); 2797 memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); 2798 2799 /* 2800 * i_tab contains a list of pointers to inodes. We initialize 2801 * the table here & we'll sort it. We will then use it to 2802 * order the acquisition of the inode locks. 2803 * 2804 * Note that the table may contain duplicates. e.g., dp1 == dp2. 2805 */ 2806 i = 0; 2807 i_tab[i++] = dp1; 2808 i_tab[i++] = dp2; 2809 i_tab[i++] = ip1; 2810 if (ip2) 2811 i_tab[i++] = ip2; 2812 if (wip) 2813 i_tab[i++] = wip; 2814 *num_inodes = i; 2815 2816 /* 2817 * Sort the elements via bubble sort. (Remember, there are at 2818 * most 5 elements to sort, so this is adequate.) 2819 */ 2820 for (i = 0; i < *num_inodes; i++) { 2821 for (j = 1; j < *num_inodes; j++) { 2822 if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { 2823 struct xfs_inode *temp = i_tab[j]; 2824 i_tab[j] = i_tab[j-1]; 2825 i_tab[j-1] = temp; 2826 } 2827 } 2828 } 2829 } 2830 2831 static int 2832 xfs_finish_rename( 2833 struct xfs_trans *tp) 2834 { 2835 /* 2836 * If this is a synchronous mount, make sure that the rename transaction 2837 * goes to disk before returning to the user. 2838 */ 2839 if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) 2840 xfs_trans_set_sync(tp); 2841 2842 return xfs_trans_commit(tp); 2843 } 2844 2845 /* 2846 * xfs_cross_rename() 2847 * 2848 * responsible for handling RENAME_EXCHANGE flag in renameat2() syscall 2849 */ 2850 STATIC int 2851 xfs_cross_rename( 2852 struct xfs_trans *tp, 2853 struct xfs_inode *dp1, 2854 struct xfs_name *name1, 2855 struct xfs_inode *ip1, 2856 struct xfs_inode *dp2, 2857 struct xfs_name *name2, 2858 struct xfs_inode *ip2, 2859 int spaceres) 2860 { 2861 int error = 0; 2862 int ip1_flags = 0; 2863 int ip2_flags = 0; 2864 int dp2_flags = 0; 2865 2866 /* Swap inode number for dirent in first parent */ 2867 error = xfs_dir_replace(tp, dp1, name1, ip2->i_ino, spaceres); 2868 if (error) 2869 goto out_trans_abort; 2870 2871 /* Swap inode number for dirent in second parent */ 2872 error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, spaceres); 2873 if (error) 2874 goto out_trans_abort; 2875 2876 /* 2877 * If we're renaming one or more directories across different parents, 2878 * update the respective ".." entries (and link counts) to match the new 2879 * parents. 2880 */ 2881 if (dp1 != dp2) { 2882 dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2883 2884 if (S_ISDIR(VFS_I(ip2)->i_mode)) { 2885 error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot, 2886 dp1->i_ino, spaceres); 2887 if (error) 2888 goto out_trans_abort; 2889 2890 /* transfer ip2 ".." reference to dp1 */ 2891 if (!S_ISDIR(VFS_I(ip1)->i_mode)) { 2892 error = xfs_droplink(tp, dp2); 2893 if (error) 2894 goto out_trans_abort; 2895 xfs_bumplink(tp, dp1); 2896 } 2897 2898 /* 2899 * Although ip1 isn't changed here, userspace needs 2900 * to be warned about the change, so that applications 2901 * relying on it (like backup ones), will properly 2902 * notify the change 2903 */ 2904 ip1_flags |= XFS_ICHGTIME_CHG; 2905 ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2906 } 2907 2908 if (S_ISDIR(VFS_I(ip1)->i_mode)) { 2909 error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot, 2910 dp2->i_ino, spaceres); 2911 if (error) 2912 goto out_trans_abort; 2913 2914 /* transfer ip1 ".." reference to dp2 */ 2915 if (!S_ISDIR(VFS_I(ip2)->i_mode)) { 2916 error = xfs_droplink(tp, dp1); 2917 if (error) 2918 goto out_trans_abort; 2919 xfs_bumplink(tp, dp2); 2920 } 2921 2922 /* 2923 * Although ip2 isn't changed here, userspace needs 2924 * to be warned about the change, so that applications 2925 * relying on it (like backup ones), will properly 2926 * notify the change 2927 */ 2928 ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; 2929 ip2_flags |= XFS_ICHGTIME_CHG; 2930 } 2931 } 2932 2933 if (ip1_flags) { 2934 xfs_trans_ichgtime(tp, ip1, ip1_flags); 2935 xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE); 2936 } 2937 if (ip2_flags) { 2938 xfs_trans_ichgtime(tp, ip2, ip2_flags); 2939 xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE); 2940 } 2941 if (dp2_flags) { 2942 xfs_trans_ichgtime(tp, dp2, dp2_flags); 2943 xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE); 2944 } 2945 xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2946 xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); 2947 2948 /* 2949 * Inform our hook clients that we've finished an exchange operation as 2950 * follows: removed the source and target files from their directories; 2951 * added the target to the source directory; and added the source to 2952 * the target directory. All inodes are locked, so it's ok to model a 2953 * rename this way so long as we say we deleted entries before we add 2954 * new ones. 2955 */ 2956 xfs_dir_update_hook(dp1, ip1, -1, name1); 2957 xfs_dir_update_hook(dp2, ip2, -1, name2); 2958 xfs_dir_update_hook(dp1, ip2, 1, name1); 2959 xfs_dir_update_hook(dp2, ip1, 1, name2); 2960 2961 return xfs_finish_rename(tp); 2962 2963 out_trans_abort: 2964 xfs_trans_cancel(tp); 2965 return error; 2966 } 2967 2968 /* 2969 * xfs_rename_alloc_whiteout() 2970 * 2971 * Return a referenced, unlinked, unlocked inode that can be used as a 2972 * whiteout in a rename transaction. We use a tmpfile inode here so that if we 2973 * crash between allocating the inode and linking it into the rename transaction 2974 * recovery will free the inode and we won't leak it. 2975 */ 2976 static int 2977 xfs_rename_alloc_whiteout( 2978 struct mnt_idmap *idmap, 2979 struct xfs_name *src_name, 2980 struct xfs_inode *dp, 2981 struct xfs_inode **wip) 2982 { 2983 struct xfs_inode *tmpfile; 2984 struct qstr name; 2985 int error; 2986 2987 error = xfs_create_tmpfile(idmap, dp, S_IFCHR | WHITEOUT_MODE, 2988 &tmpfile); 2989 if (error) 2990 return error; 2991 2992 name.name = src_name->name; 2993 name.len = src_name->len; 2994 error = xfs_inode_init_security(VFS_I(tmpfile), VFS_I(dp), &name); 2995 if (error) { 2996 xfs_finish_inode_setup(tmpfile); 2997 xfs_irele(tmpfile); 2998 return error; 2999 } 3000 3001 /* 3002 * Prepare the tmpfile inode as if it were created through the VFS. 3003 * Complete the inode setup and flag it as linkable. nlink is already 3004 * zero, so we can skip the drop_nlink. 3005 */ 3006 xfs_setup_iops(tmpfile); 3007 xfs_finish_inode_setup(tmpfile); 3008 VFS_I(tmpfile)->i_state |= I_LINKABLE; 3009 3010 *wip = tmpfile; 3011 return 0; 3012 } 3013 3014 /* 3015 * xfs_rename 3016 */ 3017 int 3018 xfs_rename( 3019 struct mnt_idmap *idmap, 3020 struct xfs_inode *src_dp, 3021 struct xfs_name *src_name, 3022 struct xfs_inode *src_ip, 3023 struct xfs_inode *target_dp, 3024 struct xfs_name *target_name, 3025 struct xfs_inode *target_ip, 3026 unsigned int flags) 3027 { 3028 struct xfs_mount *mp = src_dp->i_mount; 3029 struct xfs_trans *tp; 3030 struct xfs_inode *wip = NULL; /* whiteout inode */ 3031 struct xfs_inode *inodes[__XFS_SORT_INODES]; 3032 int i; 3033 int num_inodes = __XFS_SORT_INODES; 3034 bool new_parent = (src_dp != target_dp); 3035 bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode); 3036 int spaceres; 3037 bool retried = false; 3038 int error, nospace_error = 0; 3039 3040 trace_xfs_rename(src_dp, target_dp, src_name, target_name); 3041 3042 if ((flags & RENAME_EXCHANGE) && !target_ip) 3043 return -EINVAL; 3044 3045 /* 3046 * If we are doing a whiteout operation, allocate the whiteout inode 3047 * we will be placing at the target and ensure the type is set 3048 * appropriately. 3049 */ 3050 if (flags & RENAME_WHITEOUT) { 3051 error = xfs_rename_alloc_whiteout(idmap, src_name, 3052 target_dp, &wip); 3053 if (error) 3054 return error; 3055 3056 /* setup target dirent info as whiteout */ 3057 src_name->type = XFS_DIR3_FT_CHRDEV; 3058 } 3059 3060 xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, 3061 inodes, &num_inodes); 3062 3063 retry: 3064 nospace_error = 0; 3065 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); 3066 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, spaceres, 0, 0, &tp); 3067 if (error == -ENOSPC) { 3068 nospace_error = error; 3069 spaceres = 0; 3070 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_rename, 0, 0, 0, 3071 &tp); 3072 } 3073 if (error) 3074 goto out_release_wip; 3075 3076 /* 3077 * Attach the dquots to the inodes 3078 */ 3079 error = xfs_qm_vop_rename_dqattach(inodes); 3080 if (error) 3081 goto out_trans_cancel; 3082 3083 /* 3084 * Lock all the participating inodes. Depending upon whether 3085 * the target_name exists in the target directory, and 3086 * whether the target directory is the same as the source 3087 * directory, we can lock from 2 to 5 inodes. 3088 */ 3089 xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); 3090 3091 /* 3092 * Join all the inodes to the transaction. From this point on, 3093 * we can rely on either trans_commit or trans_cancel to unlock 3094 * them. 3095 */ 3096 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 3097 if (new_parent) 3098 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 3099 xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); 3100 if (target_ip) 3101 xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); 3102 if (wip) 3103 xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); 3104 3105 /* 3106 * If we are using project inheritance, we only allow renames 3107 * into our tree when the project IDs are the same; else the 3108 * tree quota mechanism would be circumvented. 3109 */ 3110 if (unlikely((target_dp->i_diflags & XFS_DIFLAG_PROJINHERIT) && 3111 target_dp->i_projid != src_ip->i_projid)) { 3112 error = -EXDEV; 3113 goto out_trans_cancel; 3114 } 3115 3116 /* RENAME_EXCHANGE is unique from here on. */ 3117 if (flags & RENAME_EXCHANGE) 3118 return xfs_cross_rename(tp, src_dp, src_name, src_ip, 3119 target_dp, target_name, target_ip, 3120 spaceres); 3121 3122 /* 3123 * Try to reserve quota to handle an expansion of the target directory. 3124 * We'll allow the rename to continue in reservationless mode if we hit 3125 * a space usage constraint. If we trigger reservationless mode, save 3126 * the errno if there isn't any free space in the target directory. 3127 */ 3128 if (spaceres != 0) { 3129 error = xfs_trans_reserve_quota_nblks(tp, target_dp, spaceres, 3130 0, false); 3131 if (error == -EDQUOT || error == -ENOSPC) { 3132 if (!retried) { 3133 xfs_trans_cancel(tp); 3134 xfs_blockgc_free_quota(target_dp, 0); 3135 retried = true; 3136 goto retry; 3137 } 3138 3139 nospace_error = error; 3140 spaceres = 0; 3141 error = 0; 3142 } 3143 if (error) 3144 goto out_trans_cancel; 3145 } 3146 3147 /* 3148 * Check for expected errors before we dirty the transaction 3149 * so we can return an error without a transaction abort. 3150 */ 3151 if (target_ip == NULL) { 3152 /* 3153 * If there's no space reservation, check the entry will 3154 * fit before actually inserting it. 3155 */ 3156 if (!spaceres) { 3157 error = xfs_dir_canenter(tp, target_dp, target_name); 3158 if (error) 3159 goto out_trans_cancel; 3160 } 3161 } else { 3162 /* 3163 * If target exists and it's a directory, check that whether 3164 * it can be destroyed. 3165 */ 3166 if (S_ISDIR(VFS_I(target_ip)->i_mode) && 3167 (!xfs_dir_isempty(target_ip) || 3168 (VFS_I(target_ip)->i_nlink > 2))) { 3169 error = -EEXIST; 3170 goto out_trans_cancel; 3171 } 3172 } 3173 3174 /* 3175 * Lock the AGI buffers we need to handle bumping the nlink of the 3176 * whiteout inode off the unlinked list and to handle dropping the 3177 * nlink of the target inode. Per locking order rules, do this in 3178 * increasing AG order and before directory block allocation tries to 3179 * grab AGFs because we grab AGIs before AGFs. 3180 * 3181 * The (vfs) caller must ensure that if src is a directory then 3182 * target_ip is either null or an empty directory. 3183 */ 3184 for (i = 0; i < num_inodes && inodes[i] != NULL; i++) { 3185 if (inodes[i] == wip || 3186 (inodes[i] == target_ip && 3187 (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) { 3188 struct xfs_perag *pag; 3189 struct xfs_buf *bp; 3190 3191 pag = xfs_perag_get(mp, 3192 XFS_INO_TO_AGNO(mp, inodes[i]->i_ino)); 3193 error = xfs_read_agi(pag, tp, 0, &bp); 3194 xfs_perag_put(pag); 3195 if (error) 3196 goto out_trans_cancel; 3197 } 3198 } 3199 3200 /* 3201 * Directory entry creation below may acquire the AGF. Remove 3202 * the whiteout from the unlinked list first to preserve correct 3203 * AGI/AGF locking order. This dirties the transaction so failures 3204 * after this point will abort and log recovery will clean up the 3205 * mess. 3206 * 3207 * For whiteouts, we need to bump the link count on the whiteout 3208 * inode. After this point, we have a real link, clear the tmpfile 3209 * state flag from the inode so it doesn't accidentally get misused 3210 * in future. 3211 */ 3212 if (wip) { 3213 struct xfs_perag *pag; 3214 3215 ASSERT(VFS_I(wip)->i_nlink == 0); 3216 3217 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, wip->i_ino)); 3218 error = xfs_iunlink_remove(tp, pag, wip); 3219 xfs_perag_put(pag); 3220 if (error) 3221 goto out_trans_cancel; 3222 3223 xfs_bumplink(tp, wip); 3224 VFS_I(wip)->i_state &= ~I_LINKABLE; 3225 } 3226 3227 /* 3228 * Set up the target. 3229 */ 3230 if (target_ip == NULL) { 3231 /* 3232 * If target does not exist and the rename crosses 3233 * directories, adjust the target directory link count 3234 * to account for the ".." reference from the new entry. 3235 */ 3236 error = xfs_dir_createname(tp, target_dp, target_name, 3237 src_ip->i_ino, spaceres); 3238 if (error) 3239 goto out_trans_cancel; 3240 3241 xfs_trans_ichgtime(tp, target_dp, 3242 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3243 3244 if (new_parent && src_is_directory) { 3245 xfs_bumplink(tp, target_dp); 3246 } 3247 } else { /* target_ip != NULL */ 3248 /* 3249 * Link the source inode under the target name. 3250 * If the source inode is a directory and we are moving 3251 * it across directories, its ".." entry will be 3252 * inconsistent until we replace that down below. 3253 * 3254 * In case there is already an entry with the same 3255 * name at the destination directory, remove it first. 3256 */ 3257 error = xfs_dir_replace(tp, target_dp, target_name, 3258 src_ip->i_ino, spaceres); 3259 if (error) 3260 goto out_trans_cancel; 3261 3262 xfs_trans_ichgtime(tp, target_dp, 3263 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3264 3265 /* 3266 * Decrement the link count on the target since the target 3267 * dir no longer points to it. 3268 */ 3269 error = xfs_droplink(tp, target_ip); 3270 if (error) 3271 goto out_trans_cancel; 3272 3273 if (src_is_directory) { 3274 /* 3275 * Drop the link from the old "." entry. 3276 */ 3277 error = xfs_droplink(tp, target_ip); 3278 if (error) 3279 goto out_trans_cancel; 3280 } 3281 } /* target_ip != NULL */ 3282 3283 /* 3284 * Remove the source. 3285 */ 3286 if (new_parent && src_is_directory) { 3287 /* 3288 * Rewrite the ".." entry to point to the new 3289 * directory. 3290 */ 3291 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot, 3292 target_dp->i_ino, spaceres); 3293 ASSERT(error != -EEXIST); 3294 if (error) 3295 goto out_trans_cancel; 3296 } 3297 3298 /* 3299 * We always want to hit the ctime on the source inode. 3300 * 3301 * This isn't strictly required by the standards since the source 3302 * inode isn't really being changed, but old unix file systems did 3303 * it and some incremental backup programs won't work without it. 3304 */ 3305 xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG); 3306 xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE); 3307 3308 /* 3309 * Adjust the link count on src_dp. This is necessary when 3310 * renaming a directory, either within one parent when 3311 * the target existed, or across two parent directories. 3312 */ 3313 if (src_is_directory && (new_parent || target_ip != NULL)) { 3314 3315 /* 3316 * Decrement link count on src_directory since the 3317 * entry that's moved no longer points to it. 3318 */ 3319 error = xfs_droplink(tp, src_dp); 3320 if (error) 3321 goto out_trans_cancel; 3322 } 3323 3324 /* 3325 * For whiteouts, we only need to update the source dirent with the 3326 * inode number of the whiteout inode rather than removing it 3327 * altogether. 3328 */ 3329 if (wip) 3330 error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, 3331 spaceres); 3332 else 3333 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, 3334 spaceres); 3335 3336 if (error) 3337 goto out_trans_cancel; 3338 3339 xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3340 xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); 3341 if (new_parent) 3342 xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); 3343 3344 /* 3345 * Inform our hook clients that we've finished a rename operation as 3346 * follows: removed the source and target files from their directories; 3347 * that we've added the source to the target directory; and finally 3348 * that we've added the whiteout, if there was one. All inodes are 3349 * locked, so it's ok to model a rename this way so long as we say we 3350 * deleted entries before we add new ones. 3351 */ 3352 if (target_ip) 3353 xfs_dir_update_hook(target_dp, target_ip, -1, target_name); 3354 xfs_dir_update_hook(src_dp, src_ip, -1, src_name); 3355 xfs_dir_update_hook(target_dp, src_ip, 1, target_name); 3356 if (wip) 3357 xfs_dir_update_hook(src_dp, wip, 1, src_name); 3358 3359 error = xfs_finish_rename(tp); 3360 if (wip) 3361 xfs_irele(wip); 3362 return error; 3363 3364 out_trans_cancel: 3365 xfs_trans_cancel(tp); 3366 out_release_wip: 3367 if (wip) 3368 xfs_irele(wip); 3369 if (error == -ENOSPC && nospace_error) 3370 error = nospace_error; 3371 return error; 3372 } 3373 3374 static int 3375 xfs_iflush( 3376 struct xfs_inode *ip, 3377 struct xfs_buf *bp) 3378 { 3379 struct xfs_inode_log_item *iip = ip->i_itemp; 3380 struct xfs_dinode *dip; 3381 struct xfs_mount *mp = ip->i_mount; 3382 int error; 3383 3384 xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED); 3385 ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING)); 3386 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || 3387 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK)); 3388 ASSERT(iip->ili_item.li_buf == bp); 3389 3390 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); 3391 3392 /* 3393 * We don't flush the inode if any of the following checks fail, but we 3394 * do still update the log item and attach to the backing buffer as if 3395 * the flush happened. This is a formality to facilitate predictable 3396 * error handling as the caller will shutdown and fail the buffer. 3397 */ 3398 error = -EFSCORRUPTED; 3399 if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), 3400 mp, XFS_ERRTAG_IFLUSH_1)) { 3401 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3402 "%s: Bad inode %llu magic number 0x%x, ptr "PTR_FMT, 3403 __func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip); 3404 goto flush_out; 3405 } 3406 if (S_ISREG(VFS_I(ip)->i_mode)) { 3407 if (XFS_TEST_ERROR( 3408 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3409 ip->i_df.if_format != XFS_DINODE_FMT_BTREE, 3410 mp, XFS_ERRTAG_IFLUSH_3)) { 3411 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3412 "%s: Bad regular inode %llu, ptr "PTR_FMT, 3413 __func__, ip->i_ino, ip); 3414 goto flush_out; 3415 } 3416 } else if (S_ISDIR(VFS_I(ip)->i_mode)) { 3417 if (XFS_TEST_ERROR( 3418 ip->i_df.if_format != XFS_DINODE_FMT_EXTENTS && 3419 ip->i_df.if_format != XFS_DINODE_FMT_BTREE && 3420 ip->i_df.if_format != XFS_DINODE_FMT_LOCAL, 3421 mp, XFS_ERRTAG_IFLUSH_4)) { 3422 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3423 "%s: Bad directory inode %llu, ptr "PTR_FMT, 3424 __func__, ip->i_ino, ip); 3425 goto flush_out; 3426 } 3427 } 3428 if (XFS_TEST_ERROR(ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af) > 3429 ip->i_nblocks, mp, XFS_ERRTAG_IFLUSH_5)) { 3430 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3431 "%s: detected corrupt incore inode %llu, " 3432 "total extents = %llu nblocks = %lld, ptr "PTR_FMT, 3433 __func__, ip->i_ino, 3434 ip->i_df.if_nextents + xfs_ifork_nextents(&ip->i_af), 3435 ip->i_nblocks, ip); 3436 goto flush_out; 3437 } 3438 if (XFS_TEST_ERROR(ip->i_forkoff > mp->m_sb.sb_inodesize, 3439 mp, XFS_ERRTAG_IFLUSH_6)) { 3440 xfs_alert_tag(mp, XFS_PTAG_IFLUSH, 3441 "%s: bad inode %llu, forkoff 0x%x, ptr "PTR_FMT, 3442 __func__, ip->i_ino, ip->i_forkoff, ip); 3443 goto flush_out; 3444 } 3445 3446 /* 3447 * Inode item log recovery for v2 inodes are dependent on the flushiter 3448 * count for correct sequencing. We bump the flush iteration count so 3449 * we can detect flushes which postdate a log record during recovery. 3450 * This is redundant as we now log every change and hence this can't 3451 * happen but we need to still do it to ensure backwards compatibility 3452 * with old kernels that predate logging all inode changes. 3453 */ 3454 if (!xfs_has_v3inodes(mp)) 3455 ip->i_flushiter++; 3456 3457 /* 3458 * If there are inline format data / attr forks attached to this inode, 3459 * make sure they are not corrupt. 3460 */ 3461 if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL && 3462 xfs_ifork_verify_local_data(ip)) 3463 goto flush_out; 3464 if (xfs_inode_has_attr_fork(ip) && 3465 ip->i_af.if_format == XFS_DINODE_FMT_LOCAL && 3466 xfs_ifork_verify_local_attr(ip)) 3467 goto flush_out; 3468 3469 /* 3470 * Copy the dirty parts of the inode into the on-disk inode. We always 3471 * copy out the core of the inode, because if the inode is dirty at all 3472 * the core must be. 3473 */ 3474 xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); 3475 3476 /* Wrap, we never let the log put out DI_MAX_FLUSH */ 3477 if (!xfs_has_v3inodes(mp)) { 3478 if (ip->i_flushiter == DI_MAX_FLUSH) 3479 ip->i_flushiter = 0; 3480 } 3481 3482 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK); 3483 if (xfs_inode_has_attr_fork(ip)) 3484 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK); 3485 3486 /* 3487 * We've recorded everything logged in the inode, so we'd like to clear 3488 * the ili_fields bits so we don't log and flush things unnecessarily. 3489 * However, we can't stop logging all this information until the data 3490 * we've copied into the disk buffer is written to disk. If we did we 3491 * might overwrite the copy of the inode in the log with all the data 3492 * after re-logging only part of it, and in the face of a crash we 3493 * wouldn't have all the data we need to recover. 3494 * 3495 * What we do is move the bits to the ili_last_fields field. When 3496 * logging the inode, these bits are moved back to the ili_fields field. 3497 * In the xfs_buf_inode_iodone() routine we clear ili_last_fields, since 3498 * we know that the information those bits represent is permanently on 3499 * disk. As long as the flush completes before the inode is logged 3500 * again, then both ili_fields and ili_last_fields will be cleared. 3501 */ 3502 error = 0; 3503 flush_out: 3504 spin_lock(&iip->ili_lock); 3505 iip->ili_last_fields = iip->ili_fields; 3506 iip->ili_fields = 0; 3507 iip->ili_fsync_fields = 0; 3508 spin_unlock(&iip->ili_lock); 3509 3510 /* 3511 * Store the current LSN of the inode so that we can tell whether the 3512 * item has moved in the AIL from xfs_buf_inode_iodone(). 3513 */ 3514 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, 3515 &iip->ili_item.li_lsn); 3516 3517 /* generate the checksum. */ 3518 xfs_dinode_calc_crc(mp, dip); 3519 if (error) 3520 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 3521 return error; 3522 } 3523 3524 /* 3525 * Non-blocking flush of dirty inode metadata into the backing buffer. 3526 * 3527 * The caller must have a reference to the inode and hold the cluster buffer 3528 * locked. The function will walk across all the inodes on the cluster buffer it 3529 * can find and lock without blocking, and flush them to the cluster buffer. 3530 * 3531 * On successful flushing of at least one inode, the caller must write out the 3532 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and 3533 * the caller needs to release the buffer. On failure, the filesystem will be 3534 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED 3535 * will be returned. 3536 */ 3537 int 3538 xfs_iflush_cluster( 3539 struct xfs_buf *bp) 3540 { 3541 struct xfs_mount *mp = bp->b_mount; 3542 struct xfs_log_item *lip, *n; 3543 struct xfs_inode *ip; 3544 struct xfs_inode_log_item *iip; 3545 int clcount = 0; 3546 int error = 0; 3547 3548 /* 3549 * We must use the safe variant here as on shutdown xfs_iflush_abort() 3550 * will remove itself from the list. 3551 */ 3552 list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { 3553 iip = (struct xfs_inode_log_item *)lip; 3554 ip = iip->ili_inode; 3555 3556 /* 3557 * Quick and dirty check to avoid locks if possible. 3558 */ 3559 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) 3560 continue; 3561 if (xfs_ipincount(ip)) 3562 continue; 3563 3564 /* 3565 * The inode is still attached to the buffer, which means it is 3566 * dirty but reclaim might try to grab it. Check carefully for 3567 * that, and grab the ilock while still holding the i_flags_lock 3568 * to guarantee reclaim will not be able to reclaim this inode 3569 * once we drop the i_flags_lock. 3570 */ 3571 spin_lock(&ip->i_flags_lock); 3572 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE)); 3573 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLUSHING)) { 3574 spin_unlock(&ip->i_flags_lock); 3575 continue; 3576 } 3577 3578 /* 3579 * ILOCK will pin the inode against reclaim and prevent 3580 * concurrent transactions modifying the inode while we are 3581 * flushing the inode. If we get the lock, set the flushing 3582 * state before we drop the i_flags_lock. 3583 */ 3584 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) { 3585 spin_unlock(&ip->i_flags_lock); 3586 continue; 3587 } 3588 __xfs_iflags_set(ip, XFS_IFLUSHING); 3589 spin_unlock(&ip->i_flags_lock); 3590 3591 /* 3592 * Abort flushing this inode if we are shut down because the 3593 * inode may not currently be in the AIL. This can occur when 3594 * log I/O failure unpins the inode without inserting into the 3595 * AIL, leaving a dirty/unpinned inode attached to the buffer 3596 * that otherwise looks like it should be flushed. 3597 */ 3598 if (xlog_is_shutdown(mp->m_log)) { 3599 xfs_iunpin_wait(ip); 3600 xfs_iflush_abort(ip); 3601 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3602 error = -EIO; 3603 continue; 3604 } 3605 3606 /* don't block waiting on a log force to unpin dirty inodes */ 3607 if (xfs_ipincount(ip)) { 3608 xfs_iflags_clear(ip, XFS_IFLUSHING); 3609 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3610 continue; 3611 } 3612 3613 if (!xfs_inode_clean(ip)) 3614 error = xfs_iflush(ip, bp); 3615 else 3616 xfs_iflags_clear(ip, XFS_IFLUSHING); 3617 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3618 if (error) 3619 break; 3620 clcount++; 3621 } 3622 3623 if (error) { 3624 /* 3625 * Shutdown first so we kill the log before we release this 3626 * buffer. If it is an INODE_ALLOC buffer and pins the tail 3627 * of the log, failing it before the _log_ is shut down can 3628 * result in the log tail being moved forward in the journal 3629 * on disk because log writes can still be taking place. Hence 3630 * unpinning the tail will allow the ICREATE intent to be 3631 * removed from the log an recovery will fail with uninitialised 3632 * inode cluster buffers. 3633 */ 3634 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3635 bp->b_flags |= XBF_ASYNC; 3636 xfs_buf_ioend_fail(bp); 3637 return error; 3638 } 3639 3640 if (!clcount) 3641 return -EAGAIN; 3642 3643 XFS_STATS_INC(mp, xs_icluster_flushcnt); 3644 XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount); 3645 return 0; 3646 3647 } 3648 3649 /* Release an inode. */ 3650 void 3651 xfs_irele( 3652 struct xfs_inode *ip) 3653 { 3654 trace_xfs_irele(ip, _RET_IP_); 3655 iput(VFS_I(ip)); 3656 } 3657 3658 /* 3659 * Ensure all commited transactions touching the inode are written to the log. 3660 */ 3661 int 3662 xfs_log_force_inode( 3663 struct xfs_inode *ip) 3664 { 3665 xfs_csn_t seq = 0; 3666 3667 xfs_ilock(ip, XFS_ILOCK_SHARED); 3668 if (xfs_ipincount(ip)) 3669 seq = ip->i_itemp->ili_commit_seq; 3670 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3671 3672 if (!seq) 3673 return 0; 3674 return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL); 3675 } 3676 3677 /* 3678 * Grab the exclusive iolock for a data copy from src to dest, making sure to 3679 * abide vfs locking order (lowest pointer value goes first) and breaking the 3680 * layout leases before proceeding. The loop is needed because we cannot call 3681 * the blocking break_layout() with the iolocks held, and therefore have to 3682 * back out both locks. 3683 */ 3684 static int 3685 xfs_iolock_two_inodes_and_break_layout( 3686 struct inode *src, 3687 struct inode *dest) 3688 { 3689 int error; 3690 3691 if (src > dest) 3692 swap(src, dest); 3693 3694 retry: 3695 /* Wait to break both inodes' layouts before we start locking. */ 3696 error = break_layout(src, true); 3697 if (error) 3698 return error; 3699 if (src != dest) { 3700 error = break_layout(dest, true); 3701 if (error) 3702 return error; 3703 } 3704 3705 /* Lock one inode and make sure nobody got in and leased it. */ 3706 inode_lock(src); 3707 error = break_layout(src, false); 3708 if (error) { 3709 inode_unlock(src); 3710 if (error == -EWOULDBLOCK) 3711 goto retry; 3712 return error; 3713 } 3714 3715 if (src == dest) 3716 return 0; 3717 3718 /* Lock the other inode and make sure nobody got in and leased it. */ 3719 inode_lock_nested(dest, I_MUTEX_NONDIR2); 3720 error = break_layout(dest, false); 3721 if (error) { 3722 inode_unlock(src); 3723 inode_unlock(dest); 3724 if (error == -EWOULDBLOCK) 3725 goto retry; 3726 return error; 3727 } 3728 3729 return 0; 3730 } 3731 3732 static int 3733 xfs_mmaplock_two_inodes_and_break_dax_layout( 3734 struct xfs_inode *ip1, 3735 struct xfs_inode *ip2) 3736 { 3737 int error; 3738 bool retry; 3739 struct page *page; 3740 3741 if (ip1->i_ino > ip2->i_ino) 3742 swap(ip1, ip2); 3743 3744 again: 3745 retry = false; 3746 /* Lock the first inode */ 3747 xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); 3748 error = xfs_break_dax_layouts(VFS_I(ip1), &retry); 3749 if (error || retry) { 3750 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3751 if (error == 0 && retry) 3752 goto again; 3753 return error; 3754 } 3755 3756 if (ip1 == ip2) 3757 return 0; 3758 3759 /* Nested lock the second inode */ 3760 xfs_ilock(ip2, xfs_lock_inumorder(XFS_MMAPLOCK_EXCL, 1)); 3761 /* 3762 * We cannot use xfs_break_dax_layouts() directly here because it may 3763 * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable 3764 * for this nested lock case. 3765 */ 3766 page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); 3767 if (page && page_ref_count(page) != 1) { 3768 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3769 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3770 goto again; 3771 } 3772 3773 return 0; 3774 } 3775 3776 /* 3777 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or 3778 * mmap activity. 3779 */ 3780 int 3781 xfs_ilock2_io_mmap( 3782 struct xfs_inode *ip1, 3783 struct xfs_inode *ip2) 3784 { 3785 int ret; 3786 3787 ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); 3788 if (ret) 3789 return ret; 3790 3791 if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { 3792 ret = xfs_mmaplock_two_inodes_and_break_dax_layout(ip1, ip2); 3793 if (ret) { 3794 inode_unlock(VFS_I(ip2)); 3795 if (ip1 != ip2) 3796 inode_unlock(VFS_I(ip1)); 3797 return ret; 3798 } 3799 } else 3800 filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, 3801 VFS_I(ip2)->i_mapping); 3802 3803 return 0; 3804 } 3805 3806 /* Unlock both inodes to allow IO and mmap activity. */ 3807 void 3808 xfs_iunlock2_io_mmap( 3809 struct xfs_inode *ip1, 3810 struct xfs_inode *ip2) 3811 { 3812 if (IS_DAX(VFS_I(ip1)) && IS_DAX(VFS_I(ip2))) { 3813 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3814 if (ip1 != ip2) 3815 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); 3816 } else 3817 filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, 3818 VFS_I(ip2)->i_mapping); 3819 3820 inode_unlock(VFS_I(ip2)); 3821 if (ip1 != ip2) 3822 inode_unlock(VFS_I(ip1)); 3823 } 3824 3825 /* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ 3826 void 3827 xfs_iunlock2_remapping( 3828 struct xfs_inode *ip1, 3829 struct xfs_inode *ip2) 3830 { 3831 xfs_iflags_clear(ip1, XFS_IREMAPPING); 3832 3833 if (ip1 != ip2) 3834 xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); 3835 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); 3836 3837 if (ip1 != ip2) 3838 inode_unlock_shared(VFS_I(ip1)); 3839 inode_unlock(VFS_I(ip2)); 3840 } 3841 3842 /* 3843 * Reload the incore inode list for this inode. Caller should ensure that 3844 * the link count cannot change, either by taking ILOCK_SHARED or otherwise 3845 * preventing other threads from executing. 3846 */ 3847 int 3848 xfs_inode_reload_unlinked_bucket( 3849 struct xfs_trans *tp, 3850 struct xfs_inode *ip) 3851 { 3852 struct xfs_mount *mp = tp->t_mountp; 3853 struct xfs_buf *agibp; 3854 struct xfs_agi *agi; 3855 struct xfs_perag *pag; 3856 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); 3857 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); 3858 xfs_agino_t prev_agino, next_agino; 3859 unsigned int bucket; 3860 bool foundit = false; 3861 int error; 3862 3863 /* Grab the first inode in the list */ 3864 pag = xfs_perag_get(mp, agno); 3865 error = xfs_ialloc_read_agi(pag, tp, 0, &agibp); 3866 xfs_perag_put(pag); 3867 if (error) 3868 return error; 3869 3870 /* 3871 * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the 3872 * incore unlinked list pointers for this inode. Check once more to 3873 * see if we raced with anyone else to reload the unlinked list. 3874 */ 3875 if (!xfs_inode_unlinked_incomplete(ip)) { 3876 foundit = true; 3877 goto out_agibp; 3878 } 3879 3880 bucket = agino % XFS_AGI_UNLINKED_BUCKETS; 3881 agi = agibp->b_addr; 3882 3883 trace_xfs_inode_reload_unlinked_bucket(ip); 3884 3885 xfs_info_ratelimited(mp, 3886 "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", 3887 agino, agno); 3888 3889 prev_agino = NULLAGINO; 3890 next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); 3891 while (next_agino != NULLAGINO) { 3892 struct xfs_inode *next_ip = NULL; 3893 3894 /* Found this caller's inode, set its backlink. */ 3895 if (next_agino == agino) { 3896 next_ip = ip; 3897 next_ip->i_prev_unlinked = prev_agino; 3898 foundit = true; 3899 goto next_inode; 3900 } 3901 3902 /* Try in-memory lookup first. */ 3903 next_ip = xfs_iunlink_lookup(pag, next_agino); 3904 if (next_ip) 3905 goto next_inode; 3906 3907 /* Inode not in memory, try reloading it. */ 3908 error = xfs_iunlink_reload_next(tp, agibp, prev_agino, 3909 next_agino); 3910 if (error) 3911 break; 3912 3913 /* Grab the reloaded inode. */ 3914 next_ip = xfs_iunlink_lookup(pag, next_agino); 3915 if (!next_ip) { 3916 /* No incore inode at all? We reloaded it... */ 3917 ASSERT(next_ip != NULL); 3918 error = -EFSCORRUPTED; 3919 break; 3920 } 3921 3922 next_inode: 3923 prev_agino = next_agino; 3924 next_agino = next_ip->i_next_unlinked; 3925 } 3926 3927 out_agibp: 3928 xfs_trans_brelse(tp, agibp); 3929 /* Should have found this inode somewhere in the iunlinked bucket. */ 3930 if (!error && !foundit) 3931 error = -EFSCORRUPTED; 3932 return error; 3933 } 3934 3935 /* Decide if this inode is missing its unlinked list and reload it. */ 3936 int 3937 xfs_inode_reload_unlinked( 3938 struct xfs_inode *ip) 3939 { 3940 struct xfs_trans *tp; 3941 int error; 3942 3943 error = xfs_trans_alloc_empty(ip->i_mount, &tp); 3944 if (error) 3945 return error; 3946 3947 xfs_ilock(ip, XFS_ILOCK_SHARED); 3948 if (xfs_inode_unlinked_incomplete(ip)) 3949 error = xfs_inode_reload_unlinked_bucket(tp, ip); 3950 xfs_iunlock(ip, XFS_ILOCK_SHARED); 3951 xfs_trans_cancel(tp); 3952 3953 return error; 3954 } 3955 3956 /* Has this inode fork been zapped by repair? */ 3957 bool 3958 xfs_ifork_zapped( 3959 const struct xfs_inode *ip, 3960 int whichfork) 3961 { 3962 unsigned int datamask = 0; 3963 3964 switch (whichfork) { 3965 case XFS_DATA_FORK: 3966 switch (ip->i_vnode.i_mode & S_IFMT) { 3967 case S_IFDIR: 3968 datamask = XFS_SICK_INO_DIR_ZAPPED; 3969 break; 3970 case S_IFLNK: 3971 datamask = XFS_SICK_INO_SYMLINK_ZAPPED; 3972 break; 3973 } 3974 return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask); 3975 case XFS_ATTR_FORK: 3976 return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED; 3977 default: 3978 return false; 3979 } 3980 } 3981 3982 /* Compute the number of data and realtime blocks used by a file. */ 3983 void 3984 xfs_inode_count_blocks( 3985 struct xfs_trans *tp, 3986 struct xfs_inode *ip, 3987 xfs_filblks_t *dblocks, 3988 xfs_filblks_t *rblocks) 3989 { 3990 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); 3991 3992 *rblocks = 0; 3993 if (XFS_IS_REALTIME_INODE(ip)) 3994 xfs_bmap_count_leaves(ifp, rblocks); 3995 *dblocks = ip->i_nblocks - *rblocks; 3996 } 3997 3998 static void 3999 xfs_wait_dax_page( 4000 struct inode *inode) 4001 { 4002 struct xfs_inode *ip = XFS_I(inode); 4003 4004 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 4005 schedule(); 4006 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 4007 } 4008 4009 int 4010 xfs_break_dax_layouts( 4011 struct inode *inode, 4012 bool *retry) 4013 { 4014 struct page *page; 4015 4016 xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); 4017 4018 page = dax_layout_busy_page(inode->i_mapping); 4019 if (!page) 4020 return 0; 4021 4022 *retry = true; 4023 return ___wait_var_event(&page->_refcount, 4024 atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 4025 0, 0, xfs_wait_dax_page(inode)); 4026 } 4027 4028 int 4029 xfs_break_layouts( 4030 struct inode *inode, 4031 uint *iolock, 4032 enum layout_break_reason reason) 4033 { 4034 bool retry; 4035 int error; 4036 4037 xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL); 4038 4039 do { 4040 retry = false; 4041 switch (reason) { 4042 case BREAK_UNMAP: 4043 error = xfs_break_dax_layouts(inode, &retry); 4044 if (error || retry) 4045 break; 4046 fallthrough; 4047 case BREAK_WRITE: 4048 error = xfs_break_leased_layouts(inode, iolock, &retry); 4049 break; 4050 default: 4051 WARN_ON_ONCE(1); 4052 error = -EINVAL; 4053 } 4054 } while (error == 0 && retry); 4055 4056 return error; 4057 } 4058 4059 /* Returns the size of fundamental allocation unit for a file, in bytes. */ 4060 unsigned int 4061 xfs_inode_alloc_unitsize( 4062 struct xfs_inode *ip) 4063 { 4064 unsigned int blocks = 1; 4065 4066 if (XFS_IS_REALTIME_INODE(ip)) 4067 blocks = ip->i_mount->m_sb.sb_rextsize; 4068 4069 return XFS_FSB_TO_B(ip->i_mount, blocks); 4070 } 4071