1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_inode_item.h" 17 #include "xfs_quota.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_dquot_item.h" 22 #include "xfs_dquot.h" 23 #include "xfs_reflink.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_ag.h" 26 27 #include <linux/iversion.h> 28 29 /* 30 * Allocate and initialise an xfs_inode. 31 */ 32 struct xfs_inode * 33 xfs_inode_alloc( 34 struct xfs_mount *mp, 35 xfs_ino_t ino) 36 { 37 struct xfs_inode *ip; 38 39 /* 40 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 41 * and return NULL here on ENOMEM. 42 */ 43 ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); 44 45 if (inode_init_always(mp->m_super, VFS_I(ip))) { 46 kmem_cache_free(xfs_inode_zone, ip); 47 return NULL; 48 } 49 50 /* VFS doesn't initialise i_mode! */ 51 VFS_I(ip)->i_mode = 0; 52 53 XFS_STATS_INC(mp, vn_active); 54 ASSERT(atomic_read(&ip->i_pincount) == 0); 55 ASSERT(ip->i_ino == 0); 56 57 /* initialise the xfs inode */ 58 ip->i_ino = ino; 59 ip->i_mount = mp; 60 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 61 ip->i_afp = NULL; 62 ip->i_cowfp = NULL; 63 memset(&ip->i_df, 0, sizeof(ip->i_df)); 64 ip->i_flags = 0; 65 ip->i_delayed_blks = 0; 66 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 67 ip->i_nblocks = 0; 68 ip->i_forkoff = 0; 69 ip->i_sick = 0; 70 ip->i_checked = 0; 71 INIT_WORK(&ip->i_ioend_work, xfs_end_io); 72 INIT_LIST_HEAD(&ip->i_ioend_list); 73 spin_lock_init(&ip->i_ioend_lock); 74 75 return ip; 76 } 77 78 STATIC void 79 xfs_inode_free_callback( 80 struct rcu_head *head) 81 { 82 struct inode *inode = container_of(head, struct inode, i_rcu); 83 struct xfs_inode *ip = XFS_I(inode); 84 85 switch (VFS_I(ip)->i_mode & S_IFMT) { 86 case S_IFREG: 87 case S_IFDIR: 88 case S_IFLNK: 89 xfs_idestroy_fork(&ip->i_df); 90 break; 91 } 92 93 if (ip->i_afp) { 94 xfs_idestroy_fork(ip->i_afp); 95 kmem_cache_free(xfs_ifork_zone, ip->i_afp); 96 } 97 if (ip->i_cowfp) { 98 xfs_idestroy_fork(ip->i_cowfp); 99 kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); 100 } 101 if (ip->i_itemp) { 102 ASSERT(!test_bit(XFS_LI_IN_AIL, 103 &ip->i_itemp->ili_item.li_flags)); 104 xfs_inode_item_destroy(ip); 105 ip->i_itemp = NULL; 106 } 107 108 kmem_cache_free(xfs_inode_zone, ip); 109 } 110 111 static void 112 __xfs_inode_free( 113 struct xfs_inode *ip) 114 { 115 /* asserts to verify all state is correct here */ 116 ASSERT(atomic_read(&ip->i_pincount) == 0); 117 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 118 XFS_STATS_DEC(ip->i_mount, vn_active); 119 120 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 121 } 122 123 void 124 xfs_inode_free( 125 struct xfs_inode *ip) 126 { 127 ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); 128 129 /* 130 * Because we use RCU freeing we need to ensure the inode always 131 * appears to be reclaimed with an invalid inode number when in the 132 * free state. The ip->i_flags_lock provides the barrier against lookup 133 * races. 134 */ 135 spin_lock(&ip->i_flags_lock); 136 ip->i_flags = XFS_IRECLAIM; 137 ip->i_ino = 0; 138 spin_unlock(&ip->i_flags_lock); 139 140 __xfs_inode_free(ip); 141 } 142 143 /* 144 * Queue background inode reclaim work if there are reclaimable inodes and there 145 * isn't reclaim work already scheduled or in progress. 146 */ 147 static void 148 xfs_reclaim_work_queue( 149 struct xfs_mount *mp) 150 { 151 152 rcu_read_lock(); 153 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 154 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 155 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 156 } 157 rcu_read_unlock(); 158 } 159 160 static void 161 xfs_perag_set_reclaim_tag( 162 struct xfs_perag *pag) 163 { 164 struct xfs_mount *mp = pag->pag_mount; 165 166 lockdep_assert_held(&pag->pag_ici_lock); 167 if (pag->pag_ici_reclaimable++) 168 return; 169 170 /* propagate the reclaim tag up into the perag radix tree */ 171 spin_lock(&mp->m_perag_lock); 172 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 173 XFS_ICI_RECLAIM_TAG); 174 spin_unlock(&mp->m_perag_lock); 175 176 /* schedule periodic background inode reclaim */ 177 xfs_reclaim_work_queue(mp); 178 179 trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 180 } 181 182 static void 183 xfs_perag_clear_reclaim_tag( 184 struct xfs_perag *pag) 185 { 186 struct xfs_mount *mp = pag->pag_mount; 187 188 lockdep_assert_held(&pag->pag_ici_lock); 189 if (--pag->pag_ici_reclaimable) 190 return; 191 192 /* clear the reclaim tag from the perag radix tree */ 193 spin_lock(&mp->m_perag_lock); 194 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 195 XFS_ICI_RECLAIM_TAG); 196 spin_unlock(&mp->m_perag_lock); 197 trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 198 } 199 200 201 /* 202 * We set the inode flag atomically with the radix tree tag. 203 * Once we get tag lookups on the radix tree, this inode flag 204 * can go away. 205 */ 206 void 207 xfs_inode_set_reclaim_tag( 208 struct xfs_inode *ip) 209 { 210 struct xfs_mount *mp = ip->i_mount; 211 struct xfs_perag *pag; 212 213 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 214 spin_lock(&pag->pag_ici_lock); 215 spin_lock(&ip->i_flags_lock); 216 217 radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 218 XFS_ICI_RECLAIM_TAG); 219 xfs_perag_set_reclaim_tag(pag); 220 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 221 222 spin_unlock(&ip->i_flags_lock); 223 spin_unlock(&pag->pag_ici_lock); 224 xfs_perag_put(pag); 225 } 226 227 STATIC void 228 xfs_inode_clear_reclaim_tag( 229 struct xfs_perag *pag, 230 xfs_ino_t ino) 231 { 232 radix_tree_tag_clear(&pag->pag_ici_root, 233 XFS_INO_TO_AGINO(pag->pag_mount, ino), 234 XFS_ICI_RECLAIM_TAG); 235 xfs_perag_clear_reclaim_tag(pag); 236 } 237 238 static void 239 xfs_inew_wait( 240 struct xfs_inode *ip) 241 { 242 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 243 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 244 245 do { 246 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 247 if (!xfs_iflags_test(ip, XFS_INEW)) 248 break; 249 schedule(); 250 } while (true); 251 finish_wait(wq, &wait.wq_entry); 252 } 253 254 /* 255 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 256 * part of the structure. This is made more complex by the fact we store 257 * information about the on-disk values in the VFS inode and so we can't just 258 * overwrite the values unconditionally. Hence we save the parameters we 259 * need to retain across reinitialisation, and rewrite them into the VFS inode 260 * after reinitialisation even if it fails. 261 */ 262 static int 263 xfs_reinit_inode( 264 struct xfs_mount *mp, 265 struct inode *inode) 266 { 267 int error; 268 uint32_t nlink = inode->i_nlink; 269 uint32_t generation = inode->i_generation; 270 uint64_t version = inode_peek_iversion(inode); 271 umode_t mode = inode->i_mode; 272 dev_t dev = inode->i_rdev; 273 kuid_t uid = inode->i_uid; 274 kgid_t gid = inode->i_gid; 275 276 error = inode_init_always(mp->m_super, inode); 277 278 set_nlink(inode, nlink); 279 inode->i_generation = generation; 280 inode_set_iversion_queried(inode, version); 281 inode->i_mode = mode; 282 inode->i_rdev = dev; 283 inode->i_uid = uid; 284 inode->i_gid = gid; 285 return error; 286 } 287 288 /* 289 * If we are allocating a new inode, then check what was returned is 290 * actually a free, empty inode. If we are not allocating an inode, 291 * then check we didn't find a free inode. 292 * 293 * Returns: 294 * 0 if the inode free state matches the lookup context 295 * -ENOENT if the inode is free and we are not allocating 296 * -EFSCORRUPTED if there is any state mismatch at all 297 */ 298 static int 299 xfs_iget_check_free_state( 300 struct xfs_inode *ip, 301 int flags) 302 { 303 if (flags & XFS_IGET_CREATE) { 304 /* should be a free inode */ 305 if (VFS_I(ip)->i_mode != 0) { 306 xfs_warn(ip->i_mount, 307 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 308 ip->i_ino, VFS_I(ip)->i_mode); 309 return -EFSCORRUPTED; 310 } 311 312 if (ip->i_nblocks != 0) { 313 xfs_warn(ip->i_mount, 314 "Corruption detected! Free inode 0x%llx has blocks allocated!", 315 ip->i_ino); 316 return -EFSCORRUPTED; 317 } 318 return 0; 319 } 320 321 /* should be an allocated inode */ 322 if (VFS_I(ip)->i_mode == 0) 323 return -ENOENT; 324 325 return 0; 326 } 327 328 /* 329 * Check the validity of the inode we just found it the cache 330 */ 331 static int 332 xfs_iget_cache_hit( 333 struct xfs_perag *pag, 334 struct xfs_inode *ip, 335 xfs_ino_t ino, 336 int flags, 337 int lock_flags) __releases(RCU) 338 { 339 struct inode *inode = VFS_I(ip); 340 struct xfs_mount *mp = ip->i_mount; 341 int error; 342 343 /* 344 * check for re-use of an inode within an RCU grace period due to the 345 * radix tree nodes not being updated yet. We monitor for this by 346 * setting the inode number to zero before freeing the inode structure. 347 * If the inode has been reallocated and set up, then the inode number 348 * will not match, so check for that, too. 349 */ 350 spin_lock(&ip->i_flags_lock); 351 if (ip->i_ino != ino) { 352 trace_xfs_iget_skip(ip); 353 XFS_STATS_INC(mp, xs_ig_frecycle); 354 error = -EAGAIN; 355 goto out_error; 356 } 357 358 359 /* 360 * If we are racing with another cache hit that is currently 361 * instantiating this inode or currently recycling it out of 362 * reclaimabe state, wait for the initialisation to complete 363 * before continuing. 364 * 365 * XXX(hch): eventually we should do something equivalent to 366 * wait_on_inode to wait for these flags to be cleared 367 * instead of polling for it. 368 */ 369 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 370 trace_xfs_iget_skip(ip); 371 XFS_STATS_INC(mp, xs_ig_frecycle); 372 error = -EAGAIN; 373 goto out_error; 374 } 375 376 /* 377 * Check the inode free state is valid. This also detects lookup 378 * racing with unlinks. 379 */ 380 error = xfs_iget_check_free_state(ip, flags); 381 if (error) 382 goto out_error; 383 384 /* 385 * If IRECLAIMABLE is set, we've torn down the VFS inode already. 386 * Need to carefully get it back into useable state. 387 */ 388 if (ip->i_flags & XFS_IRECLAIMABLE) { 389 trace_xfs_iget_reclaim(ip); 390 391 if (flags & XFS_IGET_INCORE) { 392 error = -EAGAIN; 393 goto out_error; 394 } 395 396 /* 397 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 398 * from stomping over us while we recycle the inode. We can't 399 * clear the radix tree reclaimable tag yet as it requires 400 * pag_ici_lock to be held exclusive. 401 */ 402 ip->i_flags |= XFS_IRECLAIM; 403 404 spin_unlock(&ip->i_flags_lock); 405 rcu_read_unlock(); 406 407 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 408 error = xfs_reinit_inode(mp, inode); 409 if (error) { 410 bool wake; 411 /* 412 * Re-initializing the inode failed, and we are in deep 413 * trouble. Try to re-add it to the reclaim list. 414 */ 415 rcu_read_lock(); 416 spin_lock(&ip->i_flags_lock); 417 wake = !!__xfs_iflags_test(ip, XFS_INEW); 418 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 419 if (wake) 420 wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 421 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 422 trace_xfs_iget_reclaim_fail(ip); 423 goto out_error; 424 } 425 426 spin_lock(&pag->pag_ici_lock); 427 spin_lock(&ip->i_flags_lock); 428 429 /* 430 * Clear the per-lifetime state in the inode as we are now 431 * effectively a new inode and need to return to the initial 432 * state before reuse occurs. 433 */ 434 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 435 ip->i_flags |= XFS_INEW; 436 xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 437 inode->i_state = I_NEW; 438 ip->i_sick = 0; 439 ip->i_checked = 0; 440 441 spin_unlock(&ip->i_flags_lock); 442 spin_unlock(&pag->pag_ici_lock); 443 } else { 444 /* If the VFS inode is being torn down, pause and try again. */ 445 if (!igrab(inode)) { 446 trace_xfs_iget_skip(ip); 447 error = -EAGAIN; 448 goto out_error; 449 } 450 451 /* We've got a live one. */ 452 spin_unlock(&ip->i_flags_lock); 453 rcu_read_unlock(); 454 trace_xfs_iget_hit(ip); 455 } 456 457 if (lock_flags != 0) 458 xfs_ilock(ip, lock_flags); 459 460 if (!(flags & XFS_IGET_INCORE)) 461 xfs_iflags_clear(ip, XFS_ISTALE); 462 XFS_STATS_INC(mp, xs_ig_found); 463 464 return 0; 465 466 out_error: 467 spin_unlock(&ip->i_flags_lock); 468 rcu_read_unlock(); 469 return error; 470 } 471 472 473 static int 474 xfs_iget_cache_miss( 475 struct xfs_mount *mp, 476 struct xfs_perag *pag, 477 xfs_trans_t *tp, 478 xfs_ino_t ino, 479 struct xfs_inode **ipp, 480 int flags, 481 int lock_flags) 482 { 483 struct xfs_inode *ip; 484 int error; 485 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 486 int iflags; 487 488 ip = xfs_inode_alloc(mp, ino); 489 if (!ip) 490 return -ENOMEM; 491 492 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); 493 if (error) 494 goto out_destroy; 495 496 /* 497 * For version 5 superblocks, if we are initialising a new inode and we 498 * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can 499 * simply build the new inode core with a random generation number. 500 * 501 * For version 4 (and older) superblocks, log recovery is dependent on 502 * the i_flushiter field being initialised from the current on-disk 503 * value and hence we must also read the inode off disk even when 504 * initializing new inodes. 505 */ 506 if (xfs_sb_version_has_v3inode(&mp->m_sb) && 507 (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { 508 VFS_I(ip)->i_generation = prandom_u32(); 509 } else { 510 struct xfs_buf *bp; 511 512 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp); 513 if (error) 514 goto out_destroy; 515 516 error = xfs_inode_from_disk(ip, 517 xfs_buf_offset(bp, ip->i_imap.im_boffset)); 518 if (!error) 519 xfs_buf_set_ref(bp, XFS_INO_REF); 520 xfs_trans_brelse(tp, bp); 521 522 if (error) 523 goto out_destroy; 524 } 525 526 trace_xfs_iget_miss(ip); 527 528 /* 529 * Check the inode free state is valid. This also detects lookup 530 * racing with unlinks. 531 */ 532 error = xfs_iget_check_free_state(ip, flags); 533 if (error) 534 goto out_destroy; 535 536 /* 537 * Preload the radix tree so we can insert safely under the 538 * write spinlock. Note that we cannot sleep inside the preload 539 * region. Since we can be called from transaction context, don't 540 * recurse into the file system. 541 */ 542 if (radix_tree_preload(GFP_NOFS)) { 543 error = -EAGAIN; 544 goto out_destroy; 545 } 546 547 /* 548 * Because the inode hasn't been added to the radix-tree yet it can't 549 * be found by another thread, so we can do the non-sleeping lock here. 550 */ 551 if (lock_flags) { 552 if (!xfs_ilock_nowait(ip, lock_flags)) 553 BUG(); 554 } 555 556 /* 557 * These values must be set before inserting the inode into the radix 558 * tree as the moment it is inserted a concurrent lookup (allowed by the 559 * RCU locking mechanism) can find it and that lookup must see that this 560 * is an inode currently under construction (i.e. that XFS_INEW is set). 561 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 562 * memory barrier that ensures this detection works correctly at lookup 563 * time. 564 */ 565 iflags = XFS_INEW; 566 if (flags & XFS_IGET_DONTCACHE) 567 d_mark_dontcache(VFS_I(ip)); 568 ip->i_udquot = NULL; 569 ip->i_gdquot = NULL; 570 ip->i_pdquot = NULL; 571 xfs_iflags_set(ip, iflags); 572 573 /* insert the new inode */ 574 spin_lock(&pag->pag_ici_lock); 575 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 576 if (unlikely(error)) { 577 WARN_ON(error != -EEXIST); 578 XFS_STATS_INC(mp, xs_ig_dup); 579 error = -EAGAIN; 580 goto out_preload_end; 581 } 582 spin_unlock(&pag->pag_ici_lock); 583 radix_tree_preload_end(); 584 585 *ipp = ip; 586 return 0; 587 588 out_preload_end: 589 spin_unlock(&pag->pag_ici_lock); 590 radix_tree_preload_end(); 591 if (lock_flags) 592 xfs_iunlock(ip, lock_flags); 593 out_destroy: 594 __destroy_inode(VFS_I(ip)); 595 xfs_inode_free(ip); 596 return error; 597 } 598 599 /* 600 * Look up an inode by number in the given file system. The inode is looked up 601 * in the cache held in each AG. If the inode is found in the cache, initialise 602 * the vfs inode if necessary. 603 * 604 * If it is not in core, read it in from the file system's device, add it to the 605 * cache and initialise the vfs inode. 606 * 607 * The inode is locked according to the value of the lock_flags parameter. 608 * Inode lookup is only done during metadata operations and not as part of the 609 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 610 */ 611 int 612 xfs_iget( 613 struct xfs_mount *mp, 614 struct xfs_trans *tp, 615 xfs_ino_t ino, 616 uint flags, 617 uint lock_flags, 618 struct xfs_inode **ipp) 619 { 620 struct xfs_inode *ip; 621 struct xfs_perag *pag; 622 xfs_agino_t agino; 623 int error; 624 625 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 626 627 /* reject inode numbers outside existing AGs */ 628 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 629 return -EINVAL; 630 631 XFS_STATS_INC(mp, xs_ig_attempts); 632 633 /* get the perag structure and ensure that it's inode capable */ 634 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 635 agino = XFS_INO_TO_AGINO(mp, ino); 636 637 again: 638 error = 0; 639 rcu_read_lock(); 640 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 641 642 if (ip) { 643 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 644 if (error) 645 goto out_error_or_again; 646 } else { 647 rcu_read_unlock(); 648 if (flags & XFS_IGET_INCORE) { 649 error = -ENODATA; 650 goto out_error_or_again; 651 } 652 XFS_STATS_INC(mp, xs_ig_missed); 653 654 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 655 flags, lock_flags); 656 if (error) 657 goto out_error_or_again; 658 } 659 xfs_perag_put(pag); 660 661 *ipp = ip; 662 663 /* 664 * If we have a real type for an on-disk inode, we can setup the inode 665 * now. If it's a new inode being created, xfs_ialloc will handle it. 666 */ 667 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 668 xfs_setup_existing_inode(ip); 669 return 0; 670 671 out_error_or_again: 672 if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 673 delay(1); 674 goto again; 675 } 676 xfs_perag_put(pag); 677 return error; 678 } 679 680 /* 681 * "Is this a cached inode that's also allocated?" 682 * 683 * Look up an inode by number in the given file system. If the inode is 684 * in cache and isn't in purgatory, return 1 if the inode is allocated 685 * and 0 if it is not. For all other cases (not in cache, being torn 686 * down, etc.), return a negative error code. 687 * 688 * The caller has to prevent inode allocation and freeing activity, 689 * presumably by locking the AGI buffer. This is to ensure that an 690 * inode cannot transition from allocated to freed until the caller is 691 * ready to allow that. If the inode is in an intermediate state (new, 692 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 693 * inode is not in the cache, -ENOENT will be returned. The caller must 694 * deal with these scenarios appropriately. 695 * 696 * This is a specialized use case for the online scrubber; if you're 697 * reading this, you probably want xfs_iget. 698 */ 699 int 700 xfs_icache_inode_is_allocated( 701 struct xfs_mount *mp, 702 struct xfs_trans *tp, 703 xfs_ino_t ino, 704 bool *inuse) 705 { 706 struct xfs_inode *ip; 707 int error; 708 709 error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 710 if (error) 711 return error; 712 713 *inuse = !!(VFS_I(ip)->i_mode); 714 xfs_irele(ip); 715 return 0; 716 } 717 718 /* 719 * The inode lookup is done in batches to keep the amount of lock traffic and 720 * radix tree lookups to a minimum. The batch size is a trade off between 721 * lookup reduction and stack usage. This is in the reclaim path, so we can't 722 * be too greedy. 723 */ 724 #define XFS_LOOKUP_BATCH 32 725 726 /* 727 * Decide if the given @ip is eligible to be a part of the inode walk, and 728 * grab it if so. Returns true if it's ready to go or false if we should just 729 * ignore it. 730 */ 731 STATIC bool 732 xfs_inode_walk_ag_grab( 733 struct xfs_inode *ip, 734 int flags) 735 { 736 struct inode *inode = VFS_I(ip); 737 bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); 738 739 ASSERT(rcu_read_lock_held()); 740 741 /* Check for stale RCU freed inode */ 742 spin_lock(&ip->i_flags_lock); 743 if (!ip->i_ino) 744 goto out_unlock_noent; 745 746 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 747 if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 748 __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 749 goto out_unlock_noent; 750 spin_unlock(&ip->i_flags_lock); 751 752 /* nothing to sync during shutdown */ 753 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 754 return false; 755 756 /* If we can't grab the inode, it must on it's way to reclaim. */ 757 if (!igrab(inode)) 758 return false; 759 760 /* inode is valid */ 761 return true; 762 763 out_unlock_noent: 764 spin_unlock(&ip->i_flags_lock); 765 return false; 766 } 767 768 /* 769 * For a given per-AG structure @pag, grab, @execute, and rele all incore 770 * inodes with the given radix tree @tag. 771 */ 772 STATIC int 773 xfs_inode_walk_ag( 774 struct xfs_perag *pag, 775 int iter_flags, 776 int (*execute)(struct xfs_inode *ip, void *args), 777 void *args, 778 int tag) 779 { 780 struct xfs_mount *mp = pag->pag_mount; 781 uint32_t first_index; 782 int last_error = 0; 783 int skipped; 784 bool done; 785 int nr_found; 786 787 restart: 788 done = false; 789 skipped = 0; 790 first_index = 0; 791 nr_found = 0; 792 do { 793 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 794 int error = 0; 795 int i; 796 797 rcu_read_lock(); 798 799 if (tag == XFS_ICI_NO_TAG) 800 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 801 (void **)batch, first_index, 802 XFS_LOOKUP_BATCH); 803 else 804 nr_found = radix_tree_gang_lookup_tag( 805 &pag->pag_ici_root, 806 (void **) batch, first_index, 807 XFS_LOOKUP_BATCH, tag); 808 809 if (!nr_found) { 810 rcu_read_unlock(); 811 break; 812 } 813 814 /* 815 * Grab the inodes before we drop the lock. if we found 816 * nothing, nr == 0 and the loop will be skipped. 817 */ 818 for (i = 0; i < nr_found; i++) { 819 struct xfs_inode *ip = batch[i]; 820 821 if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) 822 batch[i] = NULL; 823 824 /* 825 * Update the index for the next lookup. Catch 826 * overflows into the next AG range which can occur if 827 * we have inodes in the last block of the AG and we 828 * are currently pointing to the last inode. 829 * 830 * Because we may see inodes that are from the wrong AG 831 * due to RCU freeing and reallocation, only update the 832 * index if it lies in this AG. It was a race that lead 833 * us to see this inode, so another lookup from the 834 * same index will not find it again. 835 */ 836 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 837 continue; 838 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 839 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 840 done = true; 841 } 842 843 /* unlock now we've grabbed the inodes. */ 844 rcu_read_unlock(); 845 846 for (i = 0; i < nr_found; i++) { 847 if (!batch[i]) 848 continue; 849 if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && 850 xfs_iflags_test(batch[i], XFS_INEW)) 851 xfs_inew_wait(batch[i]); 852 error = execute(batch[i], args); 853 xfs_irele(batch[i]); 854 if (error == -EAGAIN) { 855 skipped++; 856 continue; 857 } 858 if (error && last_error != -EFSCORRUPTED) 859 last_error = error; 860 } 861 862 /* bail out if the filesystem is corrupted. */ 863 if (error == -EFSCORRUPTED) 864 break; 865 866 cond_resched(); 867 868 } while (nr_found && !done); 869 870 if (skipped) { 871 delay(1); 872 goto restart; 873 } 874 return last_error; 875 } 876 877 /* Fetch the next (possibly tagged) per-AG structure. */ 878 static inline struct xfs_perag * 879 xfs_inode_walk_get_perag( 880 struct xfs_mount *mp, 881 xfs_agnumber_t agno, 882 int tag) 883 { 884 if (tag == XFS_ICI_NO_TAG) 885 return xfs_perag_get(mp, agno); 886 return xfs_perag_get_tag(mp, agno, tag); 887 } 888 889 /* 890 * Call the @execute function on all incore inodes matching the radix tree 891 * @tag. 892 */ 893 int 894 xfs_inode_walk( 895 struct xfs_mount *mp, 896 int iter_flags, 897 int (*execute)(struct xfs_inode *ip, void *args), 898 void *args, 899 int tag) 900 { 901 struct xfs_perag *pag; 902 int error = 0; 903 int last_error = 0; 904 xfs_agnumber_t ag; 905 906 ag = 0; 907 while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { 908 ag = pag->pag_agno + 1; 909 error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); 910 xfs_perag_put(pag); 911 if (error) { 912 last_error = error; 913 if (error == -EFSCORRUPTED) 914 break; 915 } 916 } 917 return last_error; 918 } 919 920 /* 921 * Grab the inode for reclaim exclusively. 922 * 923 * We have found this inode via a lookup under RCU, so the inode may have 924 * already been freed, or it may be in the process of being recycled by 925 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 926 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 927 * will not be set. Hence we need to check for both these flag conditions to 928 * avoid inodes that are no longer reclaim candidates. 929 * 930 * Note: checking for other state flags here, under the i_flags_lock or not, is 931 * racy and should be avoided. Those races should be resolved only after we have 932 * ensured that we are able to reclaim this inode and the world can see that we 933 * are going to reclaim it. 934 * 935 * Return true if we grabbed it, false otherwise. 936 */ 937 static bool 938 xfs_reclaim_inode_grab( 939 struct xfs_inode *ip) 940 { 941 ASSERT(rcu_read_lock_held()); 942 943 spin_lock(&ip->i_flags_lock); 944 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 945 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 946 /* not a reclaim candidate. */ 947 spin_unlock(&ip->i_flags_lock); 948 return false; 949 } 950 __xfs_iflags_set(ip, XFS_IRECLAIM); 951 spin_unlock(&ip->i_flags_lock); 952 return true; 953 } 954 955 /* 956 * Inode reclaim is non-blocking, so the default action if progress cannot be 957 * made is to "requeue" the inode for reclaim by unlocking it and clearing the 958 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 959 * blocking anymore and hence we can wait for the inode to be able to reclaim 960 * it. 961 * 962 * We do no IO here - if callers require inodes to be cleaned they must push the 963 * AIL first to trigger writeback of dirty inodes. This enables writeback to be 964 * done in the background in a non-blocking manner, and enables memory reclaim 965 * to make progress without blocking. 966 */ 967 static void 968 xfs_reclaim_inode( 969 struct xfs_inode *ip, 970 struct xfs_perag *pag) 971 { 972 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 973 974 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 975 goto out; 976 if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) 977 goto out_iunlock; 978 979 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 980 xfs_iunpin_wait(ip); 981 xfs_iflush_abort(ip); 982 goto reclaim; 983 } 984 if (xfs_ipincount(ip)) 985 goto out_clear_flush; 986 if (!xfs_inode_clean(ip)) 987 goto out_clear_flush; 988 989 xfs_iflags_clear(ip, XFS_IFLUSHING); 990 reclaim: 991 992 /* 993 * Because we use RCU freeing we need to ensure the inode always appears 994 * to be reclaimed with an invalid inode number when in the free state. 995 * We do this as early as possible under the ILOCK so that 996 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 997 * detect races with us here. By doing this, we guarantee that once 998 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 999 * it will see either a valid inode that will serialise correctly, or it 1000 * will see an invalid inode that it can skip. 1001 */ 1002 spin_lock(&ip->i_flags_lock); 1003 ip->i_flags = XFS_IRECLAIM; 1004 ip->i_ino = 0; 1005 spin_unlock(&ip->i_flags_lock); 1006 1007 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1008 1009 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 1010 /* 1011 * Remove the inode from the per-AG radix tree. 1012 * 1013 * Because radix_tree_delete won't complain even if the item was never 1014 * added to the tree assert that it's been there before to catch 1015 * problems with the inode life time early on. 1016 */ 1017 spin_lock(&pag->pag_ici_lock); 1018 if (!radix_tree_delete(&pag->pag_ici_root, 1019 XFS_INO_TO_AGINO(ip->i_mount, ino))) 1020 ASSERT(0); 1021 xfs_perag_clear_reclaim_tag(pag); 1022 spin_unlock(&pag->pag_ici_lock); 1023 1024 /* 1025 * Here we do an (almost) spurious inode lock in order to coordinate 1026 * with inode cache radix tree lookups. This is because the lookup 1027 * can reference the inodes in the cache without taking references. 1028 * 1029 * We make that OK here by ensuring that we wait until the inode is 1030 * unlocked after the lookup before we go ahead and free it. 1031 */ 1032 xfs_ilock(ip, XFS_ILOCK_EXCL); 1033 xfs_qm_dqdetach(ip); 1034 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1035 ASSERT(xfs_inode_clean(ip)); 1036 1037 __xfs_inode_free(ip); 1038 return; 1039 1040 out_clear_flush: 1041 xfs_iflags_clear(ip, XFS_IFLUSHING); 1042 out_iunlock: 1043 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1044 out: 1045 xfs_iflags_clear(ip, XFS_IRECLAIM); 1046 } 1047 1048 /* 1049 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 1050 * corrupted, we still want to try to reclaim all the inodes. If we don't, 1051 * then a shut down during filesystem unmount reclaim walk leak all the 1052 * unreclaimed inodes. 1053 * 1054 * Returns non-zero if any AGs or inodes were skipped in the reclaim pass 1055 * so that callers that want to block until all dirty inodes are written back 1056 * and reclaimed can sanely loop. 1057 */ 1058 static void 1059 xfs_reclaim_inodes_ag( 1060 struct xfs_mount *mp, 1061 int *nr_to_scan) 1062 { 1063 struct xfs_perag *pag; 1064 xfs_agnumber_t ag = 0; 1065 1066 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1067 unsigned long first_index = 0; 1068 int done = 0; 1069 int nr_found = 0; 1070 1071 ag = pag->pag_agno + 1; 1072 1073 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1074 do { 1075 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1076 int i; 1077 1078 rcu_read_lock(); 1079 nr_found = radix_tree_gang_lookup_tag( 1080 &pag->pag_ici_root, 1081 (void **)batch, first_index, 1082 XFS_LOOKUP_BATCH, 1083 XFS_ICI_RECLAIM_TAG); 1084 if (!nr_found) { 1085 done = 1; 1086 rcu_read_unlock(); 1087 break; 1088 } 1089 1090 /* 1091 * Grab the inodes before we drop the lock. if we found 1092 * nothing, nr == 0 and the loop will be skipped. 1093 */ 1094 for (i = 0; i < nr_found; i++) { 1095 struct xfs_inode *ip = batch[i]; 1096 1097 if (done || !xfs_reclaim_inode_grab(ip)) 1098 batch[i] = NULL; 1099 1100 /* 1101 * Update the index for the next lookup. Catch 1102 * overflows into the next AG range which can 1103 * occur if we have inodes in the last block of 1104 * the AG and we are currently pointing to the 1105 * last inode. 1106 * 1107 * Because we may see inodes that are from the 1108 * wrong AG due to RCU freeing and 1109 * reallocation, only update the index if it 1110 * lies in this AG. It was a race that lead us 1111 * to see this inode, so another lookup from 1112 * the same index will not find it again. 1113 */ 1114 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 1115 pag->pag_agno) 1116 continue; 1117 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1118 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1119 done = 1; 1120 } 1121 1122 /* unlock now we've grabbed the inodes. */ 1123 rcu_read_unlock(); 1124 1125 for (i = 0; i < nr_found; i++) { 1126 if (batch[i]) 1127 xfs_reclaim_inode(batch[i], pag); 1128 } 1129 1130 *nr_to_scan -= XFS_LOOKUP_BATCH; 1131 cond_resched(); 1132 } while (nr_found && !done && *nr_to_scan > 0); 1133 1134 if (done) 1135 first_index = 0; 1136 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1137 xfs_perag_put(pag); 1138 } 1139 } 1140 1141 void 1142 xfs_reclaim_inodes( 1143 struct xfs_mount *mp) 1144 { 1145 int nr_to_scan = INT_MAX; 1146 1147 while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 1148 xfs_ail_push_all_sync(mp->m_ail); 1149 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1150 } 1151 } 1152 1153 /* 1154 * The shrinker infrastructure determines how many inodes we should scan for 1155 * reclaim. We want as many clean inodes ready to reclaim as possible, so we 1156 * push the AIL here. We also want to proactively free up memory if we can to 1157 * minimise the amount of work memory reclaim has to do so we kick the 1158 * background reclaim if it isn't already scheduled. 1159 */ 1160 long 1161 xfs_reclaim_inodes_nr( 1162 struct xfs_mount *mp, 1163 int nr_to_scan) 1164 { 1165 /* kick background reclaimer and push the AIL */ 1166 xfs_reclaim_work_queue(mp); 1167 xfs_ail_push_all(mp->m_ail); 1168 1169 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1170 return 0; 1171 } 1172 1173 /* 1174 * Return the number of reclaimable inodes in the filesystem for 1175 * the shrinker to determine how much to reclaim. 1176 */ 1177 int 1178 xfs_reclaim_inodes_count( 1179 struct xfs_mount *mp) 1180 { 1181 struct xfs_perag *pag; 1182 xfs_agnumber_t ag = 0; 1183 int reclaimable = 0; 1184 1185 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1186 ag = pag->pag_agno + 1; 1187 reclaimable += pag->pag_ici_reclaimable; 1188 xfs_perag_put(pag); 1189 } 1190 return reclaimable; 1191 } 1192 1193 STATIC bool 1194 xfs_inode_match_id( 1195 struct xfs_inode *ip, 1196 struct xfs_eofblocks *eofb) 1197 { 1198 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1199 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1200 return false; 1201 1202 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1203 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1204 return false; 1205 1206 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1207 ip->i_projid != eofb->eof_prid) 1208 return false; 1209 1210 return true; 1211 } 1212 1213 /* 1214 * A union-based inode filtering algorithm. Process the inode if any of the 1215 * criteria match. This is for global/internal scans only. 1216 */ 1217 STATIC bool 1218 xfs_inode_match_id_union( 1219 struct xfs_inode *ip, 1220 struct xfs_eofblocks *eofb) 1221 { 1222 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1223 uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1224 return true; 1225 1226 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1227 gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1228 return true; 1229 1230 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1231 ip->i_projid == eofb->eof_prid) 1232 return true; 1233 1234 return false; 1235 } 1236 1237 /* 1238 * Is this inode @ip eligible for eof/cow block reclamation, given some 1239 * filtering parameters @eofb? The inode is eligible if @eofb is null or 1240 * if the predicate functions match. 1241 */ 1242 static bool 1243 xfs_inode_matches_eofb( 1244 struct xfs_inode *ip, 1245 struct xfs_eofblocks *eofb) 1246 { 1247 bool match; 1248 1249 if (!eofb) 1250 return true; 1251 1252 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1253 match = xfs_inode_match_id_union(ip, eofb); 1254 else 1255 match = xfs_inode_match_id(ip, eofb); 1256 if (!match) 1257 return false; 1258 1259 /* skip the inode if the file size is too small */ 1260 if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && 1261 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1262 return false; 1263 1264 return true; 1265 } 1266 1267 /* 1268 * This is a fast pass over the inode cache to try to get reclaim moving on as 1269 * many inodes as possible in a short period of time. It kicks itself every few 1270 * seconds, as well as being kicked by the inode cache shrinker when memory 1271 * goes low. 1272 */ 1273 void 1274 xfs_reclaim_worker( 1275 struct work_struct *work) 1276 { 1277 struct xfs_mount *mp = container_of(to_delayed_work(work), 1278 struct xfs_mount, m_reclaim_work); 1279 int nr_to_scan = INT_MAX; 1280 1281 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1282 xfs_reclaim_work_queue(mp); 1283 } 1284 1285 STATIC int 1286 xfs_inode_free_eofblocks( 1287 struct xfs_inode *ip, 1288 void *args, 1289 unsigned int *lockflags) 1290 { 1291 struct xfs_eofblocks *eofb = args; 1292 bool wait; 1293 1294 wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); 1295 1296 if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) 1297 return 0; 1298 1299 /* 1300 * If the mapping is dirty the operation can block and wait for some 1301 * time. Unless we are waiting, skip it. 1302 */ 1303 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1304 return 0; 1305 1306 if (!xfs_inode_matches_eofb(ip, eofb)) 1307 return 0; 1308 1309 /* 1310 * If the caller is waiting, return -EAGAIN to keep the background 1311 * scanner moving and revisit the inode in a subsequent pass. 1312 */ 1313 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1314 if (wait) 1315 return -EAGAIN; 1316 return 0; 1317 } 1318 *lockflags |= XFS_IOLOCK_EXCL; 1319 1320 if (xfs_can_free_eofblocks(ip, false)) 1321 return xfs_free_eofblocks(ip); 1322 1323 /* inode could be preallocated or append-only */ 1324 trace_xfs_inode_free_eofblocks_invalid(ip); 1325 xfs_inode_clear_eofblocks_tag(ip); 1326 return 0; 1327 } 1328 1329 /* 1330 * Background scanning to trim preallocated space. This is queued based on the 1331 * 'speculative_prealloc_lifetime' tunable (5m by default). 1332 */ 1333 static inline void 1334 xfs_blockgc_queue( 1335 struct xfs_perag *pag) 1336 { 1337 rcu_read_lock(); 1338 if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) 1339 queue_delayed_work(pag->pag_mount->m_gc_workqueue, 1340 &pag->pag_blockgc_work, 1341 msecs_to_jiffies(xfs_blockgc_secs * 1000)); 1342 rcu_read_unlock(); 1343 } 1344 1345 static void 1346 xfs_blockgc_set_iflag( 1347 struct xfs_inode *ip, 1348 unsigned long iflag) 1349 { 1350 struct xfs_mount *mp = ip->i_mount; 1351 struct xfs_perag *pag; 1352 int tagged; 1353 1354 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1355 1356 /* 1357 * Don't bother locking the AG and looking up in the radix trees 1358 * if we already know that we have the tag set. 1359 */ 1360 if (ip->i_flags & iflag) 1361 return; 1362 spin_lock(&ip->i_flags_lock); 1363 ip->i_flags |= iflag; 1364 spin_unlock(&ip->i_flags_lock); 1365 1366 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1367 spin_lock(&pag->pag_ici_lock); 1368 1369 tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG); 1370 radix_tree_tag_set(&pag->pag_ici_root, 1371 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 1372 XFS_ICI_BLOCKGC_TAG); 1373 if (!tagged) { 1374 /* propagate the blockgc tag up into the perag radix tree */ 1375 spin_lock(&ip->i_mount->m_perag_lock); 1376 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 1377 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1378 XFS_ICI_BLOCKGC_TAG); 1379 spin_unlock(&ip->i_mount->m_perag_lock); 1380 1381 /* kick off background trimming */ 1382 xfs_blockgc_queue(pag); 1383 1384 trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1, 1385 _RET_IP_); 1386 } 1387 1388 spin_unlock(&pag->pag_ici_lock); 1389 xfs_perag_put(pag); 1390 } 1391 1392 void 1393 xfs_inode_set_eofblocks_tag( 1394 xfs_inode_t *ip) 1395 { 1396 trace_xfs_inode_set_eofblocks_tag(ip); 1397 return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); 1398 } 1399 1400 static void 1401 xfs_blockgc_clear_iflag( 1402 struct xfs_inode *ip, 1403 unsigned long iflag) 1404 { 1405 struct xfs_mount *mp = ip->i_mount; 1406 struct xfs_perag *pag; 1407 bool clear_tag; 1408 1409 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1410 1411 spin_lock(&ip->i_flags_lock); 1412 ip->i_flags &= ~iflag; 1413 clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; 1414 spin_unlock(&ip->i_flags_lock); 1415 1416 if (!clear_tag) 1417 return; 1418 1419 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1420 spin_lock(&pag->pag_ici_lock); 1421 1422 radix_tree_tag_clear(&pag->pag_ici_root, 1423 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 1424 XFS_ICI_BLOCKGC_TAG); 1425 if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) { 1426 /* clear the blockgc tag from the perag radix tree */ 1427 spin_lock(&ip->i_mount->m_perag_lock); 1428 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 1429 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1430 XFS_ICI_BLOCKGC_TAG); 1431 spin_unlock(&ip->i_mount->m_perag_lock); 1432 trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1, 1433 _RET_IP_); 1434 } 1435 1436 spin_unlock(&pag->pag_ici_lock); 1437 xfs_perag_put(pag); 1438 } 1439 1440 void 1441 xfs_inode_clear_eofblocks_tag( 1442 xfs_inode_t *ip) 1443 { 1444 trace_xfs_inode_clear_eofblocks_tag(ip); 1445 return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); 1446 } 1447 1448 /* 1449 * Set ourselves up to free CoW blocks from this file. If it's already clean 1450 * then we can bail out quickly, but otherwise we must back off if the file 1451 * is undergoing some kind of write. 1452 */ 1453 static bool 1454 xfs_prep_free_cowblocks( 1455 struct xfs_inode *ip) 1456 { 1457 /* 1458 * Just clear the tag if we have an empty cow fork or none at all. It's 1459 * possible the inode was fully unshared since it was originally tagged. 1460 */ 1461 if (!xfs_inode_has_cow_data(ip)) { 1462 trace_xfs_inode_free_cowblocks_invalid(ip); 1463 xfs_inode_clear_cowblocks_tag(ip); 1464 return false; 1465 } 1466 1467 /* 1468 * If the mapping is dirty or under writeback we cannot touch the 1469 * CoW fork. Leave it alone if we're in the midst of a directio. 1470 */ 1471 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1472 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1473 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1474 atomic_read(&VFS_I(ip)->i_dio_count)) 1475 return false; 1476 1477 return true; 1478 } 1479 1480 /* 1481 * Automatic CoW Reservation Freeing 1482 * 1483 * These functions automatically garbage collect leftover CoW reservations 1484 * that were made on behalf of a cowextsize hint when we start to run out 1485 * of quota or when the reservations sit around for too long. If the file 1486 * has dirty pages or is undergoing writeback, its CoW reservations will 1487 * be retained. 1488 * 1489 * The actual garbage collection piggybacks off the same code that runs 1490 * the speculative EOF preallocation garbage collector. 1491 */ 1492 STATIC int 1493 xfs_inode_free_cowblocks( 1494 struct xfs_inode *ip, 1495 void *args, 1496 unsigned int *lockflags) 1497 { 1498 struct xfs_eofblocks *eofb = args; 1499 bool wait; 1500 int ret = 0; 1501 1502 wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); 1503 1504 if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) 1505 return 0; 1506 1507 if (!xfs_prep_free_cowblocks(ip)) 1508 return 0; 1509 1510 if (!xfs_inode_matches_eofb(ip, eofb)) 1511 return 0; 1512 1513 /* 1514 * If the caller is waiting, return -EAGAIN to keep the background 1515 * scanner moving and revisit the inode in a subsequent pass. 1516 */ 1517 if (!(*lockflags & XFS_IOLOCK_EXCL) && 1518 !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1519 if (wait) 1520 return -EAGAIN; 1521 return 0; 1522 } 1523 *lockflags |= XFS_IOLOCK_EXCL; 1524 1525 if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { 1526 if (wait) 1527 return -EAGAIN; 1528 return 0; 1529 } 1530 *lockflags |= XFS_MMAPLOCK_EXCL; 1531 1532 /* 1533 * Check again, nobody else should be able to dirty blocks or change 1534 * the reflink iflag now that we have the first two locks held. 1535 */ 1536 if (xfs_prep_free_cowblocks(ip)) 1537 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1538 return ret; 1539 } 1540 1541 void 1542 xfs_inode_set_cowblocks_tag( 1543 xfs_inode_t *ip) 1544 { 1545 trace_xfs_inode_set_cowblocks_tag(ip); 1546 return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); 1547 } 1548 1549 void 1550 xfs_inode_clear_cowblocks_tag( 1551 xfs_inode_t *ip) 1552 { 1553 trace_xfs_inode_clear_cowblocks_tag(ip); 1554 return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); 1555 } 1556 1557 #define for_each_perag_tag(mp, next_agno, pag, tag) \ 1558 for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \ 1559 (pag) != NULL; \ 1560 (next_agno) = (pag)->pag_agno + 1, \ 1561 xfs_perag_put(pag), \ 1562 (pag) = xfs_perag_get_tag((mp), (next_agno), (tag))) 1563 1564 1565 /* Disable post-EOF and CoW block auto-reclamation. */ 1566 void 1567 xfs_blockgc_stop( 1568 struct xfs_mount *mp) 1569 { 1570 struct xfs_perag *pag; 1571 xfs_agnumber_t agno; 1572 1573 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1574 cancel_delayed_work_sync(&pag->pag_blockgc_work); 1575 } 1576 1577 /* Enable post-EOF and CoW block auto-reclamation. */ 1578 void 1579 xfs_blockgc_start( 1580 struct xfs_mount *mp) 1581 { 1582 struct xfs_perag *pag; 1583 xfs_agnumber_t agno; 1584 1585 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1586 xfs_blockgc_queue(pag); 1587 } 1588 1589 /* Scan one incore inode for block preallocations that we can remove. */ 1590 static int 1591 xfs_blockgc_scan_inode( 1592 struct xfs_inode *ip, 1593 void *args) 1594 { 1595 unsigned int lockflags = 0; 1596 int error; 1597 1598 error = xfs_inode_free_eofblocks(ip, args, &lockflags); 1599 if (error) 1600 goto unlock; 1601 1602 error = xfs_inode_free_cowblocks(ip, args, &lockflags); 1603 unlock: 1604 if (lockflags) 1605 xfs_iunlock(ip, lockflags); 1606 return error; 1607 } 1608 1609 /* Background worker that trims preallocated space. */ 1610 void 1611 xfs_blockgc_worker( 1612 struct work_struct *work) 1613 { 1614 struct xfs_perag *pag = container_of(to_delayed_work(work), 1615 struct xfs_perag, pag_blockgc_work); 1616 struct xfs_mount *mp = pag->pag_mount; 1617 int error; 1618 1619 if (!sb_start_write_trylock(mp->m_super)) 1620 return; 1621 error = xfs_inode_walk_ag(pag, 0, xfs_blockgc_scan_inode, NULL, 1622 XFS_ICI_BLOCKGC_TAG); 1623 if (error) 1624 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", 1625 pag->pag_agno, error); 1626 sb_end_write(mp->m_super); 1627 xfs_blockgc_queue(pag); 1628 } 1629 1630 /* 1631 * Try to free space in the filesystem by purging eofblocks and cowblocks. 1632 */ 1633 int 1634 xfs_blockgc_free_space( 1635 struct xfs_mount *mp, 1636 struct xfs_eofblocks *eofb) 1637 { 1638 trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_); 1639 1640 return xfs_inode_walk(mp, 0, xfs_blockgc_scan_inode, eofb, 1641 XFS_ICI_BLOCKGC_TAG); 1642 } 1643 1644 /* 1645 * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which 1646 * quota caused an allocation failure, so we make a best effort by including 1647 * each quota under low free space conditions (less than 1% free space) in the 1648 * scan. 1649 * 1650 * Callers must not hold any inode's ILOCK. If requesting a synchronous scan 1651 * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or 1652 * MMAPLOCK. 1653 */ 1654 int 1655 xfs_blockgc_free_dquots( 1656 struct xfs_mount *mp, 1657 struct xfs_dquot *udqp, 1658 struct xfs_dquot *gdqp, 1659 struct xfs_dquot *pdqp, 1660 unsigned int eof_flags) 1661 { 1662 struct xfs_eofblocks eofb = {0}; 1663 bool do_work = false; 1664 1665 if (!udqp && !gdqp && !pdqp) 1666 return 0; 1667 1668 /* 1669 * Run a scan to free blocks using the union filter to cover all 1670 * applicable quotas in a single scan. 1671 */ 1672 eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags; 1673 1674 if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { 1675 eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); 1676 eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1677 do_work = true; 1678 } 1679 1680 if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { 1681 eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); 1682 eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1683 do_work = true; 1684 } 1685 1686 if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { 1687 eofb.eof_prid = pdqp->q_id; 1688 eofb.eof_flags |= XFS_EOF_FLAGS_PRID; 1689 do_work = true; 1690 } 1691 1692 if (!do_work) 1693 return 0; 1694 1695 return xfs_blockgc_free_space(mp, &eofb); 1696 } 1697 1698 /* Run cow/eofblocks scans on the quotas attached to the inode. */ 1699 int 1700 xfs_blockgc_free_quota( 1701 struct xfs_inode *ip, 1702 unsigned int eof_flags) 1703 { 1704 return xfs_blockgc_free_dquots(ip->i_mount, 1705 xfs_inode_dquot(ip, XFS_DQTYPE_USER), 1706 xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), 1707 xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags); 1708 } 1709