1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_inode_item.h" 17 #include "xfs_quota.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_dquot_item.h" 22 #include "xfs_dquot.h" 23 #include "xfs_reflink.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_ag.h" 26 #include "xfs_log_priv.h" 27 #include "xfs_health.h" 28 29 #include <linux/iversion.h> 30 31 /* Radix tree tags for incore inode tree. */ 32 33 /* inode is to be reclaimed */ 34 #define XFS_ICI_RECLAIM_TAG 0 35 /* Inode has speculative preallocations (posteof or cow) to clean. */ 36 #define XFS_ICI_BLOCKGC_TAG 1 37 38 /* 39 * The goal for walking incore inodes. These can correspond with incore inode 40 * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. 41 */ 42 enum xfs_icwalk_goal { 43 /* Goals directly associated with tagged inodes. */ 44 XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, 45 XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, 46 }; 47 48 static int xfs_icwalk(struct xfs_mount *mp, 49 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 50 static int xfs_icwalk_ag(struct xfs_perag *pag, 51 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 52 53 /* 54 * Private inode cache walk flags for struct xfs_icwalk. Must not 55 * coincide with XFS_ICWALK_FLAGS_VALID. 56 */ 57 58 /* Stop scanning after icw_scan_limit inodes. */ 59 #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) 60 61 #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) 62 #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ 63 64 #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \ 65 XFS_ICWALK_FLAG_RECLAIM_SICK | \ 66 XFS_ICWALK_FLAG_UNION) 67 68 /* 69 * Allocate and initialise an xfs_inode. 70 */ 71 struct xfs_inode * 72 xfs_inode_alloc( 73 struct xfs_mount *mp, 74 xfs_ino_t ino) 75 { 76 struct xfs_inode *ip; 77 78 /* 79 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 80 * and return NULL here on ENOMEM. 81 */ 82 ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); 83 84 if (inode_init_always(mp->m_super, VFS_I(ip))) { 85 kmem_cache_free(xfs_inode_cache, ip); 86 return NULL; 87 } 88 89 /* VFS doesn't initialise i_mode or i_state! */ 90 VFS_I(ip)->i_mode = 0; 91 VFS_I(ip)->i_state = 0; 92 mapping_set_large_folios(VFS_I(ip)->i_mapping); 93 94 XFS_STATS_INC(mp, vn_active); 95 ASSERT(atomic_read(&ip->i_pincount) == 0); 96 ASSERT(ip->i_ino == 0); 97 98 /* initialise the xfs inode */ 99 ip->i_ino = ino; 100 ip->i_mount = mp; 101 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 102 ip->i_cowfp = NULL; 103 memset(&ip->i_af, 0, sizeof(ip->i_af)); 104 ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; 105 memset(&ip->i_df, 0, sizeof(ip->i_df)); 106 ip->i_flags = 0; 107 ip->i_delayed_blks = 0; 108 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 109 ip->i_nblocks = 0; 110 ip->i_forkoff = 0; 111 ip->i_sick = 0; 112 ip->i_checked = 0; 113 INIT_WORK(&ip->i_ioend_work, xfs_end_io); 114 INIT_LIST_HEAD(&ip->i_ioend_list); 115 spin_lock_init(&ip->i_ioend_lock); 116 ip->i_next_unlinked = NULLAGINO; 117 ip->i_prev_unlinked = 0; 118 119 return ip; 120 } 121 122 STATIC void 123 xfs_inode_free_callback( 124 struct rcu_head *head) 125 { 126 struct inode *inode = container_of(head, struct inode, i_rcu); 127 struct xfs_inode *ip = XFS_I(inode); 128 129 switch (VFS_I(ip)->i_mode & S_IFMT) { 130 case S_IFREG: 131 case S_IFDIR: 132 case S_IFLNK: 133 xfs_idestroy_fork(&ip->i_df); 134 break; 135 } 136 137 xfs_ifork_zap_attr(ip); 138 139 if (ip->i_cowfp) { 140 xfs_idestroy_fork(ip->i_cowfp); 141 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp); 142 } 143 if (ip->i_itemp) { 144 ASSERT(!test_bit(XFS_LI_IN_AIL, 145 &ip->i_itemp->ili_item.li_flags)); 146 xfs_inode_item_destroy(ip); 147 ip->i_itemp = NULL; 148 } 149 150 kmem_cache_free(xfs_inode_cache, ip); 151 } 152 153 static void 154 __xfs_inode_free( 155 struct xfs_inode *ip) 156 { 157 /* asserts to verify all state is correct here */ 158 ASSERT(atomic_read(&ip->i_pincount) == 0); 159 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 160 XFS_STATS_DEC(ip->i_mount, vn_active); 161 162 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 163 } 164 165 void 166 xfs_inode_free( 167 struct xfs_inode *ip) 168 { 169 ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); 170 171 /* 172 * Because we use RCU freeing we need to ensure the inode always 173 * appears to be reclaimed with an invalid inode number when in the 174 * free state. The ip->i_flags_lock provides the barrier against lookup 175 * races. 176 */ 177 spin_lock(&ip->i_flags_lock); 178 ip->i_flags = XFS_IRECLAIM; 179 ip->i_ino = 0; 180 spin_unlock(&ip->i_flags_lock); 181 182 __xfs_inode_free(ip); 183 } 184 185 /* 186 * Queue background inode reclaim work if there are reclaimable inodes and there 187 * isn't reclaim work already scheduled or in progress. 188 */ 189 static void 190 xfs_reclaim_work_queue( 191 struct xfs_mount *mp) 192 { 193 194 rcu_read_lock(); 195 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 196 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 197 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 198 } 199 rcu_read_unlock(); 200 } 201 202 /* 203 * Background scanning to trim preallocated space. This is queued based on the 204 * 'speculative_prealloc_lifetime' tunable (5m by default). 205 */ 206 static inline void 207 xfs_blockgc_queue( 208 struct xfs_perag *pag) 209 { 210 struct xfs_mount *mp = pag->pag_mount; 211 212 if (!xfs_is_blockgc_enabled(mp)) 213 return; 214 215 rcu_read_lock(); 216 if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) 217 queue_delayed_work(pag->pag_mount->m_blockgc_wq, 218 &pag->pag_blockgc_work, 219 msecs_to_jiffies(xfs_blockgc_secs * 1000)); 220 rcu_read_unlock(); 221 } 222 223 /* Set a tag on both the AG incore inode tree and the AG radix tree. */ 224 static void 225 xfs_perag_set_inode_tag( 226 struct xfs_perag *pag, 227 xfs_agino_t agino, 228 unsigned int tag) 229 { 230 struct xfs_mount *mp = pag->pag_mount; 231 bool was_tagged; 232 233 lockdep_assert_held(&pag->pag_ici_lock); 234 235 was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 236 radix_tree_tag_set(&pag->pag_ici_root, agino, tag); 237 238 if (tag == XFS_ICI_RECLAIM_TAG) 239 pag->pag_ici_reclaimable++; 240 241 if (was_tagged) 242 return; 243 244 /* propagate the tag up into the perag radix tree */ 245 spin_lock(&mp->m_perag_lock); 246 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); 247 spin_unlock(&mp->m_perag_lock); 248 249 /* start background work */ 250 switch (tag) { 251 case XFS_ICI_RECLAIM_TAG: 252 xfs_reclaim_work_queue(mp); 253 break; 254 case XFS_ICI_BLOCKGC_TAG: 255 xfs_blockgc_queue(pag); 256 break; 257 } 258 259 trace_xfs_perag_set_inode_tag(pag, _RET_IP_); 260 } 261 262 /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ 263 static void 264 xfs_perag_clear_inode_tag( 265 struct xfs_perag *pag, 266 xfs_agino_t agino, 267 unsigned int tag) 268 { 269 struct xfs_mount *mp = pag->pag_mount; 270 271 lockdep_assert_held(&pag->pag_ici_lock); 272 273 /* 274 * Reclaim can signal (with a null agino) that it cleared its own tag 275 * by removing the inode from the radix tree. 276 */ 277 if (agino != NULLAGINO) 278 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag); 279 else 280 ASSERT(tag == XFS_ICI_RECLAIM_TAG); 281 282 if (tag == XFS_ICI_RECLAIM_TAG) 283 pag->pag_ici_reclaimable--; 284 285 if (radix_tree_tagged(&pag->pag_ici_root, tag)) 286 return; 287 288 /* clear the tag from the perag radix tree */ 289 spin_lock(&mp->m_perag_lock); 290 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); 291 spin_unlock(&mp->m_perag_lock); 292 293 trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); 294 } 295 296 /* 297 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 298 * part of the structure. This is made more complex by the fact we store 299 * information about the on-disk values in the VFS inode and so we can't just 300 * overwrite the values unconditionally. Hence we save the parameters we 301 * need to retain across reinitialisation, and rewrite them into the VFS inode 302 * after reinitialisation even if it fails. 303 */ 304 static int 305 xfs_reinit_inode( 306 struct xfs_mount *mp, 307 struct inode *inode) 308 { 309 int error; 310 uint32_t nlink = inode->i_nlink; 311 uint32_t generation = inode->i_generation; 312 uint64_t version = inode_peek_iversion(inode); 313 umode_t mode = inode->i_mode; 314 dev_t dev = inode->i_rdev; 315 kuid_t uid = inode->i_uid; 316 kgid_t gid = inode->i_gid; 317 318 error = inode_init_always(mp->m_super, inode); 319 320 set_nlink(inode, nlink); 321 inode->i_generation = generation; 322 inode_set_iversion_queried(inode, version); 323 inode->i_mode = mode; 324 inode->i_rdev = dev; 325 inode->i_uid = uid; 326 inode->i_gid = gid; 327 mapping_set_large_folios(inode->i_mapping); 328 return error; 329 } 330 331 /* 332 * Carefully nudge an inode whose VFS state has been torn down back into a 333 * usable state. Drops the i_flags_lock and the rcu read lock. 334 */ 335 static int 336 xfs_iget_recycle( 337 struct xfs_perag *pag, 338 struct xfs_inode *ip) __releases(&ip->i_flags_lock) 339 { 340 struct xfs_mount *mp = ip->i_mount; 341 struct inode *inode = VFS_I(ip); 342 int error; 343 344 trace_xfs_iget_recycle(ip); 345 346 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 347 return -EAGAIN; 348 349 /* 350 * We need to make it look like the inode is being reclaimed to prevent 351 * the actual reclaim workers from stomping over us while we recycle 352 * the inode. We can't clear the radix tree tag yet as it requires 353 * pag_ici_lock to be held exclusive. 354 */ 355 ip->i_flags |= XFS_IRECLAIM; 356 357 spin_unlock(&ip->i_flags_lock); 358 rcu_read_unlock(); 359 360 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 361 error = xfs_reinit_inode(mp, inode); 362 xfs_iunlock(ip, XFS_ILOCK_EXCL); 363 if (error) { 364 /* 365 * Re-initializing the inode failed, and we are in deep 366 * trouble. Try to re-add it to the reclaim list. 367 */ 368 rcu_read_lock(); 369 spin_lock(&ip->i_flags_lock); 370 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 371 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 372 spin_unlock(&ip->i_flags_lock); 373 rcu_read_unlock(); 374 375 trace_xfs_iget_recycle_fail(ip); 376 return error; 377 } 378 379 spin_lock(&pag->pag_ici_lock); 380 spin_lock(&ip->i_flags_lock); 381 382 /* 383 * Clear the per-lifetime state in the inode as we are now effectively 384 * a new inode and need to return to the initial state before reuse 385 * occurs. 386 */ 387 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 388 ip->i_flags |= XFS_INEW; 389 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 390 XFS_ICI_RECLAIM_TAG); 391 inode->i_state = I_NEW; 392 spin_unlock(&ip->i_flags_lock); 393 spin_unlock(&pag->pag_ici_lock); 394 395 return 0; 396 } 397 398 /* 399 * If we are allocating a new inode, then check what was returned is 400 * actually a free, empty inode. If we are not allocating an inode, 401 * then check we didn't find a free inode. 402 * 403 * Returns: 404 * 0 if the inode free state matches the lookup context 405 * -ENOENT if the inode is free and we are not allocating 406 * -EFSCORRUPTED if there is any state mismatch at all 407 */ 408 static int 409 xfs_iget_check_free_state( 410 struct xfs_inode *ip, 411 int flags) 412 { 413 if (flags & XFS_IGET_CREATE) { 414 /* should be a free inode */ 415 if (VFS_I(ip)->i_mode != 0) { 416 xfs_warn(ip->i_mount, 417 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 418 ip->i_ino, VFS_I(ip)->i_mode); 419 xfs_agno_mark_sick(ip->i_mount, 420 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 421 XFS_SICK_AG_INOBT); 422 return -EFSCORRUPTED; 423 } 424 425 if (ip->i_nblocks != 0) { 426 xfs_warn(ip->i_mount, 427 "Corruption detected! Free inode 0x%llx has blocks allocated!", 428 ip->i_ino); 429 xfs_agno_mark_sick(ip->i_mount, 430 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 431 XFS_SICK_AG_INOBT); 432 return -EFSCORRUPTED; 433 } 434 return 0; 435 } 436 437 /* should be an allocated inode */ 438 if (VFS_I(ip)->i_mode == 0) 439 return -ENOENT; 440 441 return 0; 442 } 443 444 /* Make all pending inactivation work start immediately. */ 445 static bool 446 xfs_inodegc_queue_all( 447 struct xfs_mount *mp) 448 { 449 struct xfs_inodegc *gc; 450 int cpu; 451 bool ret = false; 452 453 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 454 gc = per_cpu_ptr(mp->m_inodegc, cpu); 455 if (!llist_empty(&gc->list)) { 456 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); 457 ret = true; 458 } 459 } 460 461 return ret; 462 } 463 464 /* Wait for all queued work and collect errors */ 465 static int 466 xfs_inodegc_wait_all( 467 struct xfs_mount *mp) 468 { 469 int cpu; 470 int error = 0; 471 472 flush_workqueue(mp->m_inodegc_wq); 473 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 474 struct xfs_inodegc *gc; 475 476 gc = per_cpu_ptr(mp->m_inodegc, cpu); 477 if (gc->error && !error) 478 error = gc->error; 479 gc->error = 0; 480 } 481 482 return error; 483 } 484 485 /* 486 * Check the validity of the inode we just found it the cache 487 */ 488 static int 489 xfs_iget_cache_hit( 490 struct xfs_perag *pag, 491 struct xfs_inode *ip, 492 xfs_ino_t ino, 493 int flags, 494 int lock_flags) __releases(RCU) 495 { 496 struct inode *inode = VFS_I(ip); 497 struct xfs_mount *mp = ip->i_mount; 498 int error; 499 500 /* 501 * check for re-use of an inode within an RCU grace period due to the 502 * radix tree nodes not being updated yet. We monitor for this by 503 * setting the inode number to zero before freeing the inode structure. 504 * If the inode has been reallocated and set up, then the inode number 505 * will not match, so check for that, too. 506 */ 507 spin_lock(&ip->i_flags_lock); 508 if (ip->i_ino != ino) 509 goto out_skip; 510 511 /* 512 * If we are racing with another cache hit that is currently 513 * instantiating this inode or currently recycling it out of 514 * reclaimable state, wait for the initialisation to complete 515 * before continuing. 516 * 517 * If we're racing with the inactivation worker we also want to wait. 518 * If we're creating a new file, it's possible that the worker 519 * previously marked the inode as free on disk but hasn't finished 520 * updating the incore state yet. The AGI buffer will be dirty and 521 * locked to the icreate transaction, so a synchronous push of the 522 * inodegc workers would result in deadlock. For a regular iget, the 523 * worker is running already, so we might as well wait. 524 * 525 * XXX(hch): eventually we should do something equivalent to 526 * wait_on_inode to wait for these flags to be cleared 527 * instead of polling for it. 528 */ 529 if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) 530 goto out_skip; 531 532 if (ip->i_flags & XFS_NEED_INACTIVE) { 533 /* Unlinked inodes cannot be re-grabbed. */ 534 if (VFS_I(ip)->i_nlink == 0) { 535 error = -ENOENT; 536 goto out_error; 537 } 538 goto out_inodegc_flush; 539 } 540 541 /* 542 * Check the inode free state is valid. This also detects lookup 543 * racing with unlinks. 544 */ 545 error = xfs_iget_check_free_state(ip, flags); 546 if (error) 547 goto out_error; 548 549 /* Skip inodes that have no vfs state. */ 550 if ((flags & XFS_IGET_INCORE) && 551 (ip->i_flags & XFS_IRECLAIMABLE)) 552 goto out_skip; 553 554 /* The inode fits the selection criteria; process it. */ 555 if (ip->i_flags & XFS_IRECLAIMABLE) { 556 /* Drops i_flags_lock and RCU read lock. */ 557 error = xfs_iget_recycle(pag, ip); 558 if (error == -EAGAIN) 559 goto out_skip; 560 if (error) 561 return error; 562 } else { 563 /* If the VFS inode is being torn down, pause and try again. */ 564 if (!igrab(inode)) 565 goto out_skip; 566 567 /* We've got a live one. */ 568 spin_unlock(&ip->i_flags_lock); 569 rcu_read_unlock(); 570 trace_xfs_iget_hit(ip); 571 } 572 573 if (lock_flags != 0) 574 xfs_ilock(ip, lock_flags); 575 576 if (!(flags & XFS_IGET_INCORE)) 577 xfs_iflags_clear(ip, XFS_ISTALE); 578 XFS_STATS_INC(mp, xs_ig_found); 579 580 return 0; 581 582 out_skip: 583 trace_xfs_iget_skip(ip); 584 XFS_STATS_INC(mp, xs_ig_frecycle); 585 error = -EAGAIN; 586 out_error: 587 spin_unlock(&ip->i_flags_lock); 588 rcu_read_unlock(); 589 return error; 590 591 out_inodegc_flush: 592 spin_unlock(&ip->i_flags_lock); 593 rcu_read_unlock(); 594 /* 595 * Do not wait for the workers, because the caller could hold an AGI 596 * buffer lock. We're just going to sleep in a loop anyway. 597 */ 598 if (xfs_is_inodegc_enabled(mp)) 599 xfs_inodegc_queue_all(mp); 600 return -EAGAIN; 601 } 602 603 static int 604 xfs_iget_cache_miss( 605 struct xfs_mount *mp, 606 struct xfs_perag *pag, 607 xfs_trans_t *tp, 608 xfs_ino_t ino, 609 struct xfs_inode **ipp, 610 int flags, 611 int lock_flags) 612 { 613 struct xfs_inode *ip; 614 int error; 615 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 616 int iflags; 617 618 ip = xfs_inode_alloc(mp, ino); 619 if (!ip) 620 return -ENOMEM; 621 622 error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags); 623 if (error) 624 goto out_destroy; 625 626 /* 627 * For version 5 superblocks, if we are initialising a new inode and we 628 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can 629 * simply build the new inode core with a random generation number. 630 * 631 * For version 4 (and older) superblocks, log recovery is dependent on 632 * the i_flushiter field being initialised from the current on-disk 633 * value and hence we must also read the inode off disk even when 634 * initializing new inodes. 635 */ 636 if (xfs_has_v3inodes(mp) && 637 (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { 638 VFS_I(ip)->i_generation = get_random_u32(); 639 } else { 640 struct xfs_buf *bp; 641 642 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp); 643 if (error) 644 goto out_destroy; 645 646 error = xfs_inode_from_disk(ip, 647 xfs_buf_offset(bp, ip->i_imap.im_boffset)); 648 if (!error) 649 xfs_buf_set_ref(bp, XFS_INO_REF); 650 else 651 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 652 xfs_trans_brelse(tp, bp); 653 654 if (error) 655 goto out_destroy; 656 } 657 658 trace_xfs_iget_miss(ip); 659 660 /* 661 * Check the inode free state is valid. This also detects lookup 662 * racing with unlinks. 663 */ 664 error = xfs_iget_check_free_state(ip, flags); 665 if (error) 666 goto out_destroy; 667 668 /* 669 * Preload the radix tree so we can insert safely under the 670 * write spinlock. Note that we cannot sleep inside the preload 671 * region. 672 */ 673 if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) { 674 error = -EAGAIN; 675 goto out_destroy; 676 } 677 678 /* 679 * Because the inode hasn't been added to the radix-tree yet it can't 680 * be found by another thread, so we can do the non-sleeping lock here. 681 */ 682 if (lock_flags) { 683 if (!xfs_ilock_nowait(ip, lock_flags)) 684 BUG(); 685 } 686 687 /* 688 * These values must be set before inserting the inode into the radix 689 * tree as the moment it is inserted a concurrent lookup (allowed by the 690 * RCU locking mechanism) can find it and that lookup must see that this 691 * is an inode currently under construction (i.e. that XFS_INEW is set). 692 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 693 * memory barrier that ensures this detection works correctly at lookup 694 * time. 695 */ 696 iflags = XFS_INEW; 697 if (flags & XFS_IGET_DONTCACHE) 698 d_mark_dontcache(VFS_I(ip)); 699 ip->i_udquot = NULL; 700 ip->i_gdquot = NULL; 701 ip->i_pdquot = NULL; 702 xfs_iflags_set(ip, iflags); 703 704 /* insert the new inode */ 705 spin_lock(&pag->pag_ici_lock); 706 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 707 if (unlikely(error)) { 708 WARN_ON(error != -EEXIST); 709 XFS_STATS_INC(mp, xs_ig_dup); 710 error = -EAGAIN; 711 goto out_preload_end; 712 } 713 spin_unlock(&pag->pag_ici_lock); 714 radix_tree_preload_end(); 715 716 *ipp = ip; 717 return 0; 718 719 out_preload_end: 720 spin_unlock(&pag->pag_ici_lock); 721 radix_tree_preload_end(); 722 if (lock_flags) 723 xfs_iunlock(ip, lock_flags); 724 out_destroy: 725 __destroy_inode(VFS_I(ip)); 726 xfs_inode_free(ip); 727 return error; 728 } 729 730 /* 731 * Look up an inode by number in the given file system. The inode is looked up 732 * in the cache held in each AG. If the inode is found in the cache, initialise 733 * the vfs inode if necessary. 734 * 735 * If it is not in core, read it in from the file system's device, add it to the 736 * cache and initialise the vfs inode. 737 * 738 * The inode is locked according to the value of the lock_flags parameter. 739 * Inode lookup is only done during metadata operations and not as part of the 740 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 741 */ 742 int 743 xfs_iget( 744 struct xfs_mount *mp, 745 struct xfs_trans *tp, 746 xfs_ino_t ino, 747 uint flags, 748 uint lock_flags, 749 struct xfs_inode **ipp) 750 { 751 struct xfs_inode *ip; 752 struct xfs_perag *pag; 753 xfs_agino_t agino; 754 int error; 755 756 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 757 758 /* reject inode numbers outside existing AGs */ 759 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 760 return -EINVAL; 761 762 XFS_STATS_INC(mp, xs_ig_attempts); 763 764 /* get the perag structure and ensure that it's inode capable */ 765 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 766 agino = XFS_INO_TO_AGINO(mp, ino); 767 768 again: 769 error = 0; 770 rcu_read_lock(); 771 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 772 773 if (ip) { 774 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 775 if (error) 776 goto out_error_or_again; 777 } else { 778 rcu_read_unlock(); 779 if (flags & XFS_IGET_INCORE) { 780 error = -ENODATA; 781 goto out_error_or_again; 782 } 783 XFS_STATS_INC(mp, xs_ig_missed); 784 785 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 786 flags, lock_flags); 787 if (error) 788 goto out_error_or_again; 789 } 790 xfs_perag_put(pag); 791 792 *ipp = ip; 793 794 /* 795 * If we have a real type for an on-disk inode, we can setup the inode 796 * now. If it's a new inode being created, xfs_init_new_inode will 797 * handle it. 798 */ 799 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 800 xfs_setup_existing_inode(ip); 801 return 0; 802 803 out_error_or_again: 804 if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) && 805 error == -EAGAIN) { 806 delay(1); 807 goto again; 808 } 809 xfs_perag_put(pag); 810 return error; 811 } 812 813 /* 814 * Grab the inode for reclaim exclusively. 815 * 816 * We have found this inode via a lookup under RCU, so the inode may have 817 * already been freed, or it may be in the process of being recycled by 818 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 819 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 820 * will not be set. Hence we need to check for both these flag conditions to 821 * avoid inodes that are no longer reclaim candidates. 822 * 823 * Note: checking for other state flags here, under the i_flags_lock or not, is 824 * racy and should be avoided. Those races should be resolved only after we have 825 * ensured that we are able to reclaim this inode and the world can see that we 826 * are going to reclaim it. 827 * 828 * Return true if we grabbed it, false otherwise. 829 */ 830 static bool 831 xfs_reclaim_igrab( 832 struct xfs_inode *ip, 833 struct xfs_icwalk *icw) 834 { 835 ASSERT(rcu_read_lock_held()); 836 837 spin_lock(&ip->i_flags_lock); 838 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 839 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 840 /* not a reclaim candidate. */ 841 spin_unlock(&ip->i_flags_lock); 842 return false; 843 } 844 845 /* Don't reclaim a sick inode unless the caller asked for it. */ 846 if (ip->i_sick && 847 (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { 848 spin_unlock(&ip->i_flags_lock); 849 return false; 850 } 851 852 __xfs_iflags_set(ip, XFS_IRECLAIM); 853 spin_unlock(&ip->i_flags_lock); 854 return true; 855 } 856 857 /* 858 * Inode reclaim is non-blocking, so the default action if progress cannot be 859 * made is to "requeue" the inode for reclaim by unlocking it and clearing the 860 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 861 * blocking anymore and hence we can wait for the inode to be able to reclaim 862 * it. 863 * 864 * We do no IO here - if callers require inodes to be cleaned they must push the 865 * AIL first to trigger writeback of dirty inodes. This enables writeback to be 866 * done in the background in a non-blocking manner, and enables memory reclaim 867 * to make progress without blocking. 868 */ 869 static void 870 xfs_reclaim_inode( 871 struct xfs_inode *ip, 872 struct xfs_perag *pag) 873 { 874 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 875 876 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 877 goto out; 878 if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) 879 goto out_iunlock; 880 881 /* 882 * Check for log shutdown because aborting the inode can move the log 883 * tail and corrupt in memory state. This is fine if the log is shut 884 * down, but if the log is still active and only the mount is shut down 885 * then the in-memory log tail movement caused by the abort can be 886 * incorrectly propagated to disk. 887 */ 888 if (xlog_is_shutdown(ip->i_mount->m_log)) { 889 xfs_iunpin_wait(ip); 890 xfs_iflush_shutdown_abort(ip); 891 goto reclaim; 892 } 893 if (xfs_ipincount(ip)) 894 goto out_clear_flush; 895 if (!xfs_inode_clean(ip)) 896 goto out_clear_flush; 897 898 xfs_iflags_clear(ip, XFS_IFLUSHING); 899 reclaim: 900 trace_xfs_inode_reclaiming(ip); 901 902 /* 903 * Because we use RCU freeing we need to ensure the inode always appears 904 * to be reclaimed with an invalid inode number when in the free state. 905 * We do this as early as possible under the ILOCK so that 906 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 907 * detect races with us here. By doing this, we guarantee that once 908 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 909 * it will see either a valid inode that will serialise correctly, or it 910 * will see an invalid inode that it can skip. 911 */ 912 spin_lock(&ip->i_flags_lock); 913 ip->i_flags = XFS_IRECLAIM; 914 ip->i_ino = 0; 915 ip->i_sick = 0; 916 ip->i_checked = 0; 917 spin_unlock(&ip->i_flags_lock); 918 919 ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL); 920 xfs_iunlock(ip, XFS_ILOCK_EXCL); 921 922 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 923 /* 924 * Remove the inode from the per-AG radix tree. 925 * 926 * Because radix_tree_delete won't complain even if the item was never 927 * added to the tree assert that it's been there before to catch 928 * problems with the inode life time early on. 929 */ 930 spin_lock(&pag->pag_ici_lock); 931 if (!radix_tree_delete(&pag->pag_ici_root, 932 XFS_INO_TO_AGINO(ip->i_mount, ino))) 933 ASSERT(0); 934 xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG); 935 spin_unlock(&pag->pag_ici_lock); 936 937 /* 938 * Here we do an (almost) spurious inode lock in order to coordinate 939 * with inode cache radix tree lookups. This is because the lookup 940 * can reference the inodes in the cache without taking references. 941 * 942 * We make that OK here by ensuring that we wait until the inode is 943 * unlocked after the lookup before we go ahead and free it. 944 */ 945 xfs_ilock(ip, XFS_ILOCK_EXCL); 946 ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot); 947 xfs_iunlock(ip, XFS_ILOCK_EXCL); 948 ASSERT(xfs_inode_clean(ip)); 949 950 __xfs_inode_free(ip); 951 return; 952 953 out_clear_flush: 954 xfs_iflags_clear(ip, XFS_IFLUSHING); 955 out_iunlock: 956 xfs_iunlock(ip, XFS_ILOCK_EXCL); 957 out: 958 xfs_iflags_clear(ip, XFS_IRECLAIM); 959 } 960 961 /* Reclaim sick inodes if we're unmounting or the fs went down. */ 962 static inline bool 963 xfs_want_reclaim_sick( 964 struct xfs_mount *mp) 965 { 966 return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) || 967 xfs_is_shutdown(mp); 968 } 969 970 void 971 xfs_reclaim_inodes( 972 struct xfs_mount *mp) 973 { 974 struct xfs_icwalk icw = { 975 .icw_flags = 0, 976 }; 977 978 if (xfs_want_reclaim_sick(mp)) 979 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 980 981 while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 982 xfs_ail_push_all_sync(mp->m_ail); 983 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 984 } 985 } 986 987 /* 988 * The shrinker infrastructure determines how many inodes we should scan for 989 * reclaim. We want as many clean inodes ready to reclaim as possible, so we 990 * push the AIL here. We also want to proactively free up memory if we can to 991 * minimise the amount of work memory reclaim has to do so we kick the 992 * background reclaim if it isn't already scheduled. 993 */ 994 long 995 xfs_reclaim_inodes_nr( 996 struct xfs_mount *mp, 997 unsigned long nr_to_scan) 998 { 999 struct xfs_icwalk icw = { 1000 .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, 1001 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan), 1002 }; 1003 1004 if (xfs_want_reclaim_sick(mp)) 1005 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 1006 1007 /* kick background reclaimer and push the AIL */ 1008 xfs_reclaim_work_queue(mp); 1009 xfs_ail_push_all(mp->m_ail); 1010 1011 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 1012 return 0; 1013 } 1014 1015 /* 1016 * Return the number of reclaimable inodes in the filesystem for 1017 * the shrinker to determine how much to reclaim. 1018 */ 1019 long 1020 xfs_reclaim_inodes_count( 1021 struct xfs_mount *mp) 1022 { 1023 struct xfs_perag *pag; 1024 xfs_agnumber_t ag = 0; 1025 long reclaimable = 0; 1026 1027 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1028 ag = pag->pag_agno + 1; 1029 reclaimable += pag->pag_ici_reclaimable; 1030 xfs_perag_put(pag); 1031 } 1032 return reclaimable; 1033 } 1034 1035 STATIC bool 1036 xfs_icwalk_match_id( 1037 struct xfs_inode *ip, 1038 struct xfs_icwalk *icw) 1039 { 1040 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1041 !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1042 return false; 1043 1044 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1045 !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1046 return false; 1047 1048 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1049 ip->i_projid != icw->icw_prid) 1050 return false; 1051 1052 return true; 1053 } 1054 1055 /* 1056 * A union-based inode filtering algorithm. Process the inode if any of the 1057 * criteria match. This is for global/internal scans only. 1058 */ 1059 STATIC bool 1060 xfs_icwalk_match_id_union( 1061 struct xfs_inode *ip, 1062 struct xfs_icwalk *icw) 1063 { 1064 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1065 uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1066 return true; 1067 1068 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1069 gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1070 return true; 1071 1072 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1073 ip->i_projid == icw->icw_prid) 1074 return true; 1075 1076 return false; 1077 } 1078 1079 /* 1080 * Is this inode @ip eligible for eof/cow block reclamation, given some 1081 * filtering parameters @icw? The inode is eligible if @icw is null or 1082 * if the predicate functions match. 1083 */ 1084 static bool 1085 xfs_icwalk_match( 1086 struct xfs_inode *ip, 1087 struct xfs_icwalk *icw) 1088 { 1089 bool match; 1090 1091 if (!icw) 1092 return true; 1093 1094 if (icw->icw_flags & XFS_ICWALK_FLAG_UNION) 1095 match = xfs_icwalk_match_id_union(ip, icw); 1096 else 1097 match = xfs_icwalk_match_id(ip, icw); 1098 if (!match) 1099 return false; 1100 1101 /* skip the inode if the file size is too small */ 1102 if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) && 1103 XFS_ISIZE(ip) < icw->icw_min_file_size) 1104 return false; 1105 1106 return true; 1107 } 1108 1109 /* 1110 * This is a fast pass over the inode cache to try to get reclaim moving on as 1111 * many inodes as possible in a short period of time. It kicks itself every few 1112 * seconds, as well as being kicked by the inode cache shrinker when memory 1113 * goes low. 1114 */ 1115 void 1116 xfs_reclaim_worker( 1117 struct work_struct *work) 1118 { 1119 struct xfs_mount *mp = container_of(to_delayed_work(work), 1120 struct xfs_mount, m_reclaim_work); 1121 1122 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); 1123 xfs_reclaim_work_queue(mp); 1124 } 1125 1126 STATIC int 1127 xfs_inode_free_eofblocks( 1128 struct xfs_inode *ip, 1129 struct xfs_icwalk *icw, 1130 unsigned int *lockflags) 1131 { 1132 bool wait; 1133 1134 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1135 1136 if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) 1137 return 0; 1138 1139 /* 1140 * If the mapping is dirty the operation can block and wait for some 1141 * time. Unless we are waiting, skip it. 1142 */ 1143 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1144 return 0; 1145 1146 if (!xfs_icwalk_match(ip, icw)) 1147 return 0; 1148 1149 /* 1150 * If the caller is waiting, return -EAGAIN to keep the background 1151 * scanner moving and revisit the inode in a subsequent pass. 1152 */ 1153 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1154 if (wait) 1155 return -EAGAIN; 1156 return 0; 1157 } 1158 *lockflags |= XFS_IOLOCK_EXCL; 1159 1160 if (xfs_can_free_eofblocks(ip, false)) 1161 return xfs_free_eofblocks(ip); 1162 1163 /* inode could be preallocated or append-only */ 1164 trace_xfs_inode_free_eofblocks_invalid(ip); 1165 xfs_inode_clear_eofblocks_tag(ip); 1166 return 0; 1167 } 1168 1169 static void 1170 xfs_blockgc_set_iflag( 1171 struct xfs_inode *ip, 1172 unsigned long iflag) 1173 { 1174 struct xfs_mount *mp = ip->i_mount; 1175 struct xfs_perag *pag; 1176 1177 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1178 1179 /* 1180 * Don't bother locking the AG and looking up in the radix trees 1181 * if we already know that we have the tag set. 1182 */ 1183 if (ip->i_flags & iflag) 1184 return; 1185 spin_lock(&ip->i_flags_lock); 1186 ip->i_flags |= iflag; 1187 spin_unlock(&ip->i_flags_lock); 1188 1189 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1190 spin_lock(&pag->pag_ici_lock); 1191 1192 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1193 XFS_ICI_BLOCKGC_TAG); 1194 1195 spin_unlock(&pag->pag_ici_lock); 1196 xfs_perag_put(pag); 1197 } 1198 1199 void 1200 xfs_inode_set_eofblocks_tag( 1201 xfs_inode_t *ip) 1202 { 1203 trace_xfs_inode_set_eofblocks_tag(ip); 1204 return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); 1205 } 1206 1207 static void 1208 xfs_blockgc_clear_iflag( 1209 struct xfs_inode *ip, 1210 unsigned long iflag) 1211 { 1212 struct xfs_mount *mp = ip->i_mount; 1213 struct xfs_perag *pag; 1214 bool clear_tag; 1215 1216 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1217 1218 spin_lock(&ip->i_flags_lock); 1219 ip->i_flags &= ~iflag; 1220 clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; 1221 spin_unlock(&ip->i_flags_lock); 1222 1223 if (!clear_tag) 1224 return; 1225 1226 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1227 spin_lock(&pag->pag_ici_lock); 1228 1229 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1230 XFS_ICI_BLOCKGC_TAG); 1231 1232 spin_unlock(&pag->pag_ici_lock); 1233 xfs_perag_put(pag); 1234 } 1235 1236 void 1237 xfs_inode_clear_eofblocks_tag( 1238 xfs_inode_t *ip) 1239 { 1240 trace_xfs_inode_clear_eofblocks_tag(ip); 1241 return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); 1242 } 1243 1244 /* 1245 * Set ourselves up to free CoW blocks from this file. If it's already clean 1246 * then we can bail out quickly, but otherwise we must back off if the file 1247 * is undergoing some kind of write. 1248 */ 1249 static bool 1250 xfs_prep_free_cowblocks( 1251 struct xfs_inode *ip) 1252 { 1253 /* 1254 * Just clear the tag if we have an empty cow fork or none at all. It's 1255 * possible the inode was fully unshared since it was originally tagged. 1256 */ 1257 if (!xfs_inode_has_cow_data(ip)) { 1258 trace_xfs_inode_free_cowblocks_invalid(ip); 1259 xfs_inode_clear_cowblocks_tag(ip); 1260 return false; 1261 } 1262 1263 /* 1264 * If the mapping is dirty or under writeback we cannot touch the 1265 * CoW fork. Leave it alone if we're in the midst of a directio. 1266 */ 1267 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1268 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1269 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1270 atomic_read(&VFS_I(ip)->i_dio_count)) 1271 return false; 1272 1273 return true; 1274 } 1275 1276 /* 1277 * Automatic CoW Reservation Freeing 1278 * 1279 * These functions automatically garbage collect leftover CoW reservations 1280 * that were made on behalf of a cowextsize hint when we start to run out 1281 * of quota or when the reservations sit around for too long. If the file 1282 * has dirty pages or is undergoing writeback, its CoW reservations will 1283 * be retained. 1284 * 1285 * The actual garbage collection piggybacks off the same code that runs 1286 * the speculative EOF preallocation garbage collector. 1287 */ 1288 STATIC int 1289 xfs_inode_free_cowblocks( 1290 struct xfs_inode *ip, 1291 struct xfs_icwalk *icw, 1292 unsigned int *lockflags) 1293 { 1294 bool wait; 1295 int ret = 0; 1296 1297 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1298 1299 if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) 1300 return 0; 1301 1302 if (!xfs_prep_free_cowblocks(ip)) 1303 return 0; 1304 1305 if (!xfs_icwalk_match(ip, icw)) 1306 return 0; 1307 1308 /* 1309 * If the caller is waiting, return -EAGAIN to keep the background 1310 * scanner moving and revisit the inode in a subsequent pass. 1311 */ 1312 if (!(*lockflags & XFS_IOLOCK_EXCL) && 1313 !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1314 if (wait) 1315 return -EAGAIN; 1316 return 0; 1317 } 1318 *lockflags |= XFS_IOLOCK_EXCL; 1319 1320 if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { 1321 if (wait) 1322 return -EAGAIN; 1323 return 0; 1324 } 1325 *lockflags |= XFS_MMAPLOCK_EXCL; 1326 1327 /* 1328 * Check again, nobody else should be able to dirty blocks or change 1329 * the reflink iflag now that we have the first two locks held. 1330 */ 1331 if (xfs_prep_free_cowblocks(ip)) 1332 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1333 return ret; 1334 } 1335 1336 void 1337 xfs_inode_set_cowblocks_tag( 1338 xfs_inode_t *ip) 1339 { 1340 trace_xfs_inode_set_cowblocks_tag(ip); 1341 return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); 1342 } 1343 1344 void 1345 xfs_inode_clear_cowblocks_tag( 1346 xfs_inode_t *ip) 1347 { 1348 trace_xfs_inode_clear_cowblocks_tag(ip); 1349 return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); 1350 } 1351 1352 /* Disable post-EOF and CoW block auto-reclamation. */ 1353 void 1354 xfs_blockgc_stop( 1355 struct xfs_mount *mp) 1356 { 1357 struct xfs_perag *pag; 1358 xfs_agnumber_t agno; 1359 1360 if (!xfs_clear_blockgc_enabled(mp)) 1361 return; 1362 1363 for_each_perag(mp, agno, pag) 1364 cancel_delayed_work_sync(&pag->pag_blockgc_work); 1365 trace_xfs_blockgc_stop(mp, __return_address); 1366 } 1367 1368 /* Enable post-EOF and CoW block auto-reclamation. */ 1369 void 1370 xfs_blockgc_start( 1371 struct xfs_mount *mp) 1372 { 1373 struct xfs_perag *pag; 1374 xfs_agnumber_t agno; 1375 1376 if (xfs_set_blockgc_enabled(mp)) 1377 return; 1378 1379 trace_xfs_blockgc_start(mp, __return_address); 1380 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1381 xfs_blockgc_queue(pag); 1382 } 1383 1384 /* Don't try to run block gc on an inode that's in any of these states. */ 1385 #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ 1386 XFS_NEED_INACTIVE | \ 1387 XFS_INACTIVATING | \ 1388 XFS_IRECLAIMABLE | \ 1389 XFS_IRECLAIM) 1390 /* 1391 * Decide if the given @ip is eligible for garbage collection of speculative 1392 * preallocations, and grab it if so. Returns true if it's ready to go or 1393 * false if we should just ignore it. 1394 */ 1395 static bool 1396 xfs_blockgc_igrab( 1397 struct xfs_inode *ip) 1398 { 1399 struct inode *inode = VFS_I(ip); 1400 1401 ASSERT(rcu_read_lock_held()); 1402 1403 /* Check for stale RCU freed inode */ 1404 spin_lock(&ip->i_flags_lock); 1405 if (!ip->i_ino) 1406 goto out_unlock_noent; 1407 1408 if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS) 1409 goto out_unlock_noent; 1410 spin_unlock(&ip->i_flags_lock); 1411 1412 /* nothing to sync during shutdown */ 1413 if (xfs_is_shutdown(ip->i_mount)) 1414 return false; 1415 1416 /* If we can't grab the inode, it must on it's way to reclaim. */ 1417 if (!igrab(inode)) 1418 return false; 1419 1420 /* inode is valid */ 1421 return true; 1422 1423 out_unlock_noent: 1424 spin_unlock(&ip->i_flags_lock); 1425 return false; 1426 } 1427 1428 /* Scan one incore inode for block preallocations that we can remove. */ 1429 static int 1430 xfs_blockgc_scan_inode( 1431 struct xfs_inode *ip, 1432 struct xfs_icwalk *icw) 1433 { 1434 unsigned int lockflags = 0; 1435 int error; 1436 1437 error = xfs_inode_free_eofblocks(ip, icw, &lockflags); 1438 if (error) 1439 goto unlock; 1440 1441 error = xfs_inode_free_cowblocks(ip, icw, &lockflags); 1442 unlock: 1443 if (lockflags) 1444 xfs_iunlock(ip, lockflags); 1445 xfs_irele(ip); 1446 return error; 1447 } 1448 1449 /* Background worker that trims preallocated space. */ 1450 void 1451 xfs_blockgc_worker( 1452 struct work_struct *work) 1453 { 1454 struct xfs_perag *pag = container_of(to_delayed_work(work), 1455 struct xfs_perag, pag_blockgc_work); 1456 struct xfs_mount *mp = pag->pag_mount; 1457 int error; 1458 1459 trace_xfs_blockgc_worker(mp, __return_address); 1460 1461 error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); 1462 if (error) 1463 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", 1464 pag->pag_agno, error); 1465 xfs_blockgc_queue(pag); 1466 } 1467 1468 /* 1469 * Try to free space in the filesystem by purging inactive inodes, eofblocks 1470 * and cowblocks. 1471 */ 1472 int 1473 xfs_blockgc_free_space( 1474 struct xfs_mount *mp, 1475 struct xfs_icwalk *icw) 1476 { 1477 int error; 1478 1479 trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); 1480 1481 error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); 1482 if (error) 1483 return error; 1484 1485 return xfs_inodegc_flush(mp); 1486 } 1487 1488 /* 1489 * Reclaim all the free space that we can by scheduling the background blockgc 1490 * and inodegc workers immediately and waiting for them all to clear. 1491 */ 1492 int 1493 xfs_blockgc_flush_all( 1494 struct xfs_mount *mp) 1495 { 1496 struct xfs_perag *pag; 1497 xfs_agnumber_t agno; 1498 1499 trace_xfs_blockgc_flush_all(mp, __return_address); 1500 1501 /* 1502 * For each blockgc worker, move its queue time up to now. If it 1503 * wasn't queued, it will not be requeued. Then flush whatever's 1504 * left. 1505 */ 1506 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1507 mod_delayed_work(pag->pag_mount->m_blockgc_wq, 1508 &pag->pag_blockgc_work, 0); 1509 1510 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1511 flush_delayed_work(&pag->pag_blockgc_work); 1512 1513 return xfs_inodegc_flush(mp); 1514 } 1515 1516 /* 1517 * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which 1518 * quota caused an allocation failure, so we make a best effort by including 1519 * each quota under low free space conditions (less than 1% free space) in the 1520 * scan. 1521 * 1522 * Callers must not hold any inode's ILOCK. If requesting a synchronous scan 1523 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or 1524 * MMAPLOCK. 1525 */ 1526 int 1527 xfs_blockgc_free_dquots( 1528 struct xfs_mount *mp, 1529 struct xfs_dquot *udqp, 1530 struct xfs_dquot *gdqp, 1531 struct xfs_dquot *pdqp, 1532 unsigned int iwalk_flags) 1533 { 1534 struct xfs_icwalk icw = {0}; 1535 bool do_work = false; 1536 1537 if (!udqp && !gdqp && !pdqp) 1538 return 0; 1539 1540 /* 1541 * Run a scan to free blocks using the union filter to cover all 1542 * applicable quotas in a single scan. 1543 */ 1544 icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; 1545 1546 if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { 1547 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); 1548 icw.icw_flags |= XFS_ICWALK_FLAG_UID; 1549 do_work = true; 1550 } 1551 1552 if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { 1553 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); 1554 icw.icw_flags |= XFS_ICWALK_FLAG_GID; 1555 do_work = true; 1556 } 1557 1558 if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { 1559 icw.icw_prid = pdqp->q_id; 1560 icw.icw_flags |= XFS_ICWALK_FLAG_PRID; 1561 do_work = true; 1562 } 1563 1564 if (!do_work) 1565 return 0; 1566 1567 return xfs_blockgc_free_space(mp, &icw); 1568 } 1569 1570 /* Run cow/eofblocks scans on the quotas attached to the inode. */ 1571 int 1572 xfs_blockgc_free_quota( 1573 struct xfs_inode *ip, 1574 unsigned int iwalk_flags) 1575 { 1576 return xfs_blockgc_free_dquots(ip->i_mount, 1577 xfs_inode_dquot(ip, XFS_DQTYPE_USER), 1578 xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), 1579 xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); 1580 } 1581 1582 /* XFS Inode Cache Walking Code */ 1583 1584 /* 1585 * The inode lookup is done in batches to keep the amount of lock traffic and 1586 * radix tree lookups to a minimum. The batch size is a trade off between 1587 * lookup reduction and stack usage. This is in the reclaim path, so we can't 1588 * be too greedy. 1589 */ 1590 #define XFS_LOOKUP_BATCH 32 1591 1592 1593 /* 1594 * Decide if we want to grab this inode in anticipation of doing work towards 1595 * the goal. 1596 */ 1597 static inline bool 1598 xfs_icwalk_igrab( 1599 enum xfs_icwalk_goal goal, 1600 struct xfs_inode *ip, 1601 struct xfs_icwalk *icw) 1602 { 1603 switch (goal) { 1604 case XFS_ICWALK_BLOCKGC: 1605 return xfs_blockgc_igrab(ip); 1606 case XFS_ICWALK_RECLAIM: 1607 return xfs_reclaim_igrab(ip, icw); 1608 default: 1609 return false; 1610 } 1611 } 1612 1613 /* 1614 * Process an inode. Each processing function must handle any state changes 1615 * made by the icwalk igrab function. Return -EAGAIN to skip an inode. 1616 */ 1617 static inline int 1618 xfs_icwalk_process_inode( 1619 enum xfs_icwalk_goal goal, 1620 struct xfs_inode *ip, 1621 struct xfs_perag *pag, 1622 struct xfs_icwalk *icw) 1623 { 1624 int error = 0; 1625 1626 switch (goal) { 1627 case XFS_ICWALK_BLOCKGC: 1628 error = xfs_blockgc_scan_inode(ip, icw); 1629 break; 1630 case XFS_ICWALK_RECLAIM: 1631 xfs_reclaim_inode(ip, pag); 1632 break; 1633 } 1634 return error; 1635 } 1636 1637 /* 1638 * For a given per-AG structure @pag and a goal, grab qualifying inodes and 1639 * process them in some manner. 1640 */ 1641 static int 1642 xfs_icwalk_ag( 1643 struct xfs_perag *pag, 1644 enum xfs_icwalk_goal goal, 1645 struct xfs_icwalk *icw) 1646 { 1647 struct xfs_mount *mp = pag->pag_mount; 1648 uint32_t first_index; 1649 int last_error = 0; 1650 int skipped; 1651 bool done; 1652 int nr_found; 1653 1654 restart: 1655 done = false; 1656 skipped = 0; 1657 if (goal == XFS_ICWALK_RECLAIM) 1658 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1659 else 1660 first_index = 0; 1661 nr_found = 0; 1662 do { 1663 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1664 int error = 0; 1665 int i; 1666 1667 rcu_read_lock(); 1668 1669 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 1670 (void **) batch, first_index, 1671 XFS_LOOKUP_BATCH, goal); 1672 if (!nr_found) { 1673 done = true; 1674 rcu_read_unlock(); 1675 break; 1676 } 1677 1678 /* 1679 * Grab the inodes before we drop the lock. if we found 1680 * nothing, nr == 0 and the loop will be skipped. 1681 */ 1682 for (i = 0; i < nr_found; i++) { 1683 struct xfs_inode *ip = batch[i]; 1684 1685 if (done || !xfs_icwalk_igrab(goal, ip, icw)) 1686 batch[i] = NULL; 1687 1688 /* 1689 * Update the index for the next lookup. Catch 1690 * overflows into the next AG range which can occur if 1691 * we have inodes in the last block of the AG and we 1692 * are currently pointing to the last inode. 1693 * 1694 * Because we may see inodes that are from the wrong AG 1695 * due to RCU freeing and reallocation, only update the 1696 * index if it lies in this AG. It was a race that lead 1697 * us to see this inode, so another lookup from the 1698 * same index will not find it again. 1699 */ 1700 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 1701 continue; 1702 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1703 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1704 done = true; 1705 } 1706 1707 /* unlock now we've grabbed the inodes. */ 1708 rcu_read_unlock(); 1709 1710 for (i = 0; i < nr_found; i++) { 1711 if (!batch[i]) 1712 continue; 1713 error = xfs_icwalk_process_inode(goal, batch[i], pag, 1714 icw); 1715 if (error == -EAGAIN) { 1716 skipped++; 1717 continue; 1718 } 1719 if (error && last_error != -EFSCORRUPTED) 1720 last_error = error; 1721 } 1722 1723 /* bail out if the filesystem is corrupted. */ 1724 if (error == -EFSCORRUPTED) 1725 break; 1726 1727 cond_resched(); 1728 1729 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { 1730 icw->icw_scan_limit -= XFS_LOOKUP_BATCH; 1731 if (icw->icw_scan_limit <= 0) 1732 break; 1733 } 1734 } while (nr_found && !done); 1735 1736 if (goal == XFS_ICWALK_RECLAIM) { 1737 if (done) 1738 first_index = 0; 1739 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1740 } 1741 1742 if (skipped) { 1743 delay(1); 1744 goto restart; 1745 } 1746 return last_error; 1747 } 1748 1749 /* Walk all incore inodes to achieve a given goal. */ 1750 static int 1751 xfs_icwalk( 1752 struct xfs_mount *mp, 1753 enum xfs_icwalk_goal goal, 1754 struct xfs_icwalk *icw) 1755 { 1756 struct xfs_perag *pag; 1757 int error = 0; 1758 int last_error = 0; 1759 xfs_agnumber_t agno; 1760 1761 for_each_perag_tag(mp, agno, pag, goal) { 1762 error = xfs_icwalk_ag(pag, goal, icw); 1763 if (error) { 1764 last_error = error; 1765 if (error == -EFSCORRUPTED) { 1766 xfs_perag_rele(pag); 1767 break; 1768 } 1769 } 1770 } 1771 return last_error; 1772 BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); 1773 } 1774 1775 #ifdef DEBUG 1776 static void 1777 xfs_check_delalloc( 1778 struct xfs_inode *ip, 1779 int whichfork) 1780 { 1781 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 1782 struct xfs_bmbt_irec got; 1783 struct xfs_iext_cursor icur; 1784 1785 if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) 1786 return; 1787 do { 1788 if (isnullstartblock(got.br_startblock)) { 1789 xfs_warn(ip->i_mount, 1790 "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", 1791 ip->i_ino, 1792 whichfork == XFS_DATA_FORK ? "data" : "cow", 1793 got.br_startoff, got.br_blockcount); 1794 } 1795 } while (xfs_iext_next_extent(ifp, &icur, &got)); 1796 } 1797 #else 1798 #define xfs_check_delalloc(ip, whichfork) do { } while (0) 1799 #endif 1800 1801 /* Schedule the inode for reclaim. */ 1802 static void 1803 xfs_inodegc_set_reclaimable( 1804 struct xfs_inode *ip) 1805 { 1806 struct xfs_mount *mp = ip->i_mount; 1807 struct xfs_perag *pag; 1808 1809 if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) { 1810 xfs_check_delalloc(ip, XFS_DATA_FORK); 1811 xfs_check_delalloc(ip, XFS_COW_FORK); 1812 ASSERT(0); 1813 } 1814 1815 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1816 spin_lock(&pag->pag_ici_lock); 1817 spin_lock(&ip->i_flags_lock); 1818 1819 trace_xfs_inode_set_reclaimable(ip); 1820 ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); 1821 ip->i_flags |= XFS_IRECLAIMABLE; 1822 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1823 XFS_ICI_RECLAIM_TAG); 1824 1825 spin_unlock(&ip->i_flags_lock); 1826 spin_unlock(&pag->pag_ici_lock); 1827 xfs_perag_put(pag); 1828 } 1829 1830 /* 1831 * Free all speculative preallocations and possibly even the inode itself. 1832 * This is the last chance to make changes to an otherwise unreferenced file 1833 * before incore reclamation happens. 1834 */ 1835 static int 1836 xfs_inodegc_inactivate( 1837 struct xfs_inode *ip) 1838 { 1839 int error; 1840 1841 trace_xfs_inode_inactivating(ip); 1842 error = xfs_inactive(ip); 1843 xfs_inodegc_set_reclaimable(ip); 1844 return error; 1845 1846 } 1847 1848 void 1849 xfs_inodegc_worker( 1850 struct work_struct *work) 1851 { 1852 struct xfs_inodegc *gc = container_of(to_delayed_work(work), 1853 struct xfs_inodegc, work); 1854 struct llist_node *node = llist_del_all(&gc->list); 1855 struct xfs_inode *ip, *n; 1856 struct xfs_mount *mp = gc->mp; 1857 unsigned int nofs_flag; 1858 1859 /* 1860 * Clear the cpu mask bit and ensure that we have seen the latest 1861 * update of the gc structure associated with this CPU. This matches 1862 * with the release semantics used when setting the cpumask bit in 1863 * xfs_inodegc_queue. 1864 */ 1865 cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask); 1866 smp_mb__after_atomic(); 1867 1868 WRITE_ONCE(gc->items, 0); 1869 1870 if (!node) 1871 return; 1872 1873 /* 1874 * We can allocate memory here while doing writeback on behalf of 1875 * memory reclaim. To avoid memory allocation deadlocks set the 1876 * task-wide nofs context for the following operations. 1877 */ 1878 nofs_flag = memalloc_nofs_save(); 1879 1880 ip = llist_entry(node, struct xfs_inode, i_gclist); 1881 trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits)); 1882 1883 WRITE_ONCE(gc->shrinker_hits, 0); 1884 llist_for_each_entry_safe(ip, n, node, i_gclist) { 1885 int error; 1886 1887 xfs_iflags_set(ip, XFS_INACTIVATING); 1888 error = xfs_inodegc_inactivate(ip); 1889 if (error && !gc->error) 1890 gc->error = error; 1891 } 1892 1893 memalloc_nofs_restore(nofs_flag); 1894 } 1895 1896 /* 1897 * Expedite all pending inodegc work to run immediately. This does not wait for 1898 * completion of the work. 1899 */ 1900 void 1901 xfs_inodegc_push( 1902 struct xfs_mount *mp) 1903 { 1904 if (!xfs_is_inodegc_enabled(mp)) 1905 return; 1906 trace_xfs_inodegc_push(mp, __return_address); 1907 xfs_inodegc_queue_all(mp); 1908 } 1909 1910 /* 1911 * Force all currently queued inode inactivation work to run immediately and 1912 * wait for the work to finish. 1913 */ 1914 int 1915 xfs_inodegc_flush( 1916 struct xfs_mount *mp) 1917 { 1918 xfs_inodegc_push(mp); 1919 trace_xfs_inodegc_flush(mp, __return_address); 1920 return xfs_inodegc_wait_all(mp); 1921 } 1922 1923 /* 1924 * Flush all the pending work and then disable the inode inactivation background 1925 * workers and wait for them to stop. Caller must hold sb->s_umount to 1926 * coordinate changes in the inodegc_enabled state. 1927 */ 1928 void 1929 xfs_inodegc_stop( 1930 struct xfs_mount *mp) 1931 { 1932 bool rerun; 1933 1934 if (!xfs_clear_inodegc_enabled(mp)) 1935 return; 1936 1937 /* 1938 * Drain all pending inodegc work, including inodes that could be 1939 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan 1940 * threads that sample the inodegc state just prior to us clearing it. 1941 * The inodegc flag state prevents new threads from queuing more 1942 * inodes, so we queue pending work items and flush the workqueue until 1943 * all inodegc lists are empty. IOWs, we cannot use drain_workqueue 1944 * here because it does not allow other unserialized mechanisms to 1945 * reschedule inodegc work while this draining is in progress. 1946 */ 1947 xfs_inodegc_queue_all(mp); 1948 do { 1949 flush_workqueue(mp->m_inodegc_wq); 1950 rerun = xfs_inodegc_queue_all(mp); 1951 } while (rerun); 1952 1953 trace_xfs_inodegc_stop(mp, __return_address); 1954 } 1955 1956 /* 1957 * Enable the inode inactivation background workers and schedule deferred inode 1958 * inactivation work if there is any. Caller must hold sb->s_umount to 1959 * coordinate changes in the inodegc_enabled state. 1960 */ 1961 void 1962 xfs_inodegc_start( 1963 struct xfs_mount *mp) 1964 { 1965 if (xfs_set_inodegc_enabled(mp)) 1966 return; 1967 1968 trace_xfs_inodegc_start(mp, __return_address); 1969 xfs_inodegc_queue_all(mp); 1970 } 1971 1972 #ifdef CONFIG_XFS_RT 1973 static inline bool 1974 xfs_inodegc_want_queue_rt_file( 1975 struct xfs_inode *ip) 1976 { 1977 struct xfs_mount *mp = ip->i_mount; 1978 1979 if (!XFS_IS_REALTIME_INODE(ip)) 1980 return false; 1981 1982 if (__percpu_counter_compare(&mp->m_frextents, 1983 mp->m_low_rtexts[XFS_LOWSP_5_PCNT], 1984 XFS_FDBLOCKS_BATCH) < 0) 1985 return true; 1986 1987 return false; 1988 } 1989 #else 1990 # define xfs_inodegc_want_queue_rt_file(ip) (false) 1991 #endif /* CONFIG_XFS_RT */ 1992 1993 /* 1994 * Schedule the inactivation worker when: 1995 * 1996 * - We've accumulated more than one inode cluster buffer's worth of inodes. 1997 * - There is less than 5% free space left. 1998 * - Any of the quotas for this inode are near an enforcement limit. 1999 */ 2000 static inline bool 2001 xfs_inodegc_want_queue_work( 2002 struct xfs_inode *ip, 2003 unsigned int items) 2004 { 2005 struct xfs_mount *mp = ip->i_mount; 2006 2007 if (items > mp->m_ino_geo.inodes_per_cluster) 2008 return true; 2009 2010 if (__percpu_counter_compare(&mp->m_fdblocks, 2011 mp->m_low_space[XFS_LOWSP_5_PCNT], 2012 XFS_FDBLOCKS_BATCH) < 0) 2013 return true; 2014 2015 if (xfs_inodegc_want_queue_rt_file(ip)) 2016 return true; 2017 2018 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) 2019 return true; 2020 2021 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP)) 2022 return true; 2023 2024 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ)) 2025 return true; 2026 2027 return false; 2028 } 2029 2030 /* 2031 * Upper bound on the number of inodes in each AG that can be queued for 2032 * inactivation at any given time, to avoid monopolizing the workqueue. 2033 */ 2034 #define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK) 2035 2036 /* 2037 * Make the frontend wait for inactivations when: 2038 * 2039 * - Memory shrinkers queued the inactivation worker and it hasn't finished. 2040 * - The queue depth exceeds the maximum allowable percpu backlog. 2041 * 2042 * Note: If we are in a NOFS context here (e.g. current thread is running a 2043 * transaction) the we don't want to block here as inodegc progress may require 2044 * filesystem resources we hold to make progress and that could result in a 2045 * deadlock. Hence we skip out of here if we are in a scoped NOFS context. 2046 */ 2047 static inline bool 2048 xfs_inodegc_want_flush_work( 2049 struct xfs_inode *ip, 2050 unsigned int items, 2051 unsigned int shrinker_hits) 2052 { 2053 if (current->flags & PF_MEMALLOC_NOFS) 2054 return false; 2055 2056 if (shrinker_hits > 0) 2057 return true; 2058 2059 if (items > XFS_INODEGC_MAX_BACKLOG) 2060 return true; 2061 2062 return false; 2063 } 2064 2065 /* 2066 * Queue a background inactivation worker if there are inodes that need to be 2067 * inactivated and higher level xfs code hasn't disabled the background 2068 * workers. 2069 */ 2070 static void 2071 xfs_inodegc_queue( 2072 struct xfs_inode *ip) 2073 { 2074 struct xfs_mount *mp = ip->i_mount; 2075 struct xfs_inodegc *gc; 2076 int items; 2077 unsigned int shrinker_hits; 2078 unsigned int cpu_nr; 2079 unsigned long queue_delay = 1; 2080 2081 trace_xfs_inode_set_need_inactive(ip); 2082 spin_lock(&ip->i_flags_lock); 2083 ip->i_flags |= XFS_NEED_INACTIVE; 2084 spin_unlock(&ip->i_flags_lock); 2085 2086 cpu_nr = get_cpu(); 2087 gc = this_cpu_ptr(mp->m_inodegc); 2088 llist_add(&ip->i_gclist, &gc->list); 2089 items = READ_ONCE(gc->items); 2090 WRITE_ONCE(gc->items, items + 1); 2091 shrinker_hits = READ_ONCE(gc->shrinker_hits); 2092 2093 /* 2094 * Ensure the list add is always seen by anyone who finds the cpumask 2095 * bit set. This effectively gives the cpumask bit set operation 2096 * release ordering semantics. 2097 */ 2098 smp_mb__before_atomic(); 2099 if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask)) 2100 cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask); 2101 2102 /* 2103 * We queue the work while holding the current CPU so that the work 2104 * is scheduled to run on this CPU. 2105 */ 2106 if (!xfs_is_inodegc_enabled(mp)) { 2107 put_cpu(); 2108 return; 2109 } 2110 2111 if (xfs_inodegc_want_queue_work(ip, items)) 2112 queue_delay = 0; 2113 2114 trace_xfs_inodegc_queue(mp, __return_address); 2115 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, 2116 queue_delay); 2117 put_cpu(); 2118 2119 if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { 2120 trace_xfs_inodegc_throttle(mp, __return_address); 2121 flush_delayed_work(&gc->work); 2122 } 2123 } 2124 2125 /* 2126 * We set the inode flag atomically with the radix tree tag. Once we get tag 2127 * lookups on the radix tree, this inode flag can go away. 2128 * 2129 * We always use background reclaim here because even if the inode is clean, it 2130 * still may be under IO and hence we have wait for IO completion to occur 2131 * before we can reclaim the inode. The background reclaim path handles this 2132 * more efficiently than we can here, so simply let background reclaim tear down 2133 * all inodes. 2134 */ 2135 void 2136 xfs_inode_mark_reclaimable( 2137 struct xfs_inode *ip) 2138 { 2139 struct xfs_mount *mp = ip->i_mount; 2140 bool need_inactive; 2141 2142 XFS_STATS_INC(mp, vn_reclaim); 2143 2144 /* 2145 * We should never get here with any of the reclaim flags already set. 2146 */ 2147 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); 2148 2149 need_inactive = xfs_inode_needs_inactive(ip); 2150 if (need_inactive) { 2151 xfs_inodegc_queue(ip); 2152 return; 2153 } 2154 2155 /* Going straight to reclaim, so drop the dquots. */ 2156 xfs_qm_dqdetach(ip); 2157 xfs_inodegc_set_reclaimable(ip); 2158 } 2159 2160 /* 2161 * Register a phony shrinker so that we can run background inodegc sooner when 2162 * there's memory pressure. Inactivation does not itself free any memory but 2163 * it does make inodes reclaimable, which eventually frees memory. 2164 * 2165 * The count function, seek value, and batch value are crafted to trigger the 2166 * scan function during the second round of scanning. Hopefully this means 2167 * that we reclaimed enough memory that initiating metadata transactions won't 2168 * make things worse. 2169 */ 2170 #define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY) 2171 #define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1) 2172 2173 static unsigned long 2174 xfs_inodegc_shrinker_count( 2175 struct shrinker *shrink, 2176 struct shrink_control *sc) 2177 { 2178 struct xfs_mount *mp = shrink->private_data; 2179 struct xfs_inodegc *gc; 2180 int cpu; 2181 2182 if (!xfs_is_inodegc_enabled(mp)) 2183 return 0; 2184 2185 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 2186 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2187 if (!llist_empty(&gc->list)) 2188 return XFS_INODEGC_SHRINKER_COUNT; 2189 } 2190 2191 return 0; 2192 } 2193 2194 static unsigned long 2195 xfs_inodegc_shrinker_scan( 2196 struct shrinker *shrink, 2197 struct shrink_control *sc) 2198 { 2199 struct xfs_mount *mp = shrink->private_data; 2200 struct xfs_inodegc *gc; 2201 int cpu; 2202 bool no_items = true; 2203 2204 if (!xfs_is_inodegc_enabled(mp)) 2205 return SHRINK_STOP; 2206 2207 trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); 2208 2209 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 2210 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2211 if (!llist_empty(&gc->list)) { 2212 unsigned int h = READ_ONCE(gc->shrinker_hits); 2213 2214 WRITE_ONCE(gc->shrinker_hits, h + 1); 2215 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); 2216 no_items = false; 2217 } 2218 } 2219 2220 /* 2221 * If there are no inodes to inactivate, we don't want the shrinker 2222 * to think there's deferred work to call us back about. 2223 */ 2224 if (no_items) 2225 return LONG_MAX; 2226 2227 return SHRINK_STOP; 2228 } 2229 2230 /* Register a shrinker so we can accelerate inodegc and throttle queuing. */ 2231 int 2232 xfs_inodegc_register_shrinker( 2233 struct xfs_mount *mp) 2234 { 2235 mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB, 2236 "xfs-inodegc:%s", 2237 mp->m_super->s_id); 2238 if (!mp->m_inodegc_shrinker) 2239 return -ENOMEM; 2240 2241 mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count; 2242 mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan; 2243 mp->m_inodegc_shrinker->seeks = 0; 2244 mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH; 2245 mp->m_inodegc_shrinker->private_data = mp; 2246 2247 shrinker_register(mp->m_inodegc_shrinker); 2248 2249 return 0; 2250 } 2251