1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_inode_item.h" 17 #include "xfs_quota.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_dquot_item.h" 22 #include "xfs_dquot.h" 23 #include "xfs_reflink.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_ag.h" 26 #include "xfs_log_priv.h" 27 #include "xfs_health.h" 28 29 #include <linux/iversion.h> 30 31 /* Radix tree tags for incore inode tree. */ 32 33 /* inode is to be reclaimed */ 34 #define XFS_ICI_RECLAIM_TAG 0 35 /* Inode has speculative preallocations (posteof or cow) to clean. */ 36 #define XFS_ICI_BLOCKGC_TAG 1 37 38 /* 39 * The goal for walking incore inodes. These can correspond with incore inode 40 * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. 41 */ 42 enum xfs_icwalk_goal { 43 /* Goals directly associated with tagged inodes. */ 44 XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, 45 XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, 46 }; 47 48 static int xfs_icwalk(struct xfs_mount *mp, 49 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 50 static int xfs_icwalk_ag(struct xfs_perag *pag, 51 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 52 53 /* 54 * Private inode cache walk flags for struct xfs_icwalk. Must not 55 * coincide with XFS_ICWALK_FLAGS_VALID. 56 */ 57 58 /* Stop scanning after icw_scan_limit inodes. */ 59 #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) 60 61 #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) 62 #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ 63 64 #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \ 65 XFS_ICWALK_FLAG_RECLAIM_SICK | \ 66 XFS_ICWALK_FLAG_UNION) 67 68 /* 69 * Allocate and initialise an xfs_inode. 70 */ 71 struct xfs_inode * 72 xfs_inode_alloc( 73 struct xfs_mount *mp, 74 xfs_ino_t ino) 75 { 76 struct xfs_inode *ip; 77 78 /* 79 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 80 * and return NULL here on ENOMEM. 81 */ 82 ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); 83 84 if (inode_init_always(mp->m_super, VFS_I(ip))) { 85 kmem_cache_free(xfs_inode_cache, ip); 86 return NULL; 87 } 88 89 /* VFS doesn't initialise i_mode or i_state! */ 90 VFS_I(ip)->i_mode = 0; 91 VFS_I(ip)->i_state = 0; 92 mapping_set_large_folios(VFS_I(ip)->i_mapping); 93 94 XFS_STATS_INC(mp, vn_active); 95 ASSERT(atomic_read(&ip->i_pincount) == 0); 96 ASSERT(ip->i_ino == 0); 97 98 /* initialise the xfs inode */ 99 ip->i_ino = ino; 100 ip->i_mount = mp; 101 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 102 ip->i_cowfp = NULL; 103 memset(&ip->i_af, 0, sizeof(ip->i_af)); 104 ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; 105 memset(&ip->i_df, 0, sizeof(ip->i_df)); 106 ip->i_flags = 0; 107 ip->i_delayed_blks = 0; 108 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 109 ip->i_nblocks = 0; 110 ip->i_forkoff = 0; 111 ip->i_sick = 0; 112 ip->i_checked = 0; 113 INIT_WORK(&ip->i_ioend_work, xfs_end_io); 114 INIT_LIST_HEAD(&ip->i_ioend_list); 115 spin_lock_init(&ip->i_ioend_lock); 116 ip->i_next_unlinked = NULLAGINO; 117 ip->i_prev_unlinked = 0; 118 119 return ip; 120 } 121 122 STATIC void 123 xfs_inode_free_callback( 124 struct rcu_head *head) 125 { 126 struct inode *inode = container_of(head, struct inode, i_rcu); 127 struct xfs_inode *ip = XFS_I(inode); 128 129 switch (VFS_I(ip)->i_mode & S_IFMT) { 130 case S_IFREG: 131 case S_IFDIR: 132 case S_IFLNK: 133 xfs_idestroy_fork(&ip->i_df); 134 break; 135 } 136 137 xfs_ifork_zap_attr(ip); 138 139 if (ip->i_cowfp) { 140 xfs_idestroy_fork(ip->i_cowfp); 141 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp); 142 } 143 if (ip->i_itemp) { 144 ASSERT(!test_bit(XFS_LI_IN_AIL, 145 &ip->i_itemp->ili_item.li_flags)); 146 xfs_inode_item_destroy(ip); 147 ip->i_itemp = NULL; 148 } 149 150 kmem_cache_free(xfs_inode_cache, ip); 151 } 152 153 static void 154 __xfs_inode_free( 155 struct xfs_inode *ip) 156 { 157 /* asserts to verify all state is correct here */ 158 ASSERT(atomic_read(&ip->i_pincount) == 0); 159 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 160 XFS_STATS_DEC(ip->i_mount, vn_active); 161 162 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 163 } 164 165 void 166 xfs_inode_free( 167 struct xfs_inode *ip) 168 { 169 ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); 170 171 /* 172 * Because we use RCU freeing we need to ensure the inode always 173 * appears to be reclaimed with an invalid inode number when in the 174 * free state. The ip->i_flags_lock provides the barrier against lookup 175 * races. 176 */ 177 spin_lock(&ip->i_flags_lock); 178 ip->i_flags = XFS_IRECLAIM; 179 ip->i_ino = 0; 180 spin_unlock(&ip->i_flags_lock); 181 182 __xfs_inode_free(ip); 183 } 184 185 /* 186 * Queue background inode reclaim work if there are reclaimable inodes and there 187 * isn't reclaim work already scheduled or in progress. 188 */ 189 static void 190 xfs_reclaim_work_queue( 191 struct xfs_mount *mp) 192 { 193 194 rcu_read_lock(); 195 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 196 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 197 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 198 } 199 rcu_read_unlock(); 200 } 201 202 /* 203 * Background scanning to trim preallocated space. This is queued based on the 204 * 'speculative_prealloc_lifetime' tunable (5m by default). 205 */ 206 static inline void 207 xfs_blockgc_queue( 208 struct xfs_perag *pag) 209 { 210 struct xfs_mount *mp = pag->pag_mount; 211 212 if (!xfs_is_blockgc_enabled(mp)) 213 return; 214 215 rcu_read_lock(); 216 if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) 217 queue_delayed_work(pag->pag_mount->m_blockgc_wq, 218 &pag->pag_blockgc_work, 219 msecs_to_jiffies(xfs_blockgc_secs * 1000)); 220 rcu_read_unlock(); 221 } 222 223 /* Set a tag on both the AG incore inode tree and the AG radix tree. */ 224 static void 225 xfs_perag_set_inode_tag( 226 struct xfs_perag *pag, 227 xfs_agino_t agino, 228 unsigned int tag) 229 { 230 struct xfs_mount *mp = pag->pag_mount; 231 bool was_tagged; 232 233 lockdep_assert_held(&pag->pag_ici_lock); 234 235 was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 236 radix_tree_tag_set(&pag->pag_ici_root, agino, tag); 237 238 if (tag == XFS_ICI_RECLAIM_TAG) 239 pag->pag_ici_reclaimable++; 240 241 if (was_tagged) 242 return; 243 244 /* propagate the tag up into the perag radix tree */ 245 spin_lock(&mp->m_perag_lock); 246 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); 247 spin_unlock(&mp->m_perag_lock); 248 249 /* start background work */ 250 switch (tag) { 251 case XFS_ICI_RECLAIM_TAG: 252 xfs_reclaim_work_queue(mp); 253 break; 254 case XFS_ICI_BLOCKGC_TAG: 255 xfs_blockgc_queue(pag); 256 break; 257 } 258 259 trace_xfs_perag_set_inode_tag(pag, _RET_IP_); 260 } 261 262 /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ 263 static void 264 xfs_perag_clear_inode_tag( 265 struct xfs_perag *pag, 266 xfs_agino_t agino, 267 unsigned int tag) 268 { 269 struct xfs_mount *mp = pag->pag_mount; 270 271 lockdep_assert_held(&pag->pag_ici_lock); 272 273 /* 274 * Reclaim can signal (with a null agino) that it cleared its own tag 275 * by removing the inode from the radix tree. 276 */ 277 if (agino != NULLAGINO) 278 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag); 279 else 280 ASSERT(tag == XFS_ICI_RECLAIM_TAG); 281 282 if (tag == XFS_ICI_RECLAIM_TAG) 283 pag->pag_ici_reclaimable--; 284 285 if (radix_tree_tagged(&pag->pag_ici_root, tag)) 286 return; 287 288 /* clear the tag from the perag radix tree */ 289 spin_lock(&mp->m_perag_lock); 290 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); 291 spin_unlock(&mp->m_perag_lock); 292 293 trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); 294 } 295 296 /* 297 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 298 * part of the structure. This is made more complex by the fact we store 299 * information about the on-disk values in the VFS inode and so we can't just 300 * overwrite the values unconditionally. Hence we save the parameters we 301 * need to retain across reinitialisation, and rewrite them into the VFS inode 302 * after reinitialisation even if it fails. 303 */ 304 static int 305 xfs_reinit_inode( 306 struct xfs_mount *mp, 307 struct inode *inode) 308 { 309 int error; 310 uint32_t nlink = inode->i_nlink; 311 uint32_t generation = inode->i_generation; 312 uint64_t version = inode_peek_iversion(inode); 313 umode_t mode = inode->i_mode; 314 dev_t dev = inode->i_rdev; 315 kuid_t uid = inode->i_uid; 316 kgid_t gid = inode->i_gid; 317 318 error = inode_init_always(mp->m_super, inode); 319 320 set_nlink(inode, nlink); 321 inode->i_generation = generation; 322 inode_set_iversion_queried(inode, version); 323 inode->i_mode = mode; 324 inode->i_rdev = dev; 325 inode->i_uid = uid; 326 inode->i_gid = gid; 327 mapping_set_large_folios(inode->i_mapping); 328 return error; 329 } 330 331 /* 332 * Carefully nudge an inode whose VFS state has been torn down back into a 333 * usable state. Drops the i_flags_lock and the rcu read lock. 334 */ 335 static int 336 xfs_iget_recycle( 337 struct xfs_perag *pag, 338 struct xfs_inode *ip) __releases(&ip->i_flags_lock) 339 { 340 struct xfs_mount *mp = ip->i_mount; 341 struct inode *inode = VFS_I(ip); 342 int error; 343 344 trace_xfs_iget_recycle(ip); 345 346 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 347 return -EAGAIN; 348 349 /* 350 * We need to make it look like the inode is being reclaimed to prevent 351 * the actual reclaim workers from stomping over us while we recycle 352 * the inode. We can't clear the radix tree tag yet as it requires 353 * pag_ici_lock to be held exclusive. 354 */ 355 ip->i_flags |= XFS_IRECLAIM; 356 357 spin_unlock(&ip->i_flags_lock); 358 rcu_read_unlock(); 359 360 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 361 error = xfs_reinit_inode(mp, inode); 362 xfs_iunlock(ip, XFS_ILOCK_EXCL); 363 if (error) { 364 /* 365 * Re-initializing the inode failed, and we are in deep 366 * trouble. Try to re-add it to the reclaim list. 367 */ 368 rcu_read_lock(); 369 spin_lock(&ip->i_flags_lock); 370 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 371 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 372 spin_unlock(&ip->i_flags_lock); 373 rcu_read_unlock(); 374 375 trace_xfs_iget_recycle_fail(ip); 376 return error; 377 } 378 379 spin_lock(&pag->pag_ici_lock); 380 spin_lock(&ip->i_flags_lock); 381 382 /* 383 * Clear the per-lifetime state in the inode as we are now effectively 384 * a new inode and need to return to the initial state before reuse 385 * occurs. 386 */ 387 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 388 ip->i_flags |= XFS_INEW; 389 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 390 XFS_ICI_RECLAIM_TAG); 391 inode->i_state = I_NEW; 392 spin_unlock(&ip->i_flags_lock); 393 spin_unlock(&pag->pag_ici_lock); 394 395 return 0; 396 } 397 398 /* 399 * If we are allocating a new inode, then check what was returned is 400 * actually a free, empty inode. If we are not allocating an inode, 401 * then check we didn't find a free inode. 402 * 403 * Returns: 404 * 0 if the inode free state matches the lookup context 405 * -ENOENT if the inode is free and we are not allocating 406 * -EFSCORRUPTED if there is any state mismatch at all 407 */ 408 static int 409 xfs_iget_check_free_state( 410 struct xfs_inode *ip, 411 int flags) 412 { 413 if (flags & XFS_IGET_CREATE) { 414 /* should be a free inode */ 415 if (VFS_I(ip)->i_mode != 0) { 416 xfs_warn(ip->i_mount, 417 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 418 ip->i_ino, VFS_I(ip)->i_mode); 419 xfs_agno_mark_sick(ip->i_mount, 420 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 421 XFS_SICK_AG_INOBT); 422 return -EFSCORRUPTED; 423 } 424 425 if (ip->i_nblocks != 0) { 426 xfs_warn(ip->i_mount, 427 "Corruption detected! Free inode 0x%llx has blocks allocated!", 428 ip->i_ino); 429 xfs_agno_mark_sick(ip->i_mount, 430 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 431 XFS_SICK_AG_INOBT); 432 return -EFSCORRUPTED; 433 } 434 return 0; 435 } 436 437 /* should be an allocated inode */ 438 if (VFS_I(ip)->i_mode == 0) 439 return -ENOENT; 440 441 return 0; 442 } 443 444 /* Make all pending inactivation work start immediately. */ 445 static bool 446 xfs_inodegc_queue_all( 447 struct xfs_mount *mp) 448 { 449 struct xfs_inodegc *gc; 450 int cpu; 451 bool ret = false; 452 453 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 454 gc = per_cpu_ptr(mp->m_inodegc, cpu); 455 if (!llist_empty(&gc->list)) { 456 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); 457 ret = true; 458 } 459 } 460 461 return ret; 462 } 463 464 /* Wait for all queued work and collect errors */ 465 static int 466 xfs_inodegc_wait_all( 467 struct xfs_mount *mp) 468 { 469 int cpu; 470 int error = 0; 471 472 flush_workqueue(mp->m_inodegc_wq); 473 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 474 struct xfs_inodegc *gc; 475 476 gc = per_cpu_ptr(mp->m_inodegc, cpu); 477 if (gc->error && !error) 478 error = gc->error; 479 gc->error = 0; 480 } 481 482 return error; 483 } 484 485 /* 486 * Check the validity of the inode we just found it the cache 487 */ 488 static int 489 xfs_iget_cache_hit( 490 struct xfs_perag *pag, 491 struct xfs_inode *ip, 492 xfs_ino_t ino, 493 int flags, 494 int lock_flags) __releases(RCU) 495 { 496 struct inode *inode = VFS_I(ip); 497 struct xfs_mount *mp = ip->i_mount; 498 int error; 499 500 /* 501 * check for re-use of an inode within an RCU grace period due to the 502 * radix tree nodes not being updated yet. We monitor for this by 503 * setting the inode number to zero before freeing the inode structure. 504 * If the inode has been reallocated and set up, then the inode number 505 * will not match, so check for that, too. 506 */ 507 spin_lock(&ip->i_flags_lock); 508 if (ip->i_ino != ino) 509 goto out_skip; 510 511 /* 512 * If we are racing with another cache hit that is currently 513 * instantiating this inode or currently recycling it out of 514 * reclaimable state, wait for the initialisation to complete 515 * before continuing. 516 * 517 * If we're racing with the inactivation worker we also want to wait. 518 * If we're creating a new file, it's possible that the worker 519 * previously marked the inode as free on disk but hasn't finished 520 * updating the incore state yet. The AGI buffer will be dirty and 521 * locked to the icreate transaction, so a synchronous push of the 522 * inodegc workers would result in deadlock. For a regular iget, the 523 * worker is running already, so we might as well wait. 524 * 525 * XXX(hch): eventually we should do something equivalent to 526 * wait_on_inode to wait for these flags to be cleared 527 * instead of polling for it. 528 */ 529 if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) 530 goto out_skip; 531 532 if (ip->i_flags & XFS_NEED_INACTIVE) { 533 /* Unlinked inodes cannot be re-grabbed. */ 534 if (VFS_I(ip)->i_nlink == 0) { 535 error = -ENOENT; 536 goto out_error; 537 } 538 goto out_inodegc_flush; 539 } 540 541 /* 542 * Check the inode free state is valid. This also detects lookup 543 * racing with unlinks. 544 */ 545 error = xfs_iget_check_free_state(ip, flags); 546 if (error) 547 goto out_error; 548 549 /* Skip inodes that have no vfs state. */ 550 if ((flags & XFS_IGET_INCORE) && 551 (ip->i_flags & XFS_IRECLAIMABLE)) 552 goto out_skip; 553 554 /* The inode fits the selection criteria; process it. */ 555 if (ip->i_flags & XFS_IRECLAIMABLE) { 556 /* Drops i_flags_lock and RCU read lock. */ 557 error = xfs_iget_recycle(pag, ip); 558 if (error == -EAGAIN) 559 goto out_skip; 560 if (error) 561 return error; 562 } else { 563 /* If the VFS inode is being torn down, pause and try again. */ 564 if (!igrab(inode)) 565 goto out_skip; 566 567 /* We've got a live one. */ 568 spin_unlock(&ip->i_flags_lock); 569 rcu_read_unlock(); 570 trace_xfs_iget_hit(ip); 571 } 572 573 if (lock_flags != 0) 574 xfs_ilock(ip, lock_flags); 575 576 if (!(flags & XFS_IGET_INCORE)) 577 xfs_iflags_clear(ip, XFS_ISTALE); 578 XFS_STATS_INC(mp, xs_ig_found); 579 580 return 0; 581 582 out_skip: 583 trace_xfs_iget_skip(ip); 584 XFS_STATS_INC(mp, xs_ig_frecycle); 585 error = -EAGAIN; 586 out_error: 587 spin_unlock(&ip->i_flags_lock); 588 rcu_read_unlock(); 589 return error; 590 591 out_inodegc_flush: 592 spin_unlock(&ip->i_flags_lock); 593 rcu_read_unlock(); 594 /* 595 * Do not wait for the workers, because the caller could hold an AGI 596 * buffer lock. We're just going to sleep in a loop anyway. 597 */ 598 if (xfs_is_inodegc_enabled(mp)) 599 xfs_inodegc_queue_all(mp); 600 return -EAGAIN; 601 } 602 603 static int 604 xfs_iget_cache_miss( 605 struct xfs_mount *mp, 606 struct xfs_perag *pag, 607 xfs_trans_t *tp, 608 xfs_ino_t ino, 609 struct xfs_inode **ipp, 610 int flags, 611 int lock_flags) 612 { 613 struct xfs_inode *ip; 614 int error; 615 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 616 617 ip = xfs_inode_alloc(mp, ino); 618 if (!ip) 619 return -ENOMEM; 620 621 error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags); 622 if (error) 623 goto out_destroy; 624 625 /* 626 * For version 5 superblocks, if we are initialising a new inode and we 627 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can 628 * simply build the new inode core with a random generation number. 629 * 630 * For version 4 (and older) superblocks, log recovery is dependent on 631 * the i_flushiter field being initialised from the current on-disk 632 * value and hence we must also read the inode off disk even when 633 * initializing new inodes. 634 */ 635 if (xfs_has_v3inodes(mp) && 636 (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { 637 VFS_I(ip)->i_generation = get_random_u32(); 638 } else { 639 struct xfs_buf *bp; 640 641 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp); 642 if (error) 643 goto out_destroy; 644 645 error = xfs_inode_from_disk(ip, 646 xfs_buf_offset(bp, ip->i_imap.im_boffset)); 647 if (!error) 648 xfs_buf_set_ref(bp, XFS_INO_REF); 649 else 650 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 651 xfs_trans_brelse(tp, bp); 652 653 if (error) 654 goto out_destroy; 655 } 656 657 trace_xfs_iget_miss(ip); 658 659 /* 660 * Check the inode free state is valid. This also detects lookup 661 * racing with unlinks. 662 */ 663 error = xfs_iget_check_free_state(ip, flags); 664 if (error) 665 goto out_destroy; 666 667 /* 668 * Preload the radix tree so we can insert safely under the 669 * write spinlock. Note that we cannot sleep inside the preload 670 * region. 671 */ 672 if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) { 673 error = -EAGAIN; 674 goto out_destroy; 675 } 676 677 /* 678 * Because the inode hasn't been added to the radix-tree yet it can't 679 * be found by another thread, so we can do the non-sleeping lock here. 680 */ 681 if (lock_flags) { 682 if (!xfs_ilock_nowait(ip, lock_flags)) 683 BUG(); 684 } 685 686 /* 687 * These values must be set before inserting the inode into the radix 688 * tree as the moment it is inserted a concurrent lookup (allowed by the 689 * RCU locking mechanism) can find it and that lookup must see that this 690 * is an inode currently under construction (i.e. that XFS_INEW is set). 691 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 692 * memory barrier that ensures this detection works correctly at lookup 693 * time. 694 */ 695 if (flags & XFS_IGET_DONTCACHE) 696 d_mark_dontcache(VFS_I(ip)); 697 ip->i_udquot = NULL; 698 ip->i_gdquot = NULL; 699 ip->i_pdquot = NULL; 700 xfs_iflags_set(ip, XFS_INEW); 701 702 /* insert the new inode */ 703 spin_lock(&pag->pag_ici_lock); 704 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 705 if (unlikely(error)) { 706 WARN_ON(error != -EEXIST); 707 XFS_STATS_INC(mp, xs_ig_dup); 708 error = -EAGAIN; 709 goto out_preload_end; 710 } 711 spin_unlock(&pag->pag_ici_lock); 712 radix_tree_preload_end(); 713 714 *ipp = ip; 715 return 0; 716 717 out_preload_end: 718 spin_unlock(&pag->pag_ici_lock); 719 radix_tree_preload_end(); 720 if (lock_flags) 721 xfs_iunlock(ip, lock_flags); 722 out_destroy: 723 __destroy_inode(VFS_I(ip)); 724 xfs_inode_free(ip); 725 return error; 726 } 727 728 /* 729 * Look up an inode by number in the given file system. The inode is looked up 730 * in the cache held in each AG. If the inode is found in the cache, initialise 731 * the vfs inode if necessary. 732 * 733 * If it is not in core, read it in from the file system's device, add it to the 734 * cache and initialise the vfs inode. 735 * 736 * The inode is locked according to the value of the lock_flags parameter. 737 * Inode lookup is only done during metadata operations and not as part of the 738 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 739 */ 740 int 741 xfs_iget( 742 struct xfs_mount *mp, 743 struct xfs_trans *tp, 744 xfs_ino_t ino, 745 uint flags, 746 uint lock_flags, 747 struct xfs_inode **ipp) 748 { 749 struct xfs_inode *ip; 750 struct xfs_perag *pag; 751 xfs_agino_t agino; 752 int error; 753 754 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 755 756 /* reject inode numbers outside existing AGs */ 757 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 758 return -EINVAL; 759 760 XFS_STATS_INC(mp, xs_ig_attempts); 761 762 /* get the perag structure and ensure that it's inode capable */ 763 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 764 agino = XFS_INO_TO_AGINO(mp, ino); 765 766 again: 767 error = 0; 768 rcu_read_lock(); 769 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 770 771 if (ip) { 772 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 773 if (error) 774 goto out_error_or_again; 775 } else { 776 rcu_read_unlock(); 777 if (flags & XFS_IGET_INCORE) { 778 error = -ENODATA; 779 goto out_error_or_again; 780 } 781 XFS_STATS_INC(mp, xs_ig_missed); 782 783 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 784 flags, lock_flags); 785 if (error) 786 goto out_error_or_again; 787 } 788 xfs_perag_put(pag); 789 790 *ipp = ip; 791 792 /* 793 * If we have a real type for an on-disk inode, we can setup the inode 794 * now. If it's a new inode being created, xfs_init_new_inode will 795 * handle it. 796 */ 797 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 798 xfs_setup_existing_inode(ip); 799 return 0; 800 801 out_error_or_again: 802 if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) && 803 error == -EAGAIN) { 804 delay(1); 805 goto again; 806 } 807 xfs_perag_put(pag); 808 return error; 809 } 810 811 /* 812 * Grab the inode for reclaim exclusively. 813 * 814 * We have found this inode via a lookup under RCU, so the inode may have 815 * already been freed, or it may be in the process of being recycled by 816 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 817 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 818 * will not be set. Hence we need to check for both these flag conditions to 819 * avoid inodes that are no longer reclaim candidates. 820 * 821 * Note: checking for other state flags here, under the i_flags_lock or not, is 822 * racy and should be avoided. Those races should be resolved only after we have 823 * ensured that we are able to reclaim this inode and the world can see that we 824 * are going to reclaim it. 825 * 826 * Return true if we grabbed it, false otherwise. 827 */ 828 static bool 829 xfs_reclaim_igrab( 830 struct xfs_inode *ip, 831 struct xfs_icwalk *icw) 832 { 833 ASSERT(rcu_read_lock_held()); 834 835 spin_lock(&ip->i_flags_lock); 836 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 837 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 838 /* not a reclaim candidate. */ 839 spin_unlock(&ip->i_flags_lock); 840 return false; 841 } 842 843 /* Don't reclaim a sick inode unless the caller asked for it. */ 844 if (ip->i_sick && 845 (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { 846 spin_unlock(&ip->i_flags_lock); 847 return false; 848 } 849 850 __xfs_iflags_set(ip, XFS_IRECLAIM); 851 spin_unlock(&ip->i_flags_lock); 852 return true; 853 } 854 855 /* 856 * Inode reclaim is non-blocking, so the default action if progress cannot be 857 * made is to "requeue" the inode for reclaim by unlocking it and clearing the 858 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 859 * blocking anymore and hence we can wait for the inode to be able to reclaim 860 * it. 861 * 862 * We do no IO here - if callers require inodes to be cleaned they must push the 863 * AIL first to trigger writeback of dirty inodes. This enables writeback to be 864 * done in the background in a non-blocking manner, and enables memory reclaim 865 * to make progress without blocking. 866 */ 867 static void 868 xfs_reclaim_inode( 869 struct xfs_inode *ip, 870 struct xfs_perag *pag) 871 { 872 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 873 874 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 875 goto out; 876 if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) 877 goto out_iunlock; 878 879 /* 880 * Check for log shutdown because aborting the inode can move the log 881 * tail and corrupt in memory state. This is fine if the log is shut 882 * down, but if the log is still active and only the mount is shut down 883 * then the in-memory log tail movement caused by the abort can be 884 * incorrectly propagated to disk. 885 */ 886 if (xlog_is_shutdown(ip->i_mount->m_log)) { 887 xfs_iunpin_wait(ip); 888 xfs_iflush_shutdown_abort(ip); 889 goto reclaim; 890 } 891 if (xfs_ipincount(ip)) 892 goto out_clear_flush; 893 if (!xfs_inode_clean(ip)) 894 goto out_clear_flush; 895 896 xfs_iflags_clear(ip, XFS_IFLUSHING); 897 reclaim: 898 trace_xfs_inode_reclaiming(ip); 899 900 /* 901 * Because we use RCU freeing we need to ensure the inode always appears 902 * to be reclaimed with an invalid inode number when in the free state. 903 * We do this as early as possible under the ILOCK so that 904 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 905 * detect races with us here. By doing this, we guarantee that once 906 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 907 * it will see either a valid inode that will serialise correctly, or it 908 * will see an invalid inode that it can skip. 909 */ 910 spin_lock(&ip->i_flags_lock); 911 ip->i_flags = XFS_IRECLAIM; 912 ip->i_ino = 0; 913 ip->i_sick = 0; 914 ip->i_checked = 0; 915 spin_unlock(&ip->i_flags_lock); 916 917 ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL); 918 xfs_iunlock(ip, XFS_ILOCK_EXCL); 919 920 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 921 /* 922 * Remove the inode from the per-AG radix tree. 923 * 924 * Because radix_tree_delete won't complain even if the item was never 925 * added to the tree assert that it's been there before to catch 926 * problems with the inode life time early on. 927 */ 928 spin_lock(&pag->pag_ici_lock); 929 if (!radix_tree_delete(&pag->pag_ici_root, 930 XFS_INO_TO_AGINO(ip->i_mount, ino))) 931 ASSERT(0); 932 xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG); 933 spin_unlock(&pag->pag_ici_lock); 934 935 /* 936 * Here we do an (almost) spurious inode lock in order to coordinate 937 * with inode cache radix tree lookups. This is because the lookup 938 * can reference the inodes in the cache without taking references. 939 * 940 * We make that OK here by ensuring that we wait until the inode is 941 * unlocked after the lookup before we go ahead and free it. 942 */ 943 xfs_ilock(ip, XFS_ILOCK_EXCL); 944 ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot); 945 xfs_iunlock(ip, XFS_ILOCK_EXCL); 946 ASSERT(xfs_inode_clean(ip)); 947 948 __xfs_inode_free(ip); 949 return; 950 951 out_clear_flush: 952 xfs_iflags_clear(ip, XFS_IFLUSHING); 953 out_iunlock: 954 xfs_iunlock(ip, XFS_ILOCK_EXCL); 955 out: 956 xfs_iflags_clear(ip, XFS_IRECLAIM); 957 } 958 959 /* Reclaim sick inodes if we're unmounting or the fs went down. */ 960 static inline bool 961 xfs_want_reclaim_sick( 962 struct xfs_mount *mp) 963 { 964 return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) || 965 xfs_is_shutdown(mp); 966 } 967 968 void 969 xfs_reclaim_inodes( 970 struct xfs_mount *mp) 971 { 972 struct xfs_icwalk icw = { 973 .icw_flags = 0, 974 }; 975 976 if (xfs_want_reclaim_sick(mp)) 977 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 978 979 while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 980 xfs_ail_push_all_sync(mp->m_ail); 981 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 982 } 983 } 984 985 /* 986 * The shrinker infrastructure determines how many inodes we should scan for 987 * reclaim. We want as many clean inodes ready to reclaim as possible, so we 988 * push the AIL here. We also want to proactively free up memory if we can to 989 * minimise the amount of work memory reclaim has to do so we kick the 990 * background reclaim if it isn't already scheduled. 991 */ 992 long 993 xfs_reclaim_inodes_nr( 994 struct xfs_mount *mp, 995 unsigned long nr_to_scan) 996 { 997 struct xfs_icwalk icw = { 998 .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, 999 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan), 1000 }; 1001 1002 if (xfs_want_reclaim_sick(mp)) 1003 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 1004 1005 /* kick background reclaimer and push the AIL */ 1006 xfs_reclaim_work_queue(mp); 1007 xfs_ail_push_all(mp->m_ail); 1008 1009 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 1010 return 0; 1011 } 1012 1013 /* 1014 * Return the number of reclaimable inodes in the filesystem for 1015 * the shrinker to determine how much to reclaim. 1016 */ 1017 long 1018 xfs_reclaim_inodes_count( 1019 struct xfs_mount *mp) 1020 { 1021 struct xfs_perag *pag; 1022 xfs_agnumber_t ag = 0; 1023 long reclaimable = 0; 1024 1025 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1026 ag = pag->pag_agno + 1; 1027 reclaimable += pag->pag_ici_reclaimable; 1028 xfs_perag_put(pag); 1029 } 1030 return reclaimable; 1031 } 1032 1033 STATIC bool 1034 xfs_icwalk_match_id( 1035 struct xfs_inode *ip, 1036 struct xfs_icwalk *icw) 1037 { 1038 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1039 !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1040 return false; 1041 1042 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1043 !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1044 return false; 1045 1046 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1047 ip->i_projid != icw->icw_prid) 1048 return false; 1049 1050 return true; 1051 } 1052 1053 /* 1054 * A union-based inode filtering algorithm. Process the inode if any of the 1055 * criteria match. This is for global/internal scans only. 1056 */ 1057 STATIC bool 1058 xfs_icwalk_match_id_union( 1059 struct xfs_inode *ip, 1060 struct xfs_icwalk *icw) 1061 { 1062 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1063 uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1064 return true; 1065 1066 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1067 gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1068 return true; 1069 1070 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1071 ip->i_projid == icw->icw_prid) 1072 return true; 1073 1074 return false; 1075 } 1076 1077 /* 1078 * Is this inode @ip eligible for eof/cow block reclamation, given some 1079 * filtering parameters @icw? The inode is eligible if @icw is null or 1080 * if the predicate functions match. 1081 */ 1082 static bool 1083 xfs_icwalk_match( 1084 struct xfs_inode *ip, 1085 struct xfs_icwalk *icw) 1086 { 1087 bool match; 1088 1089 if (!icw) 1090 return true; 1091 1092 if (icw->icw_flags & XFS_ICWALK_FLAG_UNION) 1093 match = xfs_icwalk_match_id_union(ip, icw); 1094 else 1095 match = xfs_icwalk_match_id(ip, icw); 1096 if (!match) 1097 return false; 1098 1099 /* skip the inode if the file size is too small */ 1100 if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) && 1101 XFS_ISIZE(ip) < icw->icw_min_file_size) 1102 return false; 1103 1104 return true; 1105 } 1106 1107 /* 1108 * This is a fast pass over the inode cache to try to get reclaim moving on as 1109 * many inodes as possible in a short period of time. It kicks itself every few 1110 * seconds, as well as being kicked by the inode cache shrinker when memory 1111 * goes low. 1112 */ 1113 void 1114 xfs_reclaim_worker( 1115 struct work_struct *work) 1116 { 1117 struct xfs_mount *mp = container_of(to_delayed_work(work), 1118 struct xfs_mount, m_reclaim_work); 1119 1120 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); 1121 xfs_reclaim_work_queue(mp); 1122 } 1123 1124 STATIC int 1125 xfs_inode_free_eofblocks( 1126 struct xfs_inode *ip, 1127 struct xfs_icwalk *icw, 1128 unsigned int *lockflags) 1129 { 1130 bool wait; 1131 1132 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1133 1134 if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) 1135 return 0; 1136 1137 /* 1138 * If the mapping is dirty the operation can block and wait for some 1139 * time. Unless we are waiting, skip it. 1140 */ 1141 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1142 return 0; 1143 1144 if (!xfs_icwalk_match(ip, icw)) 1145 return 0; 1146 1147 /* 1148 * If the caller is waiting, return -EAGAIN to keep the background 1149 * scanner moving and revisit the inode in a subsequent pass. 1150 */ 1151 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1152 if (wait) 1153 return -EAGAIN; 1154 return 0; 1155 } 1156 *lockflags |= XFS_IOLOCK_EXCL; 1157 1158 if (xfs_can_free_eofblocks(ip)) 1159 return xfs_free_eofblocks(ip); 1160 1161 /* inode could be preallocated or append-only */ 1162 trace_xfs_inode_free_eofblocks_invalid(ip); 1163 xfs_inode_clear_eofblocks_tag(ip); 1164 return 0; 1165 } 1166 1167 static void 1168 xfs_blockgc_set_iflag( 1169 struct xfs_inode *ip, 1170 unsigned long iflag) 1171 { 1172 struct xfs_mount *mp = ip->i_mount; 1173 struct xfs_perag *pag; 1174 1175 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1176 1177 /* 1178 * Don't bother locking the AG and looking up in the radix trees 1179 * if we already know that we have the tag set. 1180 */ 1181 if (ip->i_flags & iflag) 1182 return; 1183 spin_lock(&ip->i_flags_lock); 1184 ip->i_flags |= iflag; 1185 spin_unlock(&ip->i_flags_lock); 1186 1187 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1188 spin_lock(&pag->pag_ici_lock); 1189 1190 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1191 XFS_ICI_BLOCKGC_TAG); 1192 1193 spin_unlock(&pag->pag_ici_lock); 1194 xfs_perag_put(pag); 1195 } 1196 1197 void 1198 xfs_inode_set_eofblocks_tag( 1199 xfs_inode_t *ip) 1200 { 1201 trace_xfs_inode_set_eofblocks_tag(ip); 1202 return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); 1203 } 1204 1205 static void 1206 xfs_blockgc_clear_iflag( 1207 struct xfs_inode *ip, 1208 unsigned long iflag) 1209 { 1210 struct xfs_mount *mp = ip->i_mount; 1211 struct xfs_perag *pag; 1212 bool clear_tag; 1213 1214 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1215 1216 spin_lock(&ip->i_flags_lock); 1217 ip->i_flags &= ~iflag; 1218 clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; 1219 spin_unlock(&ip->i_flags_lock); 1220 1221 if (!clear_tag) 1222 return; 1223 1224 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1225 spin_lock(&pag->pag_ici_lock); 1226 1227 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1228 XFS_ICI_BLOCKGC_TAG); 1229 1230 spin_unlock(&pag->pag_ici_lock); 1231 xfs_perag_put(pag); 1232 } 1233 1234 void 1235 xfs_inode_clear_eofblocks_tag( 1236 xfs_inode_t *ip) 1237 { 1238 trace_xfs_inode_clear_eofblocks_tag(ip); 1239 return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); 1240 } 1241 1242 /* 1243 * Set ourselves up to free CoW blocks from this file. If it's already clean 1244 * then we can bail out quickly, but otherwise we must back off if the file 1245 * is undergoing some kind of write. 1246 */ 1247 static bool 1248 xfs_prep_free_cowblocks( 1249 struct xfs_inode *ip) 1250 { 1251 /* 1252 * Just clear the tag if we have an empty cow fork or none at all. It's 1253 * possible the inode was fully unshared since it was originally tagged. 1254 */ 1255 if (!xfs_inode_has_cow_data(ip)) { 1256 trace_xfs_inode_free_cowblocks_invalid(ip); 1257 xfs_inode_clear_cowblocks_tag(ip); 1258 return false; 1259 } 1260 1261 /* 1262 * If the mapping is dirty or under writeback we cannot touch the 1263 * CoW fork. Leave it alone if we're in the midst of a directio. 1264 */ 1265 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1266 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1267 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1268 atomic_read(&VFS_I(ip)->i_dio_count)) 1269 return false; 1270 1271 return true; 1272 } 1273 1274 /* 1275 * Automatic CoW Reservation Freeing 1276 * 1277 * These functions automatically garbage collect leftover CoW reservations 1278 * that were made on behalf of a cowextsize hint when we start to run out 1279 * of quota or when the reservations sit around for too long. If the file 1280 * has dirty pages or is undergoing writeback, its CoW reservations will 1281 * be retained. 1282 * 1283 * The actual garbage collection piggybacks off the same code that runs 1284 * the speculative EOF preallocation garbage collector. 1285 */ 1286 STATIC int 1287 xfs_inode_free_cowblocks( 1288 struct xfs_inode *ip, 1289 struct xfs_icwalk *icw, 1290 unsigned int *lockflags) 1291 { 1292 bool wait; 1293 int ret = 0; 1294 1295 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1296 1297 if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) 1298 return 0; 1299 1300 if (!xfs_prep_free_cowblocks(ip)) 1301 return 0; 1302 1303 if (!xfs_icwalk_match(ip, icw)) 1304 return 0; 1305 1306 /* 1307 * If the caller is waiting, return -EAGAIN to keep the background 1308 * scanner moving and revisit the inode in a subsequent pass. 1309 */ 1310 if (!(*lockflags & XFS_IOLOCK_EXCL) && 1311 !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1312 if (wait) 1313 return -EAGAIN; 1314 return 0; 1315 } 1316 *lockflags |= XFS_IOLOCK_EXCL; 1317 1318 if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { 1319 if (wait) 1320 return -EAGAIN; 1321 return 0; 1322 } 1323 *lockflags |= XFS_MMAPLOCK_EXCL; 1324 1325 /* 1326 * Check again, nobody else should be able to dirty blocks or change 1327 * the reflink iflag now that we have the first two locks held. 1328 */ 1329 if (xfs_prep_free_cowblocks(ip)) 1330 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1331 return ret; 1332 } 1333 1334 void 1335 xfs_inode_set_cowblocks_tag( 1336 xfs_inode_t *ip) 1337 { 1338 trace_xfs_inode_set_cowblocks_tag(ip); 1339 return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); 1340 } 1341 1342 void 1343 xfs_inode_clear_cowblocks_tag( 1344 xfs_inode_t *ip) 1345 { 1346 trace_xfs_inode_clear_cowblocks_tag(ip); 1347 return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); 1348 } 1349 1350 /* Disable post-EOF and CoW block auto-reclamation. */ 1351 void 1352 xfs_blockgc_stop( 1353 struct xfs_mount *mp) 1354 { 1355 struct xfs_perag *pag; 1356 xfs_agnumber_t agno; 1357 1358 if (!xfs_clear_blockgc_enabled(mp)) 1359 return; 1360 1361 for_each_perag(mp, agno, pag) 1362 cancel_delayed_work_sync(&pag->pag_blockgc_work); 1363 trace_xfs_blockgc_stop(mp, __return_address); 1364 } 1365 1366 /* Enable post-EOF and CoW block auto-reclamation. */ 1367 void 1368 xfs_blockgc_start( 1369 struct xfs_mount *mp) 1370 { 1371 struct xfs_perag *pag; 1372 xfs_agnumber_t agno; 1373 1374 if (xfs_set_blockgc_enabled(mp)) 1375 return; 1376 1377 trace_xfs_blockgc_start(mp, __return_address); 1378 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1379 xfs_blockgc_queue(pag); 1380 } 1381 1382 /* Don't try to run block gc on an inode that's in any of these states. */ 1383 #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ 1384 XFS_NEED_INACTIVE | \ 1385 XFS_INACTIVATING | \ 1386 XFS_IRECLAIMABLE | \ 1387 XFS_IRECLAIM) 1388 /* 1389 * Decide if the given @ip is eligible for garbage collection of speculative 1390 * preallocations, and grab it if so. Returns true if it's ready to go or 1391 * false if we should just ignore it. 1392 */ 1393 static bool 1394 xfs_blockgc_igrab( 1395 struct xfs_inode *ip) 1396 { 1397 struct inode *inode = VFS_I(ip); 1398 1399 ASSERT(rcu_read_lock_held()); 1400 1401 /* Check for stale RCU freed inode */ 1402 spin_lock(&ip->i_flags_lock); 1403 if (!ip->i_ino) 1404 goto out_unlock_noent; 1405 1406 if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS) 1407 goto out_unlock_noent; 1408 spin_unlock(&ip->i_flags_lock); 1409 1410 /* nothing to sync during shutdown */ 1411 if (xfs_is_shutdown(ip->i_mount)) 1412 return false; 1413 1414 /* If we can't grab the inode, it must on it's way to reclaim. */ 1415 if (!igrab(inode)) 1416 return false; 1417 1418 /* inode is valid */ 1419 return true; 1420 1421 out_unlock_noent: 1422 spin_unlock(&ip->i_flags_lock); 1423 return false; 1424 } 1425 1426 /* Scan one incore inode for block preallocations that we can remove. */ 1427 static int 1428 xfs_blockgc_scan_inode( 1429 struct xfs_inode *ip, 1430 struct xfs_icwalk *icw) 1431 { 1432 unsigned int lockflags = 0; 1433 int error; 1434 1435 error = xfs_inode_free_eofblocks(ip, icw, &lockflags); 1436 if (error) 1437 goto unlock; 1438 1439 error = xfs_inode_free_cowblocks(ip, icw, &lockflags); 1440 unlock: 1441 if (lockflags) 1442 xfs_iunlock(ip, lockflags); 1443 xfs_irele(ip); 1444 return error; 1445 } 1446 1447 /* Background worker that trims preallocated space. */ 1448 void 1449 xfs_blockgc_worker( 1450 struct work_struct *work) 1451 { 1452 struct xfs_perag *pag = container_of(to_delayed_work(work), 1453 struct xfs_perag, pag_blockgc_work); 1454 struct xfs_mount *mp = pag->pag_mount; 1455 int error; 1456 1457 trace_xfs_blockgc_worker(mp, __return_address); 1458 1459 error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); 1460 if (error) 1461 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", 1462 pag->pag_agno, error); 1463 xfs_blockgc_queue(pag); 1464 } 1465 1466 /* 1467 * Try to free space in the filesystem by purging inactive inodes, eofblocks 1468 * and cowblocks. 1469 */ 1470 int 1471 xfs_blockgc_free_space( 1472 struct xfs_mount *mp, 1473 struct xfs_icwalk *icw) 1474 { 1475 int error; 1476 1477 trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); 1478 1479 error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); 1480 if (error) 1481 return error; 1482 1483 return xfs_inodegc_flush(mp); 1484 } 1485 1486 /* 1487 * Reclaim all the free space that we can by scheduling the background blockgc 1488 * and inodegc workers immediately and waiting for them all to clear. 1489 */ 1490 int 1491 xfs_blockgc_flush_all( 1492 struct xfs_mount *mp) 1493 { 1494 struct xfs_perag *pag; 1495 xfs_agnumber_t agno; 1496 1497 trace_xfs_blockgc_flush_all(mp, __return_address); 1498 1499 /* 1500 * For each blockgc worker, move its queue time up to now. If it 1501 * wasn't queued, it will not be requeued. Then flush whatever's 1502 * left. 1503 */ 1504 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1505 mod_delayed_work(pag->pag_mount->m_blockgc_wq, 1506 &pag->pag_blockgc_work, 0); 1507 1508 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1509 flush_delayed_work(&pag->pag_blockgc_work); 1510 1511 return xfs_inodegc_flush(mp); 1512 } 1513 1514 /* 1515 * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which 1516 * quota caused an allocation failure, so we make a best effort by including 1517 * each quota under low free space conditions (less than 1% free space) in the 1518 * scan. 1519 * 1520 * Callers must not hold any inode's ILOCK. If requesting a synchronous scan 1521 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or 1522 * MMAPLOCK. 1523 */ 1524 int 1525 xfs_blockgc_free_dquots( 1526 struct xfs_mount *mp, 1527 struct xfs_dquot *udqp, 1528 struct xfs_dquot *gdqp, 1529 struct xfs_dquot *pdqp, 1530 unsigned int iwalk_flags) 1531 { 1532 struct xfs_icwalk icw = {0}; 1533 bool do_work = false; 1534 1535 if (!udqp && !gdqp && !pdqp) 1536 return 0; 1537 1538 /* 1539 * Run a scan to free blocks using the union filter to cover all 1540 * applicable quotas in a single scan. 1541 */ 1542 icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; 1543 1544 if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { 1545 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); 1546 icw.icw_flags |= XFS_ICWALK_FLAG_UID; 1547 do_work = true; 1548 } 1549 1550 if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { 1551 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); 1552 icw.icw_flags |= XFS_ICWALK_FLAG_GID; 1553 do_work = true; 1554 } 1555 1556 if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { 1557 icw.icw_prid = pdqp->q_id; 1558 icw.icw_flags |= XFS_ICWALK_FLAG_PRID; 1559 do_work = true; 1560 } 1561 1562 if (!do_work) 1563 return 0; 1564 1565 return xfs_blockgc_free_space(mp, &icw); 1566 } 1567 1568 /* Run cow/eofblocks scans on the quotas attached to the inode. */ 1569 int 1570 xfs_blockgc_free_quota( 1571 struct xfs_inode *ip, 1572 unsigned int iwalk_flags) 1573 { 1574 return xfs_blockgc_free_dquots(ip->i_mount, 1575 xfs_inode_dquot(ip, XFS_DQTYPE_USER), 1576 xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), 1577 xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); 1578 } 1579 1580 /* XFS Inode Cache Walking Code */ 1581 1582 /* 1583 * The inode lookup is done in batches to keep the amount of lock traffic and 1584 * radix tree lookups to a minimum. The batch size is a trade off between 1585 * lookup reduction and stack usage. This is in the reclaim path, so we can't 1586 * be too greedy. 1587 */ 1588 #define XFS_LOOKUP_BATCH 32 1589 1590 1591 /* 1592 * Decide if we want to grab this inode in anticipation of doing work towards 1593 * the goal. 1594 */ 1595 static inline bool 1596 xfs_icwalk_igrab( 1597 enum xfs_icwalk_goal goal, 1598 struct xfs_inode *ip, 1599 struct xfs_icwalk *icw) 1600 { 1601 switch (goal) { 1602 case XFS_ICWALK_BLOCKGC: 1603 return xfs_blockgc_igrab(ip); 1604 case XFS_ICWALK_RECLAIM: 1605 return xfs_reclaim_igrab(ip, icw); 1606 default: 1607 return false; 1608 } 1609 } 1610 1611 /* 1612 * Process an inode. Each processing function must handle any state changes 1613 * made by the icwalk igrab function. Return -EAGAIN to skip an inode. 1614 */ 1615 static inline int 1616 xfs_icwalk_process_inode( 1617 enum xfs_icwalk_goal goal, 1618 struct xfs_inode *ip, 1619 struct xfs_perag *pag, 1620 struct xfs_icwalk *icw) 1621 { 1622 int error = 0; 1623 1624 switch (goal) { 1625 case XFS_ICWALK_BLOCKGC: 1626 error = xfs_blockgc_scan_inode(ip, icw); 1627 break; 1628 case XFS_ICWALK_RECLAIM: 1629 xfs_reclaim_inode(ip, pag); 1630 break; 1631 } 1632 return error; 1633 } 1634 1635 /* 1636 * For a given per-AG structure @pag and a goal, grab qualifying inodes and 1637 * process them in some manner. 1638 */ 1639 static int 1640 xfs_icwalk_ag( 1641 struct xfs_perag *pag, 1642 enum xfs_icwalk_goal goal, 1643 struct xfs_icwalk *icw) 1644 { 1645 struct xfs_mount *mp = pag->pag_mount; 1646 uint32_t first_index; 1647 int last_error = 0; 1648 int skipped; 1649 bool done; 1650 int nr_found; 1651 1652 restart: 1653 done = false; 1654 skipped = 0; 1655 if (goal == XFS_ICWALK_RECLAIM) 1656 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1657 else 1658 first_index = 0; 1659 nr_found = 0; 1660 do { 1661 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1662 int error = 0; 1663 int i; 1664 1665 rcu_read_lock(); 1666 1667 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 1668 (void **) batch, first_index, 1669 XFS_LOOKUP_BATCH, goal); 1670 if (!nr_found) { 1671 done = true; 1672 rcu_read_unlock(); 1673 break; 1674 } 1675 1676 /* 1677 * Grab the inodes before we drop the lock. if we found 1678 * nothing, nr == 0 and the loop will be skipped. 1679 */ 1680 for (i = 0; i < nr_found; i++) { 1681 struct xfs_inode *ip = batch[i]; 1682 1683 if (done || !xfs_icwalk_igrab(goal, ip, icw)) 1684 batch[i] = NULL; 1685 1686 /* 1687 * Update the index for the next lookup. Catch 1688 * overflows into the next AG range which can occur if 1689 * we have inodes in the last block of the AG and we 1690 * are currently pointing to the last inode. 1691 * 1692 * Because we may see inodes that are from the wrong AG 1693 * due to RCU freeing and reallocation, only update the 1694 * index if it lies in this AG. It was a race that lead 1695 * us to see this inode, so another lookup from the 1696 * same index will not find it again. 1697 */ 1698 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 1699 continue; 1700 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1701 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1702 done = true; 1703 } 1704 1705 /* unlock now we've grabbed the inodes. */ 1706 rcu_read_unlock(); 1707 1708 for (i = 0; i < nr_found; i++) { 1709 if (!batch[i]) 1710 continue; 1711 error = xfs_icwalk_process_inode(goal, batch[i], pag, 1712 icw); 1713 if (error == -EAGAIN) { 1714 skipped++; 1715 continue; 1716 } 1717 if (error && last_error != -EFSCORRUPTED) 1718 last_error = error; 1719 } 1720 1721 /* bail out if the filesystem is corrupted. */ 1722 if (error == -EFSCORRUPTED) 1723 break; 1724 1725 cond_resched(); 1726 1727 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { 1728 icw->icw_scan_limit -= XFS_LOOKUP_BATCH; 1729 if (icw->icw_scan_limit <= 0) 1730 break; 1731 } 1732 } while (nr_found && !done); 1733 1734 if (goal == XFS_ICWALK_RECLAIM) { 1735 if (done) 1736 first_index = 0; 1737 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1738 } 1739 1740 if (skipped) { 1741 delay(1); 1742 goto restart; 1743 } 1744 return last_error; 1745 } 1746 1747 /* Walk all incore inodes to achieve a given goal. */ 1748 static int 1749 xfs_icwalk( 1750 struct xfs_mount *mp, 1751 enum xfs_icwalk_goal goal, 1752 struct xfs_icwalk *icw) 1753 { 1754 struct xfs_perag *pag; 1755 int error = 0; 1756 int last_error = 0; 1757 xfs_agnumber_t agno; 1758 1759 for_each_perag_tag(mp, agno, pag, goal) { 1760 error = xfs_icwalk_ag(pag, goal, icw); 1761 if (error) { 1762 last_error = error; 1763 if (error == -EFSCORRUPTED) { 1764 xfs_perag_rele(pag); 1765 break; 1766 } 1767 } 1768 } 1769 return last_error; 1770 BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); 1771 } 1772 1773 #ifdef DEBUG 1774 static void 1775 xfs_check_delalloc( 1776 struct xfs_inode *ip, 1777 int whichfork) 1778 { 1779 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 1780 struct xfs_bmbt_irec got; 1781 struct xfs_iext_cursor icur; 1782 1783 if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) 1784 return; 1785 do { 1786 if (isnullstartblock(got.br_startblock)) { 1787 xfs_warn(ip->i_mount, 1788 "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", 1789 ip->i_ino, 1790 whichfork == XFS_DATA_FORK ? "data" : "cow", 1791 got.br_startoff, got.br_blockcount); 1792 } 1793 } while (xfs_iext_next_extent(ifp, &icur, &got)); 1794 } 1795 #else 1796 #define xfs_check_delalloc(ip, whichfork) do { } while (0) 1797 #endif 1798 1799 /* Schedule the inode for reclaim. */ 1800 static void 1801 xfs_inodegc_set_reclaimable( 1802 struct xfs_inode *ip) 1803 { 1804 struct xfs_mount *mp = ip->i_mount; 1805 struct xfs_perag *pag; 1806 1807 if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) { 1808 xfs_check_delalloc(ip, XFS_DATA_FORK); 1809 xfs_check_delalloc(ip, XFS_COW_FORK); 1810 ASSERT(0); 1811 } 1812 1813 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1814 spin_lock(&pag->pag_ici_lock); 1815 spin_lock(&ip->i_flags_lock); 1816 1817 trace_xfs_inode_set_reclaimable(ip); 1818 ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); 1819 ip->i_flags |= XFS_IRECLAIMABLE; 1820 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1821 XFS_ICI_RECLAIM_TAG); 1822 1823 spin_unlock(&ip->i_flags_lock); 1824 spin_unlock(&pag->pag_ici_lock); 1825 xfs_perag_put(pag); 1826 } 1827 1828 /* 1829 * Free all speculative preallocations and possibly even the inode itself. 1830 * This is the last chance to make changes to an otherwise unreferenced file 1831 * before incore reclamation happens. 1832 */ 1833 static int 1834 xfs_inodegc_inactivate( 1835 struct xfs_inode *ip) 1836 { 1837 int error; 1838 1839 trace_xfs_inode_inactivating(ip); 1840 error = xfs_inactive(ip); 1841 xfs_inodegc_set_reclaimable(ip); 1842 return error; 1843 1844 } 1845 1846 void 1847 xfs_inodegc_worker( 1848 struct work_struct *work) 1849 { 1850 struct xfs_inodegc *gc = container_of(to_delayed_work(work), 1851 struct xfs_inodegc, work); 1852 struct llist_node *node = llist_del_all(&gc->list); 1853 struct xfs_inode *ip, *n; 1854 struct xfs_mount *mp = gc->mp; 1855 unsigned int nofs_flag; 1856 1857 /* 1858 * Clear the cpu mask bit and ensure that we have seen the latest 1859 * update of the gc structure associated with this CPU. This matches 1860 * with the release semantics used when setting the cpumask bit in 1861 * xfs_inodegc_queue. 1862 */ 1863 cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask); 1864 smp_mb__after_atomic(); 1865 1866 WRITE_ONCE(gc->items, 0); 1867 1868 if (!node) 1869 return; 1870 1871 /* 1872 * We can allocate memory here while doing writeback on behalf of 1873 * memory reclaim. To avoid memory allocation deadlocks set the 1874 * task-wide nofs context for the following operations. 1875 */ 1876 nofs_flag = memalloc_nofs_save(); 1877 1878 ip = llist_entry(node, struct xfs_inode, i_gclist); 1879 trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits)); 1880 1881 WRITE_ONCE(gc->shrinker_hits, 0); 1882 llist_for_each_entry_safe(ip, n, node, i_gclist) { 1883 int error; 1884 1885 xfs_iflags_set(ip, XFS_INACTIVATING); 1886 error = xfs_inodegc_inactivate(ip); 1887 if (error && !gc->error) 1888 gc->error = error; 1889 } 1890 1891 memalloc_nofs_restore(nofs_flag); 1892 } 1893 1894 /* 1895 * Expedite all pending inodegc work to run immediately. This does not wait for 1896 * completion of the work. 1897 */ 1898 void 1899 xfs_inodegc_push( 1900 struct xfs_mount *mp) 1901 { 1902 if (!xfs_is_inodegc_enabled(mp)) 1903 return; 1904 trace_xfs_inodegc_push(mp, __return_address); 1905 xfs_inodegc_queue_all(mp); 1906 } 1907 1908 /* 1909 * Force all currently queued inode inactivation work to run immediately and 1910 * wait for the work to finish. 1911 */ 1912 int 1913 xfs_inodegc_flush( 1914 struct xfs_mount *mp) 1915 { 1916 xfs_inodegc_push(mp); 1917 trace_xfs_inodegc_flush(mp, __return_address); 1918 return xfs_inodegc_wait_all(mp); 1919 } 1920 1921 /* 1922 * Flush all the pending work and then disable the inode inactivation background 1923 * workers and wait for them to stop. Caller must hold sb->s_umount to 1924 * coordinate changes in the inodegc_enabled state. 1925 */ 1926 void 1927 xfs_inodegc_stop( 1928 struct xfs_mount *mp) 1929 { 1930 bool rerun; 1931 1932 if (!xfs_clear_inodegc_enabled(mp)) 1933 return; 1934 1935 /* 1936 * Drain all pending inodegc work, including inodes that could be 1937 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan 1938 * threads that sample the inodegc state just prior to us clearing it. 1939 * The inodegc flag state prevents new threads from queuing more 1940 * inodes, so we queue pending work items and flush the workqueue until 1941 * all inodegc lists are empty. IOWs, we cannot use drain_workqueue 1942 * here because it does not allow other unserialized mechanisms to 1943 * reschedule inodegc work while this draining is in progress. 1944 */ 1945 xfs_inodegc_queue_all(mp); 1946 do { 1947 flush_workqueue(mp->m_inodegc_wq); 1948 rerun = xfs_inodegc_queue_all(mp); 1949 } while (rerun); 1950 1951 trace_xfs_inodegc_stop(mp, __return_address); 1952 } 1953 1954 /* 1955 * Enable the inode inactivation background workers and schedule deferred inode 1956 * inactivation work if there is any. Caller must hold sb->s_umount to 1957 * coordinate changes in the inodegc_enabled state. 1958 */ 1959 void 1960 xfs_inodegc_start( 1961 struct xfs_mount *mp) 1962 { 1963 if (xfs_set_inodegc_enabled(mp)) 1964 return; 1965 1966 trace_xfs_inodegc_start(mp, __return_address); 1967 xfs_inodegc_queue_all(mp); 1968 } 1969 1970 #ifdef CONFIG_XFS_RT 1971 static inline bool 1972 xfs_inodegc_want_queue_rt_file( 1973 struct xfs_inode *ip) 1974 { 1975 struct xfs_mount *mp = ip->i_mount; 1976 1977 if (!XFS_IS_REALTIME_INODE(ip)) 1978 return false; 1979 1980 if (__percpu_counter_compare(&mp->m_frextents, 1981 mp->m_low_rtexts[XFS_LOWSP_5_PCNT], 1982 XFS_FDBLOCKS_BATCH) < 0) 1983 return true; 1984 1985 return false; 1986 } 1987 #else 1988 # define xfs_inodegc_want_queue_rt_file(ip) (false) 1989 #endif /* CONFIG_XFS_RT */ 1990 1991 /* 1992 * Schedule the inactivation worker when: 1993 * 1994 * - We've accumulated more than one inode cluster buffer's worth of inodes. 1995 * - There is less than 5% free space left. 1996 * - Any of the quotas for this inode are near an enforcement limit. 1997 */ 1998 static inline bool 1999 xfs_inodegc_want_queue_work( 2000 struct xfs_inode *ip, 2001 unsigned int items) 2002 { 2003 struct xfs_mount *mp = ip->i_mount; 2004 2005 if (items > mp->m_ino_geo.inodes_per_cluster) 2006 return true; 2007 2008 if (__percpu_counter_compare(&mp->m_fdblocks, 2009 mp->m_low_space[XFS_LOWSP_5_PCNT], 2010 XFS_FDBLOCKS_BATCH) < 0) 2011 return true; 2012 2013 if (xfs_inodegc_want_queue_rt_file(ip)) 2014 return true; 2015 2016 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) 2017 return true; 2018 2019 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP)) 2020 return true; 2021 2022 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ)) 2023 return true; 2024 2025 return false; 2026 } 2027 2028 /* 2029 * Upper bound on the number of inodes in each AG that can be queued for 2030 * inactivation at any given time, to avoid monopolizing the workqueue. 2031 */ 2032 #define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK) 2033 2034 /* 2035 * Make the frontend wait for inactivations when: 2036 * 2037 * - Memory shrinkers queued the inactivation worker and it hasn't finished. 2038 * - The queue depth exceeds the maximum allowable percpu backlog. 2039 * 2040 * Note: If we are in a NOFS context here (e.g. current thread is running a 2041 * transaction) the we don't want to block here as inodegc progress may require 2042 * filesystem resources we hold to make progress and that could result in a 2043 * deadlock. Hence we skip out of here if we are in a scoped NOFS context. 2044 */ 2045 static inline bool 2046 xfs_inodegc_want_flush_work( 2047 struct xfs_inode *ip, 2048 unsigned int items, 2049 unsigned int shrinker_hits) 2050 { 2051 if (current->flags & PF_MEMALLOC_NOFS) 2052 return false; 2053 2054 if (shrinker_hits > 0) 2055 return true; 2056 2057 if (items > XFS_INODEGC_MAX_BACKLOG) 2058 return true; 2059 2060 return false; 2061 } 2062 2063 /* 2064 * Queue a background inactivation worker if there are inodes that need to be 2065 * inactivated and higher level xfs code hasn't disabled the background 2066 * workers. 2067 */ 2068 static void 2069 xfs_inodegc_queue( 2070 struct xfs_inode *ip) 2071 { 2072 struct xfs_mount *mp = ip->i_mount; 2073 struct xfs_inodegc *gc; 2074 int items; 2075 unsigned int shrinker_hits; 2076 unsigned int cpu_nr; 2077 unsigned long queue_delay = 1; 2078 2079 trace_xfs_inode_set_need_inactive(ip); 2080 spin_lock(&ip->i_flags_lock); 2081 ip->i_flags |= XFS_NEED_INACTIVE; 2082 spin_unlock(&ip->i_flags_lock); 2083 2084 cpu_nr = get_cpu(); 2085 gc = this_cpu_ptr(mp->m_inodegc); 2086 llist_add(&ip->i_gclist, &gc->list); 2087 items = READ_ONCE(gc->items); 2088 WRITE_ONCE(gc->items, items + 1); 2089 shrinker_hits = READ_ONCE(gc->shrinker_hits); 2090 2091 /* 2092 * Ensure the list add is always seen by anyone who finds the cpumask 2093 * bit set. This effectively gives the cpumask bit set operation 2094 * release ordering semantics. 2095 */ 2096 smp_mb__before_atomic(); 2097 if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask)) 2098 cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask); 2099 2100 /* 2101 * We queue the work while holding the current CPU so that the work 2102 * is scheduled to run on this CPU. 2103 */ 2104 if (!xfs_is_inodegc_enabled(mp)) { 2105 put_cpu(); 2106 return; 2107 } 2108 2109 if (xfs_inodegc_want_queue_work(ip, items)) 2110 queue_delay = 0; 2111 2112 trace_xfs_inodegc_queue(mp, __return_address); 2113 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, 2114 queue_delay); 2115 put_cpu(); 2116 2117 if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { 2118 trace_xfs_inodegc_throttle(mp, __return_address); 2119 flush_delayed_work(&gc->work); 2120 } 2121 } 2122 2123 /* 2124 * We set the inode flag atomically with the radix tree tag. Once we get tag 2125 * lookups on the radix tree, this inode flag can go away. 2126 * 2127 * We always use background reclaim here because even if the inode is clean, it 2128 * still may be under IO and hence we have wait for IO completion to occur 2129 * before we can reclaim the inode. The background reclaim path handles this 2130 * more efficiently than we can here, so simply let background reclaim tear down 2131 * all inodes. 2132 */ 2133 void 2134 xfs_inode_mark_reclaimable( 2135 struct xfs_inode *ip) 2136 { 2137 struct xfs_mount *mp = ip->i_mount; 2138 bool need_inactive; 2139 2140 XFS_STATS_INC(mp, vn_reclaim); 2141 2142 /* 2143 * We should never get here with any of the reclaim flags already set. 2144 */ 2145 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); 2146 2147 need_inactive = xfs_inode_needs_inactive(ip); 2148 if (need_inactive) { 2149 xfs_inodegc_queue(ip); 2150 return; 2151 } 2152 2153 /* Going straight to reclaim, so drop the dquots. */ 2154 xfs_qm_dqdetach(ip); 2155 xfs_inodegc_set_reclaimable(ip); 2156 } 2157 2158 /* 2159 * Register a phony shrinker so that we can run background inodegc sooner when 2160 * there's memory pressure. Inactivation does not itself free any memory but 2161 * it does make inodes reclaimable, which eventually frees memory. 2162 * 2163 * The count function, seek value, and batch value are crafted to trigger the 2164 * scan function during the second round of scanning. Hopefully this means 2165 * that we reclaimed enough memory that initiating metadata transactions won't 2166 * make things worse. 2167 */ 2168 #define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY) 2169 #define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1) 2170 2171 static unsigned long 2172 xfs_inodegc_shrinker_count( 2173 struct shrinker *shrink, 2174 struct shrink_control *sc) 2175 { 2176 struct xfs_mount *mp = shrink->private_data; 2177 struct xfs_inodegc *gc; 2178 int cpu; 2179 2180 if (!xfs_is_inodegc_enabled(mp)) 2181 return 0; 2182 2183 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 2184 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2185 if (!llist_empty(&gc->list)) 2186 return XFS_INODEGC_SHRINKER_COUNT; 2187 } 2188 2189 return 0; 2190 } 2191 2192 static unsigned long 2193 xfs_inodegc_shrinker_scan( 2194 struct shrinker *shrink, 2195 struct shrink_control *sc) 2196 { 2197 struct xfs_mount *mp = shrink->private_data; 2198 struct xfs_inodegc *gc; 2199 int cpu; 2200 bool no_items = true; 2201 2202 if (!xfs_is_inodegc_enabled(mp)) 2203 return SHRINK_STOP; 2204 2205 trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); 2206 2207 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 2208 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2209 if (!llist_empty(&gc->list)) { 2210 unsigned int h = READ_ONCE(gc->shrinker_hits); 2211 2212 WRITE_ONCE(gc->shrinker_hits, h + 1); 2213 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); 2214 no_items = false; 2215 } 2216 } 2217 2218 /* 2219 * If there are no inodes to inactivate, we don't want the shrinker 2220 * to think there's deferred work to call us back about. 2221 */ 2222 if (no_items) 2223 return LONG_MAX; 2224 2225 return SHRINK_STOP; 2226 } 2227 2228 /* Register a shrinker so we can accelerate inodegc and throttle queuing. */ 2229 int 2230 xfs_inodegc_register_shrinker( 2231 struct xfs_mount *mp) 2232 { 2233 mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB, 2234 "xfs-inodegc:%s", 2235 mp->m_super->s_id); 2236 if (!mp->m_inodegc_shrinker) 2237 return -ENOMEM; 2238 2239 mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count; 2240 mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan; 2241 mp->m_inodegc_shrinker->seeks = 0; 2242 mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH; 2243 mp->m_inodegc_shrinker->private_data = mp; 2244 2245 shrinker_register(mp->m_inodegc_shrinker); 2246 2247 return 0; 2248 } 2249