1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_inode_item.h" 17 #include "xfs_quota.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_dquot_item.h" 22 #include "xfs_dquot.h" 23 #include "xfs_reflink.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_ag.h" 26 #include "xfs_log_priv.h" 27 #include "xfs_health.h" 28 #include "xfs_da_format.h" 29 #include "xfs_dir2.h" 30 #include "xfs_metafile.h" 31 32 #include <linux/iversion.h> 33 34 /* Radix tree tags for incore inode tree. */ 35 36 /* inode is to be reclaimed */ 37 #define XFS_ICI_RECLAIM_TAG 0 38 /* Inode has speculative preallocations (posteof or cow) to clean. */ 39 #define XFS_ICI_BLOCKGC_TAG 1 40 41 /* 42 * The goal for walking incore inodes. These can correspond with incore inode 43 * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. 44 */ 45 enum xfs_icwalk_goal { 46 /* Goals directly associated with tagged inodes. */ 47 XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, 48 XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, 49 }; 50 51 static int xfs_icwalk(struct xfs_mount *mp, 52 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 53 static int xfs_icwalk_ag(struct xfs_perag *pag, 54 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 55 56 /* 57 * Private inode cache walk flags for struct xfs_icwalk. Must not 58 * coincide with XFS_ICWALK_FLAGS_VALID. 59 */ 60 61 /* Stop scanning after icw_scan_limit inodes. */ 62 #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) 63 64 #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) 65 #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ 66 67 #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \ 68 XFS_ICWALK_FLAG_RECLAIM_SICK | \ 69 XFS_ICWALK_FLAG_UNION) 70 71 /* Marks for the perag xarray */ 72 #define XFS_PERAG_RECLAIM_MARK XA_MARK_0 73 #define XFS_PERAG_BLOCKGC_MARK XA_MARK_1 74 75 static inline xa_mark_t ici_tag_to_mark(unsigned int tag) 76 { 77 if (tag == XFS_ICI_RECLAIM_TAG) 78 return XFS_PERAG_RECLAIM_MARK; 79 ASSERT(tag == XFS_ICI_BLOCKGC_TAG); 80 return XFS_PERAG_BLOCKGC_MARK; 81 } 82 83 /* 84 * Allocate and initialise an xfs_inode. 85 */ 86 struct xfs_inode * 87 xfs_inode_alloc( 88 struct xfs_mount *mp, 89 xfs_ino_t ino) 90 { 91 struct xfs_inode *ip; 92 93 /* 94 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 95 * and return NULL here on ENOMEM. 96 */ 97 ip = alloc_inode_sb(mp->m_super, xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); 98 99 if (inode_init_always(mp->m_super, VFS_I(ip))) { 100 kmem_cache_free(xfs_inode_cache, ip); 101 return NULL; 102 } 103 104 /* VFS doesn't initialise i_mode! */ 105 VFS_I(ip)->i_mode = 0; 106 mapping_set_folio_min_order(VFS_I(ip)->i_mapping, 107 M_IGEO(mp)->min_folio_order); 108 109 XFS_STATS_INC(mp, vn_active); 110 ASSERT(atomic_read(&ip->i_pincount) == 0); 111 ASSERT(ip->i_ino == 0); 112 113 /* initialise the xfs inode */ 114 ip->i_ino = ino; 115 ip->i_mount = mp; 116 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 117 ip->i_cowfp = NULL; 118 memset(&ip->i_af, 0, sizeof(ip->i_af)); 119 ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS; 120 memset(&ip->i_df, 0, sizeof(ip->i_df)); 121 ip->i_flags = 0; 122 ip->i_delayed_blks = 0; 123 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 124 ip->i_nblocks = 0; 125 ip->i_forkoff = 0; 126 ip->i_sick = 0; 127 ip->i_checked = 0; 128 INIT_WORK(&ip->i_ioend_work, xfs_end_io); 129 INIT_LIST_HEAD(&ip->i_ioend_list); 130 spin_lock_init(&ip->i_ioend_lock); 131 ip->i_next_unlinked = NULLAGINO; 132 ip->i_prev_unlinked = 0; 133 134 return ip; 135 } 136 137 STATIC void 138 xfs_inode_free_callback( 139 struct rcu_head *head) 140 { 141 struct inode *inode = container_of(head, struct inode, i_rcu); 142 struct xfs_inode *ip = XFS_I(inode); 143 144 switch (VFS_I(ip)->i_mode & S_IFMT) { 145 case S_IFREG: 146 case S_IFDIR: 147 case S_IFLNK: 148 xfs_idestroy_fork(&ip->i_df); 149 break; 150 } 151 152 xfs_ifork_zap_attr(ip); 153 154 if (ip->i_cowfp) { 155 xfs_idestroy_fork(ip->i_cowfp); 156 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp); 157 } 158 if (ip->i_itemp) { 159 ASSERT(!test_bit(XFS_LI_IN_AIL, 160 &ip->i_itemp->ili_item.li_flags)); 161 xfs_inode_item_destroy(ip); 162 ip->i_itemp = NULL; 163 } 164 165 kmem_cache_free(xfs_inode_cache, ip); 166 } 167 168 static void 169 __xfs_inode_free( 170 struct xfs_inode *ip) 171 { 172 /* asserts to verify all state is correct here */ 173 ASSERT(atomic_read(&ip->i_pincount) == 0); 174 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 175 XFS_STATS_DEC(ip->i_mount, vn_active); 176 177 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 178 } 179 180 void 181 xfs_inode_free( 182 struct xfs_inode *ip) 183 { 184 ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); 185 186 /* 187 * Because we use RCU freeing we need to ensure the inode always 188 * appears to be reclaimed with an invalid inode number when in the 189 * free state. The ip->i_flags_lock provides the barrier against lookup 190 * races. 191 */ 192 spin_lock(&ip->i_flags_lock); 193 ip->i_flags = XFS_IRECLAIM; 194 ip->i_ino = 0; 195 spin_unlock(&ip->i_flags_lock); 196 197 __xfs_inode_free(ip); 198 } 199 200 /* 201 * Queue background inode reclaim work if there are reclaimable inodes and there 202 * isn't reclaim work already scheduled or in progress. 203 */ 204 static void 205 xfs_reclaim_work_queue( 206 struct xfs_mount *mp) 207 { 208 209 rcu_read_lock(); 210 if (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) { 211 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 212 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 213 } 214 rcu_read_unlock(); 215 } 216 217 /* 218 * Background scanning to trim preallocated space. This is queued based on the 219 * 'speculative_prealloc_lifetime' tunable (5m by default). 220 */ 221 static inline void 222 xfs_blockgc_queue( 223 struct xfs_perag *pag) 224 { 225 struct xfs_mount *mp = pag_mount(pag); 226 227 if (!xfs_is_blockgc_enabled(mp)) 228 return; 229 230 rcu_read_lock(); 231 if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) 232 queue_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 233 secs_to_jiffies(xfs_blockgc_secs)); 234 rcu_read_unlock(); 235 } 236 237 /* Set a tag on both the AG incore inode tree and the AG radix tree. */ 238 static void 239 xfs_perag_set_inode_tag( 240 struct xfs_perag *pag, 241 xfs_agino_t agino, 242 unsigned int tag) 243 { 244 bool was_tagged; 245 246 lockdep_assert_held(&pag->pag_ici_lock); 247 248 was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 249 radix_tree_tag_set(&pag->pag_ici_root, agino, tag); 250 251 if (tag == XFS_ICI_RECLAIM_TAG) 252 pag->pag_ici_reclaimable++; 253 254 if (was_tagged) 255 return; 256 257 /* propagate the tag up into the pag xarray tree */ 258 xfs_group_set_mark(pag_group(pag), ici_tag_to_mark(tag)); 259 260 /* start background work */ 261 switch (tag) { 262 case XFS_ICI_RECLAIM_TAG: 263 xfs_reclaim_work_queue(pag_mount(pag)); 264 break; 265 case XFS_ICI_BLOCKGC_TAG: 266 xfs_blockgc_queue(pag); 267 break; 268 } 269 270 trace_xfs_perag_set_inode_tag(pag, _RET_IP_); 271 } 272 273 /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ 274 static void 275 xfs_perag_clear_inode_tag( 276 struct xfs_perag *pag, 277 xfs_agino_t agino, 278 unsigned int tag) 279 { 280 lockdep_assert_held(&pag->pag_ici_lock); 281 282 /* 283 * Reclaim can signal (with a null agino) that it cleared its own tag 284 * by removing the inode from the radix tree. 285 */ 286 if (agino != NULLAGINO) 287 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag); 288 else 289 ASSERT(tag == XFS_ICI_RECLAIM_TAG); 290 291 if (tag == XFS_ICI_RECLAIM_TAG) 292 pag->pag_ici_reclaimable--; 293 294 if (radix_tree_tagged(&pag->pag_ici_root, tag)) 295 return; 296 297 /* clear the tag from the pag xarray */ 298 xfs_group_clear_mark(pag_group(pag), ici_tag_to_mark(tag)); 299 trace_xfs_perag_clear_inode_tag(pag, _RET_IP_); 300 } 301 302 /* 303 * Find the next AG after @pag, or the first AG if @pag is NULL. 304 */ 305 static struct xfs_perag * 306 xfs_perag_grab_next_tag( 307 struct xfs_mount *mp, 308 struct xfs_perag *pag, 309 int tag) 310 { 311 return to_perag(xfs_group_grab_next_mark(mp, 312 pag ? pag_group(pag) : NULL, 313 ici_tag_to_mark(tag), XG_TYPE_AG)); 314 } 315 316 /* 317 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 318 * part of the structure. This is made more complex by the fact we store 319 * information about the on-disk values in the VFS inode and so we can't just 320 * overwrite the values unconditionally. Hence we save the parameters we 321 * need to retain across reinitialisation, and rewrite them into the VFS inode 322 * after reinitialisation even if it fails. 323 */ 324 static int 325 xfs_reinit_inode( 326 struct xfs_mount *mp, 327 struct inode *inode) 328 { 329 int error; 330 uint32_t nlink = inode->i_nlink; 331 uint32_t generation = inode->i_generation; 332 uint64_t version = inode_peek_iversion(inode); 333 umode_t mode = inode->i_mode; 334 dev_t dev = inode->i_rdev; 335 kuid_t uid = inode->i_uid; 336 kgid_t gid = inode->i_gid; 337 unsigned long state = inode_state_read_once(inode); 338 339 error = inode_init_always(mp->m_super, inode); 340 341 set_nlink(inode, nlink); 342 inode->i_generation = generation; 343 inode_set_iversion_queried(inode, version); 344 inode->i_mode = mode; 345 inode->i_rdev = dev; 346 inode->i_uid = uid; 347 inode->i_gid = gid; 348 inode_state_assign_raw(inode, state); 349 mapping_set_folio_min_order(inode->i_mapping, 350 M_IGEO(mp)->min_folio_order); 351 return error; 352 } 353 354 /* 355 * Carefully nudge an inode whose VFS state has been torn down back into a 356 * usable state. Drops the i_flags_lock and the rcu read lock. 357 */ 358 static int 359 xfs_iget_recycle( 360 struct xfs_perag *pag, 361 struct xfs_inode *ip) 362 { 363 struct xfs_mount *mp = ip->i_mount; 364 struct inode *inode = VFS_I(ip); 365 int error; 366 367 trace_xfs_iget_recycle(ip); 368 369 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 370 error = xfs_reinit_inode(mp, inode); 371 xfs_iunlock(ip, XFS_ILOCK_EXCL); 372 if (error) { 373 /* 374 * Re-initializing the inode failed, and we are in deep 375 * trouble. Try to re-add it to the reclaim list. 376 */ 377 rcu_read_lock(); 378 spin_lock(&ip->i_flags_lock); 379 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 380 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 381 spin_unlock(&ip->i_flags_lock); 382 rcu_read_unlock(); 383 384 trace_xfs_iget_recycle_fail(ip); 385 return error; 386 } 387 388 spin_lock(&pag->pag_ici_lock); 389 spin_lock(&ip->i_flags_lock); 390 391 /* 392 * Clear the per-lifetime state in the inode as we are now effectively 393 * a new inode and need to return to the initial state before reuse 394 * occurs. 395 */ 396 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 397 ip->i_flags |= XFS_INEW; 398 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 399 XFS_ICI_RECLAIM_TAG); 400 inode_state_assign_raw(inode, I_NEW); 401 spin_unlock(&ip->i_flags_lock); 402 spin_unlock(&pag->pag_ici_lock); 403 404 return 0; 405 } 406 407 /* 408 * If we are allocating a new inode, then check what was returned is 409 * actually a free, empty inode. If we are not allocating an inode, 410 * then check we didn't find a free inode. 411 * 412 * Returns: 413 * 0 if the inode free state matches the lookup context 414 * -ENOENT if the inode is free and we are not allocating 415 * -EFSCORRUPTED if there is any state mismatch at all 416 */ 417 static int 418 xfs_iget_check_free_state( 419 struct xfs_inode *ip, 420 int flags) 421 { 422 if (flags & XFS_IGET_CREATE) { 423 /* should be a free inode */ 424 if (VFS_I(ip)->i_mode != 0) { 425 xfs_warn(ip->i_mount, 426 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 427 ip->i_ino, VFS_I(ip)->i_mode); 428 xfs_agno_mark_sick(ip->i_mount, 429 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 430 XFS_SICK_AG_INOBT); 431 return -EFSCORRUPTED; 432 } 433 434 if (ip->i_nblocks != 0) { 435 xfs_warn(ip->i_mount, 436 "Corruption detected! Free inode 0x%llx has blocks allocated!", 437 ip->i_ino); 438 xfs_agno_mark_sick(ip->i_mount, 439 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 440 XFS_SICK_AG_INOBT); 441 return -EFSCORRUPTED; 442 } 443 return 0; 444 } 445 446 /* should be an allocated inode */ 447 if (VFS_I(ip)->i_mode == 0) 448 return -ENOENT; 449 450 return 0; 451 } 452 453 /* Make all pending inactivation work start immediately. */ 454 static bool 455 xfs_inodegc_queue_all( 456 struct xfs_mount *mp) 457 { 458 struct xfs_inodegc *gc; 459 int cpu; 460 bool ret = false; 461 462 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 463 gc = per_cpu_ptr(mp->m_inodegc, cpu); 464 if (!llist_empty(&gc->list)) { 465 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); 466 ret = true; 467 } 468 } 469 470 return ret; 471 } 472 473 /* Wait for all queued work and collect errors */ 474 static int 475 xfs_inodegc_wait_all( 476 struct xfs_mount *mp) 477 { 478 int cpu; 479 int error = 0; 480 481 flush_workqueue(mp->m_inodegc_wq); 482 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 483 struct xfs_inodegc *gc; 484 485 gc = per_cpu_ptr(mp->m_inodegc, cpu); 486 if (gc->error && !error) 487 error = gc->error; 488 gc->error = 0; 489 } 490 491 return error; 492 } 493 494 /* 495 * Check the validity of the inode we just found it the cache 496 */ 497 static int 498 xfs_iget_cache_hit( 499 struct xfs_perag *pag, 500 struct xfs_inode *ip, 501 xfs_ino_t ino, 502 int flags, 503 int lock_flags) __releases(RCU) 504 { 505 struct inode *inode = VFS_I(ip); 506 struct xfs_mount *mp = ip->i_mount; 507 int error; 508 509 /* 510 * check for re-use of an inode within an RCU grace period due to the 511 * radix tree nodes not being updated yet. We monitor for this by 512 * setting the inode number to zero before freeing the inode structure. 513 * If the inode has been reallocated and set up, then the inode number 514 * will not match, so check for that, too. 515 */ 516 spin_lock(&ip->i_flags_lock); 517 if (ip->i_ino != ino) 518 goto out_skip; 519 520 /* 521 * If we are racing with another cache hit that is currently 522 * instantiating this inode or currently recycling it out of 523 * reclaimable state, wait for the initialisation to complete 524 * before continuing. 525 * 526 * If we're racing with the inactivation worker we also want to wait. 527 * If we're creating a new file, it's possible that the worker 528 * previously marked the inode as free on disk but hasn't finished 529 * updating the incore state yet. The AGI buffer will be dirty and 530 * locked to the icreate transaction, so a synchronous push of the 531 * inodegc workers would result in deadlock. For a regular iget, the 532 * worker is running already, so we might as well wait. 533 * 534 * XXX(hch): eventually we should do something equivalent to 535 * wait_on_inode to wait for these flags to be cleared 536 * instead of polling for it. 537 */ 538 if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) 539 goto out_skip; 540 541 if (ip->i_flags & XFS_NEED_INACTIVE) { 542 /* Unlinked inodes cannot be re-grabbed. */ 543 if (VFS_I(ip)->i_nlink == 0) { 544 error = -ENOENT; 545 goto out_error; 546 } 547 goto out_inodegc_flush; 548 } 549 550 /* 551 * Check the inode free state is valid. This also detects lookup 552 * racing with unlinks. 553 */ 554 error = xfs_iget_check_free_state(ip, flags); 555 if (error) 556 goto out_error; 557 558 /* Skip inodes that have no vfs state. */ 559 if ((flags & XFS_IGET_INCORE) && 560 (ip->i_flags & XFS_IRECLAIMABLE)) 561 goto out_skip; 562 563 /* The inode fits the selection criteria; process it. */ 564 if (ip->i_flags & XFS_IRECLAIMABLE) { 565 /* 566 * We need to make it look like the inode is being reclaimed to 567 * prevent the actual reclaim workers from stomping over us 568 * while we recycle the inode. We can't clear the radix tree 569 * tag yet as it requires pag_ici_lock to be held exclusive. 570 */ 571 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 572 goto out_skip; 573 ip->i_flags |= XFS_IRECLAIM; 574 spin_unlock(&ip->i_flags_lock); 575 rcu_read_unlock(); 576 577 error = xfs_iget_recycle(pag, ip); 578 if (error) 579 return error; 580 } else { 581 /* If the VFS inode is being torn down, pause and try again. */ 582 if (!igrab(inode)) 583 goto out_skip; 584 585 /* We've got a live one. */ 586 spin_unlock(&ip->i_flags_lock); 587 rcu_read_unlock(); 588 trace_xfs_iget_hit(ip); 589 } 590 591 if (lock_flags != 0) 592 xfs_ilock(ip, lock_flags); 593 594 if (!(flags & XFS_IGET_INCORE)) 595 xfs_iflags_clear(ip, XFS_ISTALE); 596 XFS_STATS_INC(mp, xs_ig_found); 597 598 return 0; 599 600 out_skip: 601 trace_xfs_iget_skip(ip); 602 XFS_STATS_INC(mp, xs_ig_frecycle); 603 error = -EAGAIN; 604 out_error: 605 spin_unlock(&ip->i_flags_lock); 606 rcu_read_unlock(); 607 return error; 608 609 out_inodegc_flush: 610 spin_unlock(&ip->i_flags_lock); 611 rcu_read_unlock(); 612 /* 613 * Do not wait for the workers, because the caller could hold an AGI 614 * buffer lock. We're just going to sleep in a loop anyway. 615 */ 616 if (xfs_is_inodegc_enabled(mp)) 617 xfs_inodegc_queue_all(mp); 618 return -EAGAIN; 619 } 620 621 static int 622 xfs_iget_cache_miss( 623 struct xfs_mount *mp, 624 struct xfs_perag *pag, 625 xfs_trans_t *tp, 626 xfs_ino_t ino, 627 struct xfs_inode **ipp, 628 int flags, 629 int lock_flags) 630 { 631 struct xfs_inode *ip; 632 int error; 633 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 634 635 ip = xfs_inode_alloc(mp, ino); 636 if (!ip) 637 return -ENOMEM; 638 639 error = xfs_imap(pag, tp, ip->i_ino, &ip->i_imap, flags); 640 if (error) 641 goto out_destroy; 642 643 /* 644 * For version 5 superblocks, if we are initialising a new inode, we 645 * simply build the new inode core with a random generation number. 646 * 647 * For version 4 (and older) superblocks, log recovery is dependent on 648 * the i_flushiter field being initialised from the current on-disk 649 * value and hence we must also read the inode off disk even when 650 * initializing new inodes. 651 */ 652 if (xfs_has_v3inodes(mp) && (flags & XFS_IGET_CREATE)) { 653 VFS_I(ip)->i_generation = get_random_u32(); 654 } else { 655 struct xfs_buf *bp; 656 657 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp); 658 if (error) 659 goto out_destroy; 660 661 error = xfs_inode_from_disk(ip, 662 xfs_buf_offset(bp, ip->i_imap.im_boffset)); 663 if (!error) 664 xfs_buf_set_ref(bp, XFS_INO_REF); 665 else 666 xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE); 667 xfs_trans_brelse(tp, bp); 668 669 if (error) 670 goto out_destroy; 671 } 672 673 trace_xfs_iget_miss(ip); 674 675 /* 676 * Check the inode free state is valid. This also detects lookup 677 * racing with unlinks. 678 */ 679 error = xfs_iget_check_free_state(ip, flags); 680 if (error) 681 goto out_destroy; 682 683 /* 684 * Preload the radix tree so we can insert safely under the 685 * write spinlock. Note that we cannot sleep inside the preload 686 * region. 687 */ 688 if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) { 689 error = -EAGAIN; 690 goto out_destroy; 691 } 692 693 /* 694 * Because the inode hasn't been added to the radix-tree yet it can't 695 * be found by another thread, so we can do the non-sleeping lock here. 696 */ 697 if (lock_flags) { 698 if (!xfs_ilock_nowait(ip, lock_flags)) 699 BUG(); 700 } 701 702 /* 703 * These values must be set before inserting the inode into the radix 704 * tree as the moment it is inserted a concurrent lookup (allowed by the 705 * RCU locking mechanism) can find it and that lookup must see that this 706 * is an inode currently under construction (i.e. that XFS_INEW is set). 707 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 708 * memory barrier that ensures this detection works correctly at lookup 709 * time. 710 */ 711 if (flags & XFS_IGET_DONTCACHE) 712 d_mark_dontcache(VFS_I(ip)); 713 ip->i_udquot = NULL; 714 ip->i_gdquot = NULL; 715 ip->i_pdquot = NULL; 716 xfs_iflags_set(ip, XFS_INEW); 717 718 /* insert the new inode */ 719 spin_lock(&pag->pag_ici_lock); 720 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 721 if (unlikely(error)) { 722 WARN_ON(error != -EEXIST); 723 XFS_STATS_INC(mp, xs_ig_dup); 724 error = -EAGAIN; 725 goto out_preload_end; 726 } 727 spin_unlock(&pag->pag_ici_lock); 728 radix_tree_preload_end(); 729 730 *ipp = ip; 731 return 0; 732 733 out_preload_end: 734 spin_unlock(&pag->pag_ici_lock); 735 radix_tree_preload_end(); 736 if (lock_flags) 737 xfs_iunlock(ip, lock_flags); 738 out_destroy: 739 __destroy_inode(VFS_I(ip)); 740 xfs_inode_free(ip); 741 return error; 742 } 743 744 /* 745 * Look up an inode by number in the given file system. The inode is looked up 746 * in the cache held in each AG. If the inode is found in the cache, initialise 747 * the vfs inode if necessary. 748 * 749 * If it is not in core, read it in from the file system's device, add it to the 750 * cache and initialise the vfs inode. 751 * 752 * The inode is locked according to the value of the lock_flags parameter. 753 * Inode lookup is only done during metadata operations and not as part of the 754 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 755 */ 756 int 757 xfs_iget( 758 struct xfs_mount *mp, 759 struct xfs_trans *tp, 760 xfs_ino_t ino, 761 uint flags, 762 uint lock_flags, 763 struct xfs_inode **ipp) 764 { 765 struct xfs_inode *ip; 766 struct xfs_perag *pag; 767 xfs_agino_t agino; 768 int error; 769 770 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 771 772 /* reject inode numbers outside existing AGs */ 773 if (!xfs_verify_ino(mp, ino)) 774 return -EINVAL; 775 776 XFS_STATS_INC(mp, xs_ig_attempts); 777 778 /* get the perag structure and ensure that it's inode capable */ 779 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 780 agino = XFS_INO_TO_AGINO(mp, ino); 781 782 again: 783 error = 0; 784 rcu_read_lock(); 785 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 786 787 if (ip) { 788 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 789 if (error) 790 goto out_error_or_again; 791 } else { 792 rcu_read_unlock(); 793 if (flags & XFS_IGET_INCORE) { 794 error = -ENODATA; 795 goto out_error_or_again; 796 } 797 XFS_STATS_INC(mp, xs_ig_missed); 798 799 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 800 flags, lock_flags); 801 if (error) 802 goto out_error_or_again; 803 } 804 xfs_perag_put(pag); 805 806 *ipp = ip; 807 808 /* 809 * If we have a real type for an on-disk inode, we can setup the inode 810 * now. If it's a new inode being created, xfs_init_new_inode will 811 * handle it. 812 */ 813 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 814 xfs_setup_existing_inode(ip); 815 return 0; 816 817 out_error_or_again: 818 if (!(flags & (XFS_IGET_INCORE | XFS_IGET_NORETRY)) && 819 error == -EAGAIN) { 820 delay(1); 821 goto again; 822 } 823 xfs_perag_put(pag); 824 return error; 825 } 826 827 /* 828 * Get a metadata inode. 829 * 830 * The metafile type must match the file mode exactly, and for files in the 831 * metadata directory tree, it must match the inode's metatype exactly. 832 */ 833 int 834 xfs_trans_metafile_iget( 835 struct xfs_trans *tp, 836 xfs_ino_t ino, 837 enum xfs_metafile_type metafile_type, 838 struct xfs_inode **ipp) 839 { 840 struct xfs_mount *mp = tp->t_mountp; 841 struct xfs_inode *ip; 842 umode_t mode; 843 int error; 844 845 error = xfs_iget(mp, tp, ino, 0, 0, &ip); 846 if (error == -EFSCORRUPTED || error == -EINVAL) 847 goto whine; 848 if (error) 849 return error; 850 851 if (VFS_I(ip)->i_nlink == 0) 852 goto bad_rele; 853 854 if (metafile_type == XFS_METAFILE_DIR) 855 mode = S_IFDIR; 856 else 857 mode = S_IFREG; 858 if (inode_wrong_type(VFS_I(ip), mode)) 859 goto bad_rele; 860 if (xfs_has_metadir(mp)) { 861 if (!xfs_is_metadir_inode(ip)) 862 goto bad_rele; 863 if (metafile_type != ip->i_metatype) 864 goto bad_rele; 865 } 866 867 *ipp = ip; 868 return 0; 869 bad_rele: 870 xfs_irele(ip); 871 whine: 872 xfs_err(mp, "metadata inode 0x%llx type %u is corrupt", ino, 873 metafile_type); 874 xfs_fs_mark_sick(mp, XFS_SICK_FS_METADIR); 875 return -EFSCORRUPTED; 876 } 877 878 /* Grab a metadata file if the caller doesn't already have a transaction. */ 879 int 880 xfs_metafile_iget( 881 struct xfs_mount *mp, 882 xfs_ino_t ino, 883 enum xfs_metafile_type metafile_type, 884 struct xfs_inode **ipp) 885 { 886 struct xfs_trans *tp; 887 int error; 888 889 tp = xfs_trans_alloc_empty(mp); 890 error = xfs_trans_metafile_iget(tp, ino, metafile_type, ipp); 891 xfs_trans_cancel(tp); 892 return error; 893 } 894 895 /* 896 * Grab the inode for reclaim exclusively. 897 * 898 * We have found this inode via a lookup under RCU, so the inode may have 899 * already been freed, or it may be in the process of being recycled by 900 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 901 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 902 * will not be set. Hence we need to check for both these flag conditions to 903 * avoid inodes that are no longer reclaim candidates. 904 * 905 * Note: checking for other state flags here, under the i_flags_lock or not, is 906 * racy and should be avoided. Those races should be resolved only after we have 907 * ensured that we are able to reclaim this inode and the world can see that we 908 * are going to reclaim it. 909 * 910 * Return true if we grabbed it, false otherwise. 911 */ 912 static bool 913 xfs_reclaim_igrab( 914 struct xfs_inode *ip, 915 struct xfs_icwalk *icw) 916 { 917 ASSERT(rcu_read_lock_held()); 918 919 spin_lock(&ip->i_flags_lock); 920 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 921 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 922 /* not a reclaim candidate. */ 923 spin_unlock(&ip->i_flags_lock); 924 return false; 925 } 926 927 /* Don't reclaim a sick inode unless the caller asked for it. */ 928 if (ip->i_sick && 929 (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { 930 spin_unlock(&ip->i_flags_lock); 931 return false; 932 } 933 934 __xfs_iflags_set(ip, XFS_IRECLAIM); 935 spin_unlock(&ip->i_flags_lock); 936 return true; 937 } 938 939 /* 940 * Inode reclaim is non-blocking, so the default action if progress cannot be 941 * made is to "requeue" the inode for reclaim by unlocking it and clearing the 942 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 943 * blocking anymore and hence we can wait for the inode to be able to reclaim 944 * it. 945 * 946 * We do no IO here - if callers require inodes to be cleaned they must push the 947 * AIL first to trigger writeback of dirty inodes. This enables writeback to be 948 * done in the background in a non-blocking manner, and enables memory reclaim 949 * to make progress without blocking. 950 */ 951 static void 952 xfs_reclaim_inode( 953 struct xfs_inode *ip, 954 struct xfs_perag *pag) 955 { 956 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 957 958 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 959 goto out; 960 if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) 961 goto out_iunlock; 962 963 /* 964 * Check for log shutdown because aborting the inode can move the log 965 * tail and corrupt in memory state. This is fine if the log is shut 966 * down, but if the log is still active and only the mount is shut down 967 * then the in-memory log tail movement caused by the abort can be 968 * incorrectly propagated to disk. 969 */ 970 if (xlog_is_shutdown(ip->i_mount->m_log)) { 971 xfs_iunpin_wait(ip); 972 /* 973 * Avoid a ABBA deadlock on the inode cluster buffer vs 974 * concurrent xfs_ifree_cluster() trying to mark the inode 975 * stale. We don't need the inode locked to run the flush abort 976 * code, but the flush abort needs to lock the cluster buffer. 977 */ 978 xfs_iunlock(ip, XFS_ILOCK_EXCL); 979 xfs_iflush_shutdown_abort(ip); 980 xfs_ilock(ip, XFS_ILOCK_EXCL); 981 goto reclaim; 982 } 983 if (xfs_ipincount(ip)) 984 goto out_clear_flush; 985 if (!xfs_inode_clean(ip)) 986 goto out_clear_flush; 987 988 xfs_iflags_clear(ip, XFS_IFLUSHING); 989 reclaim: 990 trace_xfs_inode_reclaiming(ip); 991 992 /* 993 * Because we use RCU freeing we need to ensure the inode always appears 994 * to be reclaimed with an invalid inode number when in the free state. 995 * We do this as early as possible under the ILOCK so that 996 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 997 * detect races with us here. By doing this, we guarantee that once 998 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 999 * it will see either a valid inode that will serialise correctly, or it 1000 * will see an invalid inode that it can skip. 1001 */ 1002 spin_lock(&ip->i_flags_lock); 1003 ip->i_flags = XFS_IRECLAIM; 1004 ip->i_ino = 0; 1005 ip->i_sick = 0; 1006 ip->i_checked = 0; 1007 spin_unlock(&ip->i_flags_lock); 1008 1009 ASSERT(!ip->i_itemp || ip->i_itemp->ili_item.li_buf == NULL); 1010 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1011 1012 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 1013 /* 1014 * Remove the inode from the per-AG radix tree. 1015 * 1016 * Because radix_tree_delete won't complain even if the item was never 1017 * added to the tree assert that it's been there before to catch 1018 * problems with the inode life time early on. 1019 */ 1020 spin_lock(&pag->pag_ici_lock); 1021 if (!radix_tree_delete(&pag->pag_ici_root, 1022 XFS_INO_TO_AGINO(ip->i_mount, ino))) 1023 ASSERT(0); 1024 xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG); 1025 spin_unlock(&pag->pag_ici_lock); 1026 1027 /* 1028 * Here we do an (almost) spurious inode lock in order to coordinate 1029 * with inode cache radix tree lookups. This is because the lookup 1030 * can reference the inodes in the cache without taking references. 1031 * 1032 * We make that OK here by ensuring that we wait until the inode is 1033 * unlocked after the lookup before we go ahead and free it. 1034 */ 1035 xfs_ilock(ip, XFS_ILOCK_EXCL); 1036 ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot); 1037 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1038 ASSERT(xfs_inode_clean(ip)); 1039 1040 __xfs_inode_free(ip); 1041 return; 1042 1043 out_clear_flush: 1044 xfs_iflags_clear(ip, XFS_IFLUSHING); 1045 out_iunlock: 1046 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1047 out: 1048 xfs_iflags_clear(ip, XFS_IRECLAIM); 1049 } 1050 1051 /* Reclaim sick inodes if we're unmounting or the fs went down. */ 1052 static inline bool 1053 xfs_want_reclaim_sick( 1054 struct xfs_mount *mp) 1055 { 1056 return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) || 1057 xfs_is_shutdown(mp); 1058 } 1059 1060 void 1061 xfs_reclaim_inodes( 1062 struct xfs_mount *mp) 1063 { 1064 struct xfs_icwalk icw = { 1065 .icw_flags = 0, 1066 }; 1067 1068 if (xfs_want_reclaim_sick(mp)) 1069 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 1070 1071 while (xfs_group_marked(mp, XG_TYPE_AG, XFS_PERAG_RECLAIM_MARK)) { 1072 xfs_ail_push_all_sync(mp->m_ail); 1073 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 1074 } 1075 } 1076 1077 /* 1078 * The shrinker infrastructure determines how many inodes we should scan for 1079 * reclaim. We want as many clean inodes ready to reclaim as possible, so we 1080 * push the AIL here. We also want to proactively free up memory if we can to 1081 * minimise the amount of work memory reclaim has to do so we kick the 1082 * background reclaim if it isn't already scheduled. 1083 */ 1084 long 1085 xfs_reclaim_inodes_nr( 1086 struct xfs_mount *mp, 1087 unsigned long nr_to_scan) 1088 { 1089 struct xfs_icwalk icw = { 1090 .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, 1091 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan), 1092 }; 1093 1094 if (xfs_want_reclaim_sick(mp)) 1095 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 1096 1097 /* kick background reclaimer and push the AIL */ 1098 xfs_reclaim_work_queue(mp); 1099 xfs_ail_push_all(mp->m_ail); 1100 1101 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 1102 return 0; 1103 } 1104 1105 /* 1106 * Return the number of reclaimable inodes in the filesystem for 1107 * the shrinker to determine how much to reclaim. 1108 */ 1109 long 1110 xfs_reclaim_inodes_count( 1111 struct xfs_mount *mp) 1112 { 1113 XA_STATE (xas, &mp->m_groups[XG_TYPE_AG].xa, 0); 1114 long reclaimable = 0; 1115 struct xfs_perag *pag; 1116 1117 rcu_read_lock(); 1118 xas_for_each_marked(&xas, pag, ULONG_MAX, XFS_PERAG_RECLAIM_MARK) { 1119 trace_xfs_reclaim_inodes_count(pag, _THIS_IP_); 1120 reclaimable += pag->pag_ici_reclaimable; 1121 } 1122 rcu_read_unlock(); 1123 1124 return reclaimable; 1125 } 1126 1127 STATIC bool 1128 xfs_icwalk_match_id( 1129 struct xfs_inode *ip, 1130 struct xfs_icwalk *icw) 1131 { 1132 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1133 !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1134 return false; 1135 1136 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1137 !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1138 return false; 1139 1140 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1141 ip->i_projid != icw->icw_prid) 1142 return false; 1143 1144 return true; 1145 } 1146 1147 /* 1148 * A union-based inode filtering algorithm. Process the inode if any of the 1149 * criteria match. This is for global/internal scans only. 1150 */ 1151 STATIC bool 1152 xfs_icwalk_match_id_union( 1153 struct xfs_inode *ip, 1154 struct xfs_icwalk *icw) 1155 { 1156 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1157 uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1158 return true; 1159 1160 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1161 gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1162 return true; 1163 1164 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1165 ip->i_projid == icw->icw_prid) 1166 return true; 1167 1168 return false; 1169 } 1170 1171 /* 1172 * Is this inode @ip eligible for eof/cow block reclamation, given some 1173 * filtering parameters @icw? The inode is eligible if @icw is null or 1174 * if the predicate functions match. 1175 */ 1176 static bool 1177 xfs_icwalk_match( 1178 struct xfs_inode *ip, 1179 struct xfs_icwalk *icw) 1180 { 1181 bool match; 1182 1183 if (!icw) 1184 return true; 1185 1186 if (icw->icw_flags & XFS_ICWALK_FLAG_UNION) 1187 match = xfs_icwalk_match_id_union(ip, icw); 1188 else 1189 match = xfs_icwalk_match_id(ip, icw); 1190 if (!match) 1191 return false; 1192 1193 /* skip the inode if the file size is too small */ 1194 if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) && 1195 XFS_ISIZE(ip) < icw->icw_min_file_size) 1196 return false; 1197 1198 return true; 1199 } 1200 1201 /* 1202 * This is a fast pass over the inode cache to try to get reclaim moving on as 1203 * many inodes as possible in a short period of time. It kicks itself every few 1204 * seconds, as well as being kicked by the inode cache shrinker when memory 1205 * goes low. 1206 */ 1207 void 1208 xfs_reclaim_worker( 1209 struct work_struct *work) 1210 { 1211 struct xfs_mount *mp = container_of(to_delayed_work(work), 1212 struct xfs_mount, m_reclaim_work); 1213 1214 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); 1215 xfs_reclaim_work_queue(mp); 1216 } 1217 1218 STATIC int 1219 xfs_inode_free_eofblocks( 1220 struct xfs_inode *ip, 1221 struct xfs_icwalk *icw, 1222 unsigned int *lockflags) 1223 { 1224 bool wait; 1225 1226 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1227 1228 if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) 1229 return 0; 1230 1231 /* 1232 * If the mapping is dirty the operation can block and wait for some 1233 * time. Unless we are waiting, skip it. 1234 */ 1235 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1236 return 0; 1237 1238 if (!xfs_icwalk_match(ip, icw)) 1239 return 0; 1240 1241 /* 1242 * If the caller is waiting, return -EAGAIN to keep the background 1243 * scanner moving and revisit the inode in a subsequent pass. 1244 */ 1245 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1246 if (wait) 1247 return -EAGAIN; 1248 return 0; 1249 } 1250 *lockflags |= XFS_IOLOCK_EXCL; 1251 1252 if (xfs_can_free_eofblocks(ip)) 1253 return xfs_free_eofblocks(ip); 1254 1255 /* inode could be preallocated */ 1256 trace_xfs_inode_free_eofblocks_invalid(ip); 1257 xfs_inode_clear_eofblocks_tag(ip); 1258 return 0; 1259 } 1260 1261 static void 1262 xfs_blockgc_set_iflag( 1263 struct xfs_inode *ip, 1264 unsigned long iflag) 1265 { 1266 struct xfs_mount *mp = ip->i_mount; 1267 struct xfs_perag *pag; 1268 1269 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1270 1271 /* 1272 * Don't bother locking the AG and looking up in the radix trees 1273 * if we already know that we have the tag set. 1274 */ 1275 if (ip->i_flags & iflag) 1276 return; 1277 spin_lock(&ip->i_flags_lock); 1278 ip->i_flags |= iflag; 1279 spin_unlock(&ip->i_flags_lock); 1280 1281 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1282 spin_lock(&pag->pag_ici_lock); 1283 1284 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1285 XFS_ICI_BLOCKGC_TAG); 1286 1287 spin_unlock(&pag->pag_ici_lock); 1288 xfs_perag_put(pag); 1289 } 1290 1291 void 1292 xfs_inode_set_eofblocks_tag( 1293 xfs_inode_t *ip) 1294 { 1295 trace_xfs_inode_set_eofblocks_tag(ip); 1296 return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); 1297 } 1298 1299 static void 1300 xfs_blockgc_clear_iflag( 1301 struct xfs_inode *ip, 1302 unsigned long iflag) 1303 { 1304 struct xfs_mount *mp = ip->i_mount; 1305 struct xfs_perag *pag; 1306 bool clear_tag; 1307 1308 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1309 1310 spin_lock(&ip->i_flags_lock); 1311 ip->i_flags &= ~iflag; 1312 clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; 1313 spin_unlock(&ip->i_flags_lock); 1314 1315 if (!clear_tag) 1316 return; 1317 1318 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1319 spin_lock(&pag->pag_ici_lock); 1320 1321 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1322 XFS_ICI_BLOCKGC_TAG); 1323 1324 spin_unlock(&pag->pag_ici_lock); 1325 xfs_perag_put(pag); 1326 } 1327 1328 void 1329 xfs_inode_clear_eofblocks_tag( 1330 xfs_inode_t *ip) 1331 { 1332 trace_xfs_inode_clear_eofblocks_tag(ip); 1333 return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); 1334 } 1335 1336 /* 1337 * Prepare to free COW fork blocks from an inode. 1338 */ 1339 static bool 1340 xfs_prep_free_cowblocks( 1341 struct xfs_inode *ip, 1342 struct xfs_icwalk *icw) 1343 { 1344 bool sync; 1345 1346 sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1347 1348 /* 1349 * Just clear the tag if we have an empty cow fork or none at all. It's 1350 * possible the inode was fully unshared since it was originally tagged. 1351 */ 1352 if (!xfs_inode_has_cow_data(ip)) { 1353 trace_xfs_inode_free_cowblocks_invalid(ip); 1354 xfs_inode_clear_cowblocks_tag(ip); 1355 return false; 1356 } 1357 1358 /* 1359 * A cowblocks trim of an inode can have a significant effect on 1360 * fragmentation even when a reasonable COW extent size hint is set. 1361 * Therefore, we prefer to not process cowblocks unless they are clean 1362 * and idle. We can never process a cowblocks inode that is dirty or has 1363 * in-flight I/O under any circumstances, because outstanding writeback 1364 * or dio expects targeted COW fork blocks exist through write 1365 * completion where they can be remapped into the data fork. 1366 * 1367 * Therefore, the heuristic used here is to never process inodes 1368 * currently opened for write from background (i.e. non-sync) scans. For 1369 * sync scans, use the pagecache/dio state of the inode to ensure we 1370 * never free COW fork blocks out from under pending I/O. 1371 */ 1372 if (!sync && inode_is_open_for_write(VFS_I(ip))) 1373 return false; 1374 return xfs_can_free_cowblocks(ip); 1375 } 1376 1377 /* 1378 * Automatic CoW Reservation Freeing 1379 * 1380 * These functions automatically garbage collect leftover CoW reservations 1381 * that were made on behalf of a cowextsize hint when we start to run out 1382 * of quota or when the reservations sit around for too long. If the file 1383 * has dirty pages or is undergoing writeback, its CoW reservations will 1384 * be retained. 1385 * 1386 * The actual garbage collection piggybacks off the same code that runs 1387 * the speculative EOF preallocation garbage collector. 1388 */ 1389 STATIC int 1390 xfs_inode_free_cowblocks( 1391 struct xfs_inode *ip, 1392 struct xfs_icwalk *icw, 1393 unsigned int *lockflags) 1394 { 1395 bool wait; 1396 int ret = 0; 1397 1398 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1399 1400 if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) 1401 return 0; 1402 1403 if (!xfs_prep_free_cowblocks(ip, icw)) 1404 return 0; 1405 1406 if (!xfs_icwalk_match(ip, icw)) 1407 return 0; 1408 1409 /* 1410 * If the caller is waiting, return -EAGAIN to keep the background 1411 * scanner moving and revisit the inode in a subsequent pass. 1412 */ 1413 if (!(*lockflags & XFS_IOLOCK_EXCL) && 1414 !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1415 if (wait) 1416 return -EAGAIN; 1417 return 0; 1418 } 1419 *lockflags |= XFS_IOLOCK_EXCL; 1420 1421 if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { 1422 if (wait) 1423 return -EAGAIN; 1424 return 0; 1425 } 1426 *lockflags |= XFS_MMAPLOCK_EXCL; 1427 1428 /* 1429 * Check again, nobody else should be able to dirty blocks or change 1430 * the reflink iflag now that we have the first two locks held. 1431 */ 1432 if (xfs_prep_free_cowblocks(ip, icw)) 1433 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1434 return ret; 1435 } 1436 1437 void 1438 xfs_inode_set_cowblocks_tag( 1439 xfs_inode_t *ip) 1440 { 1441 trace_xfs_inode_set_cowblocks_tag(ip); 1442 return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); 1443 } 1444 1445 void 1446 xfs_inode_clear_cowblocks_tag( 1447 xfs_inode_t *ip) 1448 { 1449 trace_xfs_inode_clear_cowblocks_tag(ip); 1450 return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); 1451 } 1452 1453 /* Disable post-EOF and CoW block auto-reclamation. */ 1454 void 1455 xfs_blockgc_stop( 1456 struct xfs_mount *mp) 1457 { 1458 struct xfs_perag *pag = NULL; 1459 1460 if (!xfs_clear_blockgc_enabled(mp)) 1461 return; 1462 1463 while ((pag = xfs_perag_next(mp, pag))) 1464 cancel_delayed_work_sync(&pag->pag_blockgc_work); 1465 trace_xfs_blockgc_stop(mp, __return_address); 1466 } 1467 1468 /* Enable post-EOF and CoW block auto-reclamation. */ 1469 void 1470 xfs_blockgc_start( 1471 struct xfs_mount *mp) 1472 { 1473 struct xfs_perag *pag = NULL; 1474 1475 if (xfs_set_blockgc_enabled(mp)) 1476 return; 1477 1478 trace_xfs_blockgc_start(mp, __return_address); 1479 while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) 1480 xfs_blockgc_queue(pag); 1481 } 1482 1483 /* Don't try to run block gc on an inode that's in any of these states. */ 1484 #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ 1485 XFS_NEED_INACTIVE | \ 1486 XFS_INACTIVATING | \ 1487 XFS_IRECLAIMABLE | \ 1488 XFS_IRECLAIM) 1489 /* 1490 * Decide if the given @ip is eligible for garbage collection of speculative 1491 * preallocations, and grab it if so. Returns true if it's ready to go or 1492 * false if we should just ignore it. 1493 */ 1494 static bool 1495 xfs_blockgc_igrab( 1496 struct xfs_inode *ip) 1497 { 1498 struct inode *inode = VFS_I(ip); 1499 1500 ASSERT(rcu_read_lock_held()); 1501 1502 /* Check for stale RCU freed inode */ 1503 spin_lock(&ip->i_flags_lock); 1504 if (!ip->i_ino) 1505 goto out_unlock_noent; 1506 1507 if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS) 1508 goto out_unlock_noent; 1509 spin_unlock(&ip->i_flags_lock); 1510 1511 /* nothing to sync during shutdown */ 1512 if (xfs_is_shutdown(ip->i_mount)) 1513 return false; 1514 1515 /* If we can't grab the inode, it must on it's way to reclaim. */ 1516 if (!igrab(inode)) 1517 return false; 1518 1519 /* inode is valid */ 1520 return true; 1521 1522 out_unlock_noent: 1523 spin_unlock(&ip->i_flags_lock); 1524 return false; 1525 } 1526 1527 /* Scan one incore inode for block preallocations that we can remove. */ 1528 static int 1529 xfs_blockgc_scan_inode( 1530 struct xfs_inode *ip, 1531 struct xfs_icwalk *icw) 1532 { 1533 unsigned int lockflags = 0; 1534 int error; 1535 1536 error = xfs_inode_free_eofblocks(ip, icw, &lockflags); 1537 if (error) 1538 goto unlock; 1539 1540 error = xfs_inode_free_cowblocks(ip, icw, &lockflags); 1541 unlock: 1542 if (lockflags) 1543 xfs_iunlock(ip, lockflags); 1544 xfs_irele(ip); 1545 return error; 1546 } 1547 1548 /* Background worker that trims preallocated space. */ 1549 void 1550 xfs_blockgc_worker( 1551 struct work_struct *work) 1552 { 1553 struct xfs_perag *pag = container_of(to_delayed_work(work), 1554 struct xfs_perag, pag_blockgc_work); 1555 struct xfs_mount *mp = pag_mount(pag); 1556 int error; 1557 1558 trace_xfs_blockgc_worker(mp, __return_address); 1559 1560 error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); 1561 if (error) 1562 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", 1563 pag_agno(pag), error); 1564 xfs_blockgc_queue(pag); 1565 } 1566 1567 /* 1568 * Try to free space in the filesystem by purging inactive inodes, eofblocks 1569 * and cowblocks. 1570 */ 1571 int 1572 xfs_blockgc_free_space( 1573 struct xfs_mount *mp, 1574 struct xfs_icwalk *icw) 1575 { 1576 int error; 1577 1578 trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); 1579 1580 error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); 1581 if (error) 1582 return error; 1583 1584 return xfs_inodegc_flush(mp); 1585 } 1586 1587 /* 1588 * Reclaim all the free space that we can by scheduling the background blockgc 1589 * and inodegc workers immediately and waiting for them all to clear. 1590 */ 1591 int 1592 xfs_blockgc_flush_all( 1593 struct xfs_mount *mp) 1594 { 1595 struct xfs_perag *pag = NULL; 1596 1597 trace_xfs_blockgc_flush_all(mp, __return_address); 1598 1599 /* 1600 * For each blockgc worker, move its queue time up to now. If it wasn't 1601 * queued, it will not be requeued. Then flush whatever is left. 1602 */ 1603 while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) 1604 mod_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, 0); 1605 1606 while ((pag = xfs_perag_grab_next_tag(mp, pag, XFS_ICI_BLOCKGC_TAG))) 1607 flush_delayed_work(&pag->pag_blockgc_work); 1608 1609 return xfs_inodegc_flush(mp); 1610 } 1611 1612 /* 1613 * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which 1614 * quota caused an allocation failure, so we make a best effort by including 1615 * each quota under low free space conditions (less than 1% free space) in the 1616 * scan. 1617 * 1618 * Callers must not hold any inode's ILOCK. If requesting a synchronous scan 1619 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or 1620 * MMAPLOCK. 1621 */ 1622 int 1623 xfs_blockgc_free_dquots( 1624 struct xfs_mount *mp, 1625 struct xfs_dquot *udqp, 1626 struct xfs_dquot *gdqp, 1627 struct xfs_dquot *pdqp, 1628 unsigned int iwalk_flags) 1629 { 1630 struct xfs_icwalk icw = {0}; 1631 bool do_work = false; 1632 1633 if (!udqp && !gdqp && !pdqp) 1634 return 0; 1635 1636 /* 1637 * Run a scan to free blocks using the union filter to cover all 1638 * applicable quotas in a single scan. 1639 */ 1640 icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; 1641 1642 if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { 1643 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); 1644 icw.icw_flags |= XFS_ICWALK_FLAG_UID; 1645 do_work = true; 1646 } 1647 1648 if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { 1649 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); 1650 icw.icw_flags |= XFS_ICWALK_FLAG_GID; 1651 do_work = true; 1652 } 1653 1654 if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { 1655 icw.icw_prid = pdqp->q_id; 1656 icw.icw_flags |= XFS_ICWALK_FLAG_PRID; 1657 do_work = true; 1658 } 1659 1660 if (!do_work) 1661 return 0; 1662 1663 return xfs_blockgc_free_space(mp, &icw); 1664 } 1665 1666 /* Run cow/eofblocks scans on the quotas attached to the inode. */ 1667 int 1668 xfs_blockgc_free_quota( 1669 struct xfs_inode *ip, 1670 unsigned int iwalk_flags) 1671 { 1672 return xfs_blockgc_free_dquots(ip->i_mount, 1673 xfs_inode_dquot(ip, XFS_DQTYPE_USER), 1674 xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), 1675 xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); 1676 } 1677 1678 /* XFS Inode Cache Walking Code */ 1679 1680 /* 1681 * The inode lookup is done in batches to keep the amount of lock traffic and 1682 * radix tree lookups to a minimum. The batch size is a trade off between 1683 * lookup reduction and stack usage. This is in the reclaim path, so we can't 1684 * be too greedy. 1685 */ 1686 #define XFS_LOOKUP_BATCH 32 1687 1688 1689 /* 1690 * Decide if we want to grab this inode in anticipation of doing work towards 1691 * the goal. 1692 */ 1693 static inline bool 1694 xfs_icwalk_igrab( 1695 enum xfs_icwalk_goal goal, 1696 struct xfs_inode *ip, 1697 struct xfs_icwalk *icw) 1698 { 1699 switch (goal) { 1700 case XFS_ICWALK_BLOCKGC: 1701 return xfs_blockgc_igrab(ip); 1702 case XFS_ICWALK_RECLAIM: 1703 return xfs_reclaim_igrab(ip, icw); 1704 default: 1705 return false; 1706 } 1707 } 1708 1709 /* 1710 * Process an inode. Each processing function must handle any state changes 1711 * made by the icwalk igrab function. Return -EAGAIN to skip an inode. 1712 */ 1713 static inline int 1714 xfs_icwalk_process_inode( 1715 enum xfs_icwalk_goal goal, 1716 struct xfs_inode *ip, 1717 struct xfs_perag *pag, 1718 struct xfs_icwalk *icw) 1719 { 1720 int error = 0; 1721 1722 switch (goal) { 1723 case XFS_ICWALK_BLOCKGC: 1724 error = xfs_blockgc_scan_inode(ip, icw); 1725 break; 1726 case XFS_ICWALK_RECLAIM: 1727 xfs_reclaim_inode(ip, pag); 1728 break; 1729 } 1730 return error; 1731 } 1732 1733 /* 1734 * For a given per-AG structure @pag and a goal, grab qualifying inodes and 1735 * process them in some manner. 1736 */ 1737 static int 1738 xfs_icwalk_ag( 1739 struct xfs_perag *pag, 1740 enum xfs_icwalk_goal goal, 1741 struct xfs_icwalk *icw) 1742 { 1743 struct xfs_mount *mp = pag_mount(pag); 1744 uint32_t first_index; 1745 int last_error = 0; 1746 int skipped; 1747 bool done; 1748 int nr_found; 1749 1750 restart: 1751 done = false; 1752 skipped = 0; 1753 if (goal == XFS_ICWALK_RECLAIM) 1754 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1755 else 1756 first_index = 0; 1757 nr_found = 0; 1758 do { 1759 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1760 int error = 0; 1761 int i; 1762 1763 rcu_read_lock(); 1764 1765 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 1766 (void **) batch, first_index, 1767 XFS_LOOKUP_BATCH, goal); 1768 if (!nr_found) { 1769 done = true; 1770 rcu_read_unlock(); 1771 break; 1772 } 1773 1774 /* 1775 * Grab the inodes before we drop the lock. if we found 1776 * nothing, nr == 0 and the loop will be skipped. 1777 */ 1778 for (i = 0; i < nr_found; i++) { 1779 struct xfs_inode *ip = batch[i]; 1780 1781 if (done || !xfs_icwalk_igrab(goal, ip, icw)) 1782 batch[i] = NULL; 1783 1784 /* 1785 * Update the index for the next lookup. Catch 1786 * overflows into the next AG range which can occur if 1787 * we have inodes in the last block of the AG and we 1788 * are currently pointing to the last inode. 1789 * 1790 * Because we may see inodes that are from the wrong AG 1791 * due to RCU freeing and reallocation, only update the 1792 * index if it lies in this AG. It was a race that lead 1793 * us to see this inode, so another lookup from the 1794 * same index will not find it again. 1795 */ 1796 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag_agno(pag)) 1797 continue; 1798 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1799 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1800 done = true; 1801 } 1802 1803 /* unlock now we've grabbed the inodes. */ 1804 rcu_read_unlock(); 1805 1806 for (i = 0; i < nr_found; i++) { 1807 if (!batch[i]) 1808 continue; 1809 error = xfs_icwalk_process_inode(goal, batch[i], pag, 1810 icw); 1811 if (error == -EAGAIN) { 1812 skipped++; 1813 continue; 1814 } 1815 if (error && last_error != -EFSCORRUPTED) 1816 last_error = error; 1817 } 1818 1819 /* bail out if the filesystem is corrupted. */ 1820 if (error == -EFSCORRUPTED) 1821 break; 1822 1823 cond_resched(); 1824 1825 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { 1826 icw->icw_scan_limit -= XFS_LOOKUP_BATCH; 1827 if (icw->icw_scan_limit <= 0) 1828 break; 1829 } 1830 } while (nr_found && !done); 1831 1832 if (goal == XFS_ICWALK_RECLAIM) { 1833 if (done) 1834 first_index = 0; 1835 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1836 } 1837 1838 if (skipped) { 1839 delay(1); 1840 goto restart; 1841 } 1842 return last_error; 1843 } 1844 1845 /* Walk all incore inodes to achieve a given goal. */ 1846 static int 1847 xfs_icwalk( 1848 struct xfs_mount *mp, 1849 enum xfs_icwalk_goal goal, 1850 struct xfs_icwalk *icw) 1851 { 1852 struct xfs_perag *pag = NULL; 1853 int error = 0; 1854 int last_error = 0; 1855 1856 while ((pag = xfs_perag_grab_next_tag(mp, pag, goal))) { 1857 error = xfs_icwalk_ag(pag, goal, icw); 1858 if (error) { 1859 last_error = error; 1860 if (error == -EFSCORRUPTED) { 1861 xfs_perag_rele(pag); 1862 break; 1863 } 1864 } 1865 } 1866 return last_error; 1867 BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); 1868 } 1869 1870 #ifdef DEBUG 1871 static void 1872 xfs_check_delalloc( 1873 struct xfs_inode *ip, 1874 int whichfork) 1875 { 1876 struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); 1877 struct xfs_bmbt_irec got; 1878 struct xfs_iext_cursor icur; 1879 1880 if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) 1881 return; 1882 do { 1883 if (isnullstartblock(got.br_startblock)) { 1884 xfs_warn(ip->i_mount, 1885 "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", 1886 ip->i_ino, 1887 whichfork == XFS_DATA_FORK ? "data" : "cow", 1888 got.br_startoff, got.br_blockcount); 1889 } 1890 } while (xfs_iext_next_extent(ifp, &icur, &got)); 1891 } 1892 #else 1893 #define xfs_check_delalloc(ip, whichfork) do { } while (0) 1894 #endif 1895 1896 /* Schedule the inode for reclaim. */ 1897 static void 1898 xfs_inodegc_set_reclaimable( 1899 struct xfs_inode *ip) 1900 { 1901 struct xfs_mount *mp = ip->i_mount; 1902 struct xfs_perag *pag; 1903 1904 if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) { 1905 xfs_check_delalloc(ip, XFS_DATA_FORK); 1906 xfs_check_delalloc(ip, XFS_COW_FORK); 1907 ASSERT(0); 1908 } 1909 1910 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1911 spin_lock(&pag->pag_ici_lock); 1912 spin_lock(&ip->i_flags_lock); 1913 1914 trace_xfs_inode_set_reclaimable(ip); 1915 ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); 1916 ip->i_flags |= XFS_IRECLAIMABLE; 1917 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1918 XFS_ICI_RECLAIM_TAG); 1919 1920 spin_unlock(&ip->i_flags_lock); 1921 spin_unlock(&pag->pag_ici_lock); 1922 xfs_perag_put(pag); 1923 } 1924 1925 /* 1926 * Free all speculative preallocations and possibly even the inode itself. 1927 * This is the last chance to make changes to an otherwise unreferenced file 1928 * before incore reclamation happens. 1929 */ 1930 static int 1931 xfs_inodegc_inactivate( 1932 struct xfs_inode *ip) 1933 { 1934 int error; 1935 1936 trace_xfs_inode_inactivating(ip); 1937 error = xfs_inactive(ip); 1938 xfs_inodegc_set_reclaimable(ip); 1939 return error; 1940 1941 } 1942 1943 void 1944 xfs_inodegc_worker( 1945 struct work_struct *work) 1946 { 1947 struct xfs_inodegc *gc = container_of(to_delayed_work(work), 1948 struct xfs_inodegc, work); 1949 struct llist_node *node = llist_del_all(&gc->list); 1950 struct xfs_inode *ip, *n; 1951 struct xfs_mount *mp = gc->mp; 1952 unsigned int nofs_flag; 1953 1954 /* 1955 * Clear the cpu mask bit and ensure that we have seen the latest 1956 * update of the gc structure associated with this CPU. This matches 1957 * with the release semantics used when setting the cpumask bit in 1958 * xfs_inodegc_queue. 1959 */ 1960 cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask); 1961 smp_mb__after_atomic(); 1962 1963 WRITE_ONCE(gc->items, 0); 1964 1965 if (!node) 1966 return; 1967 1968 /* 1969 * We can allocate memory here while doing writeback on behalf of 1970 * memory reclaim. To avoid memory allocation deadlocks set the 1971 * task-wide nofs context for the following operations. 1972 */ 1973 nofs_flag = memalloc_nofs_save(); 1974 1975 ip = llist_entry(node, struct xfs_inode, i_gclist); 1976 trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits)); 1977 1978 WRITE_ONCE(gc->shrinker_hits, 0); 1979 llist_for_each_entry_safe(ip, n, node, i_gclist) { 1980 int error; 1981 1982 xfs_iflags_set(ip, XFS_INACTIVATING); 1983 error = xfs_inodegc_inactivate(ip); 1984 if (error && !gc->error) 1985 gc->error = error; 1986 } 1987 1988 memalloc_nofs_restore(nofs_flag); 1989 } 1990 1991 /* 1992 * Expedite all pending inodegc work to run immediately. This does not wait for 1993 * completion of the work. 1994 */ 1995 void 1996 xfs_inodegc_push( 1997 struct xfs_mount *mp) 1998 { 1999 if (!xfs_is_inodegc_enabled(mp)) 2000 return; 2001 trace_xfs_inodegc_push(mp, __return_address); 2002 xfs_inodegc_queue_all(mp); 2003 } 2004 2005 /* 2006 * Force all currently queued inode inactivation work to run immediately and 2007 * wait for the work to finish. 2008 */ 2009 int 2010 xfs_inodegc_flush( 2011 struct xfs_mount *mp) 2012 { 2013 xfs_inodegc_push(mp); 2014 trace_xfs_inodegc_flush(mp, __return_address); 2015 return xfs_inodegc_wait_all(mp); 2016 } 2017 2018 /* 2019 * Flush all the pending work and then disable the inode inactivation background 2020 * workers and wait for them to stop. Caller must hold sb->s_umount to 2021 * coordinate changes in the inodegc_enabled state. 2022 */ 2023 void 2024 xfs_inodegc_stop( 2025 struct xfs_mount *mp) 2026 { 2027 bool rerun; 2028 2029 if (!xfs_clear_inodegc_enabled(mp)) 2030 return; 2031 2032 /* 2033 * Drain all pending inodegc work, including inodes that could be 2034 * queued by racing xfs_inodegc_queue or xfs_inodegc_shrinker_scan 2035 * threads that sample the inodegc state just prior to us clearing it. 2036 * The inodegc flag state prevents new threads from queuing more 2037 * inodes, so we queue pending work items and flush the workqueue until 2038 * all inodegc lists are empty. IOWs, we cannot use drain_workqueue 2039 * here because it does not allow other unserialized mechanisms to 2040 * reschedule inodegc work while this draining is in progress. 2041 */ 2042 xfs_inodegc_queue_all(mp); 2043 do { 2044 flush_workqueue(mp->m_inodegc_wq); 2045 rerun = xfs_inodegc_queue_all(mp); 2046 } while (rerun); 2047 2048 trace_xfs_inodegc_stop(mp, __return_address); 2049 } 2050 2051 /* 2052 * Enable the inode inactivation background workers and schedule deferred inode 2053 * inactivation work if there is any. Caller must hold sb->s_umount to 2054 * coordinate changes in the inodegc_enabled state. 2055 */ 2056 void 2057 xfs_inodegc_start( 2058 struct xfs_mount *mp) 2059 { 2060 if (xfs_set_inodegc_enabled(mp)) 2061 return; 2062 2063 trace_xfs_inodegc_start(mp, __return_address); 2064 xfs_inodegc_queue_all(mp); 2065 } 2066 2067 #ifdef CONFIG_XFS_RT 2068 static inline bool 2069 xfs_inodegc_want_queue_rt_file( 2070 struct xfs_inode *ip) 2071 { 2072 struct xfs_mount *mp = ip->i_mount; 2073 2074 if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp)) 2075 return false; 2076 2077 if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS, 2078 mp->m_low_rtexts[XFS_LOWSP_5_PCNT], 2079 XFS_FDBLOCKS_BATCH) < 0) 2080 return true; 2081 2082 return false; 2083 } 2084 #else 2085 # define xfs_inodegc_want_queue_rt_file(ip) (false) 2086 #endif /* CONFIG_XFS_RT */ 2087 2088 /* 2089 * Schedule the inactivation worker when: 2090 * 2091 * - We've accumulated more than one inode cluster buffer's worth of inodes. 2092 * - There is less than 5% free space left. 2093 * - Any of the quotas for this inode are near an enforcement limit. 2094 */ 2095 static inline bool 2096 xfs_inodegc_want_queue_work( 2097 struct xfs_inode *ip, 2098 unsigned int items) 2099 { 2100 struct xfs_mount *mp = ip->i_mount; 2101 2102 if (items > mp->m_ino_geo.inodes_per_cluster) 2103 return true; 2104 2105 if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS, 2106 mp->m_low_space[XFS_LOWSP_5_PCNT], 2107 XFS_FDBLOCKS_BATCH) < 0) 2108 return true; 2109 2110 if (xfs_inodegc_want_queue_rt_file(ip)) 2111 return true; 2112 2113 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) 2114 return true; 2115 2116 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP)) 2117 return true; 2118 2119 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ)) 2120 return true; 2121 2122 return false; 2123 } 2124 2125 /* 2126 * Upper bound on the number of inodes in each AG that can be queued for 2127 * inactivation at any given time, to avoid monopolizing the workqueue. 2128 */ 2129 #define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK) 2130 2131 /* 2132 * Make the frontend wait for inactivations when: 2133 * 2134 * - Memory shrinkers queued the inactivation worker and it hasn't finished. 2135 * - The queue depth exceeds the maximum allowable percpu backlog. 2136 * 2137 * Note: If we are in a NOFS context here (e.g. current thread is running a 2138 * transaction) the we don't want to block here as inodegc progress may require 2139 * filesystem resources we hold to make progress and that could result in a 2140 * deadlock. Hence we skip out of here if we are in a scoped NOFS context. 2141 */ 2142 static inline bool 2143 xfs_inodegc_want_flush_work( 2144 struct xfs_inode *ip, 2145 unsigned int items, 2146 unsigned int shrinker_hits) 2147 { 2148 if (current->flags & PF_MEMALLOC_NOFS) 2149 return false; 2150 2151 if (shrinker_hits > 0) 2152 return true; 2153 2154 if (items > XFS_INODEGC_MAX_BACKLOG) 2155 return true; 2156 2157 return false; 2158 } 2159 2160 /* 2161 * Queue a background inactivation worker if there are inodes that need to be 2162 * inactivated and higher level xfs code hasn't disabled the background 2163 * workers. 2164 */ 2165 static void 2166 xfs_inodegc_queue( 2167 struct xfs_inode *ip) 2168 { 2169 struct xfs_mount *mp = ip->i_mount; 2170 struct xfs_inodegc *gc; 2171 int items; 2172 unsigned int shrinker_hits; 2173 unsigned int cpu_nr; 2174 unsigned long queue_delay = 1; 2175 2176 trace_xfs_inode_set_need_inactive(ip); 2177 spin_lock(&ip->i_flags_lock); 2178 ip->i_flags |= XFS_NEED_INACTIVE; 2179 spin_unlock(&ip->i_flags_lock); 2180 2181 cpu_nr = get_cpu(); 2182 gc = this_cpu_ptr(mp->m_inodegc); 2183 llist_add(&ip->i_gclist, &gc->list); 2184 items = READ_ONCE(gc->items); 2185 WRITE_ONCE(gc->items, items + 1); 2186 shrinker_hits = READ_ONCE(gc->shrinker_hits); 2187 2188 /* 2189 * Ensure the list add is always seen by anyone who finds the cpumask 2190 * bit set. This effectively gives the cpumask bit set operation 2191 * release ordering semantics. 2192 */ 2193 smp_mb__before_atomic(); 2194 if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask)) 2195 cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask); 2196 2197 /* 2198 * We queue the work while holding the current CPU so that the work 2199 * is scheduled to run on this CPU. 2200 */ 2201 if (!xfs_is_inodegc_enabled(mp)) { 2202 put_cpu(); 2203 return; 2204 } 2205 2206 if (xfs_inodegc_want_queue_work(ip, items)) 2207 queue_delay = 0; 2208 2209 trace_xfs_inodegc_queue(mp, __return_address); 2210 mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, 2211 queue_delay); 2212 put_cpu(); 2213 2214 if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { 2215 trace_xfs_inodegc_throttle(mp, __return_address); 2216 flush_delayed_work(&gc->work); 2217 } 2218 } 2219 2220 /* 2221 * We set the inode flag atomically with the radix tree tag. Once we get tag 2222 * lookups on the radix tree, this inode flag can go away. 2223 * 2224 * We always use background reclaim here because even if the inode is clean, it 2225 * still may be under IO and hence we have wait for IO completion to occur 2226 * before we can reclaim the inode. The background reclaim path handles this 2227 * more efficiently than we can here, so simply let background reclaim tear down 2228 * all inodes. 2229 */ 2230 void 2231 xfs_inode_mark_reclaimable( 2232 struct xfs_inode *ip) 2233 { 2234 struct xfs_mount *mp = ip->i_mount; 2235 bool need_inactive; 2236 2237 XFS_STATS_INC(mp, vn_reclaim); 2238 2239 /* 2240 * We should never get here with any of the reclaim flags already set. 2241 */ 2242 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); 2243 2244 need_inactive = xfs_inode_needs_inactive(ip); 2245 if (need_inactive) { 2246 xfs_inodegc_queue(ip); 2247 return; 2248 } 2249 2250 /* Going straight to reclaim, so drop the dquots. */ 2251 xfs_qm_dqdetach(ip); 2252 xfs_inodegc_set_reclaimable(ip); 2253 } 2254 2255 /* 2256 * Register a phony shrinker so that we can run background inodegc sooner when 2257 * there's memory pressure. Inactivation does not itself free any memory but 2258 * it does make inodes reclaimable, which eventually frees memory. 2259 * 2260 * The count function, seek value, and batch value are crafted to trigger the 2261 * scan function during the second round of scanning. Hopefully this means 2262 * that we reclaimed enough memory that initiating metadata transactions won't 2263 * make things worse. 2264 */ 2265 #define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY) 2266 #define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1) 2267 2268 static unsigned long 2269 xfs_inodegc_shrinker_count( 2270 struct shrinker *shrink, 2271 struct shrink_control *sc) 2272 { 2273 struct xfs_mount *mp = shrink->private_data; 2274 struct xfs_inodegc *gc; 2275 int cpu; 2276 2277 if (!xfs_is_inodegc_enabled(mp)) 2278 return 0; 2279 2280 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 2281 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2282 if (!llist_empty(&gc->list)) 2283 return XFS_INODEGC_SHRINKER_COUNT; 2284 } 2285 2286 return 0; 2287 } 2288 2289 static unsigned long 2290 xfs_inodegc_shrinker_scan( 2291 struct shrinker *shrink, 2292 struct shrink_control *sc) 2293 { 2294 struct xfs_mount *mp = shrink->private_data; 2295 struct xfs_inodegc *gc; 2296 int cpu; 2297 bool no_items = true; 2298 2299 if (!xfs_is_inodegc_enabled(mp)) 2300 return SHRINK_STOP; 2301 2302 trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); 2303 2304 for_each_cpu(cpu, &mp->m_inodegc_cpumask) { 2305 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2306 if (!llist_empty(&gc->list)) { 2307 unsigned int h = READ_ONCE(gc->shrinker_hits); 2308 2309 WRITE_ONCE(gc->shrinker_hits, h + 1); 2310 mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); 2311 no_items = false; 2312 } 2313 } 2314 2315 /* 2316 * If there are no inodes to inactivate, we don't want the shrinker 2317 * to think there's deferred work to call us back about. 2318 */ 2319 if (no_items) 2320 return LONG_MAX; 2321 2322 return SHRINK_STOP; 2323 } 2324 2325 /* Register a shrinker so we can accelerate inodegc and throttle queuing. */ 2326 int 2327 xfs_inodegc_register_shrinker( 2328 struct xfs_mount *mp) 2329 { 2330 mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB, 2331 "xfs-inodegc:%s", 2332 mp->m_super->s_id); 2333 if (!mp->m_inodegc_shrinker) 2334 return -ENOMEM; 2335 2336 mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count; 2337 mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan; 2338 mp->m_inodegc_shrinker->seeks = 0; 2339 mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH; 2340 mp->m_inodegc_shrinker->private_data = mp; 2341 2342 shrinker_register(mp->m_inodegc_shrinker); 2343 2344 return 0; 2345 } 2346