1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_error.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_iomap.h" 15 #include "xfs_trans.h" 16 #include "xfs_alloc.h" 17 #include "xfs_bmap.h" 18 #include "xfs_bmap_btree.h" 19 #include "xfs_trans_space.h" 20 #include "xfs_refcount.h" 21 #include "xfs_rtbitmap.h" 22 #include "xfs_rtrmap_btree.h" 23 #include "xfs_zone_alloc.h" 24 #include "xfs_zone_priv.h" 25 #include "xfs_zones.h" 26 #include "xfs_trace.h" 27 #include "xfs_mru_cache.h" 28 29 static void 30 xfs_open_zone_free_rcu( 31 struct callback_head *cb) 32 { 33 struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); 34 35 xfs_rtgroup_rele(oz->oz_rtg); 36 kfree(oz); 37 } 38 39 void 40 xfs_open_zone_put( 41 struct xfs_open_zone *oz) 42 { 43 if (atomic_dec_and_test(&oz->oz_ref)) 44 call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); 45 } 46 47 static inline uint32_t 48 xfs_zone_bucket( 49 struct xfs_mount *mp, 50 uint32_t used_blocks) 51 { 52 return XFS_ZONE_USED_BUCKETS * used_blocks / 53 mp->m_groups[XG_TYPE_RTG].blocks; 54 } 55 56 static inline void 57 xfs_zone_add_to_bucket( 58 struct xfs_zone_info *zi, 59 xfs_rgnumber_t rgno, 60 uint32_t to_bucket) 61 { 62 __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); 63 zi->zi_used_bucket_entries[to_bucket]++; 64 } 65 66 static inline void 67 xfs_zone_remove_from_bucket( 68 struct xfs_zone_info *zi, 69 xfs_rgnumber_t rgno, 70 uint32_t from_bucket) 71 { 72 __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); 73 zi->zi_used_bucket_entries[from_bucket]--; 74 } 75 76 static void 77 xfs_zone_account_reclaimable( 78 struct xfs_rtgroup *rtg, 79 uint32_t freed) 80 { 81 struct xfs_group *xg = &rtg->rtg_group; 82 struct xfs_mount *mp = rtg_mount(rtg); 83 struct xfs_zone_info *zi = mp->m_zone_info; 84 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 85 xfs_rgnumber_t rgno = rtg_rgno(rtg); 86 uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); 87 uint32_t to_bucket = xfs_zone_bucket(mp, used); 88 bool was_full = (used + freed == rtg_blocks(rtg)); 89 90 /* 91 * This can be called from log recovery, where the zone_info structure 92 * hasn't been allocated yet. Skip all work as xfs_mount_zones will 93 * add the zones to the right buckets before the file systems becomes 94 * active. 95 */ 96 if (!zi) 97 return; 98 99 if (!used) { 100 /* 101 * The zone is now empty, remove it from the bottom bucket and 102 * trigger a reset. 103 */ 104 trace_xfs_zone_emptied(rtg); 105 106 if (!was_full) 107 xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); 108 109 spin_lock(&zi->zi_used_buckets_lock); 110 if (!was_full) 111 xfs_zone_remove_from_bucket(zi, rgno, from_bucket); 112 spin_unlock(&zi->zi_used_buckets_lock); 113 114 spin_lock(&zi->zi_reset_list_lock); 115 xg->xg_next_reset = zi->zi_reset_list; 116 zi->zi_reset_list = xg; 117 spin_unlock(&zi->zi_reset_list_lock); 118 119 if (zi->zi_gc_thread) 120 wake_up_process(zi->zi_gc_thread); 121 } else if (was_full) { 122 /* 123 * The zone transitioned from full, mark it up as reclaimable 124 * and wake up GC which might be waiting for zones to reclaim. 125 */ 126 spin_lock(&zi->zi_used_buckets_lock); 127 xfs_zone_add_to_bucket(zi, rgno, to_bucket); 128 spin_unlock(&zi->zi_used_buckets_lock); 129 130 xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); 131 if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) 132 wake_up_process(zi->zi_gc_thread); 133 } else if (to_bucket != from_bucket) { 134 /* 135 * Move the zone to a new bucket if it dropped below the 136 * threshold. 137 */ 138 spin_lock(&zi->zi_used_buckets_lock); 139 xfs_zone_add_to_bucket(zi, rgno, to_bucket); 140 xfs_zone_remove_from_bucket(zi, rgno, from_bucket); 141 spin_unlock(&zi->zi_used_buckets_lock); 142 } 143 } 144 145 static void 146 xfs_open_zone_mark_full( 147 struct xfs_open_zone *oz) 148 { 149 struct xfs_rtgroup *rtg = oz->oz_rtg; 150 struct xfs_mount *mp = rtg_mount(rtg); 151 struct xfs_zone_info *zi = mp->m_zone_info; 152 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 153 154 trace_xfs_zone_full(rtg); 155 156 WRITE_ONCE(rtg->rtg_open_zone, NULL); 157 158 spin_lock(&zi->zi_open_zones_lock); 159 if (oz->oz_is_gc) { 160 ASSERT(current == zi->zi_gc_thread); 161 zi->zi_open_gc_zone = NULL; 162 } else { 163 zi->zi_nr_open_zones--; 164 list_del_init(&oz->oz_entry); 165 } 166 spin_unlock(&zi->zi_open_zones_lock); 167 xfs_open_zone_put(oz); 168 169 wake_up_all(&zi->zi_zone_wait); 170 if (used < rtg_blocks(rtg)) 171 xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 172 } 173 174 static void 175 xfs_zone_record_blocks( 176 struct xfs_trans *tp, 177 struct xfs_open_zone *oz, 178 xfs_fsblock_t fsbno, 179 xfs_filblks_t len) 180 { 181 struct xfs_mount *mp = tp->t_mountp; 182 struct xfs_rtgroup *rtg = oz->oz_rtg; 183 struct xfs_inode *rmapip = rtg_rmap(rtg); 184 185 trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); 186 187 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 188 xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); 189 rmapip->i_used_blocks += len; 190 ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); 191 oz->oz_written += len; 192 if (oz->oz_written == rtg_blocks(rtg)) 193 xfs_open_zone_mark_full(oz); 194 xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 195 } 196 197 /* 198 * Called for blocks that have been written to disk, but not actually linked to 199 * an inode, which can happen when garbage collection races with user data 200 * writes to a file. 201 */ 202 static void 203 xfs_zone_skip_blocks( 204 struct xfs_open_zone *oz, 205 xfs_filblks_t len) 206 { 207 struct xfs_rtgroup *rtg = oz->oz_rtg; 208 209 trace_xfs_zone_skip_blocks(oz, 0, len); 210 211 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 212 oz->oz_written += len; 213 if (oz->oz_written == rtg_blocks(rtg)) 214 xfs_open_zone_mark_full(oz); 215 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 216 217 xfs_add_frextents(rtg_mount(rtg), len); 218 } 219 220 static int 221 xfs_zoned_map_extent( 222 struct xfs_trans *tp, 223 struct xfs_inode *ip, 224 struct xfs_bmbt_irec *new, 225 struct xfs_open_zone *oz, 226 xfs_fsblock_t old_startblock) 227 { 228 struct xfs_bmbt_irec data; 229 int nmaps = 1; 230 int error; 231 232 /* Grab the corresponding mapping in the data fork. */ 233 error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, 234 &nmaps, 0); 235 if (error) 236 return error; 237 238 /* 239 * Cap the update to the existing extent in the data fork because we can 240 * only overwrite one extent at a time. 241 */ 242 ASSERT(new->br_blockcount >= data.br_blockcount); 243 new->br_blockcount = data.br_blockcount; 244 245 /* 246 * If a data write raced with this GC write, keep the existing data in 247 * the data fork, mark our newly written GC extent as reclaimable, then 248 * move on to the next extent. 249 * 250 * Note that this can also happen when racing with operations that do 251 * not actually invalidate the data, but just move it to a different 252 * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the 253 * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the 254 * data was just moved around, GC fails to free the zone, but the zone 255 * becomes a GC candidate again as soon as all previous GC I/O has 256 * finished and these blocks will be moved out eventually. 257 */ 258 if (old_startblock != NULLFSBLOCK && 259 old_startblock != data.br_startblock) 260 goto skip; 261 262 trace_xfs_reflink_cow_remap_from(ip, new); 263 trace_xfs_reflink_cow_remap_to(ip, &data); 264 265 error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, 266 XFS_IEXT_REFLINK_END_COW_CNT); 267 if (error) 268 return error; 269 270 if (data.br_startblock != HOLESTARTBLOCK) { 271 ASSERT(data.br_startblock != DELAYSTARTBLOCK); 272 ASSERT(!isnullstartblock(data.br_startblock)); 273 274 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); 275 if (xfs_is_reflink_inode(ip)) { 276 xfs_refcount_decrease_extent(tp, true, &data); 277 } else { 278 error = xfs_free_extent_later(tp, data.br_startblock, 279 data.br_blockcount, NULL, 280 XFS_AG_RESV_NONE, 281 XFS_FREE_EXTENT_REALTIME); 282 if (error) 283 return error; 284 } 285 } 286 287 xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount); 288 289 /* Map the new blocks into the data fork. */ 290 xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); 291 return 0; 292 293 skip: 294 trace_xfs_reflink_cow_remap_skip(ip, new); 295 xfs_zone_skip_blocks(oz, new->br_blockcount); 296 return 0; 297 } 298 299 int 300 xfs_zoned_end_io( 301 struct xfs_inode *ip, 302 xfs_off_t offset, 303 xfs_off_t count, 304 xfs_daddr_t daddr, 305 struct xfs_open_zone *oz, 306 xfs_fsblock_t old_startblock) 307 { 308 struct xfs_mount *mp = ip->i_mount; 309 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 310 struct xfs_bmbt_irec new = { 311 .br_startoff = XFS_B_TO_FSBT(mp, offset), 312 .br_startblock = xfs_daddr_to_rtb(mp, daddr), 313 .br_state = XFS_EXT_NORM, 314 }; 315 unsigned int resblks = 316 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 317 struct xfs_trans *tp; 318 int error; 319 320 if (xfs_is_shutdown(mp)) 321 return -EIO; 322 323 while (new.br_startoff < end_fsb) { 324 new.br_blockcount = end_fsb - new.br_startoff; 325 326 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 327 XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); 328 if (error) 329 return error; 330 xfs_ilock(ip, XFS_ILOCK_EXCL); 331 xfs_trans_ijoin(tp, ip, 0); 332 333 error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); 334 if (error) 335 xfs_trans_cancel(tp); 336 else 337 error = xfs_trans_commit(tp); 338 xfs_iunlock(ip, XFS_ILOCK_EXCL); 339 if (error) 340 return error; 341 342 new.br_startoff += new.br_blockcount; 343 new.br_startblock += new.br_blockcount; 344 if (old_startblock != NULLFSBLOCK) 345 old_startblock += new.br_blockcount; 346 } 347 348 return 0; 349 } 350 351 /* 352 * "Free" blocks allocated in a zone. 353 * 354 * Just decrement the used blocks counter and report the space as freed. 355 */ 356 int 357 xfs_zone_free_blocks( 358 struct xfs_trans *tp, 359 struct xfs_rtgroup *rtg, 360 xfs_fsblock_t fsbno, 361 xfs_filblks_t len) 362 { 363 struct xfs_mount *mp = tp->t_mountp; 364 struct xfs_inode *rmapip = rtg_rmap(rtg); 365 366 xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); 367 368 if (len > rmapip->i_used_blocks) { 369 xfs_err(mp, 370 "trying to free more blocks (%lld) than used counter (%u).", 371 len, rmapip->i_used_blocks); 372 ASSERT(len <= rmapip->i_used_blocks); 373 xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); 374 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 375 return -EFSCORRUPTED; 376 } 377 378 trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); 379 380 rmapip->i_used_blocks -= len; 381 /* 382 * Don't add open zones to the reclaimable buckets. The I/O completion 383 * for writing the last block will take care of accounting for already 384 * unused blocks instead. 385 */ 386 if (!READ_ONCE(rtg->rtg_open_zone)) 387 xfs_zone_account_reclaimable(rtg, len); 388 xfs_add_frextents(mp, len); 389 xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 390 return 0; 391 } 392 393 static struct xfs_group * 394 xfs_find_free_zone( 395 struct xfs_mount *mp, 396 unsigned long start, 397 unsigned long end) 398 { 399 struct xfs_zone_info *zi = mp->m_zone_info; 400 XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start); 401 struct xfs_group *xg; 402 403 xas_lock(&xas); 404 xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) 405 if (atomic_inc_not_zero(&xg->xg_active_ref)) 406 goto found; 407 xas_unlock(&xas); 408 return NULL; 409 410 found: 411 xas_clear_mark(&xas, XFS_RTG_FREE); 412 atomic_dec(&zi->zi_nr_free_zones); 413 zi->zi_free_zone_cursor = xg->xg_gno; 414 xas_unlock(&xas); 415 return xg; 416 } 417 418 static struct xfs_open_zone * 419 xfs_init_open_zone( 420 struct xfs_rtgroup *rtg, 421 xfs_rgblock_t write_pointer, 422 enum rw_hint write_hint, 423 bool is_gc) 424 { 425 struct xfs_open_zone *oz; 426 427 oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); 428 spin_lock_init(&oz->oz_alloc_lock); 429 atomic_set(&oz->oz_ref, 1); 430 oz->oz_rtg = rtg; 431 oz->oz_allocated = write_pointer; 432 oz->oz_written = write_pointer; 433 oz->oz_write_hint = write_hint; 434 oz->oz_is_gc = is_gc; 435 436 /* 437 * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap 438 * inode, but we don't really want to take that here because we are 439 * under the zone_list_lock. Ensure the pointer is only set for a fully 440 * initialized open zone structure so that a racy lookup finding it is 441 * fine. 442 */ 443 WRITE_ONCE(rtg->rtg_open_zone, oz); 444 return oz; 445 } 446 447 /* 448 * Find a completely free zone, open it, and return a reference. 449 */ 450 struct xfs_open_zone * 451 xfs_open_zone( 452 struct xfs_mount *mp, 453 enum rw_hint write_hint, 454 bool is_gc) 455 { 456 struct xfs_zone_info *zi = mp->m_zone_info; 457 struct xfs_group *xg; 458 459 xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); 460 if (!xg) 461 xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); 462 if (!xg) 463 return NULL; 464 465 set_current_state(TASK_RUNNING); 466 return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); 467 } 468 469 static struct xfs_open_zone * 470 xfs_try_open_zone( 471 struct xfs_mount *mp, 472 enum rw_hint write_hint) 473 { 474 struct xfs_zone_info *zi = mp->m_zone_info; 475 struct xfs_open_zone *oz; 476 477 if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) 478 return NULL; 479 if (atomic_read(&zi->zi_nr_free_zones) < 480 XFS_GC_ZONES - XFS_OPEN_GC_ZONES) 481 return NULL; 482 483 /* 484 * Increment the open zone count to reserve our slot before dropping 485 * zi_open_zones_lock. 486 */ 487 zi->zi_nr_open_zones++; 488 spin_unlock(&zi->zi_open_zones_lock); 489 oz = xfs_open_zone(mp, write_hint, false); 490 spin_lock(&zi->zi_open_zones_lock); 491 if (!oz) { 492 zi->zi_nr_open_zones--; 493 return NULL; 494 } 495 496 atomic_inc(&oz->oz_ref); 497 list_add_tail(&oz->oz_entry, &zi->zi_open_zones); 498 499 /* 500 * If this was the last free zone, other waiters might be waiting 501 * on us to write to it as well. 502 */ 503 wake_up_all(&zi->zi_zone_wait); 504 505 if (xfs_zoned_need_gc(mp)) 506 wake_up_process(zi->zi_gc_thread); 507 508 trace_xfs_zone_opened(oz->oz_rtg); 509 return oz; 510 } 511 512 enum xfs_zone_alloc_score { 513 /* Any open zone will do it, we're desperate */ 514 XFS_ZONE_ALLOC_ANY = 0, 515 516 /* It better fit somehow */ 517 XFS_ZONE_ALLOC_OK = 1, 518 519 /* Only reuse a zone if it fits really well. */ 520 XFS_ZONE_ALLOC_GOOD = 2, 521 }; 522 523 /* 524 * Life time hint co-location matrix. Fields not set default to 0 525 * aka XFS_ZONE_ALLOC_ANY. 526 */ 527 static const unsigned int 528 xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { 529 [WRITE_LIFE_NOT_SET] = { 530 [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, 531 }, 532 [WRITE_LIFE_NONE] = { 533 [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, 534 }, 535 [WRITE_LIFE_SHORT] = { 536 [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, 537 }, 538 [WRITE_LIFE_MEDIUM] = { 539 [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD, 540 }, 541 [WRITE_LIFE_LONG] = { 542 [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, 543 [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, 544 }, 545 [WRITE_LIFE_EXTREME] = { 546 [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, 547 [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, 548 }, 549 }; 550 551 static bool 552 xfs_try_use_zone( 553 struct xfs_zone_info *zi, 554 enum rw_hint file_hint, 555 struct xfs_open_zone *oz, 556 unsigned int goodness) 557 { 558 if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 559 return false; 560 561 if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness) 562 return false; 563 564 if (!atomic_inc_not_zero(&oz->oz_ref)) 565 return false; 566 567 /* 568 * If we have a hint set for the data, use that for the zone even if 569 * some data was written already without any hint set, but don't change 570 * the temperature after that as that would make little sense without 571 * tracking per-temperature class written block counts, which is 572 * probably overkill anyway. 573 */ 574 if (file_hint != WRITE_LIFE_NOT_SET && 575 oz->oz_write_hint == WRITE_LIFE_NOT_SET) 576 oz->oz_write_hint = file_hint; 577 578 /* 579 * If we couldn't match by inode or life time we just pick the first 580 * zone with enough space above. For that we want the least busy zone 581 * for some definition of "least" busy. For now this simple LRU 582 * algorithm that rotates every zone to the end of the list will do it, 583 * even if it isn't exactly cache friendly. 584 */ 585 if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) 586 list_move_tail(&oz->oz_entry, &zi->zi_open_zones); 587 return true; 588 } 589 590 static struct xfs_open_zone * 591 xfs_select_open_zone_lru( 592 struct xfs_zone_info *zi, 593 enum rw_hint file_hint, 594 unsigned int goodness) 595 { 596 struct xfs_open_zone *oz; 597 598 lockdep_assert_held(&zi->zi_open_zones_lock); 599 600 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) 601 if (xfs_try_use_zone(zi, file_hint, oz, goodness)) 602 return oz; 603 604 cond_resched_lock(&zi->zi_open_zones_lock); 605 return NULL; 606 } 607 608 static struct xfs_open_zone * 609 xfs_select_open_zone_mru( 610 struct xfs_zone_info *zi, 611 enum rw_hint file_hint) 612 { 613 struct xfs_open_zone *oz; 614 615 lockdep_assert_held(&zi->zi_open_zones_lock); 616 617 list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) 618 if (xfs_try_use_zone(zi, file_hint, oz, false)) 619 return oz; 620 621 cond_resched_lock(&zi->zi_open_zones_lock); 622 return NULL; 623 } 624 625 static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) 626 { 627 if (xfs_has_nolifetime(ip->i_mount)) 628 return WRITE_LIFE_NOT_SET; 629 return VFS_I(ip)->i_write_hint; 630 } 631 632 /* 633 * Try to tightly pack small files that are written back after they were closed 634 * instead of trying to open new zones for them or spread them to the least 635 * recently used zone. This optimizes the data layout for workloads that untar 636 * or copy a lot of small files. Right now this does not separate multiple such 637 * streams. 638 */ 639 static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) 640 { 641 struct xfs_mount *mp = ip->i_mount; 642 size_t zone_capacity = 643 XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks); 644 645 /* 646 * Do not pack write files that are already using a full zone to avoid 647 * fragmentation. 648 */ 649 if (i_size_read(VFS_I(ip)) >= zone_capacity) 650 return false; 651 652 return !inode_is_open_for_write(VFS_I(ip)) && 653 !(ip->i_diflags & XFS_DIFLAG_APPEND); 654 } 655 656 static struct xfs_open_zone * 657 xfs_select_zone_nowait( 658 struct xfs_mount *mp, 659 enum rw_hint write_hint, 660 bool pack_tight) 661 { 662 struct xfs_zone_info *zi = mp->m_zone_info; 663 struct xfs_open_zone *oz = NULL; 664 665 if (xfs_is_shutdown(mp)) 666 return NULL; 667 668 /* 669 * Try to fill up open zones with matching temperature if available. It 670 * is better to try to co-locate data when this is favorable, so we can 671 * activate empty zones when it is statistically better to separate 672 * data. 673 */ 674 spin_lock(&zi->zi_open_zones_lock); 675 oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD); 676 if (oz) 677 goto out_unlock; 678 679 if (pack_tight) 680 oz = xfs_select_open_zone_mru(zi, write_hint); 681 if (oz) 682 goto out_unlock; 683 684 /* 685 * See if we can open a new zone and use that so that data for different 686 * files is mixed as little as possible. 687 */ 688 oz = xfs_try_open_zone(mp, write_hint); 689 if (oz) 690 goto out_unlock; 691 692 /* 693 * Try to find an zone that is an ok match to colocate data with. 694 */ 695 oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); 696 if (oz) 697 goto out_unlock; 698 699 /* 700 * Pick the least recently used zone, regardless of hint match 701 */ 702 oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY); 703 out_unlock: 704 spin_unlock(&zi->zi_open_zones_lock); 705 return oz; 706 } 707 708 static struct xfs_open_zone * 709 xfs_select_zone( 710 struct xfs_mount *mp, 711 enum rw_hint write_hint, 712 bool pack_tight) 713 { 714 struct xfs_zone_info *zi = mp->m_zone_info; 715 DEFINE_WAIT (wait); 716 struct xfs_open_zone *oz; 717 718 oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); 719 if (oz) 720 return oz; 721 722 for (;;) { 723 prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); 724 oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); 725 if (oz || xfs_is_shutdown(mp)) 726 break; 727 schedule(); 728 } 729 finish_wait(&zi->zi_zone_wait, &wait); 730 return oz; 731 } 732 733 static unsigned int 734 xfs_zone_alloc_blocks( 735 struct xfs_open_zone *oz, 736 xfs_filblks_t count_fsb, 737 sector_t *sector, 738 bool *is_seq) 739 { 740 struct xfs_rtgroup *rtg = oz->oz_rtg; 741 struct xfs_mount *mp = rtg_mount(rtg); 742 xfs_rgblock_t allocated; 743 744 spin_lock(&oz->oz_alloc_lock); 745 count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, 746 (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated); 747 if (!count_fsb) { 748 spin_unlock(&oz->oz_alloc_lock); 749 return 0; 750 } 751 allocated = oz->oz_allocated; 752 oz->oz_allocated += count_fsb; 753 spin_unlock(&oz->oz_alloc_lock); 754 755 trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb); 756 757 *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 758 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); 759 if (!*is_seq) 760 *sector += XFS_FSB_TO_BB(mp, allocated); 761 return XFS_FSB_TO_B(mp, count_fsb); 762 } 763 764 void 765 xfs_mark_rtg_boundary( 766 struct iomap_ioend *ioend) 767 { 768 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 769 sector_t sector = ioend->io_bio.bi_iter.bi_sector; 770 771 if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) 772 ioend->io_flags |= IOMAP_IOEND_BOUNDARY; 773 } 774 775 /* 776 * Check if we have a cached last open zone available for the inode and 777 * if yes return a reference to it. 778 */ 779 static struct xfs_open_zone * 780 xfs_get_cached_zone( 781 struct xfs_inode *ip) 782 { 783 struct xfs_open_zone *oz; 784 785 rcu_read_lock(); 786 oz = VFS_I(ip)->i_private; 787 if (oz) { 788 /* 789 * GC only steals open zones at mount time, so no GC zones 790 * should end up in the cache. 791 */ 792 ASSERT(!oz->oz_is_gc); 793 if (!atomic_inc_not_zero(&oz->oz_ref)) 794 oz = NULL; 795 } 796 rcu_read_unlock(); 797 798 return oz; 799 } 800 801 /* 802 * Stash our zone in the inode so that is is reused for future allocations. 803 * 804 * The open_zone structure will be pinned until either the inode is freed or 805 * until the cached open zone is replaced with a different one because the 806 * current one was full when we tried to use it. This means we keep any 807 * open zone around forever as long as any inode that used it for the last 808 * write is cached, which slightly increases the memory use of cached inodes 809 * that were every written to, but significantly simplifies the cached zone 810 * lookup. Because the open_zone is clearly marked as full when all data 811 * in the underlying RTG was written, the caching is always safe. 812 */ 813 static void 814 xfs_set_cached_zone( 815 struct xfs_inode *ip, 816 struct xfs_open_zone *oz) 817 { 818 struct xfs_open_zone *old_oz; 819 820 atomic_inc(&oz->oz_ref); 821 old_oz = xchg(&VFS_I(ip)->i_private, oz); 822 if (old_oz) 823 xfs_open_zone_put(old_oz); 824 } 825 826 static void 827 xfs_submit_zoned_bio( 828 struct iomap_ioend *ioend, 829 struct xfs_open_zone *oz, 830 bool is_seq) 831 { 832 ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; 833 ioend->io_private = oz; 834 atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ 835 836 if (is_seq) { 837 ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; 838 ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; 839 } else { 840 xfs_mark_rtg_boundary(ioend); 841 } 842 843 submit_bio(&ioend->io_bio); 844 } 845 846 void 847 xfs_zone_alloc_and_submit( 848 struct iomap_ioend *ioend, 849 struct xfs_open_zone **oz) 850 { 851 struct xfs_inode *ip = XFS_I(ioend->io_inode); 852 struct xfs_mount *mp = ip->i_mount; 853 enum rw_hint write_hint = xfs_inode_write_hint(ip); 854 bool pack_tight = xfs_zoned_pack_tight(ip); 855 unsigned int alloc_len; 856 struct iomap_ioend *split; 857 bool is_seq; 858 859 if (xfs_is_shutdown(mp)) 860 goto out_error; 861 862 /* 863 * If we don't have a locally cached zone in this write context, see if 864 * the inode is still associated with a zone and use that if so. 865 */ 866 if (!*oz) 867 *oz = xfs_get_cached_zone(ip); 868 869 if (!*oz) { 870 select_zone: 871 *oz = xfs_select_zone(mp, write_hint, pack_tight); 872 if (!*oz) 873 goto out_error; 874 xfs_set_cached_zone(ip, *oz); 875 } 876 877 alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), 878 &ioend->io_sector, &is_seq); 879 if (!alloc_len) { 880 xfs_open_zone_put(*oz); 881 goto select_zone; 882 } 883 884 while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { 885 if (IS_ERR(split)) 886 goto out_split_error; 887 alloc_len -= split->io_bio.bi_iter.bi_size; 888 xfs_submit_zoned_bio(split, *oz, is_seq); 889 if (!alloc_len) { 890 xfs_open_zone_put(*oz); 891 goto select_zone; 892 } 893 } 894 895 xfs_submit_zoned_bio(ioend, *oz, is_seq); 896 return; 897 898 out_split_error: 899 ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); 900 out_error: 901 bio_io_error(&ioend->io_bio); 902 } 903 904 /* 905 * Wake up all threads waiting for a zoned space allocation when the file system 906 * is shut down. 907 */ 908 void 909 xfs_zoned_wake_all( 910 struct xfs_mount *mp) 911 { 912 /* 913 * Don't wake up if there is no m_zone_info. This is complicated by the 914 * fact that unmount can't atomically clear m_zone_info and thus we need 915 * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE 916 * during log recovery so we can't entirely rely on that either. 917 */ 918 if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info) 919 wake_up_all(&mp->m_zone_info->zi_zone_wait); 920 } 921 922 /* 923 * Check if @rgbno in @rgb is a potentially valid block. It might still be 924 * unused, but that information is only found in the rmap. 925 */ 926 bool 927 xfs_zone_rgbno_is_valid( 928 struct xfs_rtgroup *rtg, 929 xfs_rgnumber_t rgbno) 930 { 931 lockdep_assert_held(&rtg_rmap(rtg)->i_lock); 932 933 if (rtg->rtg_open_zone) 934 return rgbno < rtg->rtg_open_zone->oz_allocated; 935 return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, 936 rtg_rgno(rtg), XFS_RTG_FREE); 937 } 938 939 static void 940 xfs_free_open_zones( 941 struct xfs_zone_info *zi) 942 { 943 struct xfs_open_zone *oz; 944 945 spin_lock(&zi->zi_open_zones_lock); 946 while ((oz = list_first_entry_or_null(&zi->zi_open_zones, 947 struct xfs_open_zone, oz_entry))) { 948 list_del(&oz->oz_entry); 949 xfs_open_zone_put(oz); 950 } 951 spin_unlock(&zi->zi_open_zones_lock); 952 953 /* 954 * Wait for all open zones to be freed so that they drop the group 955 * references: 956 */ 957 rcu_barrier(); 958 } 959 960 struct xfs_init_zones { 961 struct xfs_mount *mp; 962 uint64_t available; 963 uint64_t reclaimable; 964 }; 965 966 static int 967 xfs_init_zone( 968 struct xfs_init_zones *iz, 969 struct xfs_rtgroup *rtg, 970 struct blk_zone *zone) 971 { 972 struct xfs_mount *mp = rtg_mount(rtg); 973 struct xfs_zone_info *zi = mp->m_zone_info; 974 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 975 xfs_rgblock_t write_pointer, highest_rgbno; 976 int error; 977 978 if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) 979 return -EFSCORRUPTED; 980 981 /* 982 * For sequential write required zones we retrieved the hardware write 983 * pointer above. 984 * 985 * For conventional zones or conventional devices we don't have that 986 * luxury. Instead query the rmap to find the highest recorded block 987 * and set the write pointer to the block after that. In case of a 988 * power loss this misses blocks where the data I/O has completed but 989 * not recorded in the rmap yet, and it also rewrites blocks if the most 990 * recently written ones got deleted again before unmount, but this is 991 * the best we can do without hardware support. 992 */ 993 if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { 994 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 995 highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); 996 if (highest_rgbno == NULLRGBLOCK) 997 write_pointer = 0; 998 else 999 write_pointer = highest_rgbno + 1; 1000 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 1001 } 1002 1003 /* 1004 * If there are no used blocks, but the zone is not in empty state yet 1005 * we lost power before the zoned reset. In that case finish the work 1006 * here. 1007 */ 1008 if (write_pointer == rtg_blocks(rtg) && used == 0) { 1009 error = xfs_zone_gc_reset_sync(rtg); 1010 if (error) 1011 return error; 1012 write_pointer = 0; 1013 } 1014 1015 if (write_pointer == 0) { 1016 /* zone is empty */ 1017 atomic_inc(&zi->zi_nr_free_zones); 1018 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 1019 iz->available += rtg_blocks(rtg); 1020 } else if (write_pointer < rtg_blocks(rtg)) { 1021 /* zone is open */ 1022 struct xfs_open_zone *oz; 1023 1024 atomic_inc(&rtg_group(rtg)->xg_active_ref); 1025 oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, 1026 false); 1027 list_add_tail(&oz->oz_entry, &zi->zi_open_zones); 1028 zi->zi_nr_open_zones++; 1029 1030 iz->available += (rtg_blocks(rtg) - write_pointer); 1031 iz->reclaimable += write_pointer - used; 1032 } else if (used < rtg_blocks(rtg)) { 1033 /* zone fully written, but has freed blocks */ 1034 xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 1035 iz->reclaimable += (rtg_blocks(rtg) - used); 1036 } 1037 1038 return 0; 1039 } 1040 1041 static int 1042 xfs_get_zone_info_cb( 1043 struct blk_zone *zone, 1044 unsigned int idx, 1045 void *data) 1046 { 1047 struct xfs_init_zones *iz = data; 1048 struct xfs_mount *mp = iz->mp; 1049 xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start); 1050 xfs_rgnumber_t rgno; 1051 struct xfs_rtgroup *rtg; 1052 int error; 1053 1054 if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { 1055 xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); 1056 return -EFSCORRUPTED; 1057 } 1058 1059 rgno = xfs_rtb_to_rgno(mp, zsbno); 1060 rtg = xfs_rtgroup_grab(mp, rgno); 1061 if (!rtg) { 1062 xfs_warn(mp, "realtime group not found for zone %u.", rgno); 1063 return -EFSCORRUPTED; 1064 } 1065 error = xfs_init_zone(iz, rtg, zone); 1066 xfs_rtgroup_rele(rtg); 1067 return error; 1068 } 1069 1070 /* 1071 * Calculate the max open zone limit based on the of number of backing zones 1072 * available. 1073 */ 1074 static inline uint32_t 1075 xfs_max_open_zones( 1076 struct xfs_mount *mp) 1077 { 1078 unsigned int max_open, max_open_data_zones; 1079 1080 /* 1081 * We need two zones for every open data zone, one in reserve as we 1082 * don't reclaim open zones. One data zone and its spare is included 1083 * in XFS_MIN_ZONES to support at least one user data writer. 1084 */ 1085 max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; 1086 max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; 1087 1088 /* 1089 * Cap the max open limit to 1/4 of available space. Without this we'd 1090 * run out of easy reclaim targets too quickly and storage devices don't 1091 * handle huge numbers of concurrent write streams overly well. 1092 */ 1093 max_open = min(max_open, mp->m_sb.sb_rgcount / 4); 1094 1095 return max(XFS_MIN_OPEN_ZONES, max_open); 1096 } 1097 1098 /* 1099 * Normally we use the open zone limit that the device reports. If there is 1100 * none let the user pick one from the command line. 1101 * 1102 * If the device doesn't report an open zone limit and there is no override, 1103 * allow to hold about a quarter of the zones open. In theory we could allow 1104 * all to be open, but at that point we run into GC deadlocks because we can't 1105 * reclaim open zones. 1106 * 1107 * When used on conventional SSDs a lower open limit is advisable as we'll 1108 * otherwise overwhelm the FTL just as much as a conventional block allocator. 1109 * 1110 * Note: To debug the open zone management code, force max_open to 1 here. 1111 */ 1112 static int 1113 xfs_calc_open_zones( 1114 struct xfs_mount *mp) 1115 { 1116 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 1117 unsigned int bdev_open_zones = bdev_max_open_zones(bdev); 1118 1119 if (!mp->m_max_open_zones) { 1120 if (bdev_open_zones) 1121 mp->m_max_open_zones = bdev_open_zones; 1122 else 1123 mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES; 1124 } 1125 1126 if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { 1127 xfs_notice(mp, "need at least %u open zones.", 1128 XFS_MIN_OPEN_ZONES); 1129 return -EIO; 1130 } 1131 1132 if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { 1133 mp->m_max_open_zones = bdev_open_zones; 1134 xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", 1135 bdev_open_zones); 1136 } 1137 1138 if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { 1139 mp->m_max_open_zones = xfs_max_open_zones(mp); 1140 xfs_info(mp, 1141 "limiting open zones to %u due to total zone count (%u)", 1142 mp->m_max_open_zones, mp->m_sb.sb_rgcount); 1143 } 1144 1145 return 0; 1146 } 1147 1148 static unsigned long * 1149 xfs_alloc_bucket_bitmap( 1150 struct xfs_mount *mp) 1151 { 1152 return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), 1153 sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); 1154 } 1155 1156 static struct xfs_zone_info * 1157 xfs_alloc_zone_info( 1158 struct xfs_mount *mp) 1159 { 1160 struct xfs_zone_info *zi; 1161 int i; 1162 1163 zi = kzalloc(sizeof(*zi), GFP_KERNEL); 1164 if (!zi) 1165 return NULL; 1166 INIT_LIST_HEAD(&zi->zi_open_zones); 1167 INIT_LIST_HEAD(&zi->zi_reclaim_reservations); 1168 spin_lock_init(&zi->zi_reset_list_lock); 1169 spin_lock_init(&zi->zi_open_zones_lock); 1170 spin_lock_init(&zi->zi_reservation_lock); 1171 init_waitqueue_head(&zi->zi_zone_wait); 1172 spin_lock_init(&zi->zi_used_buckets_lock); 1173 for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { 1174 zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); 1175 if (!zi->zi_used_bucket_bitmap[i]) 1176 goto out_free_bitmaps; 1177 } 1178 return zi; 1179 1180 out_free_bitmaps: 1181 while (--i > 0) 1182 kvfree(zi->zi_used_bucket_bitmap[i]); 1183 kfree(zi); 1184 return NULL; 1185 } 1186 1187 static void 1188 xfs_free_zone_info( 1189 struct xfs_zone_info *zi) 1190 { 1191 int i; 1192 1193 xfs_free_open_zones(zi); 1194 for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) 1195 kvfree(zi->zi_used_bucket_bitmap[i]); 1196 kfree(zi); 1197 } 1198 1199 int 1200 xfs_mount_zones( 1201 struct xfs_mount *mp) 1202 { 1203 struct xfs_init_zones iz = { 1204 .mp = mp, 1205 }; 1206 struct xfs_buftarg *bt = mp->m_rtdev_targp; 1207 int error; 1208 1209 if (!bt) { 1210 xfs_notice(mp, "RT device missing."); 1211 return -EINVAL; 1212 } 1213 1214 if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { 1215 xfs_notice(mp, "invalid flag combination."); 1216 return -EFSCORRUPTED; 1217 } 1218 if (mp->m_sb.sb_rextsize != 1) { 1219 xfs_notice(mp, "zoned file systems do not support rextsize."); 1220 return -EFSCORRUPTED; 1221 } 1222 if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { 1223 xfs_notice(mp, 1224 "zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); 1225 return -EFSCORRUPTED; 1226 } 1227 1228 error = xfs_calc_open_zones(mp); 1229 if (error) 1230 return error; 1231 1232 mp->m_zone_info = xfs_alloc_zone_info(mp); 1233 if (!mp->m_zone_info) 1234 return -ENOMEM; 1235 1236 xfs_info(mp, "%u zones of %u blocks (%u max open zones)", 1237 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, 1238 mp->m_max_open_zones); 1239 trace_xfs_zones_mount(mp); 1240 1241 if (bdev_is_zoned(bt->bt_bdev)) { 1242 error = blkdev_report_zones(bt->bt_bdev, 1243 XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), 1244 mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); 1245 if (error < 0) 1246 goto out_free_zone_info; 1247 } else { 1248 struct xfs_rtgroup *rtg = NULL; 1249 1250 while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1251 error = xfs_init_zone(&iz, rtg, NULL); 1252 if (error) 1253 goto out_free_zone_info; 1254 } 1255 } 1256 1257 xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); 1258 xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 1259 iz.available + iz.reclaimable); 1260 1261 /* 1262 * The user may configure GC to free up a percentage of unused blocks. 1263 * By default this is 0. GC will always trigger at the minimum level 1264 * for keeping max_open_zones available for data placement. 1265 */ 1266 mp->m_zonegc_low_space = 0; 1267 1268 error = xfs_zone_gc_mount(mp); 1269 if (error) 1270 goto out_free_zone_info; 1271 return 0; 1272 1273 out_free_zone_info: 1274 xfs_free_zone_info(mp->m_zone_info); 1275 return error; 1276 } 1277 1278 void 1279 xfs_unmount_zones( 1280 struct xfs_mount *mp) 1281 { 1282 xfs_zone_gc_unmount(mp); 1283 xfs_free_zone_info(mp->m_zone_info); 1284 } 1285