1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_error.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_iomap.h" 15 #include "xfs_trans.h" 16 #include "xfs_alloc.h" 17 #include "xfs_bmap.h" 18 #include "xfs_bmap_btree.h" 19 #include "xfs_trans_space.h" 20 #include "xfs_refcount.h" 21 #include "xfs_rtbitmap.h" 22 #include "xfs_rtrmap_btree.h" 23 #include "xfs_zone_alloc.h" 24 #include "xfs_zone_priv.h" 25 #include "xfs_zones.h" 26 #include "xfs_trace.h" 27 #include "xfs_mru_cache.h" 28 29 static void 30 xfs_open_zone_free_rcu( 31 struct callback_head *cb) 32 { 33 struct xfs_open_zone *oz = container_of(cb, typeof(*oz), oz_rcu); 34 35 xfs_rtgroup_rele(oz->oz_rtg); 36 kfree(oz); 37 } 38 39 void 40 xfs_open_zone_put( 41 struct xfs_open_zone *oz) 42 { 43 if (atomic_dec_and_test(&oz->oz_ref)) 44 call_rcu(&oz->oz_rcu, xfs_open_zone_free_rcu); 45 } 46 47 static inline uint32_t 48 xfs_zone_bucket( 49 struct xfs_mount *mp, 50 uint32_t used_blocks) 51 { 52 return XFS_ZONE_USED_BUCKETS * used_blocks / 53 mp->m_groups[XG_TYPE_RTG].blocks; 54 } 55 56 static inline void 57 xfs_zone_add_to_bucket( 58 struct xfs_zone_info *zi, 59 xfs_rgnumber_t rgno, 60 uint32_t to_bucket) 61 { 62 __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); 63 zi->zi_used_bucket_entries[to_bucket]++; 64 } 65 66 static inline void 67 xfs_zone_remove_from_bucket( 68 struct xfs_zone_info *zi, 69 xfs_rgnumber_t rgno, 70 uint32_t from_bucket) 71 { 72 __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); 73 zi->zi_used_bucket_entries[from_bucket]--; 74 } 75 76 static void 77 xfs_zone_account_reclaimable( 78 struct xfs_rtgroup *rtg, 79 uint32_t freed) 80 { 81 struct xfs_group *xg = rtg_group(rtg); 82 struct xfs_mount *mp = rtg_mount(rtg); 83 struct xfs_zone_info *zi = mp->m_zone_info; 84 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 85 xfs_rgnumber_t rgno = rtg_rgno(rtg); 86 uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); 87 uint32_t to_bucket = xfs_zone_bucket(mp, used); 88 bool was_full = (used + freed == rtg_blocks(rtg)); 89 90 /* 91 * This can be called from log recovery, where the zone_info structure 92 * hasn't been allocated yet. Skip all work as xfs_mount_zones will 93 * add the zones to the right buckets before the file systems becomes 94 * active. 95 */ 96 if (!zi) 97 return; 98 99 if (!used) { 100 /* 101 * The zone is now empty, remove it from the bottom bucket and 102 * trigger a reset. 103 */ 104 trace_xfs_zone_emptied(rtg); 105 106 spin_lock(&zi->zi_used_buckets_lock); 107 if (!was_full) 108 xfs_zone_remove_from_bucket(zi, rgno, from_bucket); 109 spin_unlock(&zi->zi_used_buckets_lock); 110 111 spin_lock(&zi->zi_reset_list_lock); 112 xg->xg_next_reset = zi->zi_reset_list; 113 zi->zi_reset_list = xg; 114 spin_unlock(&zi->zi_reset_list_lock); 115 116 if (zi->zi_gc_thread) 117 wake_up_process(zi->zi_gc_thread); 118 } else if (was_full) { 119 /* 120 * The zone transitioned from full, mark it up as reclaimable 121 * and wake up GC which might be waiting for zones to reclaim. 122 */ 123 spin_lock(&zi->zi_used_buckets_lock); 124 xfs_zone_add_to_bucket(zi, rgno, to_bucket); 125 spin_unlock(&zi->zi_used_buckets_lock); 126 127 if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) 128 wake_up_process(zi->zi_gc_thread); 129 } else if (to_bucket != from_bucket) { 130 /* 131 * Move the zone to a new bucket if it dropped below the 132 * threshold. 133 */ 134 spin_lock(&zi->zi_used_buckets_lock); 135 xfs_zone_add_to_bucket(zi, rgno, to_bucket); 136 xfs_zone_remove_from_bucket(zi, rgno, from_bucket); 137 spin_unlock(&zi->zi_used_buckets_lock); 138 } 139 } 140 141 /* 142 * Check if we have any zones that can be reclaimed by looking at the entry 143 * counters for the zone buckets. 144 */ 145 bool 146 xfs_zoned_have_reclaimable( 147 struct xfs_zone_info *zi) 148 { 149 int i; 150 151 spin_lock(&zi->zi_used_buckets_lock); 152 for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { 153 if (zi->zi_used_bucket_entries[i]) { 154 spin_unlock(&zi->zi_used_buckets_lock); 155 return true; 156 } 157 } 158 spin_unlock(&zi->zi_used_buckets_lock); 159 160 return false; 161 } 162 163 static void 164 xfs_open_zone_mark_full( 165 struct xfs_open_zone *oz) 166 { 167 struct xfs_rtgroup *rtg = oz->oz_rtg; 168 struct xfs_mount *mp = rtg_mount(rtg); 169 struct xfs_zone_info *zi = mp->m_zone_info; 170 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 171 172 trace_xfs_zone_full(rtg); 173 174 WRITE_ONCE(rtg->rtg_open_zone, NULL); 175 176 spin_lock(&zi->zi_open_zones_lock); 177 if (oz->oz_is_gc) 178 zi->zi_nr_open_gc_zones--; 179 else 180 zi->zi_nr_open_zones--; 181 list_del_init(&oz->oz_entry); 182 spin_unlock(&zi->zi_open_zones_lock); 183 184 if (oz->oz_is_gc) 185 wake_up_process(zi->zi_gc_thread); 186 else 187 wake_up_all(&zi->zi_zone_wait); 188 189 if (used < rtg_blocks(rtg)) 190 xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 191 xfs_open_zone_put(oz); 192 } 193 194 static inline void 195 xfs_zone_inc_written( 196 struct xfs_open_zone *oz, 197 xfs_filblks_t len) 198 { 199 xfs_assert_ilocked(rtg_rmap(oz->oz_rtg), XFS_ILOCK_EXCL); 200 201 oz->oz_written += len; 202 if (oz->oz_written == rtg_blocks(oz->oz_rtg)) 203 xfs_open_zone_mark_full(oz); 204 } 205 206 /* 207 * Called for blocks that have been written to disk, but not actually linked to 208 * an inode, which can happen when garbage collection races with user data 209 * writes to a file. 210 */ 211 static void 212 xfs_zone_skip_blocks( 213 struct xfs_open_zone *oz, 214 xfs_filblks_t len) 215 { 216 struct xfs_rtgroup *rtg = oz->oz_rtg; 217 218 trace_xfs_zone_skip_blocks(oz, 0, len); 219 220 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 221 xfs_zone_inc_written(oz, len); 222 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 223 224 xfs_add_frextents(rtg_mount(rtg), len); 225 } 226 227 static int 228 xfs_zoned_map_extent( 229 struct xfs_trans *tp, 230 struct xfs_inode *ip, 231 struct xfs_bmbt_irec *new, 232 struct xfs_open_zone *oz, 233 xfs_fsblock_t old_startblock) 234 { 235 struct xfs_bmbt_irec data; 236 struct xfs_rtgroup *rtg = oz->oz_rtg; 237 struct xfs_inode *rmapip = rtg_rmap(rtg); 238 int nmaps = 1; 239 int error; 240 241 /* Grab the corresponding mapping in the data fork. */ 242 error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, 243 &nmaps, 0); 244 if (error) 245 return error; 246 247 /* 248 * Cap the update to the existing extent in the data fork because we can 249 * only overwrite one extent at a time. 250 */ 251 ASSERT(new->br_blockcount >= data.br_blockcount); 252 new->br_blockcount = data.br_blockcount; 253 254 /* 255 * If a data write raced with this GC write, keep the existing data in 256 * the data fork, mark our newly written GC extent as reclaimable, then 257 * move on to the next extent. 258 * 259 * Note that this can also happen when racing with operations that do 260 * not actually invalidate the data, but just move it to a different 261 * inode (XFS_IOC_EXCHANGE_RANGE), or to a different offset inside the 262 * inode (FALLOC_FL_COLLAPSE_RANGE / FALLOC_FL_INSERT_RANGE). If the 263 * data was just moved around, GC fails to free the zone, but the zone 264 * becomes a GC candidate again as soon as all previous GC I/O has 265 * finished and these blocks will be moved out eventually. 266 */ 267 if (old_startblock != NULLFSBLOCK && 268 old_startblock != data.br_startblock) 269 goto skip; 270 271 trace_xfs_reflink_cow_remap_from(ip, new); 272 trace_xfs_reflink_cow_remap_to(ip, &data); 273 274 error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, 275 XFS_IEXT_REFLINK_END_COW_CNT); 276 if (error) 277 return error; 278 279 if (data.br_startblock != HOLESTARTBLOCK) { 280 ASSERT(data.br_startblock != DELAYSTARTBLOCK); 281 ASSERT(!isnullstartblock(data.br_startblock)); 282 283 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); 284 if (xfs_is_reflink_inode(ip)) { 285 xfs_refcount_decrease_extent(tp, true, &data); 286 } else { 287 error = xfs_free_extent_later(tp, data.br_startblock, 288 data.br_blockcount, NULL, 289 XFS_AG_RESV_NONE, 290 XFS_FREE_EXTENT_REALTIME); 291 if (error) 292 return error; 293 } 294 } 295 296 trace_xfs_zone_record_blocks(oz, 297 xfs_rtb_to_rgbno(tp->t_mountp, new->br_startblock), 298 new->br_blockcount); 299 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 300 xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); 301 rmapip->i_used_blocks += new->br_blockcount; 302 ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); 303 xfs_zone_inc_written(oz, new->br_blockcount); 304 xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 305 306 /* Map the new blocks into the data fork. */ 307 xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); 308 return 0; 309 310 skip: 311 trace_xfs_reflink_cow_remap_skip(ip, new); 312 xfs_zone_skip_blocks(oz, new->br_blockcount); 313 return 0; 314 } 315 316 int 317 xfs_zoned_end_io( 318 struct xfs_inode *ip, 319 xfs_off_t offset, 320 xfs_off_t count, 321 xfs_daddr_t daddr, 322 struct xfs_open_zone *oz, 323 xfs_fsblock_t old_startblock) 324 { 325 struct xfs_mount *mp = ip->i_mount; 326 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 327 struct xfs_bmbt_irec new = { 328 .br_startoff = XFS_B_TO_FSBT(mp, offset), 329 .br_startblock = xfs_daddr_to_rtb(mp, daddr), 330 .br_state = XFS_EXT_NORM, 331 }; 332 unsigned int resblks = 333 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 334 struct xfs_trans *tp; 335 int error; 336 337 if (xfs_is_shutdown(mp)) 338 return -EIO; 339 340 while (new.br_startoff < end_fsb) { 341 new.br_blockcount = end_fsb - new.br_startoff; 342 343 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 344 XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); 345 if (error) 346 return error; 347 xfs_ilock(ip, XFS_ILOCK_EXCL); 348 xfs_trans_ijoin(tp, ip, 0); 349 350 error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); 351 if (error) 352 xfs_trans_cancel(tp); 353 else 354 error = xfs_trans_commit(tp); 355 xfs_iunlock(ip, XFS_ILOCK_EXCL); 356 if (error) 357 return error; 358 359 new.br_startoff += new.br_blockcount; 360 new.br_startblock += new.br_blockcount; 361 if (old_startblock != NULLFSBLOCK) 362 old_startblock += new.br_blockcount; 363 } 364 365 return 0; 366 } 367 368 /* 369 * "Free" blocks allocated in a zone. 370 * 371 * Just decrement the used blocks counter and report the space as freed. 372 */ 373 int 374 xfs_zone_free_blocks( 375 struct xfs_trans *tp, 376 struct xfs_rtgroup *rtg, 377 xfs_fsblock_t fsbno, 378 xfs_filblks_t len) 379 { 380 struct xfs_mount *mp = tp->t_mountp; 381 struct xfs_inode *rmapip = rtg_rmap(rtg); 382 383 xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); 384 385 if (len > rmapip->i_used_blocks) { 386 xfs_err(mp, 387 "trying to free more blocks (%lld) than used counter (%u).", 388 len, rmapip->i_used_blocks); 389 ASSERT(len <= rmapip->i_used_blocks); 390 xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); 391 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 392 return -EFSCORRUPTED; 393 } 394 395 trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); 396 397 rmapip->i_used_blocks -= len; 398 /* 399 * Don't add open zones to the reclaimable buckets. The I/O completion 400 * for writing the last block will take care of accounting for already 401 * unused blocks instead. 402 */ 403 if (!READ_ONCE(rtg->rtg_open_zone)) 404 xfs_zone_account_reclaimable(rtg, len); 405 xfs_add_frextents(mp, len); 406 xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 407 return 0; 408 } 409 410 static struct xfs_open_zone * 411 xfs_init_open_zone( 412 struct xfs_rtgroup *rtg, 413 xfs_rgblock_t write_pointer, 414 enum rw_hint write_hint, 415 bool is_gc) 416 { 417 struct xfs_open_zone *oz; 418 419 oz = kzalloc_obj(*oz, GFP_NOFS | __GFP_NOFAIL); 420 spin_lock_init(&oz->oz_alloc_lock); 421 atomic_set(&oz->oz_ref, 1); 422 oz->oz_rtg = rtg; 423 oz->oz_allocated = write_pointer; 424 oz->oz_written = write_pointer; 425 oz->oz_write_hint = write_hint; 426 oz->oz_is_gc = is_gc; 427 428 /* 429 * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap 430 * inode, but we don't really want to take that here because we are 431 * under the zone_list_lock. Ensure the pointer is only set for a fully 432 * initialized open zone structure so that a racy lookup finding it is 433 * fine. 434 */ 435 WRITE_ONCE(rtg->rtg_open_zone, oz); 436 return oz; 437 } 438 439 /* 440 * Find a completely free zone, open it, and return a reference. 441 */ 442 struct xfs_open_zone * 443 xfs_open_zone( 444 struct xfs_mount *mp, 445 enum rw_hint write_hint, 446 bool is_gc) 447 { 448 struct xfs_zone_info *zi = mp->m_zone_info; 449 XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, 0); 450 struct xfs_group *xg; 451 452 /* 453 * Pick the free zone with lowest index. Zones in the beginning of the 454 * address space typically provides higher bandwidth than those at the 455 * end of the address space on HDDs. 456 */ 457 xas_lock(&xas); 458 xas_for_each_marked(&xas, xg, ULONG_MAX, XFS_RTG_FREE) 459 if (atomic_inc_not_zero(&xg->xg_active_ref)) 460 goto found; 461 xas_unlock(&xas); 462 return NULL; 463 464 found: 465 xas_clear_mark(&xas, XFS_RTG_FREE); 466 atomic_dec(&zi->zi_nr_free_zones); 467 xas_unlock(&xas); 468 469 set_current_state(TASK_RUNNING); 470 return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); 471 } 472 473 static struct xfs_open_zone * 474 xfs_try_open_zone( 475 struct xfs_mount *mp, 476 enum rw_hint write_hint) 477 { 478 struct xfs_zone_info *zi = mp->m_zone_info; 479 struct xfs_open_zone *oz; 480 481 if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) 482 return NULL; 483 if (atomic_read(&zi->zi_nr_free_zones) < 484 XFS_GC_ZONES - XFS_OPEN_GC_ZONES) 485 return NULL; 486 487 /* 488 * Increment the open zone count to reserve our slot before dropping 489 * zi_open_zones_lock. 490 */ 491 zi->zi_nr_open_zones++; 492 spin_unlock(&zi->zi_open_zones_lock); 493 oz = xfs_open_zone(mp, write_hint, false); 494 spin_lock(&zi->zi_open_zones_lock); 495 if (!oz) { 496 zi->zi_nr_open_zones--; 497 return NULL; 498 } 499 500 atomic_inc(&oz->oz_ref); 501 list_add_tail(&oz->oz_entry, &zi->zi_open_zones); 502 503 /* 504 * If this was the last free zone, other waiters might be waiting 505 * on us to write to it as well. 506 */ 507 wake_up_all(&zi->zi_zone_wait); 508 509 if (xfs_zoned_need_gc(mp)) 510 wake_up_process(zi->zi_gc_thread); 511 512 trace_xfs_zone_opened(oz->oz_rtg); 513 return oz; 514 } 515 516 enum xfs_zone_alloc_score { 517 /* Any open zone will do it, we're desperate */ 518 XFS_ZONE_ALLOC_ANY = 0, 519 520 /* It better fit somehow */ 521 XFS_ZONE_ALLOC_OK = 1, 522 523 /* Only reuse a zone if it fits really well. */ 524 XFS_ZONE_ALLOC_GOOD = 2, 525 }; 526 527 /* 528 * Life time hint co-location matrix. Fields not set default to 0 529 * aka XFS_ZONE_ALLOC_ANY. 530 */ 531 static const unsigned int 532 xfs_zoned_hint_score[WRITE_LIFE_HINT_NR][WRITE_LIFE_HINT_NR] = { 533 [WRITE_LIFE_NOT_SET] = { 534 [WRITE_LIFE_NOT_SET] = XFS_ZONE_ALLOC_OK, 535 }, 536 [WRITE_LIFE_NONE] = { 537 [WRITE_LIFE_NONE] = XFS_ZONE_ALLOC_OK, 538 }, 539 [WRITE_LIFE_SHORT] = { 540 [WRITE_LIFE_SHORT] = XFS_ZONE_ALLOC_GOOD, 541 }, 542 [WRITE_LIFE_MEDIUM] = { 543 [WRITE_LIFE_MEDIUM] = XFS_ZONE_ALLOC_GOOD, 544 }, 545 [WRITE_LIFE_LONG] = { 546 [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, 547 [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, 548 }, 549 [WRITE_LIFE_EXTREME] = { 550 [WRITE_LIFE_LONG] = XFS_ZONE_ALLOC_OK, 551 [WRITE_LIFE_EXTREME] = XFS_ZONE_ALLOC_OK, 552 }, 553 }; 554 555 static bool 556 xfs_try_use_zone( 557 struct xfs_zone_info *zi, 558 enum rw_hint file_hint, 559 struct xfs_open_zone *oz, 560 unsigned int goodness) 561 { 562 if (oz->oz_is_gc) 563 return false; 564 565 if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 566 return false; 567 568 if (xfs_zoned_hint_score[oz->oz_write_hint][file_hint] < goodness) 569 return false; 570 571 if (!atomic_inc_not_zero(&oz->oz_ref)) 572 return false; 573 574 /* 575 * If we have a hint set for the data, use that for the zone even if 576 * some data was written already without any hint set, but don't change 577 * the temperature after that as that would make little sense without 578 * tracking per-temperature class written block counts, which is 579 * probably overkill anyway. 580 */ 581 if (file_hint != WRITE_LIFE_NOT_SET && 582 oz->oz_write_hint == WRITE_LIFE_NOT_SET) 583 oz->oz_write_hint = file_hint; 584 585 /* 586 * If we couldn't match by inode or life time we just pick the first 587 * zone with enough space above. For that we want the least busy zone 588 * for some definition of "least" busy. For now this simple LRU 589 * algorithm that rotates every zone to the end of the list will do it, 590 * even if it isn't exactly cache friendly. 591 */ 592 if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) 593 list_move_tail(&oz->oz_entry, &zi->zi_open_zones); 594 return true; 595 } 596 597 static struct xfs_open_zone * 598 xfs_select_open_zone_lru( 599 struct xfs_zone_info *zi, 600 enum rw_hint file_hint, 601 unsigned int goodness) 602 { 603 struct xfs_open_zone *oz; 604 605 lockdep_assert_held(&zi->zi_open_zones_lock); 606 607 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) 608 if (xfs_try_use_zone(zi, file_hint, oz, goodness)) 609 return oz; 610 611 cond_resched_lock(&zi->zi_open_zones_lock); 612 return NULL; 613 } 614 615 static struct xfs_open_zone * 616 xfs_select_open_zone_mru( 617 struct xfs_zone_info *zi, 618 enum rw_hint file_hint) 619 { 620 struct xfs_open_zone *oz; 621 622 lockdep_assert_held(&zi->zi_open_zones_lock); 623 624 list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) 625 if (xfs_try_use_zone(zi, file_hint, oz, XFS_ZONE_ALLOC_OK)) 626 return oz; 627 628 cond_resched_lock(&zi->zi_open_zones_lock); 629 return NULL; 630 } 631 632 static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) 633 { 634 if (xfs_has_nolifetime(ip->i_mount)) 635 return WRITE_LIFE_NOT_SET; 636 return VFS_I(ip)->i_write_hint; 637 } 638 639 /* 640 * Try to tightly pack small files that are written back after they were closed 641 * instead of trying to open new zones for them or spread them to the least 642 * recently used zone. This optimizes the data layout for workloads that untar 643 * or copy a lot of small files. Right now this does not separate multiple such 644 * streams. 645 */ 646 static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) 647 { 648 struct xfs_mount *mp = ip->i_mount; 649 size_t zone_capacity = 650 XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].blocks); 651 652 /* 653 * Do not pack write files that are already using a full zone to avoid 654 * fragmentation. 655 */ 656 if (i_size_read(VFS_I(ip)) >= zone_capacity) 657 return false; 658 659 return !inode_is_open_for_write(VFS_I(ip)) && 660 !(ip->i_diflags & XFS_DIFLAG_APPEND); 661 } 662 663 static struct xfs_open_zone * 664 xfs_select_zone_nowait( 665 struct xfs_mount *mp, 666 enum rw_hint write_hint, 667 bool pack_tight) 668 { 669 struct xfs_zone_info *zi = mp->m_zone_info; 670 struct xfs_open_zone *oz = NULL; 671 672 if (xfs_is_shutdown(mp)) 673 return NULL; 674 675 /* 676 * Try to fill up open zones with matching temperature if available. It 677 * is better to try to co-locate data when this is favorable, so we can 678 * activate empty zones when it is statistically better to separate 679 * data. 680 */ 681 spin_lock(&zi->zi_open_zones_lock); 682 oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_GOOD); 683 if (oz) 684 goto out_unlock; 685 686 if (pack_tight) { 687 oz = xfs_select_open_zone_mru(zi, write_hint); 688 if (oz) 689 goto out_unlock; 690 } 691 692 /* 693 * See if we can open a new zone and use that so that data for different 694 * files is mixed as little as possible. 695 */ 696 oz = xfs_try_open_zone(mp, write_hint); 697 if (oz) 698 goto out_unlock; 699 700 /* 701 * Try to find a zone that is an ok match to colocate data with. 702 */ 703 oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_OK); 704 if (oz) 705 goto out_unlock; 706 707 /* 708 * Pick the least recently used zone, regardless of hint match 709 */ 710 oz = xfs_select_open_zone_lru(zi, write_hint, XFS_ZONE_ALLOC_ANY); 711 out_unlock: 712 spin_unlock(&zi->zi_open_zones_lock); 713 return oz; 714 } 715 716 static struct xfs_open_zone * 717 xfs_select_zone( 718 struct xfs_mount *mp, 719 enum rw_hint write_hint, 720 bool pack_tight) 721 { 722 struct xfs_zone_info *zi = mp->m_zone_info; 723 DEFINE_WAIT (wait); 724 struct xfs_open_zone *oz; 725 726 oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); 727 if (oz) 728 return oz; 729 730 for (;;) { 731 prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); 732 oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); 733 if (oz || xfs_is_shutdown(mp)) 734 break; 735 schedule(); 736 } 737 finish_wait(&zi->zi_zone_wait, &wait); 738 return oz; 739 } 740 741 static unsigned int 742 xfs_zone_alloc_blocks( 743 struct xfs_open_zone *oz, 744 xfs_filblks_t count_fsb, 745 sector_t *sector, 746 bool *is_seq) 747 { 748 struct xfs_rtgroup *rtg = oz->oz_rtg; 749 struct xfs_mount *mp = rtg_mount(rtg); 750 xfs_rgblock_t allocated; 751 752 spin_lock(&oz->oz_alloc_lock); 753 count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, 754 (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated); 755 if (!count_fsb) { 756 spin_unlock(&oz->oz_alloc_lock); 757 return 0; 758 } 759 allocated = oz->oz_allocated; 760 oz->oz_allocated += count_fsb; 761 spin_unlock(&oz->oz_alloc_lock); 762 763 trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb); 764 765 *sector = xfs_gbno_to_daddr(rtg_group(rtg), 0); 766 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); 767 if (!*is_seq) 768 *sector += XFS_FSB_TO_BB(mp, allocated); 769 return XFS_FSB_TO_B(mp, count_fsb); 770 } 771 772 void 773 xfs_mark_rtg_boundary( 774 struct iomap_ioend *ioend) 775 { 776 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 777 sector_t sector = ioend->io_bio.bi_iter.bi_sector; 778 779 if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) 780 ioend->io_flags |= IOMAP_IOEND_BOUNDARY; 781 } 782 783 /* 784 * Check if we have a cached last open zone available for the inode and 785 * if yes return a reference to it. 786 */ 787 static struct xfs_open_zone * 788 xfs_get_cached_zone( 789 struct xfs_inode *ip) 790 { 791 struct xfs_open_zone *oz; 792 793 rcu_read_lock(); 794 oz = VFS_I(ip)->i_private; 795 if (oz) { 796 /* 797 * GC only steals open zones at mount time, so no GC zones 798 * should end up in the cache. 799 */ 800 ASSERT(!oz->oz_is_gc); 801 if (!atomic_inc_not_zero(&oz->oz_ref)) 802 oz = NULL; 803 } 804 rcu_read_unlock(); 805 806 return oz; 807 } 808 809 /* 810 * Stash our zone in the inode so that is is reused for future allocations. 811 * 812 * The open_zone structure will be pinned until either the inode is freed or 813 * until the cached open zone is replaced with a different one because the 814 * current one was full when we tried to use it. This means we keep any 815 * open zone around forever as long as any inode that used it for the last 816 * write is cached, which slightly increases the memory use of cached inodes 817 * that were every written to, but significantly simplifies the cached zone 818 * lookup. Because the open_zone is clearly marked as full when all data 819 * in the underlying RTG was written, the caching is always safe. 820 */ 821 static void 822 xfs_set_cached_zone( 823 struct xfs_inode *ip, 824 struct xfs_open_zone *oz) 825 { 826 struct xfs_open_zone *old_oz; 827 828 atomic_inc(&oz->oz_ref); 829 old_oz = xchg(&VFS_I(ip)->i_private, oz); 830 if (old_oz) 831 xfs_open_zone_put(old_oz); 832 } 833 834 static void 835 xfs_submit_zoned_bio( 836 struct iomap_ioend *ioend, 837 struct xfs_open_zone *oz, 838 bool is_seq) 839 { 840 ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; 841 ioend->io_private = oz; 842 atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ 843 844 if (is_seq) { 845 ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; 846 ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; 847 } else { 848 xfs_mark_rtg_boundary(ioend); 849 } 850 851 submit_bio(&ioend->io_bio); 852 } 853 854 void 855 xfs_zone_alloc_and_submit( 856 struct iomap_ioend *ioend, 857 struct xfs_open_zone **oz) 858 { 859 struct xfs_inode *ip = XFS_I(ioend->io_inode); 860 struct xfs_mount *mp = ip->i_mount; 861 enum rw_hint write_hint = xfs_inode_write_hint(ip); 862 bool pack_tight = xfs_zoned_pack_tight(ip); 863 unsigned int alloc_len; 864 struct iomap_ioend *split; 865 bool is_seq; 866 867 if (xfs_is_shutdown(mp)) 868 goto out_error; 869 870 /* 871 * If we don't have a locally cached zone in this write context, see if 872 * the inode is still associated with a zone and use that if so. 873 */ 874 if (!*oz) 875 *oz = xfs_get_cached_zone(ip); 876 877 if (!*oz) { 878 select_zone: 879 *oz = xfs_select_zone(mp, write_hint, pack_tight); 880 if (!*oz) 881 goto out_error; 882 xfs_set_cached_zone(ip, *oz); 883 } 884 885 alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), 886 &ioend->io_sector, &is_seq); 887 if (!alloc_len) { 888 xfs_open_zone_put(*oz); 889 goto select_zone; 890 } 891 892 while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { 893 if (IS_ERR(split)) 894 goto out_split_error; 895 alloc_len -= split->io_bio.bi_iter.bi_size; 896 xfs_submit_zoned_bio(split, *oz, is_seq); 897 if (!alloc_len) { 898 xfs_open_zone_put(*oz); 899 goto select_zone; 900 } 901 } 902 903 xfs_submit_zoned_bio(ioend, *oz, is_seq); 904 return; 905 906 out_split_error: 907 ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); 908 out_error: 909 bio_io_error(&ioend->io_bio); 910 } 911 912 /* 913 * Wake up all threads waiting for a zoned space allocation when the file system 914 * is shut down. 915 */ 916 void 917 xfs_zoned_wake_all( 918 struct xfs_mount *mp) 919 { 920 /* 921 * Don't wake up if there is no m_zone_info. This is complicated by the 922 * fact that unmount can't atomically clear m_zone_info and thus we need 923 * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE 924 * during log recovery so we can't entirely rely on that either. 925 */ 926 if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info) 927 wake_up_all(&mp->m_zone_info->zi_zone_wait); 928 } 929 930 /* 931 * Check if @rgbno in @rgb is a potentially valid block. It might still be 932 * unused, but that information is only found in the rmap. 933 */ 934 bool 935 xfs_zone_rgbno_is_valid( 936 struct xfs_rtgroup *rtg, 937 xfs_rgnumber_t rgbno) 938 { 939 lockdep_assert_held(&rtg_rmap(rtg)->i_lock); 940 941 if (rtg->rtg_open_zone) 942 return rgbno < rtg->rtg_open_zone->oz_allocated; 943 return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, 944 rtg_rgno(rtg), XFS_RTG_FREE); 945 } 946 947 static void 948 xfs_free_open_zones( 949 struct xfs_zone_info *zi) 950 { 951 struct xfs_open_zone *oz; 952 953 spin_lock(&zi->zi_open_zones_lock); 954 while ((oz = list_first_entry_or_null(&zi->zi_open_zones, 955 struct xfs_open_zone, oz_entry))) { 956 list_del(&oz->oz_entry); 957 xfs_open_zone_put(oz); 958 } 959 spin_unlock(&zi->zi_open_zones_lock); 960 961 /* 962 * Wait for all open zones to be freed so that they drop the group 963 * references: 964 */ 965 rcu_barrier(); 966 } 967 968 struct xfs_init_zones { 969 uint32_t zone_size; 970 uint32_t zone_capacity; 971 uint64_t available; 972 uint64_t reclaimable; 973 }; 974 975 /* 976 * For sequential write required zones, we restart writing at the hardware write 977 * pointer returned by xfs_validate_blk_zone(). 978 * 979 * For conventional zones or conventional devices we have to query the rmap to 980 * find the highest recorded block and set the write pointer to the block after 981 * that. In case of a power loss this misses blocks where the data I/O has 982 * completed but not recorded in the rmap yet, and it also rewrites blocks if 983 * the most recently written ones got deleted again before unmount, but this is 984 * the best we can do without hardware support. 985 */ 986 static int 987 xfs_query_write_pointer( 988 struct xfs_init_zones *iz, 989 struct xfs_rtgroup *rtg, 990 xfs_rgblock_t *write_pointer) 991 { 992 struct xfs_mount *mp = rtg_mount(rtg); 993 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 994 sector_t start = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 995 xfs_rgblock_t highest_rgbno; 996 struct blk_zone zone = {}; 997 int error; 998 999 if (bdev_is_zoned(bdev)) { 1000 error = blkdev_get_zone_info(bdev, start, &zone); 1001 if (error) 1002 return error; 1003 if (zone.start != start) { 1004 xfs_warn(mp, "mismatched zone start: 0x%llx/0x%llx.", 1005 zone.start, start); 1006 return -EFSCORRUPTED; 1007 } 1008 1009 if (!xfs_validate_blk_zone(mp, &zone, rtg_rgno(rtg), 1010 iz->zone_size, iz->zone_capacity, 1011 write_pointer)) 1012 return -EFSCORRUPTED; 1013 1014 /* 1015 * Use the hardware write pointer returned by 1016 * xfs_validate_blk_zone for sequential write required zones, 1017 * else fall through to the rmap-based estimation below. 1018 */ 1019 if (zone.cond != BLK_ZONE_COND_NOT_WP) 1020 return 0; 1021 } 1022 1023 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 1024 highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); 1025 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 1026 1027 if (highest_rgbno == NULLRGBLOCK) 1028 *write_pointer = 0; 1029 else 1030 *write_pointer = highest_rgbno + 1; 1031 return 0; 1032 } 1033 1034 static int 1035 xfs_init_zone( 1036 struct xfs_init_zones *iz, 1037 struct xfs_rtgroup *rtg, 1038 xfs_rgblock_t write_pointer) 1039 { 1040 struct xfs_mount *mp = rtg_mount(rtg); 1041 struct xfs_zone_info *zi = mp->m_zone_info; 1042 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 1043 int error; 1044 1045 if (write_pointer > rtg->rtg_extents) { 1046 xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", 1047 rtg_rgno(rtg), write_pointer); 1048 return -EFSCORRUPTED; 1049 } 1050 1051 if (used > rtg->rtg_extents) { 1052 xfs_warn(mp, 1053 "zone %u has used counter (0x%x) larger than zone capacity (0x%llx).", 1054 rtg_rgno(rtg), used, rtg->rtg_extents); 1055 return -EFSCORRUPTED; 1056 } 1057 1058 if (used > write_pointer) { 1059 xfs_warn(mp, 1060 "zone %u has used counter (0x%x) larger than write pointer (0x%x).", 1061 rtg_rgno(rtg), used, write_pointer); 1062 return -EFSCORRUPTED; 1063 } 1064 1065 if (write_pointer == 0 && used != 0) { 1066 xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", 1067 rtg_rgno(rtg), used); 1068 return -EFSCORRUPTED; 1069 } 1070 1071 /* 1072 * If there are no used blocks, but the zone is not in empty state yet 1073 * we lost power before the zoned reset. In that case finish the work 1074 * here. 1075 */ 1076 if (write_pointer == rtg_blocks(rtg) && used == 0) { 1077 error = xfs_zone_gc_reset_sync(rtg); 1078 if (error) 1079 return error; 1080 write_pointer = 0; 1081 } 1082 1083 if (write_pointer == 0) { 1084 /* zone is empty */ 1085 atomic_inc(&zi->zi_nr_free_zones); 1086 xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE); 1087 iz->available += rtg_blocks(rtg); 1088 } else if (write_pointer < rtg_blocks(rtg)) { 1089 /* zone is open */ 1090 struct xfs_open_zone *oz; 1091 1092 atomic_inc(&rtg_group(rtg)->xg_active_ref); 1093 oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, 1094 false); 1095 list_add_tail(&oz->oz_entry, &zi->zi_open_zones); 1096 zi->zi_nr_open_zones++; 1097 1098 iz->available += (rtg_blocks(rtg) - write_pointer); 1099 iz->reclaimable += write_pointer - used; 1100 } else if (used < rtg_blocks(rtg)) { 1101 /* zone fully written, but has freed blocks */ 1102 xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 1103 iz->reclaimable += (rtg_blocks(rtg) - used); 1104 } 1105 1106 return 0; 1107 } 1108 1109 /* 1110 * Calculate the max open zone limit based on the of number of backing zones 1111 * available. 1112 */ 1113 static inline uint32_t 1114 xfs_max_open_zones( 1115 struct xfs_mount *mp) 1116 { 1117 unsigned int max_open, max_open_data_zones; 1118 1119 /* 1120 * We need two zones for every open data zone, one in reserve as we 1121 * don't reclaim open zones. One data zone and its spare is included 1122 * in XFS_MIN_ZONES to support at least one user data writer. 1123 */ 1124 max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; 1125 max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; 1126 1127 /* 1128 * Cap the max open limit to 1/4 of available space. Without this we'd 1129 * run out of easy reclaim targets too quickly and storage devices don't 1130 * handle huge numbers of concurrent write streams overly well. 1131 */ 1132 max_open = min(max_open, mp->m_sb.sb_rgcount / 4); 1133 1134 return max(XFS_MIN_OPEN_ZONES, max_open); 1135 } 1136 1137 /* 1138 * Normally we use the open zone limit that the device reports. If there is 1139 * none let the user pick one from the command line. 1140 * 1141 * If the device doesn't report an open zone limit and there is no override, 1142 * allow to hold about a quarter of the zones open. In theory we could allow 1143 * all to be open, but at that point we run into GC deadlocks because we can't 1144 * reclaim open zones. 1145 * 1146 * When used on conventional SSDs a lower open limit is advisable as we'll 1147 * otherwise overwhelm the FTL just as much as a conventional block allocator. 1148 * 1149 * Note: To debug the open zone management code, force max_open to 1 here. 1150 */ 1151 static int 1152 xfs_calc_open_zones( 1153 struct xfs_mount *mp) 1154 { 1155 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 1156 unsigned int bdev_open_zones = bdev_max_open_zones(bdev); 1157 1158 if (!mp->m_max_open_zones) { 1159 if (bdev_open_zones) 1160 mp->m_max_open_zones = bdev_open_zones; 1161 else 1162 mp->m_max_open_zones = XFS_DEFAULT_MAX_OPEN_ZONES; 1163 } 1164 1165 if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { 1166 xfs_notice(mp, "need at least %u open zones.", 1167 XFS_MIN_OPEN_ZONES); 1168 return -EIO; 1169 } 1170 1171 if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { 1172 mp->m_max_open_zones = bdev_open_zones; 1173 xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", 1174 bdev_open_zones); 1175 } 1176 1177 if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { 1178 mp->m_max_open_zones = xfs_max_open_zones(mp); 1179 xfs_info(mp, 1180 "limiting open zones to %u due to total zone count (%u)", 1181 mp->m_max_open_zones, mp->m_sb.sb_rgcount); 1182 } 1183 1184 return 0; 1185 } 1186 1187 static unsigned long * 1188 xfs_alloc_bucket_bitmap( 1189 struct xfs_mount *mp) 1190 { 1191 return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), 1192 sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); 1193 } 1194 1195 static struct xfs_zone_info * 1196 xfs_alloc_zone_info( 1197 struct xfs_mount *mp) 1198 { 1199 struct xfs_zone_info *zi; 1200 int i; 1201 1202 zi = kzalloc_obj(*zi); 1203 if (!zi) 1204 return NULL; 1205 INIT_LIST_HEAD(&zi->zi_open_zones); 1206 INIT_LIST_HEAD(&zi->zi_reclaim_reservations); 1207 spin_lock_init(&zi->zi_reset_list_lock); 1208 spin_lock_init(&zi->zi_open_zones_lock); 1209 spin_lock_init(&zi->zi_reservation_lock); 1210 init_waitqueue_head(&zi->zi_zone_wait); 1211 spin_lock_init(&zi->zi_used_buckets_lock); 1212 for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { 1213 zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); 1214 if (!zi->zi_used_bucket_bitmap[i]) 1215 goto out_free_bitmaps; 1216 } 1217 return zi; 1218 1219 out_free_bitmaps: 1220 while (--i > 0) 1221 kvfree(zi->zi_used_bucket_bitmap[i]); 1222 kfree(zi); 1223 return NULL; 1224 } 1225 1226 static void 1227 xfs_free_zone_info( 1228 struct xfs_zone_info *zi) 1229 { 1230 int i; 1231 1232 xfs_free_open_zones(zi); 1233 for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) 1234 kvfree(zi->zi_used_bucket_bitmap[i]); 1235 kfree(zi); 1236 } 1237 1238 static int 1239 xfs_report_zones( 1240 struct xfs_mount *mp, 1241 struct xfs_init_zones *iz) 1242 { 1243 struct xfs_rtgroup *rtg = NULL; 1244 1245 while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1246 xfs_rgblock_t write_pointer; 1247 int error; 1248 1249 error = xfs_query_write_pointer(iz, rtg, &write_pointer); 1250 if (!error) 1251 error = xfs_init_zone(iz, rtg, write_pointer); 1252 if (error) { 1253 xfs_rtgroup_rele(rtg); 1254 return error; 1255 } 1256 } 1257 1258 return 0; 1259 } 1260 1261 static inline bool 1262 xfs_zone_is_conv( 1263 struct xfs_rtgroup *rtg) 1264 { 1265 return !bdev_zone_is_seq(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 1266 xfs_gbno_to_daddr(rtg_group(rtg), 0)); 1267 } 1268 1269 static struct xfs_open_zone * 1270 xfs_find_fullest_conventional_open_zone( 1271 struct xfs_mount *mp) 1272 { 1273 struct xfs_zone_info *zi = mp->m_zone_info; 1274 struct xfs_open_zone *found = NULL, *oz; 1275 1276 spin_lock(&zi->zi_open_zones_lock); 1277 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 1278 if (!xfs_zone_is_conv(oz->oz_rtg)) 1279 continue; 1280 if (!found || oz->oz_allocated > found->oz_allocated) 1281 found = oz; 1282 } 1283 spin_unlock(&zi->zi_open_zones_lock); 1284 1285 return found; 1286 } 1287 1288 /* 1289 * Find the fullest conventional zones and remove them from the open zone pool 1290 * until we are at the open zone limit. 1291 * 1292 * We can end up with spurious "open" zones when the last blocks in a fully 1293 * written zone were invalidate as there is no write pointer for conventional 1294 * zones. 1295 * 1296 * If we are still over the limit when there is no conventional open zone left, 1297 * the user overrode the max open zones limit using the max_open_zones mount 1298 * option we should fail. 1299 */ 1300 static int 1301 xfs_finish_spurious_open_zones( 1302 struct xfs_mount *mp, 1303 struct xfs_init_zones *iz) 1304 { 1305 struct xfs_zone_info *zi = mp->m_zone_info; 1306 1307 while (zi->zi_nr_open_zones > mp->m_max_open_zones) { 1308 struct xfs_open_zone *oz; 1309 xfs_filblks_t adjust; 1310 1311 oz = xfs_find_fullest_conventional_open_zone(mp); 1312 if (!oz) { 1313 xfs_err(mp, 1314 "too many open zones for max_open_zones limit (%u/%u)", 1315 zi->zi_nr_open_zones, mp->m_max_open_zones); 1316 return -EINVAL; 1317 } 1318 1319 xfs_rtgroup_lock(oz->oz_rtg, XFS_RTGLOCK_RMAP); 1320 adjust = rtg_blocks(oz->oz_rtg) - oz->oz_written; 1321 trace_xfs_zone_spurious_open(oz, oz->oz_written, adjust); 1322 oz->oz_written = rtg_blocks(oz->oz_rtg); 1323 xfs_open_zone_mark_full(oz); 1324 xfs_rtgroup_unlock(oz->oz_rtg, XFS_RTGLOCK_RMAP); 1325 iz->available -= adjust; 1326 iz->reclaimable += adjust; 1327 } 1328 1329 return 0; 1330 } 1331 1332 int 1333 xfs_mount_zones( 1334 struct xfs_mount *mp) 1335 { 1336 struct xfs_init_zones iz = { 1337 .zone_capacity = mp->m_groups[XG_TYPE_RTG].blocks, 1338 .zone_size = xfs_rtgroup_raw_size(mp), 1339 }; 1340 int error; 1341 1342 if (!mp->m_rtdev_targp) { 1343 xfs_notice(mp, "RT device missing."); 1344 return -EINVAL; 1345 } 1346 1347 if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { 1348 xfs_notice(mp, "invalid flag combination."); 1349 return -EFSCORRUPTED; 1350 } 1351 if (mp->m_sb.sb_rextsize != 1) { 1352 xfs_notice(mp, "zoned file systems do not support rextsize."); 1353 return -EFSCORRUPTED; 1354 } 1355 if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { 1356 xfs_notice(mp, 1357 "zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); 1358 return -EFSCORRUPTED; 1359 } 1360 1361 error = xfs_calc_open_zones(mp); 1362 if (error) 1363 return error; 1364 1365 mp->m_zone_info = xfs_alloc_zone_info(mp); 1366 if (!mp->m_zone_info) 1367 return -ENOMEM; 1368 1369 error = xfs_report_zones(mp, &iz); 1370 if (error) 1371 goto out_free_zone_info; 1372 1373 error = xfs_finish_spurious_open_zones(mp, &iz); 1374 if (error) 1375 goto out_free_zone_info; 1376 1377 xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); 1378 xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 1379 iz.available + iz.reclaimable); 1380 1381 /* 1382 * The writeback code switches between inodes regularly to provide 1383 * fairness. The default lower bound is 4MiB, but for zoned file 1384 * systems we want to increase that both to reduce seeks, but also more 1385 * importantly so that workloads that writes files in a multiple of the 1386 * zone size do not get fragmented and require garbage collection when 1387 * they shouldn't. Increase is to the zone size capped by the max 1388 * extent len. 1389 * 1390 * Note that because s_min_writeback_pages is a superblock field, this 1391 * value also get applied to non-zoned files on the data device if 1392 * there are any. On typical zoned setup all data is on the RT device 1393 * because using the more efficient sequential write required zones 1394 * is the reason for using the zone allocator, and either the RT device 1395 * and the (meta)data device are on the same block device, or the 1396 * (meta)data device is on a fast SSD while the data on the RT device 1397 * is on a SMR HDD. In any combination of the above cases enforcing 1398 * the higher min_writeback_pages for non-RT inodes is either a noop 1399 * or beneficial. 1400 */ 1401 mp->m_super->s_min_writeback_pages = 1402 XFS_FSB_TO_B(mp, min(iz.zone_capacity, XFS_MAX_BMBT_EXTLEN)) >> 1403 PAGE_SHIFT; 1404 1405 /* 1406 * The user may configure GC to free up a percentage of unused blocks. 1407 * By default this is 0. GC will always trigger at the minimum level 1408 * for keeping max_open_zones available for data placement. 1409 */ 1410 mp->m_zonegc_low_space = 0; 1411 1412 error = xfs_zone_gc_mount(mp); 1413 if (error) 1414 goto out_free_zone_info; 1415 1416 xfs_info(mp, "%u zones of %u blocks (%u max open zones)", 1417 mp->m_sb.sb_rgcount, iz.zone_capacity, mp->m_max_open_zones); 1418 trace_xfs_zones_mount(mp); 1419 return 0; 1420 1421 out_free_zone_info: 1422 xfs_free_zone_info(mp->m_zone_info); 1423 return error; 1424 } 1425 1426 void 1427 xfs_unmount_zones( 1428 struct xfs_mount *mp) 1429 { 1430 xfs_zone_gc_unmount(mp); 1431 xfs_free_zone_info(mp->m_zone_info); 1432 } 1433