1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_error.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_iomap.h" 15 #include "xfs_trans.h" 16 #include "xfs_alloc.h" 17 #include "xfs_bmap.h" 18 #include "xfs_bmap_btree.h" 19 #include "xfs_trans_space.h" 20 #include "xfs_refcount.h" 21 #include "xfs_rtbitmap.h" 22 #include "xfs_rtrmap_btree.h" 23 #include "xfs_zone_alloc.h" 24 #include "xfs_zone_priv.h" 25 #include "xfs_zones.h" 26 #include "xfs_trace.h" 27 #include "xfs_mru_cache.h" 28 29 void 30 xfs_open_zone_put( 31 struct xfs_open_zone *oz) 32 { 33 if (atomic_dec_and_test(&oz->oz_ref)) { 34 xfs_rtgroup_rele(oz->oz_rtg); 35 kfree(oz); 36 } 37 } 38 39 static inline uint32_t 40 xfs_zone_bucket( 41 struct xfs_mount *mp, 42 uint32_t used_blocks) 43 { 44 return XFS_ZONE_USED_BUCKETS * used_blocks / 45 mp->m_groups[XG_TYPE_RTG].blocks; 46 } 47 48 static inline void 49 xfs_zone_add_to_bucket( 50 struct xfs_zone_info *zi, 51 xfs_rgnumber_t rgno, 52 uint32_t to_bucket) 53 { 54 __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); 55 zi->zi_used_bucket_entries[to_bucket]++; 56 } 57 58 static inline void 59 xfs_zone_remove_from_bucket( 60 struct xfs_zone_info *zi, 61 xfs_rgnumber_t rgno, 62 uint32_t from_bucket) 63 { 64 __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); 65 zi->zi_used_bucket_entries[from_bucket]--; 66 } 67 68 static void 69 xfs_zone_account_reclaimable( 70 struct xfs_rtgroup *rtg, 71 uint32_t freed) 72 { 73 struct xfs_group *xg = &rtg->rtg_group; 74 struct xfs_mount *mp = rtg_mount(rtg); 75 struct xfs_zone_info *zi = mp->m_zone_info; 76 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 77 xfs_rgnumber_t rgno = rtg_rgno(rtg); 78 uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); 79 uint32_t to_bucket = xfs_zone_bucket(mp, used); 80 bool was_full = (used + freed == rtg_blocks(rtg)); 81 82 /* 83 * This can be called from log recovery, where the zone_info structure 84 * hasn't been allocated yet. Skip all work as xfs_mount_zones will 85 * add the zones to the right buckets before the file systems becomes 86 * active. 87 */ 88 if (!zi) 89 return; 90 91 if (!used) { 92 /* 93 * The zone is now empty, remove it from the bottom bucket and 94 * trigger a reset. 95 */ 96 trace_xfs_zone_emptied(rtg); 97 98 if (!was_full) 99 xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); 100 101 spin_lock(&zi->zi_used_buckets_lock); 102 if (!was_full) 103 xfs_zone_remove_from_bucket(zi, rgno, from_bucket); 104 spin_unlock(&zi->zi_used_buckets_lock); 105 106 spin_lock(&zi->zi_reset_list_lock); 107 xg->xg_next_reset = zi->zi_reset_list; 108 zi->zi_reset_list = xg; 109 spin_unlock(&zi->zi_reset_list_lock); 110 111 if (zi->zi_gc_thread) 112 wake_up_process(zi->zi_gc_thread); 113 } else if (was_full) { 114 /* 115 * The zone transitioned from full, mark it up as reclaimable 116 * and wake up GC which might be waiting for zones to reclaim. 117 */ 118 spin_lock(&zi->zi_used_buckets_lock); 119 xfs_zone_add_to_bucket(zi, rgno, to_bucket); 120 spin_unlock(&zi->zi_used_buckets_lock); 121 122 xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); 123 if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) 124 wake_up_process(zi->zi_gc_thread); 125 } else if (to_bucket != from_bucket) { 126 /* 127 * Move the zone to a new bucket if it dropped below the 128 * threshold. 129 */ 130 spin_lock(&zi->zi_used_buckets_lock); 131 xfs_zone_add_to_bucket(zi, rgno, to_bucket); 132 xfs_zone_remove_from_bucket(zi, rgno, from_bucket); 133 spin_unlock(&zi->zi_used_buckets_lock); 134 } 135 } 136 137 static void 138 xfs_open_zone_mark_full( 139 struct xfs_open_zone *oz) 140 { 141 struct xfs_rtgroup *rtg = oz->oz_rtg; 142 struct xfs_mount *mp = rtg_mount(rtg); 143 struct xfs_zone_info *zi = mp->m_zone_info; 144 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 145 146 trace_xfs_zone_full(rtg); 147 148 WRITE_ONCE(rtg->rtg_open_zone, NULL); 149 150 spin_lock(&zi->zi_open_zones_lock); 151 if (oz->oz_is_gc) { 152 ASSERT(current == zi->zi_gc_thread); 153 zi->zi_open_gc_zone = NULL; 154 } else { 155 zi->zi_nr_open_zones--; 156 list_del_init(&oz->oz_entry); 157 } 158 spin_unlock(&zi->zi_open_zones_lock); 159 xfs_open_zone_put(oz); 160 161 wake_up_all(&zi->zi_zone_wait); 162 if (used < rtg_blocks(rtg)) 163 xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 164 } 165 166 static void 167 xfs_zone_record_blocks( 168 struct xfs_trans *tp, 169 struct xfs_open_zone *oz, 170 xfs_fsblock_t fsbno, 171 xfs_filblks_t len) 172 { 173 struct xfs_mount *mp = tp->t_mountp; 174 struct xfs_rtgroup *rtg = oz->oz_rtg; 175 struct xfs_inode *rmapip = rtg_rmap(rtg); 176 177 trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); 178 179 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 180 xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); 181 rmapip->i_used_blocks += len; 182 ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); 183 oz->oz_written += len; 184 if (oz->oz_written == rtg_blocks(rtg)) 185 xfs_open_zone_mark_full(oz); 186 xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 187 } 188 189 /* 190 * Called for blocks that have been written to disk, but not actually linked to 191 * an inode, which can happen when garbage collection races with user data 192 * writes to a file. 193 */ 194 static void 195 xfs_zone_skip_blocks( 196 struct xfs_open_zone *oz, 197 xfs_filblks_t len) 198 { 199 struct xfs_rtgroup *rtg = oz->oz_rtg; 200 201 trace_xfs_zone_skip_blocks(oz, 0, len); 202 203 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 204 oz->oz_written += len; 205 if (oz->oz_written == rtg_blocks(rtg)) 206 xfs_open_zone_mark_full(oz); 207 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 208 209 xfs_add_frextents(rtg_mount(rtg), len); 210 } 211 212 static int 213 xfs_zoned_map_extent( 214 struct xfs_trans *tp, 215 struct xfs_inode *ip, 216 struct xfs_bmbt_irec *new, 217 struct xfs_open_zone *oz, 218 xfs_fsblock_t old_startblock) 219 { 220 struct xfs_bmbt_irec data; 221 int nmaps = 1; 222 int error; 223 224 /* Grab the corresponding mapping in the data fork. */ 225 error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, 226 &nmaps, 0); 227 if (error) 228 return error; 229 230 /* 231 * Cap the update to the existing extent in the data fork because we can 232 * only overwrite one extent at a time. 233 */ 234 ASSERT(new->br_blockcount >= data.br_blockcount); 235 new->br_blockcount = data.br_blockcount; 236 237 /* 238 * If a data write raced with this GC write, keep the existing data in 239 * the data fork, mark our newly written GC extent as reclaimable, then 240 * move on to the next extent. 241 */ 242 if (old_startblock != NULLFSBLOCK && 243 old_startblock != data.br_startblock) 244 goto skip; 245 246 trace_xfs_reflink_cow_remap_from(ip, new); 247 trace_xfs_reflink_cow_remap_to(ip, &data); 248 249 error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, 250 XFS_IEXT_REFLINK_END_COW_CNT); 251 if (error) 252 return error; 253 254 if (data.br_startblock != HOLESTARTBLOCK) { 255 ASSERT(data.br_startblock != DELAYSTARTBLOCK); 256 ASSERT(!isnullstartblock(data.br_startblock)); 257 258 xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); 259 if (xfs_is_reflink_inode(ip)) { 260 xfs_refcount_decrease_extent(tp, true, &data); 261 } else { 262 error = xfs_free_extent_later(tp, data.br_startblock, 263 data.br_blockcount, NULL, 264 XFS_AG_RESV_NONE, 265 XFS_FREE_EXTENT_REALTIME); 266 if (error) 267 return error; 268 } 269 } 270 271 xfs_zone_record_blocks(tp, oz, new->br_startblock, new->br_blockcount); 272 273 /* Map the new blocks into the data fork. */ 274 xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); 275 return 0; 276 277 skip: 278 trace_xfs_reflink_cow_remap_skip(ip, new); 279 xfs_zone_skip_blocks(oz, new->br_blockcount); 280 return 0; 281 } 282 283 int 284 xfs_zoned_end_io( 285 struct xfs_inode *ip, 286 xfs_off_t offset, 287 xfs_off_t count, 288 xfs_daddr_t daddr, 289 struct xfs_open_zone *oz, 290 xfs_fsblock_t old_startblock) 291 { 292 struct xfs_mount *mp = ip->i_mount; 293 xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); 294 struct xfs_bmbt_irec new = { 295 .br_startoff = XFS_B_TO_FSBT(mp, offset), 296 .br_startblock = xfs_daddr_to_rtb(mp, daddr), 297 .br_state = XFS_EXT_NORM, 298 }; 299 unsigned int resblks = 300 XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); 301 struct xfs_trans *tp; 302 int error; 303 304 if (xfs_is_shutdown(mp)) 305 return -EIO; 306 307 while (new.br_startoff < end_fsb) { 308 new.br_blockcount = end_fsb - new.br_startoff; 309 310 error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 311 XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); 312 if (error) 313 return error; 314 xfs_ilock(ip, XFS_ILOCK_EXCL); 315 xfs_trans_ijoin(tp, ip, 0); 316 317 error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); 318 if (error) 319 xfs_trans_cancel(tp); 320 else 321 error = xfs_trans_commit(tp); 322 xfs_iunlock(ip, XFS_ILOCK_EXCL); 323 if (error) 324 return error; 325 326 new.br_startoff += new.br_blockcount; 327 new.br_startblock += new.br_blockcount; 328 if (old_startblock != NULLFSBLOCK) 329 old_startblock += new.br_blockcount; 330 } 331 332 return 0; 333 } 334 335 /* 336 * "Free" blocks allocated in a zone. 337 * 338 * Just decrement the used blocks counter and report the space as freed. 339 */ 340 int 341 xfs_zone_free_blocks( 342 struct xfs_trans *tp, 343 struct xfs_rtgroup *rtg, 344 xfs_fsblock_t fsbno, 345 xfs_filblks_t len) 346 { 347 struct xfs_mount *mp = tp->t_mountp; 348 struct xfs_inode *rmapip = rtg_rmap(rtg); 349 350 xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); 351 352 if (len > rmapip->i_used_blocks) { 353 xfs_err(mp, 354 "trying to free more blocks (%lld) than used counter (%u).", 355 len, rmapip->i_used_blocks); 356 ASSERT(len <= rmapip->i_used_blocks); 357 xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); 358 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 359 return -EFSCORRUPTED; 360 } 361 362 trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); 363 364 rmapip->i_used_blocks -= len; 365 /* 366 * Don't add open zones to the reclaimable buckets. The I/O completion 367 * for writing the last block will take care of accounting for already 368 * unused blocks instead. 369 */ 370 if (!READ_ONCE(rtg->rtg_open_zone)) 371 xfs_zone_account_reclaimable(rtg, len); 372 xfs_add_frextents(mp, len); 373 xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); 374 return 0; 375 } 376 377 static struct xfs_group * 378 xfs_find_free_zone( 379 struct xfs_mount *mp, 380 unsigned long start, 381 unsigned long end) 382 { 383 struct xfs_zone_info *zi = mp->m_zone_info; 384 XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start); 385 struct xfs_group *xg; 386 387 xas_lock(&xas); 388 xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) 389 if (atomic_inc_not_zero(&xg->xg_active_ref)) 390 goto found; 391 xas_unlock(&xas); 392 return NULL; 393 394 found: 395 xas_clear_mark(&xas, XFS_RTG_FREE); 396 atomic_dec(&zi->zi_nr_free_zones); 397 zi->zi_free_zone_cursor = xg->xg_gno; 398 xas_unlock(&xas); 399 return xg; 400 } 401 402 static struct xfs_open_zone * 403 xfs_init_open_zone( 404 struct xfs_rtgroup *rtg, 405 xfs_rgblock_t write_pointer, 406 enum rw_hint write_hint, 407 bool is_gc) 408 { 409 struct xfs_open_zone *oz; 410 411 oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); 412 spin_lock_init(&oz->oz_alloc_lock); 413 atomic_set(&oz->oz_ref, 1); 414 oz->oz_rtg = rtg; 415 oz->oz_allocated = write_pointer; 416 oz->oz_written = write_pointer; 417 oz->oz_write_hint = write_hint; 418 oz->oz_is_gc = is_gc; 419 420 /* 421 * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap 422 * inode, but we don't really want to take that here because we are 423 * under the zone_list_lock. Ensure the pointer is only set for a fully 424 * initialized open zone structure so that a racy lookup finding it is 425 * fine. 426 */ 427 WRITE_ONCE(rtg->rtg_open_zone, oz); 428 return oz; 429 } 430 431 /* 432 * Find a completely free zone, open it, and return a reference. 433 */ 434 struct xfs_open_zone * 435 xfs_open_zone( 436 struct xfs_mount *mp, 437 enum rw_hint write_hint, 438 bool is_gc) 439 { 440 struct xfs_zone_info *zi = mp->m_zone_info; 441 struct xfs_group *xg; 442 443 xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); 444 if (!xg) 445 xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); 446 if (!xg) 447 return NULL; 448 449 set_current_state(TASK_RUNNING); 450 return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); 451 } 452 453 static struct xfs_open_zone * 454 xfs_try_open_zone( 455 struct xfs_mount *mp, 456 enum rw_hint write_hint) 457 { 458 struct xfs_zone_info *zi = mp->m_zone_info; 459 struct xfs_open_zone *oz; 460 461 if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) 462 return NULL; 463 if (atomic_read(&zi->zi_nr_free_zones) < 464 XFS_GC_ZONES - XFS_OPEN_GC_ZONES) 465 return NULL; 466 467 /* 468 * Increment the open zone count to reserve our slot before dropping 469 * zi_open_zones_lock. 470 */ 471 zi->zi_nr_open_zones++; 472 spin_unlock(&zi->zi_open_zones_lock); 473 oz = xfs_open_zone(mp, write_hint, false); 474 spin_lock(&zi->zi_open_zones_lock); 475 if (!oz) { 476 zi->zi_nr_open_zones--; 477 return NULL; 478 } 479 480 atomic_inc(&oz->oz_ref); 481 list_add_tail(&oz->oz_entry, &zi->zi_open_zones); 482 483 /* 484 * If this was the last free zone, other waiters might be waiting 485 * on us to write to it as well. 486 */ 487 wake_up_all(&zi->zi_zone_wait); 488 489 if (xfs_zoned_need_gc(mp)) 490 wake_up_process(zi->zi_gc_thread); 491 492 trace_xfs_zone_opened(oz->oz_rtg); 493 return oz; 494 } 495 496 /* 497 * For data with short or medium lifetime, try to colocated it into an 498 * already open zone with a matching temperature. 499 */ 500 static bool 501 xfs_colocate_eagerly( 502 enum rw_hint file_hint) 503 { 504 switch (file_hint) { 505 case WRITE_LIFE_MEDIUM: 506 case WRITE_LIFE_SHORT: 507 case WRITE_LIFE_NONE: 508 return true; 509 default: 510 return false; 511 } 512 } 513 514 static bool 515 xfs_good_hint_match( 516 struct xfs_open_zone *oz, 517 enum rw_hint file_hint) 518 { 519 switch (oz->oz_write_hint) { 520 case WRITE_LIFE_LONG: 521 case WRITE_LIFE_EXTREME: 522 /* colocate long and extreme */ 523 if (file_hint == WRITE_LIFE_LONG || 524 file_hint == WRITE_LIFE_EXTREME) 525 return true; 526 break; 527 case WRITE_LIFE_MEDIUM: 528 /* colocate medium with medium */ 529 if (file_hint == WRITE_LIFE_MEDIUM) 530 return true; 531 break; 532 case WRITE_LIFE_SHORT: 533 case WRITE_LIFE_NONE: 534 case WRITE_LIFE_NOT_SET: 535 /* colocate short and none */ 536 if (file_hint <= WRITE_LIFE_SHORT) 537 return true; 538 break; 539 } 540 return false; 541 } 542 543 static bool 544 xfs_try_use_zone( 545 struct xfs_zone_info *zi, 546 enum rw_hint file_hint, 547 struct xfs_open_zone *oz, 548 bool lowspace) 549 { 550 if (oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 551 return false; 552 if (!lowspace && !xfs_good_hint_match(oz, file_hint)) 553 return false; 554 if (!atomic_inc_not_zero(&oz->oz_ref)) 555 return false; 556 557 /* 558 * If we have a hint set for the data, use that for the zone even if 559 * some data was written already without any hint set, but don't change 560 * the temperature after that as that would make little sense without 561 * tracking per-temperature class written block counts, which is 562 * probably overkill anyway. 563 */ 564 if (file_hint != WRITE_LIFE_NOT_SET && 565 oz->oz_write_hint == WRITE_LIFE_NOT_SET) 566 oz->oz_write_hint = file_hint; 567 568 /* 569 * If we couldn't match by inode or life time we just pick the first 570 * zone with enough space above. For that we want the least busy zone 571 * for some definition of "least" busy. For now this simple LRU 572 * algorithm that rotates every zone to the end of the list will do it, 573 * even if it isn't exactly cache friendly. 574 */ 575 if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) 576 list_move_tail(&oz->oz_entry, &zi->zi_open_zones); 577 return true; 578 } 579 580 static struct xfs_open_zone * 581 xfs_select_open_zone_lru( 582 struct xfs_zone_info *zi, 583 enum rw_hint file_hint, 584 bool lowspace) 585 { 586 struct xfs_open_zone *oz; 587 588 lockdep_assert_held(&zi->zi_open_zones_lock); 589 590 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) 591 if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) 592 return oz; 593 594 cond_resched_lock(&zi->zi_open_zones_lock); 595 return NULL; 596 } 597 598 static struct xfs_open_zone * 599 xfs_select_open_zone_mru( 600 struct xfs_zone_info *zi, 601 enum rw_hint file_hint) 602 { 603 struct xfs_open_zone *oz; 604 605 lockdep_assert_held(&zi->zi_open_zones_lock); 606 607 list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) 608 if (xfs_try_use_zone(zi, file_hint, oz, false)) 609 return oz; 610 611 cond_resched_lock(&zi->zi_open_zones_lock); 612 return NULL; 613 } 614 615 static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) 616 { 617 if (xfs_has_nolifetime(ip->i_mount)) 618 return WRITE_LIFE_NOT_SET; 619 return VFS_I(ip)->i_write_hint; 620 } 621 622 /* 623 * Try to pack inodes that are written back after they were closed tight instead 624 * of trying to open new zones for them or spread them to the least recently 625 * used zone. This optimizes the data layout for workloads that untar or copy 626 * a lot of small files. Right now this does not separate multiple such 627 * streams. 628 */ 629 static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) 630 { 631 return !inode_is_open_for_write(VFS_I(ip)) && 632 !(ip->i_diflags & XFS_DIFLAG_APPEND); 633 } 634 635 static struct xfs_open_zone * 636 xfs_select_zone_nowait( 637 struct xfs_mount *mp, 638 enum rw_hint write_hint, 639 bool pack_tight) 640 { 641 struct xfs_zone_info *zi = mp->m_zone_info; 642 struct xfs_open_zone *oz = NULL; 643 644 if (xfs_is_shutdown(mp)) 645 return NULL; 646 647 /* 648 * Try to fill up open zones with matching temperature if available. It 649 * is better to try to co-locate data when this is favorable, so we can 650 * activate empty zones when it is statistically better to separate 651 * data. 652 */ 653 spin_lock(&zi->zi_open_zones_lock); 654 if (xfs_colocate_eagerly(write_hint)) 655 oz = xfs_select_open_zone_lru(zi, write_hint, false); 656 else if (pack_tight) 657 oz = xfs_select_open_zone_mru(zi, write_hint); 658 if (oz) 659 goto out_unlock; 660 661 /* 662 * See if we can open a new zone and use that so that data for different 663 * files is mixed as little as possible. 664 */ 665 oz = xfs_try_open_zone(mp, write_hint); 666 if (oz) 667 goto out_unlock; 668 669 /* 670 * Try to colocate cold data with other cold data if we failed to open a 671 * new zone for it. 672 */ 673 if (write_hint != WRITE_LIFE_NOT_SET && 674 !xfs_colocate_eagerly(write_hint)) 675 oz = xfs_select_open_zone_lru(zi, write_hint, false); 676 if (!oz) 677 oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); 678 if (!oz) 679 oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); 680 out_unlock: 681 spin_unlock(&zi->zi_open_zones_lock); 682 return oz; 683 } 684 685 static struct xfs_open_zone * 686 xfs_select_zone( 687 struct xfs_mount *mp, 688 enum rw_hint write_hint, 689 bool pack_tight) 690 { 691 struct xfs_zone_info *zi = mp->m_zone_info; 692 DEFINE_WAIT (wait); 693 struct xfs_open_zone *oz; 694 695 oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); 696 if (oz) 697 return oz; 698 699 for (;;) { 700 prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); 701 oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); 702 if (oz || xfs_is_shutdown(mp)) 703 break; 704 schedule(); 705 } 706 finish_wait(&zi->zi_zone_wait, &wait); 707 return oz; 708 } 709 710 static unsigned int 711 xfs_zone_alloc_blocks( 712 struct xfs_open_zone *oz, 713 xfs_filblks_t count_fsb, 714 sector_t *sector, 715 bool *is_seq) 716 { 717 struct xfs_rtgroup *rtg = oz->oz_rtg; 718 struct xfs_mount *mp = rtg_mount(rtg); 719 xfs_rgblock_t allocated; 720 721 spin_lock(&oz->oz_alloc_lock); 722 count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, 723 (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_allocated); 724 if (!count_fsb) { 725 spin_unlock(&oz->oz_alloc_lock); 726 return 0; 727 } 728 allocated = oz->oz_allocated; 729 oz->oz_allocated += count_fsb; 730 spin_unlock(&oz->oz_alloc_lock); 731 732 trace_xfs_zone_alloc_blocks(oz, allocated, count_fsb); 733 734 *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 735 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); 736 if (!*is_seq) 737 *sector += XFS_FSB_TO_BB(mp, allocated); 738 return XFS_FSB_TO_B(mp, count_fsb); 739 } 740 741 void 742 xfs_mark_rtg_boundary( 743 struct iomap_ioend *ioend) 744 { 745 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 746 sector_t sector = ioend->io_bio.bi_iter.bi_sector; 747 748 if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) 749 ioend->io_flags |= IOMAP_IOEND_BOUNDARY; 750 } 751 752 /* 753 * Cache the last zone written to for an inode so that it is considered first 754 * for subsequent writes. 755 */ 756 struct xfs_zone_cache_item { 757 struct xfs_mru_cache_elem mru; 758 struct xfs_open_zone *oz; 759 }; 760 761 static inline struct xfs_zone_cache_item * 762 xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) 763 { 764 return container_of(mru, struct xfs_zone_cache_item, mru); 765 } 766 767 static void 768 xfs_zone_cache_free_func( 769 void *data, 770 struct xfs_mru_cache_elem *mru) 771 { 772 struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); 773 774 xfs_open_zone_put(item->oz); 775 kfree(item); 776 } 777 778 /* 779 * Check if we have a cached last open zone available for the inode and 780 * if yes return a reference to it. 781 */ 782 static struct xfs_open_zone * 783 xfs_cached_zone( 784 struct xfs_mount *mp, 785 struct xfs_inode *ip) 786 { 787 struct xfs_mru_cache_elem *mru; 788 struct xfs_open_zone *oz; 789 790 mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); 791 if (!mru) 792 return NULL; 793 oz = xfs_zone_cache_item(mru)->oz; 794 if (oz) { 795 /* 796 * GC only steals open zones at mount time, so no GC zones 797 * should end up in the cache. 798 */ 799 ASSERT(!oz->oz_is_gc); 800 ASSERT(atomic_read(&oz->oz_ref) > 0); 801 atomic_inc(&oz->oz_ref); 802 } 803 xfs_mru_cache_done(mp->m_zone_cache); 804 return oz; 805 } 806 807 /* 808 * Update the last used zone cache for a given inode. 809 * 810 * The caller must have a reference on the open zone. 811 */ 812 static void 813 xfs_zone_cache_create_association( 814 struct xfs_inode *ip, 815 struct xfs_open_zone *oz) 816 { 817 struct xfs_mount *mp = ip->i_mount; 818 struct xfs_zone_cache_item *item = NULL; 819 struct xfs_mru_cache_elem *mru; 820 821 ASSERT(atomic_read(&oz->oz_ref) > 0); 822 atomic_inc(&oz->oz_ref); 823 824 mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); 825 if (mru) { 826 /* 827 * If we have an association already, update it to point to the 828 * new zone. 829 */ 830 item = xfs_zone_cache_item(mru); 831 xfs_open_zone_put(item->oz); 832 item->oz = oz; 833 xfs_mru_cache_done(mp->m_zone_cache); 834 return; 835 } 836 837 item = kmalloc(sizeof(*item), GFP_KERNEL); 838 if (!item) { 839 xfs_open_zone_put(oz); 840 return; 841 } 842 item->oz = oz; 843 xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); 844 } 845 846 static void 847 xfs_submit_zoned_bio( 848 struct iomap_ioend *ioend, 849 struct xfs_open_zone *oz, 850 bool is_seq) 851 { 852 ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; 853 ioend->io_private = oz; 854 atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ 855 856 if (is_seq) { 857 ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; 858 ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; 859 } else { 860 xfs_mark_rtg_boundary(ioend); 861 } 862 863 submit_bio(&ioend->io_bio); 864 } 865 866 void 867 xfs_zone_alloc_and_submit( 868 struct iomap_ioend *ioend, 869 struct xfs_open_zone **oz) 870 { 871 struct xfs_inode *ip = XFS_I(ioend->io_inode); 872 struct xfs_mount *mp = ip->i_mount; 873 enum rw_hint write_hint = xfs_inode_write_hint(ip); 874 bool pack_tight = xfs_zoned_pack_tight(ip); 875 unsigned int alloc_len; 876 struct iomap_ioend *split; 877 bool is_seq; 878 879 if (xfs_is_shutdown(mp)) 880 goto out_error; 881 882 /* 883 * If we don't have a locally cached zone in this write context, see if 884 * the inode is still associated with a zone and use that if so. 885 */ 886 if (!*oz) 887 *oz = xfs_cached_zone(mp, ip); 888 889 if (!*oz) { 890 select_zone: 891 *oz = xfs_select_zone(mp, write_hint, pack_tight); 892 if (!*oz) 893 goto out_error; 894 895 xfs_zone_cache_create_association(ip, *oz); 896 } 897 898 alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), 899 &ioend->io_sector, &is_seq); 900 if (!alloc_len) { 901 xfs_open_zone_put(*oz); 902 goto select_zone; 903 } 904 905 while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { 906 if (IS_ERR(split)) 907 goto out_split_error; 908 alloc_len -= split->io_bio.bi_iter.bi_size; 909 xfs_submit_zoned_bio(split, *oz, is_seq); 910 if (!alloc_len) { 911 xfs_open_zone_put(*oz); 912 goto select_zone; 913 } 914 } 915 916 xfs_submit_zoned_bio(ioend, *oz, is_seq); 917 return; 918 919 out_split_error: 920 ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); 921 out_error: 922 bio_io_error(&ioend->io_bio); 923 } 924 925 /* 926 * Wake up all threads waiting for a zoned space allocation when the file system 927 * is shut down. 928 */ 929 void 930 xfs_zoned_wake_all( 931 struct xfs_mount *mp) 932 { 933 /* 934 * Don't wake up if there is no m_zone_info. This is complicated by the 935 * fact that unmount can't atomically clear m_zone_info and thus we need 936 * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE 937 * during log recovery so we can't entirely rely on that either. 938 */ 939 if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info) 940 wake_up_all(&mp->m_zone_info->zi_zone_wait); 941 } 942 943 /* 944 * Check if @rgbno in @rgb is a potentially valid block. It might still be 945 * unused, but that information is only found in the rmap. 946 */ 947 bool 948 xfs_zone_rgbno_is_valid( 949 struct xfs_rtgroup *rtg, 950 xfs_rgnumber_t rgbno) 951 { 952 lockdep_assert_held(&rtg_rmap(rtg)->i_lock); 953 954 if (rtg->rtg_open_zone) 955 return rgbno < rtg->rtg_open_zone->oz_allocated; 956 return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, 957 rtg_rgno(rtg), XFS_RTG_FREE); 958 } 959 960 static void 961 xfs_free_open_zones( 962 struct xfs_zone_info *zi) 963 { 964 struct xfs_open_zone *oz; 965 966 spin_lock(&zi->zi_open_zones_lock); 967 while ((oz = list_first_entry_or_null(&zi->zi_open_zones, 968 struct xfs_open_zone, oz_entry))) { 969 list_del(&oz->oz_entry); 970 xfs_open_zone_put(oz); 971 } 972 spin_unlock(&zi->zi_open_zones_lock); 973 } 974 975 struct xfs_init_zones { 976 struct xfs_mount *mp; 977 uint64_t available; 978 uint64_t reclaimable; 979 }; 980 981 static int 982 xfs_init_zone( 983 struct xfs_init_zones *iz, 984 struct xfs_rtgroup *rtg, 985 struct blk_zone *zone) 986 { 987 struct xfs_mount *mp = rtg_mount(rtg); 988 struct xfs_zone_info *zi = mp->m_zone_info; 989 uint32_t used = rtg_rmap(rtg)->i_used_blocks; 990 xfs_rgblock_t write_pointer, highest_rgbno; 991 int error; 992 993 if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) 994 return -EFSCORRUPTED; 995 996 /* 997 * For sequential write required zones we retrieved the hardware write 998 * pointer above. 999 * 1000 * For conventional zones or conventional devices we don't have that 1001 * luxury. Instead query the rmap to find the highest recorded block 1002 * and set the write pointer to the block after that. In case of a 1003 * power loss this misses blocks where the data I/O has completed but 1004 * not recorded in the rmap yet, and it also rewrites blocks if the most 1005 * recently written ones got deleted again before unmount, but this is 1006 * the best we can do without hardware support. 1007 */ 1008 if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { 1009 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 1010 highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); 1011 if (highest_rgbno == NULLRGBLOCK) 1012 write_pointer = 0; 1013 else 1014 write_pointer = highest_rgbno + 1; 1015 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 1016 } 1017 1018 /* 1019 * If there are no used blocks, but the zone is not in empty state yet 1020 * we lost power before the zoned reset. In that case finish the work 1021 * here. 1022 */ 1023 if (write_pointer == rtg_blocks(rtg) && used == 0) { 1024 error = xfs_zone_gc_reset_sync(rtg); 1025 if (error) 1026 return error; 1027 write_pointer = 0; 1028 } 1029 1030 if (write_pointer == 0) { 1031 /* zone is empty */ 1032 atomic_inc(&zi->zi_nr_free_zones); 1033 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 1034 iz->available += rtg_blocks(rtg); 1035 } else if (write_pointer < rtg_blocks(rtg)) { 1036 /* zone is open */ 1037 struct xfs_open_zone *oz; 1038 1039 atomic_inc(&rtg_group(rtg)->xg_active_ref); 1040 oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, 1041 false); 1042 list_add_tail(&oz->oz_entry, &zi->zi_open_zones); 1043 zi->zi_nr_open_zones++; 1044 1045 iz->available += (rtg_blocks(rtg) - write_pointer); 1046 iz->reclaimable += write_pointer - used; 1047 } else if (used < rtg_blocks(rtg)) { 1048 /* zone fully written, but has freed blocks */ 1049 xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); 1050 iz->reclaimable += (rtg_blocks(rtg) - used); 1051 } 1052 1053 return 0; 1054 } 1055 1056 static int 1057 xfs_get_zone_info_cb( 1058 struct blk_zone *zone, 1059 unsigned int idx, 1060 void *data) 1061 { 1062 struct xfs_init_zones *iz = data; 1063 struct xfs_mount *mp = iz->mp; 1064 xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start); 1065 xfs_rgnumber_t rgno; 1066 struct xfs_rtgroup *rtg; 1067 int error; 1068 1069 if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { 1070 xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); 1071 return -EFSCORRUPTED; 1072 } 1073 1074 rgno = xfs_rtb_to_rgno(mp, zsbno); 1075 rtg = xfs_rtgroup_grab(mp, rgno); 1076 if (!rtg) { 1077 xfs_warn(mp, "realtime group not found for zone %u.", rgno); 1078 return -EFSCORRUPTED; 1079 } 1080 error = xfs_init_zone(iz, rtg, zone); 1081 xfs_rtgroup_rele(rtg); 1082 return error; 1083 } 1084 1085 /* 1086 * Calculate the max open zone limit based on the of number of backing zones 1087 * available. 1088 */ 1089 static inline uint32_t 1090 xfs_max_open_zones( 1091 struct xfs_mount *mp) 1092 { 1093 unsigned int max_open, max_open_data_zones; 1094 1095 /* 1096 * We need two zones for every open data zone, one in reserve as we 1097 * don't reclaim open zones. One data zone and its spare is included 1098 * in XFS_MIN_ZONES to support at least one user data writer. 1099 */ 1100 max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; 1101 max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; 1102 1103 /* 1104 * Cap the max open limit to 1/4 of available space. Without this we'd 1105 * run out of easy reclaim targets too quickly and storage devices don't 1106 * handle huge numbers of concurrent write streams overly well. 1107 */ 1108 max_open = min(max_open, mp->m_sb.sb_rgcount / 4); 1109 1110 return max(XFS_MIN_OPEN_ZONES, max_open); 1111 } 1112 1113 /* 1114 * Normally we use the open zone limit that the device reports. If there is 1115 * none let the user pick one from the command line. 1116 * 1117 * If the device doesn't report an open zone limit and there is no override, 1118 * allow to hold about a quarter of the zones open. In theory we could allow 1119 * all to be open, but at that point we run into GC deadlocks because we can't 1120 * reclaim open zones. 1121 * 1122 * When used on conventional SSDs a lower open limit is advisable as we'll 1123 * otherwise overwhelm the FTL just as much as a conventional block allocator. 1124 * 1125 * Note: To debug the open zone management code, force max_open to 1 here. 1126 */ 1127 static int 1128 xfs_calc_open_zones( 1129 struct xfs_mount *mp) 1130 { 1131 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 1132 unsigned int bdev_open_zones = bdev_max_open_zones(bdev); 1133 1134 if (!mp->m_max_open_zones) { 1135 if (bdev_open_zones) 1136 mp->m_max_open_zones = bdev_open_zones; 1137 else 1138 mp->m_max_open_zones = xfs_max_open_zones(mp); 1139 } 1140 1141 if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { 1142 xfs_notice(mp, "need at least %u open zones.", 1143 XFS_MIN_OPEN_ZONES); 1144 return -EIO; 1145 } 1146 1147 if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { 1148 mp->m_max_open_zones = bdev_open_zones; 1149 xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", 1150 bdev_open_zones); 1151 } 1152 1153 if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { 1154 mp->m_max_open_zones = xfs_max_open_zones(mp); 1155 xfs_info(mp, 1156 "limiting open zones to %u due to total zone count (%u)", 1157 mp->m_max_open_zones, mp->m_sb.sb_rgcount); 1158 } 1159 1160 return 0; 1161 } 1162 1163 static unsigned long * 1164 xfs_alloc_bucket_bitmap( 1165 struct xfs_mount *mp) 1166 { 1167 return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), 1168 sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); 1169 } 1170 1171 static struct xfs_zone_info * 1172 xfs_alloc_zone_info( 1173 struct xfs_mount *mp) 1174 { 1175 struct xfs_zone_info *zi; 1176 int i; 1177 1178 zi = kzalloc(sizeof(*zi), GFP_KERNEL); 1179 if (!zi) 1180 return NULL; 1181 INIT_LIST_HEAD(&zi->zi_open_zones); 1182 INIT_LIST_HEAD(&zi->zi_reclaim_reservations); 1183 spin_lock_init(&zi->zi_reset_list_lock); 1184 spin_lock_init(&zi->zi_open_zones_lock); 1185 spin_lock_init(&zi->zi_reservation_lock); 1186 init_waitqueue_head(&zi->zi_zone_wait); 1187 spin_lock_init(&zi->zi_used_buckets_lock); 1188 for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { 1189 zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); 1190 if (!zi->zi_used_bucket_bitmap[i]) 1191 goto out_free_bitmaps; 1192 } 1193 return zi; 1194 1195 out_free_bitmaps: 1196 while (--i > 0) 1197 kvfree(zi->zi_used_bucket_bitmap[i]); 1198 kfree(zi); 1199 return NULL; 1200 } 1201 1202 static void 1203 xfs_free_zone_info( 1204 struct xfs_zone_info *zi) 1205 { 1206 int i; 1207 1208 xfs_free_open_zones(zi); 1209 for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) 1210 kvfree(zi->zi_used_bucket_bitmap[i]); 1211 kfree(zi); 1212 } 1213 1214 int 1215 xfs_mount_zones( 1216 struct xfs_mount *mp) 1217 { 1218 struct xfs_init_zones iz = { 1219 .mp = mp, 1220 }; 1221 struct xfs_buftarg *bt = mp->m_rtdev_targp; 1222 int error; 1223 1224 if (!bt) { 1225 xfs_notice(mp, "RT device missing."); 1226 return -EINVAL; 1227 } 1228 1229 if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { 1230 xfs_notice(mp, "invalid flag combination."); 1231 return -EFSCORRUPTED; 1232 } 1233 if (mp->m_sb.sb_rextsize != 1) { 1234 xfs_notice(mp, "zoned file systems do not support rextsize."); 1235 return -EFSCORRUPTED; 1236 } 1237 if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { 1238 xfs_notice(mp, 1239 "zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); 1240 return -EFSCORRUPTED; 1241 } 1242 1243 error = xfs_calc_open_zones(mp); 1244 if (error) 1245 return error; 1246 1247 mp->m_zone_info = xfs_alloc_zone_info(mp); 1248 if (!mp->m_zone_info) 1249 return -ENOMEM; 1250 1251 xfs_info(mp, "%u zones of %u blocks size (%u max open)", 1252 mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, 1253 mp->m_max_open_zones); 1254 trace_xfs_zones_mount(mp); 1255 1256 if (bdev_is_zoned(bt->bt_bdev)) { 1257 error = blkdev_report_zones(bt->bt_bdev, 1258 XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), 1259 mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); 1260 if (error < 0) 1261 goto out_free_zone_info; 1262 } else { 1263 struct xfs_rtgroup *rtg = NULL; 1264 1265 while ((rtg = xfs_rtgroup_next(mp, rtg))) { 1266 error = xfs_init_zone(&iz, rtg, NULL); 1267 if (error) 1268 goto out_free_zone_info; 1269 } 1270 } 1271 1272 xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); 1273 xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, 1274 iz.available + iz.reclaimable); 1275 1276 /* 1277 * The user may configure GC to free up a percentage of unused blocks. 1278 * By default this is 0. GC will always trigger at the minimum level 1279 * for keeping max_open_zones available for data placement. 1280 */ 1281 mp->m_zonegc_low_space = 0; 1282 1283 error = xfs_zone_gc_mount(mp); 1284 if (error) 1285 goto out_free_zone_info; 1286 1287 /* 1288 * Set up a mru cache to track inode to open zone for data placement 1289 * purposes. The magic values for group count and life time is the 1290 * same as the defaults for file streams, which seems sane enough. 1291 */ 1292 xfs_mru_cache_create(&mp->m_zone_cache, mp, 1293 5000, 10, xfs_zone_cache_free_func); 1294 return 0; 1295 1296 out_free_zone_info: 1297 xfs_free_zone_info(mp->m_zone_info); 1298 return error; 1299 } 1300 1301 void 1302 xfs_unmount_zones( 1303 struct xfs_mount *mp) 1304 { 1305 xfs_zone_gc_unmount(mp); 1306 xfs_free_zone_info(mp->m_zone_info); 1307 xfs_mru_cache_destroy(mp->m_zone_cache); 1308 } 1309