1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/blkdev.h> 6 #include <linux/sched/mm.h> 7 #include <linux/atomic.h> 8 #include <linux/vmalloc.h> 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "zoned.h" 12 #include "disk-io.h" 13 #include "block-group.h" 14 #include "dev-replace.h" 15 #include "space-info.h" 16 #include "fs.h" 17 #include "accessors.h" 18 #include "bio.h" 19 #include "transaction.h" 20 #include "sysfs.h" 21 22 /* Maximum number of zones to report per blkdev_report_zones() call */ 23 #define BTRFS_REPORT_NR_ZONES 4096 24 /* Invalid allocation pointer value for missing devices */ 25 #define WP_MISSING_DEV ((u64)-1) 26 /* Pseudo write pointer value for conventional zone */ 27 #define WP_CONVENTIONAL ((u64)-2) 28 29 /* 30 * Location of the first zone of superblock logging zone pairs. 31 * 32 * - primary superblock: 0B (zone 0) 33 * - first copy: 512G (zone starting at that offset) 34 * - second copy: 4T (zone starting at that offset) 35 */ 36 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) 37 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) 38 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) 39 40 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) 41 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) 42 43 /* Number of superblock log zones */ 44 #define BTRFS_NR_SB_LOG_ZONES 2 45 46 /* Default number of max active zones when the device has no limits. */ 47 #define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128 48 49 /* 50 * Minimum of active zones we need: 51 * 52 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors 53 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group 54 * - 1 zone for tree-log dedicated block group 55 * - 1 zone for relocation 56 */ 57 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) 58 59 /* 60 * Minimum / maximum supported zone size. Currently, SMR disks have a zone 61 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. 62 * We do not expect the zone size to become larger than 8GiB or smaller than 63 * 4MiB in the near future. 64 */ 65 #define BTRFS_MAX_ZONE_SIZE SZ_8G 66 #define BTRFS_MIN_ZONE_SIZE SZ_4M 67 68 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) 69 70 static void wait_eb_writebacks(struct btrfs_block_group *block_group); 71 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); 72 73 static inline bool sb_zone_is_full(const struct blk_zone *zone) 74 { 75 return (zone->cond == BLK_ZONE_COND_FULL) || 76 (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); 77 } 78 79 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 80 { 81 struct blk_zone *zones = data; 82 83 memcpy(&zones[idx], zone, sizeof(*zone)); 84 85 return 0; 86 } 87 88 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 89 u64 *wp_ret) 90 { 91 bool empty[BTRFS_NR_SB_LOG_ZONES]; 92 bool full[BTRFS_NR_SB_LOG_ZONES]; 93 sector_t sector; 94 95 for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 96 ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); 97 empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); 98 full[i] = sb_zone_is_full(&zones[i]); 99 } 100 101 /* 102 * Possible states of log buffer zones 103 * 104 * Empty[0] In use[0] Full[0] 105 * Empty[1] * 0 1 106 * In use[1] x x 1 107 * Full[1] 0 0 C 108 * 109 * Log position: 110 * *: Special case, no superblock is written 111 * 0: Use write pointer of zones[0] 112 * 1: Use write pointer of zones[1] 113 * C: Compare super blocks from zones[0] and zones[1], use the latest 114 * one determined by generation 115 * x: Invalid state 116 */ 117 118 if (empty[0] && empty[1]) { 119 /* Special case to distinguish no superblock to read */ 120 *wp_ret = zones[0].start << SECTOR_SHIFT; 121 return -ENOENT; 122 } else if (full[0] && full[1]) { 123 /* Compare two super blocks */ 124 struct address_space *mapping = bdev->bd_mapping; 125 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 126 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 127 128 for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 129 u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; 130 u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - 131 BTRFS_SUPER_INFO_SIZE; 132 133 page[i] = read_cache_page_gfp(mapping, 134 bytenr >> PAGE_SHIFT, GFP_NOFS); 135 if (IS_ERR(page[i])) { 136 if (i == 1) 137 btrfs_release_disk_super(super[0]); 138 return PTR_ERR(page[i]); 139 } 140 super[i] = page_address(page[i]); 141 } 142 143 if (btrfs_super_generation(super[0]) > 144 btrfs_super_generation(super[1])) 145 sector = zones[1].start; 146 else 147 sector = zones[0].start; 148 149 for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 150 btrfs_release_disk_super(super[i]); 151 } else if (!full[0] && (empty[1] || full[1])) { 152 sector = zones[0].wp; 153 } else if (full[0]) { 154 sector = zones[1].wp; 155 } else { 156 return -EUCLEAN; 157 } 158 *wp_ret = sector << SECTOR_SHIFT; 159 return 0; 160 } 161 162 /* 163 * Get the first zone number of the superblock mirror 164 */ 165 static inline u32 sb_zone_number(int shift, int mirror) 166 { 167 u64 zone = U64_MAX; 168 169 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 170 switch (mirror) { 171 case 0: zone = 0; break; 172 case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; 173 case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; 174 } 175 176 ASSERT(zone <= U32_MAX); 177 178 return (u32)zone; 179 } 180 181 static inline sector_t zone_start_sector(u32 zone_number, 182 struct block_device *bdev) 183 { 184 return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); 185 } 186 187 static inline u64 zone_start_physical(u32 zone_number, 188 struct btrfs_zoned_device_info *zone_info) 189 { 190 return (u64)zone_number << zone_info->zone_size_shift; 191 } 192 193 /* 194 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block 195 * device into static sized chunks and fake a conventional zone on each of 196 * them. 197 */ 198 static int emulate_report_zones(struct btrfs_device *device, u64 pos, 199 struct blk_zone *zones, unsigned int nr_zones) 200 { 201 const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; 202 sector_t bdev_size = bdev_nr_sectors(device->bdev); 203 unsigned int i; 204 205 pos >>= SECTOR_SHIFT; 206 for (i = 0; i < nr_zones; i++) { 207 zones[i].start = i * zone_sectors + pos; 208 zones[i].len = zone_sectors; 209 zones[i].capacity = zone_sectors; 210 zones[i].wp = zones[i].start + zone_sectors; 211 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; 212 zones[i].cond = BLK_ZONE_COND_NOT_WP; 213 214 if (zones[i].wp >= bdev_size) { 215 i++; 216 break; 217 } 218 } 219 220 return i; 221 } 222 223 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 224 struct blk_zone *zones, unsigned int *nr_zones) 225 { 226 struct btrfs_zoned_device_info *zinfo = device->zone_info; 227 int ret; 228 229 if (!*nr_zones) 230 return 0; 231 232 if (!bdev_is_zoned(device->bdev)) { 233 ret = emulate_report_zones(device, pos, zones, *nr_zones); 234 *nr_zones = ret; 235 return 0; 236 } 237 238 /* Check cache */ 239 if (zinfo->zone_cache) { 240 unsigned int i; 241 u32 zno; 242 243 ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); 244 zno = pos >> zinfo->zone_size_shift; 245 /* 246 * We cannot report zones beyond the zone end. So, it is OK to 247 * cap *nr_zones to at the end. 248 */ 249 *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); 250 251 for (i = 0; i < *nr_zones; i++) { 252 struct blk_zone *zone_info; 253 254 zone_info = &zinfo->zone_cache[zno + i]; 255 if (!zone_info->len) 256 break; 257 } 258 259 if (i == *nr_zones) { 260 /* Cache hit on all the zones */ 261 memcpy(zones, zinfo->zone_cache + zno, 262 sizeof(*zinfo->zone_cache) * *nr_zones); 263 return 0; 264 } 265 } 266 267 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 268 copy_zone_info_cb, zones); 269 if (ret < 0) { 270 btrfs_err(device->fs_info, 271 "zoned: failed to read zone %llu on %s (devid %llu)", 272 pos, rcu_dereference(device->name), 273 device->devid); 274 return ret; 275 } 276 *nr_zones = ret; 277 if (unlikely(!ret)) 278 return -EIO; 279 280 /* Populate cache */ 281 if (zinfo->zone_cache) { 282 u32 zno = pos >> zinfo->zone_size_shift; 283 284 memcpy(zinfo->zone_cache + zno, zones, 285 sizeof(*zinfo->zone_cache) * *nr_zones); 286 } 287 288 return 0; 289 } 290 291 /* The emulated zone size is determined from the size of device extent */ 292 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) 293 { 294 BTRFS_PATH_AUTO_FREE(path); 295 struct btrfs_root *root = fs_info->dev_root; 296 struct btrfs_key key; 297 struct extent_buffer *leaf; 298 struct btrfs_dev_extent *dext; 299 int ret = 0; 300 301 key.objectid = 1; 302 key.type = BTRFS_DEV_EXTENT_KEY; 303 key.offset = 0; 304 305 path = btrfs_alloc_path(); 306 if (!path) 307 return -ENOMEM; 308 309 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 310 if (ret < 0) 311 return ret; 312 313 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 314 ret = btrfs_next_leaf(root, path); 315 if (ret < 0) 316 return ret; 317 /* No dev extents at all? Not good */ 318 if (unlikely(ret > 0)) 319 return -EUCLEAN; 320 } 321 322 leaf = path->nodes[0]; 323 dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 324 fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); 325 return 0; 326 } 327 328 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) 329 { 330 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 331 struct btrfs_device *device; 332 int ret = 0; 333 334 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ 335 if (!btrfs_fs_incompat(fs_info, ZONED)) 336 return 0; 337 338 mutex_lock(&fs_devices->device_list_mutex); 339 list_for_each_entry(device, &fs_devices->devices, dev_list) { 340 /* We can skip reading of zone info for missing devices */ 341 if (!device->bdev) 342 continue; 343 344 ret = btrfs_get_dev_zone_info(device, true); 345 if (ret) 346 break; 347 } 348 mutex_unlock(&fs_devices->device_list_mutex); 349 350 return ret; 351 } 352 353 int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) 354 { 355 struct btrfs_fs_info *fs_info = device->fs_info; 356 struct btrfs_zoned_device_info *zone_info = NULL; 357 struct block_device *bdev = device->bdev; 358 unsigned int max_active_zones; 359 unsigned int nactive; 360 sector_t nr_sectors; 361 sector_t sector = 0; 362 struct blk_zone *zones = NULL; 363 unsigned int i, nreported = 0, nr_zones; 364 sector_t zone_sectors; 365 char *model, *emulated; 366 int ret; 367 368 /* 369 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not 370 * yet be set. 371 */ 372 if (!btrfs_fs_incompat(fs_info, ZONED)) 373 return 0; 374 375 if (device->zone_info) 376 return 0; 377 378 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 379 if (!zone_info) 380 return -ENOMEM; 381 382 device->zone_info = zone_info; 383 384 if (!bdev_is_zoned(bdev)) { 385 if (!fs_info->zone_size) { 386 ret = calculate_emulated_zone_size(fs_info); 387 if (ret) 388 goto out; 389 } 390 391 ASSERT(fs_info->zone_size); 392 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; 393 } else { 394 zone_sectors = bdev_zone_sectors(bdev); 395 } 396 397 ASSERT(is_power_of_two_u64(zone_sectors)); 398 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 399 400 /* We reject devices with a zone size larger than 8GB */ 401 if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { 402 btrfs_err(fs_info, 403 "zoned: %s: zone size %llu larger than supported maximum %llu", 404 rcu_dereference(device->name), 405 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 406 ret = -EINVAL; 407 goto out; 408 } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { 409 btrfs_err(fs_info, 410 "zoned: %s: zone size %llu smaller than supported minimum %u", 411 rcu_dereference(device->name), 412 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); 413 ret = -EINVAL; 414 goto out; 415 } 416 417 nr_sectors = bdev_nr_sectors(bdev); 418 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 419 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 420 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 421 zone_info->nr_zones++; 422 423 max_active_zones = min_not_zero(bdev_max_active_zones(bdev), 424 bdev_max_open_zones(bdev)); 425 if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES) 426 max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES; 427 if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { 428 btrfs_err(fs_info, 429 "zoned: %s: max active zones %u is too small, need at least %u active zones", 430 rcu_dereference(device->name), max_active_zones, 431 BTRFS_MIN_ACTIVE_ZONES); 432 ret = -EINVAL; 433 goto out; 434 } 435 zone_info->max_active_zones = max_active_zones; 436 437 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 438 if (!zone_info->seq_zones) { 439 ret = -ENOMEM; 440 goto out; 441 } 442 443 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 444 if (!zone_info->empty_zones) { 445 ret = -ENOMEM; 446 goto out; 447 } 448 449 zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 450 if (!zone_info->active_zones) { 451 ret = -ENOMEM; 452 goto out; 453 } 454 455 zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 456 if (!zones) { 457 ret = -ENOMEM; 458 goto out; 459 } 460 461 /* 462 * Enable zone cache only for a zoned device. On a non-zoned device, we 463 * fill the zone info with emulated CONVENTIONAL zones, so no need to 464 * use the cache. 465 */ 466 if (populate_cache && bdev_is_zoned(device->bdev)) { 467 zone_info->zone_cache = vcalloc(zone_info->nr_zones, 468 sizeof(struct blk_zone)); 469 if (!zone_info->zone_cache) { 470 btrfs_err(device->fs_info, 471 "zoned: failed to allocate zone cache for %s", 472 rcu_dereference(device->name)); 473 ret = -ENOMEM; 474 goto out; 475 } 476 } 477 478 /* Get zones type */ 479 nactive = 0; 480 while (sector < nr_sectors) { 481 nr_zones = BTRFS_REPORT_NR_ZONES; 482 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 483 &nr_zones); 484 if (ret) 485 goto out; 486 487 for (i = 0; i < nr_zones; i++) { 488 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 489 __set_bit(nreported, zone_info->seq_zones); 490 switch (zones[i].cond) { 491 case BLK_ZONE_COND_EMPTY: 492 __set_bit(nreported, zone_info->empty_zones); 493 break; 494 case BLK_ZONE_COND_IMP_OPEN: 495 case BLK_ZONE_COND_EXP_OPEN: 496 case BLK_ZONE_COND_CLOSED: 497 __set_bit(nreported, zone_info->active_zones); 498 nactive++; 499 break; 500 } 501 nreported++; 502 } 503 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 504 } 505 506 if (unlikely(nreported != zone_info->nr_zones)) { 507 btrfs_err(device->fs_info, 508 "inconsistent number of zones on %s (%u/%u)", 509 rcu_dereference(device->name), nreported, 510 zone_info->nr_zones); 511 ret = -EIO; 512 goto out; 513 } 514 515 if (max_active_zones) { 516 if (unlikely(nactive > max_active_zones)) { 517 if (bdev_max_active_zones(bdev) == 0) { 518 max_active_zones = 0; 519 zone_info->max_active_zones = 0; 520 goto validate; 521 } 522 btrfs_err(device->fs_info, 523 "zoned: %u active zones on %s exceeds max_active_zones %u", 524 nactive, rcu_dereference(device->name), 525 max_active_zones); 526 ret = -EIO; 527 goto out; 528 } 529 atomic_set(&zone_info->active_zones_left, 530 max_active_zones - nactive); 531 set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); 532 } 533 534 validate: 535 /* Validate superblock log */ 536 nr_zones = BTRFS_NR_SB_LOG_ZONES; 537 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 538 u32 sb_zone; 539 u64 sb_wp; 540 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 541 542 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 543 if (sb_zone + 1 >= zone_info->nr_zones) 544 continue; 545 546 ret = btrfs_get_dev_zones(device, 547 zone_start_physical(sb_zone, zone_info), 548 &zone_info->sb_zones[sb_pos], 549 &nr_zones); 550 if (ret) 551 goto out; 552 553 if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) { 554 btrfs_err(device->fs_info, 555 "zoned: failed to read super block log zone info at devid %llu zone %u", 556 device->devid, sb_zone); 557 ret = -EUCLEAN; 558 goto out; 559 } 560 561 /* 562 * If zones[0] is conventional, always use the beginning of the 563 * zone to record superblock. No need to validate in that case. 564 */ 565 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 566 BLK_ZONE_TYPE_CONVENTIONAL) 567 continue; 568 569 ret = sb_write_pointer(device->bdev, 570 &zone_info->sb_zones[sb_pos], &sb_wp); 571 if (unlikely(ret != -ENOENT && ret)) { 572 btrfs_err(device->fs_info, 573 "zoned: super block log zone corrupted devid %llu zone %u", 574 device->devid, sb_zone); 575 ret = -EUCLEAN; 576 goto out; 577 } 578 } 579 580 581 kvfree(zones); 582 583 if (bdev_is_zoned(bdev)) { 584 model = "host-managed zoned"; 585 emulated = ""; 586 } else { 587 model = "regular"; 588 emulated = "emulated "; 589 } 590 591 btrfs_info(fs_info, 592 "%s block device %s, %u %szones of %llu bytes", 593 model, rcu_dereference(device->name), zone_info->nr_zones, 594 emulated, zone_info->zone_size); 595 596 return 0; 597 598 out: 599 kvfree(zones); 600 btrfs_destroy_dev_zone_info(device); 601 return ret; 602 } 603 604 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 605 { 606 struct btrfs_zoned_device_info *zone_info = device->zone_info; 607 608 if (!zone_info) 609 return; 610 611 bitmap_free(zone_info->active_zones); 612 bitmap_free(zone_info->seq_zones); 613 bitmap_free(zone_info->empty_zones); 614 vfree(zone_info->zone_cache); 615 kfree(zone_info); 616 device->zone_info = NULL; 617 } 618 619 struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) 620 { 621 struct btrfs_zoned_device_info *zone_info; 622 623 zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); 624 if (!zone_info) 625 return NULL; 626 627 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 628 if (!zone_info->seq_zones) 629 goto out; 630 631 bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, 632 zone_info->nr_zones); 633 634 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 635 if (!zone_info->empty_zones) 636 goto out; 637 638 bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, 639 zone_info->nr_zones); 640 641 zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 642 if (!zone_info->active_zones) 643 goto out; 644 645 bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, 646 zone_info->nr_zones); 647 zone_info->zone_cache = NULL; 648 649 return zone_info; 650 651 out: 652 bitmap_free(zone_info->seq_zones); 653 bitmap_free(zone_info->empty_zones); 654 bitmap_free(zone_info->active_zones); 655 kfree(zone_info); 656 return NULL; 657 } 658 659 static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) 660 { 661 unsigned int nr_zones = 1; 662 int ret; 663 664 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 665 if (ret != 0 || !nr_zones) 666 return ret ? ret : -EIO; 667 668 return 0; 669 } 670 671 static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) 672 { 673 struct btrfs_device *device; 674 675 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { 676 if (device->bdev && bdev_is_zoned(device->bdev)) { 677 btrfs_err(fs_info, 678 "zoned: mode not enabled but zoned device found: %pg", 679 device->bdev); 680 return -EINVAL; 681 } 682 } 683 684 return 0; 685 } 686 687 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 688 { 689 struct queue_limits *lim = &fs_info->limits; 690 struct btrfs_device *device; 691 u64 zone_size = 0; 692 int ret; 693 694 /* 695 * Host-Managed devices can't be used without the ZONED flag. With the 696 * ZONED all devices can be used, using zone emulation if required. 697 */ 698 if (!btrfs_fs_incompat(fs_info, ZONED)) 699 return btrfs_check_for_zoned_device(fs_info); 700 701 blk_set_stacking_limits(lim); 702 703 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { 704 struct btrfs_zoned_device_info *zone_info = device->zone_info; 705 706 if (!device->bdev) 707 continue; 708 709 if (!zone_size) { 710 zone_size = zone_info->zone_size; 711 } else if (zone_info->zone_size != zone_size) { 712 btrfs_err(fs_info, 713 "zoned: unequal block device zone sizes: have %llu found %llu", 714 zone_info->zone_size, zone_size); 715 return -EINVAL; 716 } 717 718 /* 719 * With the zoned emulation, we can have non-zoned device on the 720 * zoned mode. In this case, we don't have a valid max zone 721 * append size. 722 */ 723 if (bdev_is_zoned(device->bdev)) 724 blk_stack_limits(lim, bdev_limits(device->bdev), 0); 725 } 726 727 ret = blk_validate_limits(lim); 728 if (ret) { 729 btrfs_err(fs_info, "zoned: failed to validate queue limits"); 730 return ret; 731 } 732 733 /* 734 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 735 * btrfs_create_chunk(). Since we want stripe_len == zone_size, 736 * check the alignment here. 737 */ 738 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 739 btrfs_err(fs_info, 740 "zoned: zone size %llu not aligned to stripe %u", 741 zone_size, BTRFS_STRIPE_LEN); 742 return -EINVAL; 743 } 744 745 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 746 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 747 return -EINVAL; 748 } 749 750 fs_info->zone_size = zone_size; 751 /* 752 * Also limit max_zone_append_size by max_segments * PAGE_SIZE. 753 * Technically, we can have multiple pages per segment. But, since 754 * we add the pages one by one to a bio, and cannot increase the 755 * metadata reservation even if it increases the number of extents, it 756 * is safe to stick with the limit. 757 */ 758 fs_info->max_zone_append_size = ALIGN_DOWN( 759 min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, 760 (u64)lim->max_sectors << SECTOR_SHIFT, 761 (u64)lim->max_segments << PAGE_SHIFT), 762 fs_info->sectorsize); 763 fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; 764 765 fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, 766 fs_info->max_zone_append_size); 767 768 /* 769 * Check mount options here, because we might change fs_info->zoned 770 * from fs_info->zone_size. 771 */ 772 ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt); 773 if (ret) 774 return ret; 775 776 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 777 return 0; 778 } 779 780 int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, 781 unsigned long long *mount_opt) 782 { 783 if (!btrfs_is_zoned(info)) 784 return 0; 785 786 /* 787 * Space cache writing is not COWed. Disable that to avoid write errors 788 * in sequential zones. 789 */ 790 if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { 791 btrfs_err(info, "zoned: space cache v1 is not supported"); 792 return -EINVAL; 793 } 794 795 if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) { 796 btrfs_err(info, "zoned: NODATACOW not supported"); 797 return -EINVAL; 798 } 799 800 if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) { 801 btrfs_info(info, 802 "zoned: async discard ignored and disabled for zoned mode"); 803 btrfs_clear_opt(*mount_opt, DISCARD_ASYNC); 804 } 805 806 return 0; 807 } 808 809 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 810 int rw, u64 *bytenr_ret) 811 { 812 u64 wp; 813 int ret; 814 815 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 816 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 817 return 0; 818 } 819 820 ret = sb_write_pointer(bdev, zones, &wp); 821 if (ret != -ENOENT && ret < 0) 822 return ret; 823 824 if (rw == WRITE) { 825 struct blk_zone *reset = NULL; 826 827 if (wp == zones[0].start << SECTOR_SHIFT) 828 reset = &zones[0]; 829 else if (wp == zones[1].start << SECTOR_SHIFT) 830 reset = &zones[1]; 831 832 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 833 unsigned int nofs_flags; 834 835 ASSERT(sb_zone_is_full(reset)); 836 837 nofs_flags = memalloc_nofs_save(); 838 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 839 reset->start, reset->len); 840 memalloc_nofs_restore(nofs_flags); 841 if (ret) 842 return ret; 843 844 reset->cond = BLK_ZONE_COND_EMPTY; 845 reset->wp = reset->start; 846 } 847 } else if (ret != -ENOENT) { 848 /* 849 * For READ, we want the previous one. Move write pointer to 850 * the end of a zone, if it is at the head of a zone. 851 */ 852 u64 zone_end = 0; 853 854 if (wp == zones[0].start << SECTOR_SHIFT) 855 zone_end = zones[1].start + zones[1].capacity; 856 else if (wp == zones[1].start << SECTOR_SHIFT) 857 zone_end = zones[0].start + zones[0].capacity; 858 if (zone_end) 859 wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, 860 BTRFS_SUPER_INFO_SIZE); 861 862 wp -= BTRFS_SUPER_INFO_SIZE; 863 } 864 865 *bytenr_ret = wp; 866 return 0; 867 868 } 869 870 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 871 u64 *bytenr_ret) 872 { 873 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 874 sector_t zone_sectors; 875 u32 sb_zone; 876 int ret; 877 u8 zone_sectors_shift; 878 sector_t nr_sectors; 879 u32 nr_zones; 880 881 if (!bdev_is_zoned(bdev)) { 882 *bytenr_ret = btrfs_sb_offset(mirror); 883 return 0; 884 } 885 886 ASSERT(rw == READ || rw == WRITE); 887 888 zone_sectors = bdev_zone_sectors(bdev); 889 if (!is_power_of_2(zone_sectors)) 890 return -EINVAL; 891 zone_sectors_shift = ilog2(zone_sectors); 892 nr_sectors = bdev_nr_sectors(bdev); 893 nr_zones = nr_sectors >> zone_sectors_shift; 894 895 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 896 if (sb_zone + 1 >= nr_zones) 897 return -ENOENT; 898 899 ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), 900 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 901 zones); 902 if (ret < 0) 903 return ret; 904 if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) 905 return -EIO; 906 907 return sb_log_location(bdev, zones, rw, bytenr_ret); 908 } 909 910 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 911 u64 *bytenr_ret) 912 { 913 struct btrfs_zoned_device_info *zinfo = device->zone_info; 914 u32 zone_num; 915 916 /* 917 * For a zoned filesystem on a non-zoned block device, use the same 918 * super block locations as regular filesystem. Doing so, the super 919 * block can always be retrieved and the zoned flag of the volume 920 * detected from the super block information. 921 */ 922 if (!bdev_is_zoned(device->bdev)) { 923 *bytenr_ret = btrfs_sb_offset(mirror); 924 return 0; 925 } 926 927 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 928 if (zone_num + 1 >= zinfo->nr_zones) 929 return -ENOENT; 930 931 return sb_log_location(device->bdev, 932 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 933 rw, bytenr_ret); 934 } 935 936 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 937 int mirror) 938 { 939 u32 zone_num; 940 941 if (!zinfo) 942 return false; 943 944 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 945 if (zone_num + 1 >= zinfo->nr_zones) 946 return false; 947 948 if (!test_bit(zone_num, zinfo->seq_zones)) 949 return false; 950 951 return true; 952 } 953 954 int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 955 { 956 struct btrfs_zoned_device_info *zinfo = device->zone_info; 957 struct blk_zone *zone; 958 int i; 959 960 if (!is_sb_log_zone(zinfo, mirror)) 961 return 0; 962 963 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 964 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 965 /* Advance the next zone */ 966 if (zone->cond == BLK_ZONE_COND_FULL) { 967 zone++; 968 continue; 969 } 970 971 if (zone->cond == BLK_ZONE_COND_EMPTY) 972 zone->cond = BLK_ZONE_COND_IMP_OPEN; 973 974 zone->wp += SUPER_INFO_SECTORS; 975 976 if (sb_zone_is_full(zone)) { 977 /* 978 * No room left to write new superblock. Since 979 * superblock is written with REQ_SYNC, it is safe to 980 * finish the zone now. 981 * 982 * If the write pointer is exactly at the capacity, 983 * explicit ZONE_FINISH is not necessary. 984 */ 985 if (zone->wp != zone->start + zone->capacity) { 986 unsigned int nofs_flags; 987 int ret; 988 989 nofs_flags = memalloc_nofs_save(); 990 ret = blkdev_zone_mgmt(device->bdev, 991 REQ_OP_ZONE_FINISH, zone->start, 992 zone->len); 993 memalloc_nofs_restore(nofs_flags); 994 if (ret) 995 return ret; 996 } 997 998 zone->wp = zone->start + zone->len; 999 zone->cond = BLK_ZONE_COND_FULL; 1000 } 1001 return 0; 1002 } 1003 1004 /* All the zones are FULL. Should not reach here. */ 1005 DEBUG_WARN("unexpected state, all zones full"); 1006 return -EIO; 1007 } 1008 1009 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 1010 { 1011 unsigned int nofs_flags; 1012 sector_t zone_sectors; 1013 sector_t nr_sectors; 1014 u8 zone_sectors_shift; 1015 u32 sb_zone; 1016 u32 nr_zones; 1017 int ret; 1018 1019 zone_sectors = bdev_zone_sectors(bdev); 1020 zone_sectors_shift = ilog2(zone_sectors); 1021 nr_sectors = bdev_nr_sectors(bdev); 1022 nr_zones = nr_sectors >> zone_sectors_shift; 1023 1024 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 1025 if (sb_zone + 1 >= nr_zones) 1026 return -ENOENT; 1027 1028 nofs_flags = memalloc_nofs_save(); 1029 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 1030 zone_start_sector(sb_zone, bdev), 1031 zone_sectors * BTRFS_NR_SB_LOG_ZONES); 1032 memalloc_nofs_restore(nofs_flags); 1033 return ret; 1034 } 1035 1036 /* 1037 * Find allocatable zones within a given region. 1038 * 1039 * @device: the device to allocate a region on 1040 * @hole_start: the position of the hole to allocate the region 1041 * @num_bytes: size of wanted region 1042 * @hole_end: the end of the hole 1043 * @return: position of allocatable zones 1044 * 1045 * Allocatable region should not contain any superblock locations. 1046 */ 1047 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, 1048 u64 hole_end, u64 num_bytes) 1049 { 1050 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1051 const u8 shift = zinfo->zone_size_shift; 1052 u64 nzones = num_bytes >> shift; 1053 u64 pos = hole_start; 1054 u64 begin, end; 1055 bool have_sb; 1056 int i; 1057 1058 ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); 1059 ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); 1060 1061 while (pos < hole_end) { 1062 begin = pos >> shift; 1063 end = begin + nzones; 1064 1065 if (end > zinfo->nr_zones) 1066 return hole_end; 1067 1068 /* Check if zones in the region are all empty */ 1069 if (btrfs_dev_is_sequential(device, pos) && 1070 !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) { 1071 pos += zinfo->zone_size; 1072 continue; 1073 } 1074 1075 have_sb = false; 1076 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1077 u32 sb_zone; 1078 u64 sb_pos; 1079 1080 sb_zone = sb_zone_number(shift, i); 1081 if (!(end <= sb_zone || 1082 sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { 1083 have_sb = true; 1084 pos = zone_start_physical( 1085 sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); 1086 break; 1087 } 1088 1089 /* We also need to exclude regular superblock positions */ 1090 sb_pos = btrfs_sb_offset(i); 1091 if (!(pos + num_bytes <= sb_pos || 1092 sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { 1093 have_sb = true; 1094 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, 1095 zinfo->zone_size); 1096 break; 1097 } 1098 } 1099 if (!have_sb) 1100 break; 1101 } 1102 1103 return pos; 1104 } 1105 1106 static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) 1107 { 1108 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1109 unsigned int zno = (pos >> zone_info->zone_size_shift); 1110 1111 /* We can use any number of zones */ 1112 if (zone_info->max_active_zones == 0) 1113 return true; 1114 1115 if (!test_bit(zno, zone_info->active_zones)) { 1116 /* Active zone left? */ 1117 if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) 1118 return false; 1119 if (test_and_set_bit(zno, zone_info->active_zones)) { 1120 /* Someone already set the bit */ 1121 atomic_inc(&zone_info->active_zones_left); 1122 } 1123 } 1124 1125 return true; 1126 } 1127 1128 static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) 1129 { 1130 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1131 unsigned int zno = (pos >> zone_info->zone_size_shift); 1132 1133 /* We can use any number of zones */ 1134 if (zone_info->max_active_zones == 0) 1135 return; 1136 1137 if (test_and_clear_bit(zno, zone_info->active_zones)) 1138 atomic_inc(&zone_info->active_zones_left); 1139 } 1140 1141 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 1142 u64 length, u64 *bytes) 1143 { 1144 unsigned int nofs_flags; 1145 int ret; 1146 1147 *bytes = 0; 1148 nofs_flags = memalloc_nofs_save(); 1149 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 1150 physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); 1151 memalloc_nofs_restore(nofs_flags); 1152 if (ret) 1153 return ret; 1154 1155 *bytes = length; 1156 while (length) { 1157 btrfs_dev_set_zone_empty(device, physical); 1158 btrfs_dev_clear_active_zone(device, physical); 1159 physical += device->zone_info->zone_size; 1160 length -= device->zone_info->zone_size; 1161 } 1162 1163 return 0; 1164 } 1165 1166 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) 1167 { 1168 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1169 const u8 shift = zinfo->zone_size_shift; 1170 unsigned long begin = start >> shift; 1171 unsigned long nbits = size >> shift; 1172 u64 pos; 1173 int ret; 1174 1175 ASSERT(IS_ALIGNED(start, zinfo->zone_size)); 1176 ASSERT(IS_ALIGNED(size, zinfo->zone_size)); 1177 1178 if (begin + nbits > zinfo->nr_zones) 1179 return -ERANGE; 1180 1181 /* All the zones are conventional */ 1182 if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits)) 1183 return 0; 1184 1185 /* All the zones are sequential and empty */ 1186 if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) && 1187 bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits)) 1188 return 0; 1189 1190 for (pos = start; pos < start + size; pos += zinfo->zone_size) { 1191 u64 reset_bytes; 1192 1193 if (!btrfs_dev_is_sequential(device, pos) || 1194 btrfs_dev_is_empty_zone(device, pos)) 1195 continue; 1196 1197 /* Free regions should be empty */ 1198 btrfs_warn( 1199 device->fs_info, 1200 "zoned: resetting device %s (devid %llu) zone %llu for allocation", 1201 rcu_dereference(device->name), device->devid, pos >> shift); 1202 WARN_ON_ONCE(1); 1203 1204 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, 1205 &reset_bytes); 1206 if (ret) 1207 return ret; 1208 } 1209 1210 return 0; 1211 } 1212 1213 /* 1214 * Calculate an allocation pointer from the extent allocation information 1215 * for a block group consist of conventional zones. It is pointed to the 1216 * end of the highest addressed extent in the block group as an allocation 1217 * offset. 1218 */ 1219 static int calculate_alloc_pointer(struct btrfs_block_group *cache, 1220 u64 *offset_ret, bool new) 1221 { 1222 struct btrfs_fs_info *fs_info = cache->fs_info; 1223 struct btrfs_root *root; 1224 BTRFS_PATH_AUTO_FREE(path); 1225 struct btrfs_key key; 1226 struct btrfs_key found_key; 1227 int ret; 1228 u64 length; 1229 1230 /* 1231 * Avoid tree lookups for a new block group, there's no use for it. 1232 * It must always be 0. 1233 * 1234 * Also, we have a lock chain of extent buffer lock -> chunk mutex. 1235 * For new a block group, this function is called from 1236 * btrfs_make_block_group() which is already taking the chunk mutex. 1237 * Thus, we cannot call calculate_alloc_pointer() which takes extent 1238 * buffer locks to avoid deadlock. 1239 */ 1240 if (new) { 1241 *offset_ret = 0; 1242 return 0; 1243 } 1244 1245 path = btrfs_alloc_path(); 1246 if (!path) 1247 return -ENOMEM; 1248 1249 key.objectid = cache->start + cache->length; 1250 key.type = 0; 1251 key.offset = 0; 1252 1253 root = btrfs_extent_root(fs_info, key.objectid); 1254 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1255 /* We should not find the exact match */ 1256 if (unlikely(!ret)) 1257 ret = -EUCLEAN; 1258 if (ret < 0) 1259 return ret; 1260 1261 ret = btrfs_previous_extent_item(root, path, cache->start); 1262 if (ret) { 1263 if (ret == 1) { 1264 ret = 0; 1265 *offset_ret = 0; 1266 } 1267 return ret; 1268 } 1269 1270 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 1271 1272 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) 1273 length = found_key.offset; 1274 else 1275 length = fs_info->nodesize; 1276 1277 if (unlikely(!(found_key.objectid >= cache->start && 1278 found_key.objectid + length <= cache->start + cache->length))) { 1279 return -EUCLEAN; 1280 } 1281 *offset_ret = found_key.objectid + length - cache->start; 1282 return 0; 1283 } 1284 1285 struct zone_info { 1286 u64 physical; 1287 u64 capacity; 1288 u64 alloc_offset; 1289 }; 1290 1291 static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, 1292 struct zone_info *info, unsigned long *active, 1293 struct btrfs_chunk_map *map, bool new) 1294 { 1295 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1296 struct btrfs_device *device; 1297 int dev_replace_is_ongoing = 0; 1298 unsigned int nofs_flag; 1299 struct blk_zone zone; 1300 int ret; 1301 1302 info->physical = map->stripes[zone_idx].physical; 1303 1304 down_read(&dev_replace->rwsem); 1305 device = map->stripes[zone_idx].dev; 1306 1307 if (!device->bdev) { 1308 up_read(&dev_replace->rwsem); 1309 info->alloc_offset = WP_MISSING_DEV; 1310 return 0; 1311 } 1312 1313 /* Consider a zone as active if we can allow any number of active zones. */ 1314 if (!device->zone_info->max_active_zones) 1315 __set_bit(zone_idx, active); 1316 1317 if (!btrfs_dev_is_sequential(device, info->physical)) { 1318 up_read(&dev_replace->rwsem); 1319 info->alloc_offset = WP_CONVENTIONAL; 1320 info->capacity = device->zone_info->zone_size; 1321 return 0; 1322 } 1323 1324 ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); 1325 1326 /* This zone will be used for allocation, so mark this zone non-empty. */ 1327 btrfs_dev_clear_zone_empty(device, info->physical); 1328 1329 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1330 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1331 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); 1332 1333 /* 1334 * The group is mapped to a sequential zone. Get the zone write pointer 1335 * to determine the allocation offset within the zone. 1336 */ 1337 WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); 1338 1339 if (new) { 1340 sector_t capacity; 1341 1342 capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); 1343 up_read(&dev_replace->rwsem); 1344 info->alloc_offset = 0; 1345 info->capacity = capacity << SECTOR_SHIFT; 1346 1347 return 0; 1348 } 1349 1350 nofs_flag = memalloc_nofs_save(); 1351 ret = btrfs_get_dev_zone(device, info->physical, &zone); 1352 memalloc_nofs_restore(nofs_flag); 1353 if (ret) { 1354 up_read(&dev_replace->rwsem); 1355 if (ret != -EIO && ret != -EOPNOTSUPP) 1356 return ret; 1357 info->alloc_offset = WP_MISSING_DEV; 1358 return 0; 1359 } 1360 1361 if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) { 1362 btrfs_err(fs_info, 1363 "zoned: unexpected conventional zone %llu on device %s (devid %llu)", 1364 zone.start << SECTOR_SHIFT, rcu_dereference(device->name), 1365 device->devid); 1366 up_read(&dev_replace->rwsem); 1367 return -EIO; 1368 } 1369 1370 info->capacity = (zone.capacity << SECTOR_SHIFT); 1371 1372 switch (zone.cond) { 1373 case BLK_ZONE_COND_OFFLINE: 1374 case BLK_ZONE_COND_READONLY: 1375 btrfs_err(fs_info, 1376 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1377 (info->physical >> device->zone_info->zone_size_shift), 1378 rcu_dereference(device->name), device->devid); 1379 info->alloc_offset = WP_MISSING_DEV; 1380 break; 1381 case BLK_ZONE_COND_EMPTY: 1382 info->alloc_offset = 0; 1383 break; 1384 case BLK_ZONE_COND_FULL: 1385 info->alloc_offset = info->capacity; 1386 break; 1387 default: 1388 /* Partially used zone. */ 1389 info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT); 1390 __set_bit(zone_idx, active); 1391 break; 1392 } 1393 1394 up_read(&dev_replace->rwsem); 1395 1396 return 0; 1397 } 1398 1399 static int btrfs_load_block_group_single(struct btrfs_block_group *bg, 1400 struct zone_info *info, 1401 unsigned long *active) 1402 { 1403 if (unlikely(info->alloc_offset == WP_MISSING_DEV)) { 1404 btrfs_err(bg->fs_info, 1405 "zoned: cannot recover write pointer for zone %llu", 1406 info->physical); 1407 return -EIO; 1408 } 1409 1410 bg->alloc_offset = info->alloc_offset; 1411 bg->zone_capacity = info->capacity; 1412 if (test_bit(0, active)) 1413 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1414 return 0; 1415 } 1416 1417 static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, 1418 struct btrfs_chunk_map *map, 1419 struct zone_info *zone_info, 1420 unsigned long *active, 1421 u64 last_alloc) 1422 { 1423 struct btrfs_fs_info *fs_info = bg->fs_info; 1424 1425 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1426 btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree"); 1427 return -EINVAL; 1428 } 1429 1430 bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); 1431 1432 if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { 1433 btrfs_err(bg->fs_info, 1434 "zoned: cannot recover write pointer for zone %llu", 1435 zone_info[0].physical); 1436 return -EIO; 1437 } 1438 if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { 1439 btrfs_err(bg->fs_info, 1440 "zoned: cannot recover write pointer for zone %llu", 1441 zone_info[1].physical); 1442 return -EIO; 1443 } 1444 1445 if (zone_info[0].alloc_offset == WP_CONVENTIONAL) 1446 zone_info[0].alloc_offset = last_alloc; 1447 1448 if (zone_info[1].alloc_offset == WP_CONVENTIONAL) 1449 zone_info[1].alloc_offset = last_alloc; 1450 1451 if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { 1452 btrfs_err(bg->fs_info, 1453 "zoned: write pointer offset mismatch of zones in DUP profile"); 1454 return -EIO; 1455 } 1456 1457 if (test_bit(0, active) != test_bit(1, active)) { 1458 if (unlikely(!btrfs_zone_activate(bg))) 1459 return -EIO; 1460 } else if (test_bit(0, active)) { 1461 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1462 } 1463 1464 bg->alloc_offset = zone_info[0].alloc_offset; 1465 return 0; 1466 } 1467 1468 static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, 1469 struct btrfs_chunk_map *map, 1470 struct zone_info *zone_info, 1471 unsigned long *active, 1472 u64 last_alloc) 1473 { 1474 struct btrfs_fs_info *fs_info = bg->fs_info; 1475 int i; 1476 1477 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1478 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1479 btrfs_bg_type_to_raid_name(map->type)); 1480 return -EINVAL; 1481 } 1482 1483 /* In case a device is missing we have a cap of 0, so don't use it. */ 1484 bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); 1485 1486 for (i = 0; i < map->num_stripes; i++) { 1487 if (zone_info[i].alloc_offset == WP_MISSING_DEV) 1488 continue; 1489 1490 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) 1491 zone_info[i].alloc_offset = last_alloc; 1492 1493 if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && 1494 !btrfs_test_opt(fs_info, DEGRADED))) { 1495 btrfs_err(fs_info, 1496 "zoned: write pointer offset mismatch of zones in %s profile", 1497 btrfs_bg_type_to_raid_name(map->type)); 1498 return -EIO; 1499 } 1500 if (test_bit(0, active) != test_bit(i, active)) { 1501 if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) && 1502 !btrfs_zone_activate(bg))) { 1503 return -EIO; 1504 } 1505 } else { 1506 if (test_bit(0, active)) 1507 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1508 } 1509 } 1510 1511 if (zone_info[0].alloc_offset != WP_MISSING_DEV) 1512 bg->alloc_offset = zone_info[0].alloc_offset; 1513 else 1514 bg->alloc_offset = zone_info[i - 1].alloc_offset; 1515 1516 return 0; 1517 } 1518 1519 static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, 1520 struct btrfs_chunk_map *map, 1521 struct zone_info *zone_info, 1522 unsigned long *active, 1523 u64 last_alloc) 1524 { 1525 struct btrfs_fs_info *fs_info = bg->fs_info; 1526 u64 stripe_nr = 0, stripe_offset = 0; 1527 u32 stripe_index = 0; 1528 1529 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1530 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1531 btrfs_bg_type_to_raid_name(map->type)); 1532 return -EINVAL; 1533 } 1534 1535 if (last_alloc) { 1536 u32 factor = map->num_stripes; 1537 1538 stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; 1539 stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; 1540 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 1541 } 1542 1543 for (int i = 0; i < map->num_stripes; i++) { 1544 if (zone_info[i].alloc_offset == WP_MISSING_DEV) 1545 continue; 1546 1547 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { 1548 1549 zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); 1550 1551 if (stripe_index > i) 1552 zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; 1553 else if (stripe_index == i) 1554 zone_info[i].alloc_offset += stripe_offset; 1555 } 1556 1557 if (test_bit(0, active) != test_bit(i, active)) { 1558 if (unlikely(!btrfs_zone_activate(bg))) 1559 return -EIO; 1560 } else { 1561 if (test_bit(0, active)) 1562 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1563 } 1564 bg->zone_capacity += zone_info[i].capacity; 1565 bg->alloc_offset += zone_info[i].alloc_offset; 1566 } 1567 1568 return 0; 1569 } 1570 1571 static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, 1572 struct btrfs_chunk_map *map, 1573 struct zone_info *zone_info, 1574 unsigned long *active, 1575 u64 last_alloc) 1576 { 1577 struct btrfs_fs_info *fs_info = bg->fs_info; 1578 u64 stripe_nr = 0, stripe_offset = 0; 1579 u32 stripe_index = 0; 1580 1581 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1582 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1583 btrfs_bg_type_to_raid_name(map->type)); 1584 return -EINVAL; 1585 } 1586 1587 if (last_alloc) { 1588 u32 factor = map->num_stripes / map->sub_stripes; 1589 1590 stripe_nr = last_alloc >> BTRFS_STRIPE_LEN_SHIFT; 1591 stripe_offset = last_alloc & BTRFS_STRIPE_LEN_MASK; 1592 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); 1593 } 1594 1595 for (int i = 0; i < map->num_stripes; i++) { 1596 if (zone_info[i].alloc_offset == WP_MISSING_DEV) 1597 continue; 1598 1599 if (test_bit(0, active) != test_bit(i, active)) { 1600 if (unlikely(!btrfs_zone_activate(bg))) 1601 return -EIO; 1602 } else { 1603 if (test_bit(0, active)) 1604 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1605 } 1606 1607 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { 1608 zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr); 1609 1610 if (stripe_index > (i / map->sub_stripes)) 1611 zone_info[i].alloc_offset += BTRFS_STRIPE_LEN; 1612 else if (stripe_index == (i / map->sub_stripes)) 1613 zone_info[i].alloc_offset += stripe_offset; 1614 } 1615 1616 if ((i % map->sub_stripes) == 0) { 1617 bg->zone_capacity += zone_info[i].capacity; 1618 bg->alloc_offset += zone_info[i].alloc_offset; 1619 } 1620 } 1621 1622 return 0; 1623 } 1624 1625 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) 1626 { 1627 struct btrfs_fs_info *fs_info = cache->fs_info; 1628 struct btrfs_chunk_map *map; 1629 u64 logical = cache->start; 1630 u64 length = cache->length; 1631 struct zone_info *zone_info = NULL; 1632 int ret; 1633 int i; 1634 unsigned long *active = NULL; 1635 u64 last_alloc = 0; 1636 u32 num_sequential = 0, num_conventional = 0; 1637 u64 profile; 1638 1639 if (!btrfs_is_zoned(fs_info)) 1640 return 0; 1641 1642 /* Sanity check */ 1643 if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) { 1644 btrfs_err(fs_info, 1645 "zoned: block group %llu len %llu unaligned to zone size %llu", 1646 logical, length, fs_info->zone_size); 1647 return -EIO; 1648 } 1649 1650 map = btrfs_find_chunk_map(fs_info, logical, length); 1651 if (!map) 1652 return -EINVAL; 1653 1654 cache->physical_map = map; 1655 1656 zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); 1657 if (!zone_info) { 1658 ret = -ENOMEM; 1659 goto out; 1660 } 1661 1662 active = bitmap_zalloc(map->num_stripes, GFP_NOFS); 1663 if (!active) { 1664 ret = -ENOMEM; 1665 goto out; 1666 } 1667 1668 for (i = 0; i < map->num_stripes; i++) { 1669 ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); 1670 if (ret) 1671 goto out; 1672 1673 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) 1674 num_conventional++; 1675 else 1676 num_sequential++; 1677 } 1678 1679 if (num_sequential > 0) 1680 set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); 1681 1682 if (num_conventional > 0) { 1683 ret = calculate_alloc_pointer(cache, &last_alloc, new); 1684 if (ret) { 1685 btrfs_err(fs_info, 1686 "zoned: failed to determine allocation offset of bg %llu", 1687 cache->start); 1688 goto out; 1689 } else if (map->num_stripes == num_conventional) { 1690 cache->alloc_offset = last_alloc; 1691 cache->zone_capacity = cache->length; 1692 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); 1693 goto out; 1694 } 1695 } 1696 1697 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 1698 switch (profile) { 1699 case 0: /* single */ 1700 ret = btrfs_load_block_group_single(cache, &zone_info[0], active); 1701 break; 1702 case BTRFS_BLOCK_GROUP_DUP: 1703 ret = btrfs_load_block_group_dup(cache, map, zone_info, active, 1704 last_alloc); 1705 break; 1706 case BTRFS_BLOCK_GROUP_RAID1: 1707 case BTRFS_BLOCK_GROUP_RAID1C3: 1708 case BTRFS_BLOCK_GROUP_RAID1C4: 1709 ret = btrfs_load_block_group_raid1(cache, map, zone_info, 1710 active, last_alloc); 1711 break; 1712 case BTRFS_BLOCK_GROUP_RAID0: 1713 ret = btrfs_load_block_group_raid0(cache, map, zone_info, 1714 active, last_alloc); 1715 break; 1716 case BTRFS_BLOCK_GROUP_RAID10: 1717 ret = btrfs_load_block_group_raid10(cache, map, zone_info, 1718 active, last_alloc); 1719 break; 1720 case BTRFS_BLOCK_GROUP_RAID5: 1721 case BTRFS_BLOCK_GROUP_RAID6: 1722 default: 1723 btrfs_err(fs_info, "zoned: profile %s not yet supported", 1724 btrfs_bg_type_to_raid_name(map->type)); 1725 ret = -EINVAL; 1726 goto out; 1727 } 1728 1729 if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && 1730 profile != BTRFS_BLOCK_GROUP_RAID10) { 1731 /* 1732 * Detected broken write pointer. Make this block group 1733 * unallocatable by setting the allocation pointer at the end of 1734 * allocatable region. Relocating this block group will fix the 1735 * mismatch. 1736 * 1737 * Currently, we cannot handle RAID0 or RAID10 case like this 1738 * because we don't have a proper zone_capacity value. But, 1739 * reading from this block group won't work anyway by a missing 1740 * stripe. 1741 */ 1742 cache->alloc_offset = cache->zone_capacity; 1743 } 1744 1745 out: 1746 /* Reject non SINGLE data profiles without RST */ 1747 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && 1748 (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && 1749 !fs_info->stripe_root) { 1750 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1751 btrfs_bg_type_to_raid_name(map->type)); 1752 ret = -EINVAL; 1753 } 1754 1755 if (unlikely(cache->alloc_offset > cache->zone_capacity)) { 1756 btrfs_err(fs_info, 1757 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", 1758 cache->alloc_offset, cache->zone_capacity, 1759 cache->start); 1760 ret = -EIO; 1761 } 1762 1763 /* An extent is allocated after the write pointer */ 1764 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1765 btrfs_err(fs_info, 1766 "zoned: got wrong write pointer in BG %llu: %llu > %llu", 1767 logical, last_alloc, cache->alloc_offset); 1768 ret = -EIO; 1769 } 1770 1771 if (!ret) { 1772 cache->meta_write_pointer = cache->alloc_offset + cache->start; 1773 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { 1774 btrfs_get_block_group(cache); 1775 spin_lock(&fs_info->zone_active_bgs_lock); 1776 list_add_tail(&cache->active_bg_list, 1777 &fs_info->zone_active_bgs); 1778 spin_unlock(&fs_info->zone_active_bgs_lock); 1779 } 1780 } else { 1781 btrfs_free_chunk_map(cache->physical_map); 1782 cache->physical_map = NULL; 1783 } 1784 bitmap_free(active); 1785 kfree(zone_info); 1786 1787 return ret; 1788 } 1789 1790 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) 1791 { 1792 u64 unusable, free; 1793 1794 if (!btrfs_is_zoned(cache->fs_info)) 1795 return; 1796 1797 WARN_ON(cache->bytes_super != 0); 1798 unusable = (cache->alloc_offset - cache->used) + 1799 (cache->length - cache->zone_capacity); 1800 free = cache->zone_capacity - cache->alloc_offset; 1801 1802 /* We only need ->free_space in ALLOC_SEQ block groups */ 1803 cache->cached = BTRFS_CACHE_FINISHED; 1804 cache->free_space_ctl->free_space = free; 1805 cache->zone_unusable = unusable; 1806 } 1807 1808 bool btrfs_use_zone_append(struct btrfs_bio *bbio) 1809 { 1810 u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); 1811 struct btrfs_inode *inode = bbio->inode; 1812 struct btrfs_fs_info *fs_info = bbio->fs_info; 1813 struct btrfs_block_group *cache; 1814 bool ret = false; 1815 1816 if (!btrfs_is_zoned(fs_info)) 1817 return false; 1818 1819 if (!inode || !is_data_inode(inode)) 1820 return false; 1821 1822 if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) 1823 return false; 1824 1825 /* 1826 * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the 1827 * extent layout the relocation code has. 1828 * Furthermore we have set aside own block-group from which only the 1829 * relocation "process" can allocate and make sure only one process at a 1830 * time can add pages to an extent that gets relocated, so it's safe to 1831 * use regular REQ_OP_WRITE for this special case. 1832 */ 1833 if (btrfs_is_data_reloc_root(inode->root)) 1834 return false; 1835 1836 cache = btrfs_lookup_block_group(fs_info, start); 1837 ASSERT(cache); 1838 if (!cache) 1839 return false; 1840 1841 ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); 1842 btrfs_put_block_group(cache); 1843 1844 return ret; 1845 } 1846 1847 void btrfs_record_physical_zoned(struct btrfs_bio *bbio) 1848 { 1849 const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 1850 struct btrfs_ordered_sum *sum = bbio->sums; 1851 1852 if (physical < bbio->orig_physical) 1853 sum->logical -= bbio->orig_physical - physical; 1854 else 1855 sum->logical += physical - bbio->orig_physical; 1856 } 1857 1858 static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, 1859 u64 logical) 1860 { 1861 struct extent_map_tree *em_tree = &ordered->inode->extent_tree; 1862 struct extent_map *em; 1863 1864 ordered->disk_bytenr = logical; 1865 1866 write_lock(&em_tree->lock); 1867 em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, 1868 ordered->num_bytes); 1869 /* The em should be a new COW extent, thus it should not have an offset. */ 1870 ASSERT(em->offset == 0); 1871 em->disk_bytenr = logical; 1872 btrfs_free_extent_map(em); 1873 write_unlock(&em_tree->lock); 1874 } 1875 1876 static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, 1877 u64 logical, u64 len) 1878 { 1879 struct btrfs_ordered_extent *new; 1880 1881 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && 1882 btrfs_split_extent_map(ordered->inode, ordered->file_offset, 1883 ordered->num_bytes, len, logical)) 1884 return false; 1885 1886 new = btrfs_split_ordered_extent(ordered, len); 1887 if (IS_ERR(new)) 1888 return false; 1889 new->disk_bytenr = logical; 1890 btrfs_finish_one_ordered(new); 1891 return true; 1892 } 1893 1894 void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) 1895 { 1896 struct btrfs_inode *inode = ordered->inode; 1897 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1898 struct btrfs_ordered_sum *sum; 1899 u64 logical, len; 1900 1901 /* 1902 * Write to pre-allocated region is for the data relocation, and so 1903 * it should use WRITE operation. No split/rewrite are necessary. 1904 */ 1905 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 1906 return; 1907 1908 ASSERT(!list_empty(&ordered->list)); 1909 /* The ordered->list can be empty in the above pre-alloc case. */ 1910 sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); 1911 logical = sum->logical; 1912 len = sum->len; 1913 1914 while (len < ordered->disk_num_bytes) { 1915 sum = list_next_entry(sum, list); 1916 if (sum->logical == logical + len) { 1917 len += sum->len; 1918 continue; 1919 } 1920 if (!btrfs_zoned_split_ordered(ordered, logical, len)) { 1921 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 1922 btrfs_err(fs_info, "failed to split ordered extent"); 1923 goto out; 1924 } 1925 logical = sum->logical; 1926 len = sum->len; 1927 } 1928 1929 if (ordered->disk_bytenr != logical) 1930 btrfs_rewrite_logical_zoned(ordered, logical); 1931 1932 out: 1933 /* 1934 * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures 1935 * were allocated by btrfs_alloc_dummy_sum only to record the logical 1936 * addresses and don't contain actual checksums. We thus must free them 1937 * here so that we don't attempt to log the csums later. 1938 */ 1939 if ((inode->flags & BTRFS_INODE_NODATASUM) || 1940 test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) { 1941 while ((sum = list_first_entry_or_null(&ordered->list, 1942 typeof(*sum), list))) { 1943 list_del(&sum->list); 1944 kfree(sum); 1945 } 1946 } 1947 } 1948 1949 static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, 1950 struct btrfs_block_group **active_bg) 1951 { 1952 const struct writeback_control *wbc = ctx->wbc; 1953 struct btrfs_block_group *block_group = ctx->zoned_bg; 1954 struct btrfs_fs_info *fs_info = block_group->fs_info; 1955 1956 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) 1957 return true; 1958 1959 if (fs_info->treelog_bg == block_group->start) { 1960 if (!btrfs_zone_activate(block_group)) { 1961 int ret_fin = btrfs_zone_finish_one_bg(fs_info); 1962 1963 if (ret_fin != 1 || !btrfs_zone_activate(block_group)) 1964 return false; 1965 } 1966 } else if (*active_bg != block_group) { 1967 struct btrfs_block_group *tgt = *active_bg; 1968 1969 /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ 1970 lockdep_assert_held(&fs_info->zoned_meta_io_lock); 1971 1972 if (tgt) { 1973 /* 1974 * If there is an unsent IO left in the allocated area, 1975 * we cannot wait for them as it may cause a deadlock. 1976 */ 1977 if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { 1978 if (wbc->sync_mode == WB_SYNC_NONE || 1979 (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) 1980 return false; 1981 } 1982 1983 /* Pivot active metadata/system block group. */ 1984 btrfs_zoned_meta_io_unlock(fs_info); 1985 wait_eb_writebacks(tgt); 1986 do_zone_finish(tgt, true); 1987 btrfs_zoned_meta_io_lock(fs_info); 1988 if (*active_bg == tgt) { 1989 btrfs_put_block_group(tgt); 1990 *active_bg = NULL; 1991 } 1992 } 1993 if (!btrfs_zone_activate(block_group)) 1994 return false; 1995 if (*active_bg != block_group) { 1996 ASSERT(*active_bg == NULL); 1997 *active_bg = block_group; 1998 btrfs_get_block_group(block_group); 1999 } 2000 } 2001 2002 return true; 2003 } 2004 2005 /* 2006 * Check if @ctx->eb is aligned to the write pointer. 2007 * 2008 * Return: 2009 * 0: @ctx->eb is at the write pointer. You can write it. 2010 * -EAGAIN: There is a hole. The caller should handle the case. 2011 * -EBUSY: There is a hole, but the caller can just bail out. 2012 */ 2013 int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, 2014 struct btrfs_eb_write_context *ctx) 2015 { 2016 const struct writeback_control *wbc = ctx->wbc; 2017 const struct extent_buffer *eb = ctx->eb; 2018 struct btrfs_block_group *block_group = ctx->zoned_bg; 2019 2020 if (!btrfs_is_zoned(fs_info)) 2021 return 0; 2022 2023 if (block_group) { 2024 if (block_group->start > eb->start || 2025 block_group->start + block_group->length <= eb->start) { 2026 btrfs_put_block_group(block_group); 2027 block_group = NULL; 2028 ctx->zoned_bg = NULL; 2029 } 2030 } 2031 2032 if (!block_group) { 2033 block_group = btrfs_lookup_block_group(fs_info, eb->start); 2034 if (!block_group) 2035 return 0; 2036 ctx->zoned_bg = block_group; 2037 } 2038 2039 if (block_group->meta_write_pointer == eb->start) { 2040 struct btrfs_block_group **tgt; 2041 2042 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) 2043 return 0; 2044 2045 if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) 2046 tgt = &fs_info->active_system_bg; 2047 else 2048 tgt = &fs_info->active_meta_bg; 2049 if (check_bg_is_active(ctx, tgt)) 2050 return 0; 2051 } 2052 2053 /* 2054 * Since we may release fs_info->zoned_meta_io_lock, someone can already 2055 * start writing this eb. In that case, we can just bail out. 2056 */ 2057 if (block_group->meta_write_pointer > eb->start) 2058 return -EBUSY; 2059 2060 /* If for_sync, this hole will be filled with transaction commit. */ 2061 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 2062 return -EAGAIN; 2063 return -EBUSY; 2064 } 2065 2066 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) 2067 { 2068 if (!btrfs_dev_is_sequential(device, physical)) 2069 return -EOPNOTSUPP; 2070 2071 return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, 2072 length >> SECTOR_SHIFT, GFP_NOFS, 0); 2073 } 2074 2075 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, 2076 struct blk_zone *zone) 2077 { 2078 struct btrfs_io_context *bioc = NULL; 2079 u64 mapped_length = PAGE_SIZE; 2080 unsigned int nofs_flag; 2081 int nmirrors; 2082 int i, ret; 2083 2084 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 2085 &mapped_length, &bioc, NULL, NULL); 2086 if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) { 2087 ret = -EIO; 2088 goto out_put_bioc; 2089 } 2090 2091 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 2092 ret = -EINVAL; 2093 goto out_put_bioc; 2094 } 2095 2096 nofs_flag = memalloc_nofs_save(); 2097 nmirrors = (int)bioc->num_stripes; 2098 for (i = 0; i < nmirrors; i++) { 2099 u64 physical = bioc->stripes[i].physical; 2100 struct btrfs_device *dev = bioc->stripes[i].dev; 2101 2102 /* Missing device */ 2103 if (!dev->bdev) 2104 continue; 2105 2106 ret = btrfs_get_dev_zone(dev, physical, zone); 2107 /* Failing device */ 2108 if (ret == -EIO || ret == -EOPNOTSUPP) 2109 continue; 2110 break; 2111 } 2112 memalloc_nofs_restore(nofs_flag); 2113 out_put_bioc: 2114 btrfs_put_bioc(bioc); 2115 return ret; 2116 } 2117 2118 /* 2119 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by 2120 * filling zeros between @physical_pos to a write pointer of dev-replace 2121 * source device. 2122 */ 2123 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 2124 u64 physical_start, u64 physical_pos) 2125 { 2126 struct btrfs_fs_info *fs_info = tgt_dev->fs_info; 2127 struct blk_zone zone; 2128 u64 length; 2129 u64 wp; 2130 int ret; 2131 2132 if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) 2133 return 0; 2134 2135 ret = read_zone_info(fs_info, logical, &zone); 2136 if (ret) 2137 return ret; 2138 2139 wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); 2140 2141 if (physical_pos == wp) 2142 return 0; 2143 2144 if (unlikely(physical_pos > wp)) 2145 return -EUCLEAN; 2146 2147 length = wp - physical_pos; 2148 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 2149 } 2150 2151 /* 2152 * Activate block group and underlying device zones 2153 * 2154 * @block_group: the block group to activate 2155 * 2156 * Return: true on success, false otherwise 2157 */ 2158 bool btrfs_zone_activate(struct btrfs_block_group *block_group) 2159 { 2160 struct btrfs_fs_info *fs_info = block_group->fs_info; 2161 struct btrfs_chunk_map *map; 2162 struct btrfs_device *device; 2163 u64 physical; 2164 const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); 2165 bool ret; 2166 int i; 2167 2168 if (!btrfs_is_zoned(block_group->fs_info)) 2169 return true; 2170 2171 map = block_group->physical_map; 2172 2173 spin_lock(&fs_info->zone_active_bgs_lock); 2174 spin_lock(&block_group->lock); 2175 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { 2176 ret = true; 2177 goto out_unlock; 2178 } 2179 2180 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { 2181 /* The caller should check if the block group is full. */ 2182 if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) { 2183 ret = false; 2184 goto out_unlock; 2185 } 2186 } else { 2187 /* Since it is already written, it should have been active. */ 2188 WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start); 2189 } 2190 2191 for (i = 0; i < map->num_stripes; i++) { 2192 struct btrfs_zoned_device_info *zinfo; 2193 int reserved = 0; 2194 2195 device = map->stripes[i].dev; 2196 physical = map->stripes[i].physical; 2197 zinfo = device->zone_info; 2198 2199 if (!device->bdev) 2200 continue; 2201 2202 if (zinfo->max_active_zones == 0) 2203 continue; 2204 2205 if (is_data) 2206 reserved = zinfo->reserved_active_zones; 2207 /* 2208 * For the data block group, leave active zones for one 2209 * metadata block group and one system block group. 2210 */ 2211 if (atomic_read(&zinfo->active_zones_left) <= reserved) { 2212 ret = false; 2213 goto out_unlock; 2214 } 2215 2216 if (!btrfs_dev_set_active_zone(device, physical)) { 2217 /* Cannot activate the zone */ 2218 ret = false; 2219 goto out_unlock; 2220 } 2221 if (!is_data) 2222 zinfo->reserved_active_zones--; 2223 } 2224 2225 /* Successfully activated all the zones */ 2226 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); 2227 spin_unlock(&block_group->lock); 2228 2229 /* For the active block group list */ 2230 btrfs_get_block_group(block_group); 2231 list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); 2232 spin_unlock(&fs_info->zone_active_bgs_lock); 2233 2234 return true; 2235 2236 out_unlock: 2237 spin_unlock(&block_group->lock); 2238 spin_unlock(&fs_info->zone_active_bgs_lock); 2239 return ret; 2240 } 2241 2242 static void wait_eb_writebacks(struct btrfs_block_group *block_group) 2243 { 2244 struct btrfs_fs_info *fs_info = block_group->fs_info; 2245 const u64 end = block_group->start + block_group->length; 2246 struct extent_buffer *eb; 2247 unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); 2248 2249 rcu_read_lock(); 2250 xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { 2251 if (eb->start < block_group->start) 2252 continue; 2253 if (eb->start >= end) 2254 break; 2255 rcu_read_unlock(); 2256 wait_on_extent_buffer_writeback(eb); 2257 rcu_read_lock(); 2258 } 2259 rcu_read_unlock(); 2260 } 2261 2262 static int call_zone_finish(struct btrfs_block_group *block_group, 2263 struct btrfs_io_stripe *stripe) 2264 { 2265 struct btrfs_device *device = stripe->dev; 2266 const u64 physical = stripe->physical; 2267 struct btrfs_zoned_device_info *zinfo = device->zone_info; 2268 int ret; 2269 2270 if (!device->bdev) 2271 return 0; 2272 2273 if (zinfo->max_active_zones == 0) 2274 return 0; 2275 2276 if (btrfs_dev_is_sequential(device, physical)) { 2277 unsigned int nofs_flags; 2278 2279 nofs_flags = memalloc_nofs_save(); 2280 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 2281 physical >> SECTOR_SHIFT, 2282 zinfo->zone_size >> SECTOR_SHIFT); 2283 memalloc_nofs_restore(nofs_flags); 2284 2285 if (ret) 2286 return ret; 2287 } 2288 2289 if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 2290 zinfo->reserved_active_zones++; 2291 btrfs_dev_clear_active_zone(device, physical); 2292 2293 return 0; 2294 } 2295 2296 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) 2297 { 2298 struct btrfs_fs_info *fs_info = block_group->fs_info; 2299 struct btrfs_chunk_map *map; 2300 const bool is_metadata = (block_group->flags & 2301 (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); 2302 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2303 int ret = 0; 2304 int i; 2305 2306 spin_lock(&block_group->lock); 2307 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { 2308 spin_unlock(&block_group->lock); 2309 return 0; 2310 } 2311 2312 /* Check if we have unwritten allocated space */ 2313 if (is_metadata && 2314 block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { 2315 spin_unlock(&block_group->lock); 2316 return -EAGAIN; 2317 } 2318 2319 /* 2320 * If we are sure that the block group is full (= no more room left for 2321 * new allocation) and the IO for the last usable block is completed, we 2322 * don't need to wait for the other IOs. This holds because we ensure 2323 * the sequential IO submissions using the ZONE_APPEND command for data 2324 * and block_group->meta_write_pointer for metadata. 2325 */ 2326 if (!fully_written) { 2327 if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { 2328 spin_unlock(&block_group->lock); 2329 return -EAGAIN; 2330 } 2331 spin_unlock(&block_group->lock); 2332 2333 ret = btrfs_inc_block_group_ro(block_group, false); 2334 if (ret) 2335 return ret; 2336 2337 /* Ensure all writes in this block group finish */ 2338 btrfs_wait_block_group_reservations(block_group); 2339 /* No need to wait for NOCOW writers. Zoned mode does not allow that */ 2340 btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group); 2341 /* Wait for extent buffers to be written. */ 2342 if (is_metadata) 2343 wait_eb_writebacks(block_group); 2344 2345 spin_lock(&block_group->lock); 2346 2347 /* 2348 * Bail out if someone already deactivated the block group, or 2349 * allocated space is left in the block group. 2350 */ 2351 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2352 &block_group->runtime_flags)) { 2353 spin_unlock(&block_group->lock); 2354 btrfs_dec_block_group_ro(block_group); 2355 return 0; 2356 } 2357 2358 if (block_group->reserved || 2359 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, 2360 &block_group->runtime_flags)) { 2361 spin_unlock(&block_group->lock); 2362 btrfs_dec_block_group_ro(block_group); 2363 return -EAGAIN; 2364 } 2365 } 2366 2367 clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); 2368 block_group->alloc_offset = block_group->zone_capacity; 2369 if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) 2370 block_group->meta_write_pointer = block_group->start + 2371 block_group->zone_capacity; 2372 block_group->free_space_ctl->free_space = 0; 2373 btrfs_clear_treelog_bg(block_group); 2374 btrfs_clear_data_reloc_bg(block_group); 2375 spin_unlock(&block_group->lock); 2376 2377 down_read(&dev_replace->rwsem); 2378 map = block_group->physical_map; 2379 for (i = 0; i < map->num_stripes; i++) { 2380 2381 ret = call_zone_finish(block_group, &map->stripes[i]); 2382 if (ret) { 2383 up_read(&dev_replace->rwsem); 2384 return ret; 2385 } 2386 } 2387 up_read(&dev_replace->rwsem); 2388 2389 if (!fully_written) 2390 btrfs_dec_block_group_ro(block_group); 2391 2392 spin_lock(&fs_info->zone_active_bgs_lock); 2393 ASSERT(!list_empty(&block_group->active_bg_list)); 2394 list_del_init(&block_group->active_bg_list); 2395 spin_unlock(&fs_info->zone_active_bgs_lock); 2396 2397 /* For active_bg_list */ 2398 btrfs_put_block_group(block_group); 2399 2400 clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); 2401 2402 return 0; 2403 } 2404 2405 int btrfs_zone_finish(struct btrfs_block_group *block_group) 2406 { 2407 if (!btrfs_is_zoned(block_group->fs_info)) 2408 return 0; 2409 2410 return do_zone_finish(block_group, false); 2411 } 2412 2413 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) 2414 { 2415 struct btrfs_fs_info *fs_info = fs_devices->fs_info; 2416 struct btrfs_device *device; 2417 bool ret = false; 2418 2419 if (!btrfs_is_zoned(fs_info)) 2420 return true; 2421 2422 if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) 2423 return false; 2424 2425 /* Check if there is a device with active zones left */ 2426 mutex_lock(&fs_info->chunk_mutex); 2427 spin_lock(&fs_info->zone_active_bgs_lock); 2428 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 2429 struct btrfs_zoned_device_info *zinfo = device->zone_info; 2430 int reserved = 0; 2431 2432 if (!device->bdev) 2433 continue; 2434 2435 if (!zinfo->max_active_zones) { 2436 ret = true; 2437 break; 2438 } 2439 2440 if (flags & BTRFS_BLOCK_GROUP_DATA) 2441 reserved = zinfo->reserved_active_zones; 2442 2443 switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 2444 case 0: /* single */ 2445 ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); 2446 break; 2447 case BTRFS_BLOCK_GROUP_DUP: 2448 ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); 2449 break; 2450 } 2451 if (ret) 2452 break; 2453 } 2454 spin_unlock(&fs_info->zone_active_bgs_lock); 2455 mutex_unlock(&fs_info->chunk_mutex); 2456 2457 if (!ret) 2458 set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); 2459 2460 return ret; 2461 } 2462 2463 int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) 2464 { 2465 struct btrfs_block_group *block_group; 2466 u64 min_alloc_bytes; 2467 2468 if (!btrfs_is_zoned(fs_info)) 2469 return 0; 2470 2471 block_group = btrfs_lookup_block_group(fs_info, logical); 2472 if (WARN_ON_ONCE(!block_group)) 2473 return -ENOENT; 2474 2475 /* No MIXED_BG on zoned btrfs. */ 2476 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 2477 min_alloc_bytes = fs_info->sectorsize; 2478 else 2479 min_alloc_bytes = fs_info->nodesize; 2480 2481 /* Bail out if we can allocate more data from this block group. */ 2482 if (logical + length + min_alloc_bytes <= 2483 block_group->start + block_group->zone_capacity) 2484 goto out; 2485 2486 do_zone_finish(block_group, true); 2487 2488 out: 2489 btrfs_put_block_group(block_group); 2490 return 0; 2491 } 2492 2493 static void btrfs_zone_finish_endio_workfn(struct work_struct *work) 2494 { 2495 int ret; 2496 struct btrfs_block_group *bg = 2497 container_of(work, struct btrfs_block_group, zone_finish_work); 2498 2499 wait_on_extent_buffer_writeback(bg->last_eb); 2500 free_extent_buffer(bg->last_eb); 2501 ret = do_zone_finish(bg, true); 2502 if (ret) 2503 btrfs_handle_fs_error(bg->fs_info, ret, 2504 "Failed to finish block-group's zone"); 2505 btrfs_put_block_group(bg); 2506 } 2507 2508 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, 2509 struct extent_buffer *eb) 2510 { 2511 if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || 2512 eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) 2513 return; 2514 2515 if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { 2516 btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", 2517 bg->start); 2518 return; 2519 } 2520 2521 /* For the work */ 2522 btrfs_get_block_group(bg); 2523 refcount_inc(&eb->refs); 2524 bg->last_eb = eb; 2525 INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); 2526 queue_work(system_dfl_wq, &bg->zone_finish_work); 2527 } 2528 2529 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) 2530 { 2531 struct btrfs_fs_info *fs_info = bg->fs_info; 2532 2533 spin_lock(&fs_info->relocation_bg_lock); 2534 if (fs_info->data_reloc_bg == bg->start) 2535 fs_info->data_reloc_bg = 0; 2536 spin_unlock(&fs_info->relocation_bg_lock); 2537 } 2538 2539 void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) 2540 { 2541 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 2542 struct btrfs_space_info *space_info = data_sinfo; 2543 struct btrfs_trans_handle *trans; 2544 struct btrfs_block_group *bg; 2545 struct list_head *bg_list; 2546 u64 alloc_flags; 2547 bool first = true; 2548 bool did_chunk_alloc = false; 2549 int index; 2550 int ret; 2551 2552 if (!btrfs_is_zoned(fs_info)) 2553 return; 2554 2555 if (fs_info->data_reloc_bg) 2556 return; 2557 2558 if (sb_rdonly(fs_info->sb)) 2559 return; 2560 2561 alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); 2562 index = btrfs_bg_flags_to_raid_index(alloc_flags); 2563 2564 /* Scan the data space_info to find empty block groups. Take the second one. */ 2565 again: 2566 bg_list = &space_info->block_groups[index]; 2567 list_for_each_entry(bg, bg_list, list) { 2568 if (bg->alloc_offset != 0) 2569 continue; 2570 2571 if (first) { 2572 first = false; 2573 continue; 2574 } 2575 2576 if (space_info == data_sinfo) { 2577 /* Migrate the block group to the data relocation space_info. */ 2578 struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; 2579 int factor; 2580 2581 ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); 2582 factor = btrfs_bg_type_to_factor(bg->flags); 2583 2584 down_write(&space_info->groups_sem); 2585 list_del_init(&bg->list); 2586 /* We can assume this as we choose the second empty one. */ 2587 ASSERT(!list_empty(&space_info->block_groups[index])); 2588 up_write(&space_info->groups_sem); 2589 2590 spin_lock(&space_info->lock); 2591 space_info->total_bytes -= bg->length; 2592 space_info->disk_total -= bg->length * factor; 2593 space_info->disk_total -= bg->zone_unusable; 2594 /* There is no allocation ever happened. */ 2595 ASSERT(bg->used == 0); 2596 /* No super block in a block group on the zoned setup. */ 2597 ASSERT(bg->bytes_super == 0); 2598 spin_unlock(&space_info->lock); 2599 2600 bg->space_info = reloc_sinfo; 2601 if (reloc_sinfo->block_group_kobjs[index] == NULL) 2602 btrfs_sysfs_add_block_group_type(bg); 2603 2604 btrfs_add_bg_to_space_info(fs_info, bg); 2605 } 2606 2607 fs_info->data_reloc_bg = bg->start; 2608 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); 2609 btrfs_zone_activate(bg); 2610 2611 return; 2612 } 2613 2614 if (did_chunk_alloc) 2615 return; 2616 2617 trans = btrfs_join_transaction(fs_info->tree_root); 2618 if (IS_ERR(trans)) 2619 return; 2620 2621 /* Allocate new BG in the data relocation space_info. */ 2622 space_info = data_sinfo->sub_group[0]; 2623 ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); 2624 ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); 2625 btrfs_end_transaction(trans); 2626 if (ret == 1) { 2627 /* 2628 * We allocated a new block group in the data relocation space_info. We 2629 * can take that one. 2630 */ 2631 first = false; 2632 did_chunk_alloc = true; 2633 goto again; 2634 } 2635 } 2636 2637 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) 2638 { 2639 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2640 struct btrfs_device *device; 2641 2642 if (!btrfs_is_zoned(fs_info)) 2643 return; 2644 2645 mutex_lock(&fs_devices->device_list_mutex); 2646 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2647 if (device->zone_info) { 2648 vfree(device->zone_info->zone_cache); 2649 device->zone_info->zone_cache = NULL; 2650 } 2651 } 2652 mutex_unlock(&fs_devices->device_list_mutex); 2653 } 2654 2655 bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) 2656 { 2657 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2658 struct btrfs_device *device; 2659 u64 total = btrfs_super_total_bytes(fs_info->super_copy); 2660 u64 used = 0; 2661 u64 factor; 2662 2663 ASSERT(btrfs_is_zoned(fs_info)); 2664 2665 if (fs_info->bg_reclaim_threshold == 0) 2666 return false; 2667 2668 mutex_lock(&fs_devices->device_list_mutex); 2669 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2670 if (!device->bdev) 2671 continue; 2672 2673 used += device->bytes_used; 2674 } 2675 mutex_unlock(&fs_devices->device_list_mutex); 2676 2677 factor = div64_u64(used * 100, total); 2678 return factor >= fs_info->bg_reclaim_threshold; 2679 } 2680 2681 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, 2682 u64 length) 2683 { 2684 struct btrfs_block_group *block_group; 2685 2686 if (!btrfs_is_zoned(fs_info)) 2687 return; 2688 2689 block_group = btrfs_lookup_block_group(fs_info, logical); 2690 /* It should be called on a previous data relocation block group. */ 2691 ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); 2692 2693 spin_lock(&block_group->lock); 2694 if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) 2695 goto out; 2696 2697 /* All relocation extents are written. */ 2698 if (block_group->start + block_group->alloc_offset == logical + length) { 2699 /* 2700 * Now, release this block group for further allocations and 2701 * zone finish. 2702 */ 2703 clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, 2704 &block_group->runtime_flags); 2705 } 2706 2707 out: 2708 spin_unlock(&block_group->lock); 2709 btrfs_put_block_group(block_group); 2710 } 2711 2712 int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) 2713 { 2714 struct btrfs_block_group *block_group; 2715 struct btrfs_block_group *min_bg = NULL; 2716 u64 min_avail = U64_MAX; 2717 int ret; 2718 2719 spin_lock(&fs_info->zone_active_bgs_lock); 2720 list_for_each_entry(block_group, &fs_info->zone_active_bgs, 2721 active_bg_list) { 2722 u64 avail; 2723 2724 spin_lock(&block_group->lock); 2725 if (block_group->reserved || block_group->alloc_offset == 0 || 2726 !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) || 2727 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { 2728 spin_unlock(&block_group->lock); 2729 continue; 2730 } 2731 2732 avail = block_group->zone_capacity - block_group->alloc_offset; 2733 if (min_avail > avail) { 2734 if (min_bg) 2735 btrfs_put_block_group(min_bg); 2736 min_bg = block_group; 2737 min_avail = avail; 2738 btrfs_get_block_group(min_bg); 2739 } 2740 spin_unlock(&block_group->lock); 2741 } 2742 spin_unlock(&fs_info->zone_active_bgs_lock); 2743 2744 if (!min_bg) 2745 return 0; 2746 2747 ret = btrfs_zone_finish(min_bg); 2748 btrfs_put_block_group(min_bg); 2749 2750 return ret < 0 ? ret : 1; 2751 } 2752 2753 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, 2754 struct btrfs_space_info *space_info, 2755 bool do_finish) 2756 { 2757 struct btrfs_block_group *bg; 2758 int index; 2759 2760 if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) 2761 return 0; 2762 2763 for (;;) { 2764 int ret; 2765 bool need_finish = false; 2766 2767 down_read(&space_info->groups_sem); 2768 for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { 2769 list_for_each_entry(bg, &space_info->block_groups[index], 2770 list) { 2771 if (!spin_trylock(&bg->lock)) 2772 continue; 2773 if (btrfs_zoned_bg_is_full(bg) || 2774 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2775 &bg->runtime_flags)) { 2776 spin_unlock(&bg->lock); 2777 continue; 2778 } 2779 spin_unlock(&bg->lock); 2780 2781 if (btrfs_zone_activate(bg)) { 2782 up_read(&space_info->groups_sem); 2783 return 1; 2784 } 2785 2786 need_finish = true; 2787 } 2788 } 2789 up_read(&space_info->groups_sem); 2790 2791 if (!do_finish || !need_finish) 2792 break; 2793 2794 ret = btrfs_zone_finish_one_bg(fs_info); 2795 if (ret == 0) 2796 break; 2797 if (ret < 0) 2798 return ret; 2799 } 2800 2801 return 0; 2802 } 2803 2804 /* 2805 * Reserve zones for one metadata block group, one tree-log block group, and one 2806 * system block group. 2807 */ 2808 void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) 2809 { 2810 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2811 struct btrfs_block_group *block_group; 2812 struct btrfs_device *device; 2813 /* Reserve zones for normal SINGLE metadata and tree-log block group. */ 2814 unsigned int metadata_reserve = 2; 2815 /* Reserve a zone for SINGLE system block group. */ 2816 unsigned int system_reserve = 1; 2817 2818 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) 2819 return; 2820 2821 /* 2822 * This function is called from the mount context. So, there is no 2823 * parallel process touching the bits. No need for read_seqretry(). 2824 */ 2825 if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) 2826 metadata_reserve = 4; 2827 if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) 2828 system_reserve = 2; 2829 2830 /* Apply the reservation on all the devices. */ 2831 mutex_lock(&fs_devices->device_list_mutex); 2832 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2833 if (!device->bdev) 2834 continue; 2835 2836 device->zone_info->reserved_active_zones = 2837 metadata_reserve + system_reserve; 2838 } 2839 mutex_unlock(&fs_devices->device_list_mutex); 2840 2841 /* Release reservation for currently active block groups. */ 2842 spin_lock(&fs_info->zone_active_bgs_lock); 2843 list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { 2844 struct btrfs_chunk_map *map = block_group->physical_map; 2845 2846 if (!(block_group->flags & 2847 (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) 2848 continue; 2849 2850 for (int i = 0; i < map->num_stripes; i++) 2851 map->stripes[i].dev->zone_info->reserved_active_zones--; 2852 } 2853 spin_unlock(&fs_info->zone_active_bgs_lock); 2854 } 2855 2856 /* 2857 * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. 2858 * 2859 * @space_info: the space to work on 2860 * @num_bytes: targeting reclaim bytes 2861 * 2862 * This one resets the zones of a block group, so we can reuse the region 2863 * without removing the block group. On the other hand, btrfs_delete_unused_bgs() 2864 * just removes a block group and frees up the underlying zones. So, we still 2865 * need to allocate a new block group to reuse the zones. 2866 * 2867 * Resetting is faster than deleting/recreating a block group. It is similar 2868 * to freeing the logical space on the regular mode. However, we cannot change 2869 * the block group's profile with this operation. 2870 */ 2871 int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) 2872 { 2873 struct btrfs_fs_info *fs_info = space_info->fs_info; 2874 const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; 2875 2876 if (!btrfs_is_zoned(fs_info)) 2877 return 0; 2878 2879 while (num_bytes > 0) { 2880 struct btrfs_chunk_map *map; 2881 struct btrfs_block_group *bg = NULL; 2882 bool found = false; 2883 u64 reclaimed = 0; 2884 2885 /* 2886 * Here, we choose a fully zone_unusable block group. It's 2887 * technically possible to reset a partly zone_unusable block 2888 * group, which still has some free space left. However, 2889 * handling that needs to cope with the allocation side, which 2890 * makes the logic more complex. So, let's handle the easy case 2891 * for now. 2892 */ 2893 spin_lock(&fs_info->unused_bgs_lock); 2894 list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { 2895 if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) 2896 continue; 2897 2898 /* 2899 * Use trylock to avoid locking order violation. In 2900 * btrfs_reclaim_bgs_work(), the lock order is 2901 * &bg->lock -> &fs_info->unused_bgs_lock. We skip a 2902 * block group if we cannot take its lock. 2903 */ 2904 if (!spin_trylock(&bg->lock)) 2905 continue; 2906 if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { 2907 spin_unlock(&bg->lock); 2908 continue; 2909 } 2910 spin_unlock(&bg->lock); 2911 found = true; 2912 break; 2913 } 2914 if (!found) { 2915 spin_unlock(&fs_info->unused_bgs_lock); 2916 return 0; 2917 } 2918 2919 list_del_init(&bg->bg_list); 2920 btrfs_put_block_group(bg); 2921 spin_unlock(&fs_info->unused_bgs_lock); 2922 2923 /* 2924 * Since the block group is fully zone_unusable and we cannot 2925 * allocate from this block group anymore, we don't need to set 2926 * this block group read-only. 2927 */ 2928 2929 down_read(&fs_info->dev_replace.rwsem); 2930 map = bg->physical_map; 2931 for (int i = 0; i < map->num_stripes; i++) { 2932 struct btrfs_io_stripe *stripe = &map->stripes[i]; 2933 unsigned int nofs_flags; 2934 int ret; 2935 2936 nofs_flags = memalloc_nofs_save(); 2937 ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, 2938 stripe->physical >> SECTOR_SHIFT, 2939 zone_size_sectors); 2940 memalloc_nofs_restore(nofs_flags); 2941 2942 if (ret) { 2943 up_read(&fs_info->dev_replace.rwsem); 2944 return ret; 2945 } 2946 } 2947 up_read(&fs_info->dev_replace.rwsem); 2948 2949 spin_lock(&space_info->lock); 2950 spin_lock(&bg->lock); 2951 ASSERT(!btrfs_is_block_group_used(bg)); 2952 if (bg->ro) { 2953 spin_unlock(&bg->lock); 2954 spin_unlock(&space_info->lock); 2955 continue; 2956 } 2957 2958 reclaimed = bg->alloc_offset; 2959 bg->zone_unusable = bg->length - bg->zone_capacity; 2960 bg->alloc_offset = 0; 2961 /* 2962 * This holds because we currently reset fully used then freed 2963 * block group. 2964 */ 2965 ASSERT(reclaimed == bg->zone_capacity); 2966 bg->free_space_ctl->free_space += reclaimed; 2967 space_info->bytes_zone_unusable -= reclaimed; 2968 spin_unlock(&bg->lock); 2969 btrfs_return_free_space(space_info, reclaimed); 2970 spin_unlock(&space_info->lock); 2971 2972 if (num_bytes <= reclaimed) 2973 break; 2974 num_bytes -= reclaimed; 2975 } 2976 2977 return 0; 2978 } 2979