1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/blkdev.h> 6 #include <linux/sched/mm.h> 7 #include <linux/atomic.h> 8 #include <linux/vmalloc.h> 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "zoned.h" 12 #include "disk-io.h" 13 #include "block-group.h" 14 #include "dev-replace.h" 15 #include "space-info.h" 16 #include "fs.h" 17 #include "accessors.h" 18 #include "bio.h" 19 #include "transaction.h" 20 #include "sysfs.h" 21 22 /* Maximum number of zones to report per blkdev_report_zones() call */ 23 #define BTRFS_REPORT_NR_ZONES 4096 24 /* Invalid allocation pointer value for missing devices */ 25 #define WP_MISSING_DEV ((u64)-1) 26 /* Pseudo write pointer value for conventional zone */ 27 #define WP_CONVENTIONAL ((u64)-2) 28 29 /* 30 * Location of the first zone of superblock logging zone pairs. 31 * 32 * - primary superblock: 0B (zone 0) 33 * - first copy: 512G (zone starting at that offset) 34 * - second copy: 4T (zone starting at that offset) 35 */ 36 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) 37 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) 38 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) 39 40 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) 41 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) 42 43 /* Number of superblock log zones */ 44 #define BTRFS_NR_SB_LOG_ZONES 2 45 46 /* Default number of max active zones when the device has no limits. */ 47 #define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128 48 49 /* 50 * Minimum of active zones we need: 51 * 52 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors 53 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group 54 * - 1 zone for tree-log dedicated block group 55 * - 1 zone for relocation 56 */ 57 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) 58 59 /* 60 * Minimum / maximum supported zone size. Currently, SMR disks have a zone 61 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. 62 * We do not expect the zone size to become larger than 8GiB or smaller than 63 * 4MiB in the near future. 64 */ 65 #define BTRFS_MAX_ZONE_SIZE SZ_8G 66 #define BTRFS_MIN_ZONE_SIZE SZ_4M 67 68 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) 69 70 static void wait_eb_writebacks(struct btrfs_block_group *block_group); 71 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); 72 73 static inline bool sb_zone_is_full(const struct blk_zone *zone) 74 { 75 return (zone->cond == BLK_ZONE_COND_FULL) || 76 (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); 77 } 78 79 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 80 { 81 struct blk_zone *zones = data; 82 83 memcpy(&zones[idx], zone, sizeof(*zone)); 84 85 return 0; 86 } 87 88 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 89 u64 *wp_ret) 90 { 91 bool empty[BTRFS_NR_SB_LOG_ZONES]; 92 bool full[BTRFS_NR_SB_LOG_ZONES]; 93 sector_t sector; 94 95 for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 96 ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); 97 empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); 98 full[i] = sb_zone_is_full(&zones[i]); 99 } 100 101 /* 102 * Possible states of log buffer zones 103 * 104 * Empty[0] In use[0] Full[0] 105 * Empty[1] * 0 1 106 * In use[1] x x 1 107 * Full[1] 0 0 C 108 * 109 * Log position: 110 * *: Special case, no superblock is written 111 * 0: Use write pointer of zones[0] 112 * 1: Use write pointer of zones[1] 113 * C: Compare super blocks from zones[0] and zones[1], use the latest 114 * one determined by generation 115 * x: Invalid state 116 */ 117 118 if (empty[0] && empty[1]) { 119 /* Special case to distinguish no superblock to read */ 120 *wp_ret = zones[0].start << SECTOR_SHIFT; 121 return -ENOENT; 122 } else if (full[0] && full[1]) { 123 /* Compare two super blocks */ 124 struct address_space *mapping = bdev->bd_mapping; 125 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 126 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 127 128 for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 129 u64 zone_end = (zones[i].start + zones[i].capacity) << SECTOR_SHIFT; 130 u64 bytenr = ALIGN_DOWN(zone_end, BTRFS_SUPER_INFO_SIZE) - 131 BTRFS_SUPER_INFO_SIZE; 132 133 page[i] = read_cache_page_gfp(mapping, 134 bytenr >> PAGE_SHIFT, GFP_NOFS); 135 if (IS_ERR(page[i])) { 136 if (i == 1) 137 btrfs_release_disk_super(super[0]); 138 return PTR_ERR(page[i]); 139 } 140 super[i] = page_address(page[i]); 141 } 142 143 if (btrfs_super_generation(super[0]) > 144 btrfs_super_generation(super[1])) 145 sector = zones[1].start; 146 else 147 sector = zones[0].start; 148 149 for (int i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 150 btrfs_release_disk_super(super[i]); 151 } else if (!full[0] && (empty[1] || full[1])) { 152 sector = zones[0].wp; 153 } else if (full[0]) { 154 sector = zones[1].wp; 155 } else { 156 return -EUCLEAN; 157 } 158 *wp_ret = sector << SECTOR_SHIFT; 159 return 0; 160 } 161 162 /* 163 * Get the first zone number of the superblock mirror 164 */ 165 static inline u32 sb_zone_number(int shift, int mirror) 166 { 167 u64 zone = U64_MAX; 168 169 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 170 switch (mirror) { 171 case 0: zone = 0; break; 172 case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; 173 case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; 174 } 175 176 ASSERT(zone <= U32_MAX); 177 178 return (u32)zone; 179 } 180 181 static inline sector_t zone_start_sector(u32 zone_number, 182 struct block_device *bdev) 183 { 184 return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); 185 } 186 187 static inline u64 zone_start_physical(u32 zone_number, 188 struct btrfs_zoned_device_info *zone_info) 189 { 190 return (u64)zone_number << zone_info->zone_size_shift; 191 } 192 193 /* 194 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block 195 * device into static sized chunks and fake a conventional zone on each of 196 * them. 197 */ 198 static int emulate_report_zones(struct btrfs_device *device, u64 pos, 199 struct blk_zone *zones, unsigned int nr_zones) 200 { 201 const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; 202 sector_t bdev_size = bdev_nr_sectors(device->bdev); 203 unsigned int i; 204 205 pos >>= SECTOR_SHIFT; 206 for (i = 0; i < nr_zones; i++) { 207 zones[i].start = i * zone_sectors + pos; 208 zones[i].len = zone_sectors; 209 zones[i].capacity = zone_sectors; 210 zones[i].wp = zones[i].start + zone_sectors; 211 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; 212 zones[i].cond = BLK_ZONE_COND_NOT_WP; 213 214 if (zones[i].wp >= bdev_size) { 215 i++; 216 break; 217 } 218 } 219 220 return i; 221 } 222 223 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 224 struct blk_zone *zones, unsigned int *nr_zones) 225 { 226 struct btrfs_zoned_device_info *zinfo = device->zone_info; 227 int ret; 228 229 if (!*nr_zones) 230 return 0; 231 232 if (!bdev_is_zoned(device->bdev)) { 233 ret = emulate_report_zones(device, pos, zones, *nr_zones); 234 *nr_zones = ret; 235 return 0; 236 } 237 238 /* Check cache */ 239 if (zinfo->zone_cache) { 240 unsigned int i; 241 u32 zno; 242 243 ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); 244 zno = pos >> zinfo->zone_size_shift; 245 /* 246 * We cannot report zones beyond the zone end. So, it is OK to 247 * cap *nr_zones to at the end. 248 */ 249 *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); 250 251 for (i = 0; i < *nr_zones; i++) { 252 struct blk_zone *zone_info; 253 254 zone_info = &zinfo->zone_cache[zno + i]; 255 if (!zone_info->len) 256 break; 257 } 258 259 if (i == *nr_zones) { 260 /* Cache hit on all the zones */ 261 memcpy(zones, zinfo->zone_cache + zno, 262 sizeof(*zinfo->zone_cache) * *nr_zones); 263 return 0; 264 } 265 } 266 267 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 268 copy_zone_info_cb, zones); 269 if (ret < 0) { 270 btrfs_err(device->fs_info, 271 "zoned: failed to read zone %llu on %s (devid %llu)", 272 pos, rcu_dereference(device->name), 273 device->devid); 274 return ret; 275 } 276 *nr_zones = ret; 277 if (unlikely(!ret)) 278 return -EIO; 279 280 /* Populate cache */ 281 if (zinfo->zone_cache) { 282 u32 zno = pos >> zinfo->zone_size_shift; 283 284 memcpy(zinfo->zone_cache + zno, zones, 285 sizeof(*zinfo->zone_cache) * *nr_zones); 286 } 287 288 return 0; 289 } 290 291 /* The emulated zone size is determined from the size of device extent */ 292 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) 293 { 294 BTRFS_PATH_AUTO_FREE(path); 295 struct btrfs_root *root = fs_info->dev_root; 296 struct btrfs_key key; 297 struct extent_buffer *leaf; 298 struct btrfs_dev_extent *dext; 299 int ret = 0; 300 301 key.objectid = 1; 302 key.type = BTRFS_DEV_EXTENT_KEY; 303 key.offset = 0; 304 305 path = btrfs_alloc_path(); 306 if (!path) 307 return -ENOMEM; 308 309 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 310 if (ret < 0) 311 return ret; 312 313 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 314 ret = btrfs_next_leaf(root, path); 315 if (ret < 0) 316 return ret; 317 /* No dev extents at all? Not good */ 318 if (unlikely(ret > 0)) 319 return -EUCLEAN; 320 } 321 322 leaf = path->nodes[0]; 323 dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 324 fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); 325 return 0; 326 } 327 328 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) 329 { 330 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 331 struct btrfs_device *device; 332 int ret = 0; 333 334 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ 335 if (!btrfs_fs_incompat(fs_info, ZONED)) 336 return 0; 337 338 mutex_lock(&fs_devices->device_list_mutex); 339 list_for_each_entry(device, &fs_devices->devices, dev_list) { 340 /* We can skip reading of zone info for missing devices */ 341 if (!device->bdev) 342 continue; 343 344 ret = btrfs_get_dev_zone_info(device, true); 345 if (ret) 346 break; 347 } 348 mutex_unlock(&fs_devices->device_list_mutex); 349 350 return ret; 351 } 352 353 int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) 354 { 355 struct btrfs_fs_info *fs_info = device->fs_info; 356 struct btrfs_zoned_device_info *zone_info = NULL; 357 struct block_device *bdev = device->bdev; 358 unsigned int max_active_zones; 359 unsigned int nactive; 360 sector_t nr_sectors; 361 sector_t sector = 0; 362 struct blk_zone *zones = NULL; 363 unsigned int i, nreported = 0, nr_zones; 364 sector_t zone_sectors; 365 char *model, *emulated; 366 int ret; 367 368 /* 369 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not 370 * yet be set. 371 */ 372 if (!btrfs_fs_incompat(fs_info, ZONED)) 373 return 0; 374 375 if (device->zone_info) 376 return 0; 377 378 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 379 if (!zone_info) 380 return -ENOMEM; 381 382 device->zone_info = zone_info; 383 384 if (!bdev_is_zoned(bdev)) { 385 if (!fs_info->zone_size) { 386 ret = calculate_emulated_zone_size(fs_info); 387 if (ret) 388 goto out; 389 } 390 391 ASSERT(fs_info->zone_size); 392 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; 393 } else { 394 zone_sectors = bdev_zone_sectors(bdev); 395 } 396 397 ASSERT(is_power_of_two_u64(zone_sectors)); 398 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 399 400 /* We reject devices with a zone size larger than 8GB */ 401 if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { 402 btrfs_err(fs_info, 403 "zoned: %s: zone size %llu larger than supported maximum %llu", 404 rcu_dereference(device->name), 405 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 406 ret = -EINVAL; 407 goto out; 408 } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { 409 btrfs_err(fs_info, 410 "zoned: %s: zone size %llu smaller than supported minimum %u", 411 rcu_dereference(device->name), 412 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); 413 ret = -EINVAL; 414 goto out; 415 } 416 417 nr_sectors = bdev_nr_sectors(bdev); 418 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 419 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 420 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 421 zone_info->nr_zones++; 422 423 max_active_zones = min_not_zero(bdev_max_active_zones(bdev), 424 bdev_max_open_zones(bdev)); 425 if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES) 426 max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES; 427 if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { 428 btrfs_err(fs_info, 429 "zoned: %s: max active zones %u is too small, need at least %u active zones", 430 rcu_dereference(device->name), max_active_zones, 431 BTRFS_MIN_ACTIVE_ZONES); 432 ret = -EINVAL; 433 goto out; 434 } 435 zone_info->max_active_zones = max_active_zones; 436 437 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 438 if (!zone_info->seq_zones) { 439 ret = -ENOMEM; 440 goto out; 441 } 442 443 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 444 if (!zone_info->empty_zones) { 445 ret = -ENOMEM; 446 goto out; 447 } 448 449 zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 450 if (!zone_info->active_zones) { 451 ret = -ENOMEM; 452 goto out; 453 } 454 455 zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 456 if (!zones) { 457 ret = -ENOMEM; 458 goto out; 459 } 460 461 /* 462 * Enable zone cache only for a zoned device. On a non-zoned device, we 463 * fill the zone info with emulated CONVENTIONAL zones, so no need to 464 * use the cache. 465 */ 466 if (populate_cache && bdev_is_zoned(device->bdev)) { 467 zone_info->zone_cache = vcalloc(zone_info->nr_zones, 468 sizeof(struct blk_zone)); 469 if (!zone_info->zone_cache) { 470 btrfs_err(device->fs_info, 471 "zoned: failed to allocate zone cache for %s", 472 rcu_dereference(device->name)); 473 ret = -ENOMEM; 474 goto out; 475 } 476 } 477 478 /* Get zones type */ 479 nactive = 0; 480 while (sector < nr_sectors) { 481 nr_zones = BTRFS_REPORT_NR_ZONES; 482 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 483 &nr_zones); 484 if (ret) 485 goto out; 486 487 for (i = 0; i < nr_zones; i++) { 488 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 489 __set_bit(nreported, zone_info->seq_zones); 490 switch (zones[i].cond) { 491 case BLK_ZONE_COND_EMPTY: 492 __set_bit(nreported, zone_info->empty_zones); 493 break; 494 case BLK_ZONE_COND_IMP_OPEN: 495 case BLK_ZONE_COND_EXP_OPEN: 496 case BLK_ZONE_COND_CLOSED: 497 __set_bit(nreported, zone_info->active_zones); 498 nactive++; 499 break; 500 } 501 nreported++; 502 } 503 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 504 } 505 506 if (unlikely(nreported != zone_info->nr_zones)) { 507 btrfs_err(device->fs_info, 508 "inconsistent number of zones on %s (%u/%u)", 509 rcu_dereference(device->name), nreported, 510 zone_info->nr_zones); 511 ret = -EIO; 512 goto out; 513 } 514 515 if (max_active_zones) { 516 if (unlikely(nactive > max_active_zones)) { 517 if (bdev_max_active_zones(bdev) == 0) { 518 max_active_zones = 0; 519 zone_info->max_active_zones = 0; 520 goto validate; 521 } 522 btrfs_err(device->fs_info, 523 "zoned: %u active zones on %s exceeds max_active_zones %u", 524 nactive, rcu_dereference(device->name), 525 max_active_zones); 526 ret = -EIO; 527 goto out; 528 } 529 atomic_set(&zone_info->active_zones_left, 530 max_active_zones - nactive); 531 set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags); 532 } 533 534 validate: 535 /* Validate superblock log */ 536 nr_zones = BTRFS_NR_SB_LOG_ZONES; 537 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 538 u32 sb_zone; 539 u64 sb_wp; 540 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 541 542 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 543 if (sb_zone + 1 >= zone_info->nr_zones) 544 continue; 545 546 ret = btrfs_get_dev_zones(device, 547 zone_start_physical(sb_zone, zone_info), 548 &zone_info->sb_zones[sb_pos], 549 &nr_zones); 550 if (ret) 551 goto out; 552 553 if (unlikely(nr_zones != BTRFS_NR_SB_LOG_ZONES)) { 554 btrfs_err(device->fs_info, 555 "zoned: failed to read super block log zone info at devid %llu zone %u", 556 device->devid, sb_zone); 557 ret = -EUCLEAN; 558 goto out; 559 } 560 561 /* 562 * If zones[0] is conventional, always use the beginning of the 563 * zone to record superblock. No need to validate in that case. 564 */ 565 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 566 BLK_ZONE_TYPE_CONVENTIONAL) 567 continue; 568 569 ret = sb_write_pointer(device->bdev, 570 &zone_info->sb_zones[sb_pos], &sb_wp); 571 if (unlikely(ret != -ENOENT && ret)) { 572 btrfs_err(device->fs_info, 573 "zoned: super block log zone corrupted devid %llu zone %u", 574 device->devid, sb_zone); 575 ret = -EUCLEAN; 576 goto out; 577 } 578 } 579 580 581 kvfree(zones); 582 583 if (bdev_is_zoned(bdev)) { 584 model = "host-managed zoned"; 585 emulated = ""; 586 } else { 587 model = "regular"; 588 emulated = "emulated "; 589 } 590 591 btrfs_info(fs_info, 592 "%s block device %s, %u %szones of %llu bytes", 593 model, rcu_dereference(device->name), zone_info->nr_zones, 594 emulated, zone_info->zone_size); 595 596 return 0; 597 598 out: 599 kvfree(zones); 600 btrfs_destroy_dev_zone_info(device); 601 return ret; 602 } 603 604 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 605 { 606 struct btrfs_zoned_device_info *zone_info = device->zone_info; 607 608 if (!zone_info) 609 return; 610 611 bitmap_free(zone_info->active_zones); 612 bitmap_free(zone_info->seq_zones); 613 bitmap_free(zone_info->empty_zones); 614 vfree(zone_info->zone_cache); 615 kfree(zone_info); 616 device->zone_info = NULL; 617 } 618 619 struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev) 620 { 621 struct btrfs_zoned_device_info *zone_info; 622 623 zone_info = kmemdup(orig_dev->zone_info, sizeof(*zone_info), GFP_KERNEL); 624 if (!zone_info) 625 return NULL; 626 627 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 628 if (!zone_info->seq_zones) 629 goto out; 630 631 bitmap_copy(zone_info->seq_zones, orig_dev->zone_info->seq_zones, 632 zone_info->nr_zones); 633 634 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 635 if (!zone_info->empty_zones) 636 goto out; 637 638 bitmap_copy(zone_info->empty_zones, orig_dev->zone_info->empty_zones, 639 zone_info->nr_zones); 640 641 zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 642 if (!zone_info->active_zones) 643 goto out; 644 645 bitmap_copy(zone_info->active_zones, orig_dev->zone_info->active_zones, 646 zone_info->nr_zones); 647 zone_info->zone_cache = NULL; 648 649 return zone_info; 650 651 out: 652 bitmap_free(zone_info->seq_zones); 653 bitmap_free(zone_info->empty_zones); 654 bitmap_free(zone_info->active_zones); 655 kfree(zone_info); 656 return NULL; 657 } 658 659 static int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) 660 { 661 unsigned int nr_zones = 1; 662 int ret; 663 664 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 665 if (ret != 0 || !nr_zones) 666 return ret ? ret : -EIO; 667 668 return 0; 669 } 670 671 static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) 672 { 673 struct btrfs_device *device; 674 675 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { 676 if (device->bdev && bdev_is_zoned(device->bdev)) { 677 btrfs_err(fs_info, 678 "zoned: mode not enabled but zoned device found: %pg", 679 device->bdev); 680 return -EINVAL; 681 } 682 } 683 684 return 0; 685 } 686 687 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 688 { 689 struct queue_limits *lim = &fs_info->limits; 690 struct btrfs_device *device; 691 u64 zone_size = 0; 692 int ret; 693 694 /* 695 * Host-Managed devices can't be used without the ZONED flag. With the 696 * ZONED all devices can be used, using zone emulation if required. 697 */ 698 if (!btrfs_fs_incompat(fs_info, ZONED)) 699 return btrfs_check_for_zoned_device(fs_info); 700 701 blk_set_stacking_limits(lim); 702 703 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { 704 struct btrfs_zoned_device_info *zone_info = device->zone_info; 705 706 if (!device->bdev) 707 continue; 708 709 if (!zone_size) { 710 zone_size = zone_info->zone_size; 711 } else if (zone_info->zone_size != zone_size) { 712 btrfs_err(fs_info, 713 "zoned: unequal block device zone sizes: have %llu found %llu", 714 zone_info->zone_size, zone_size); 715 return -EINVAL; 716 } 717 718 /* 719 * With the zoned emulation, we can have non-zoned device on the 720 * zoned mode. In this case, we don't have a valid max zone 721 * append size. 722 */ 723 if (bdev_is_zoned(device->bdev)) 724 blk_stack_limits(lim, bdev_limits(device->bdev), 0); 725 } 726 727 ret = blk_validate_limits(lim); 728 if (ret) { 729 btrfs_err(fs_info, "zoned: failed to validate queue limits"); 730 return ret; 731 } 732 733 /* 734 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 735 * btrfs_create_chunk(). Since we want stripe_len == zone_size, 736 * check the alignment here. 737 */ 738 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 739 btrfs_err(fs_info, 740 "zoned: zone size %llu not aligned to stripe %u", 741 zone_size, BTRFS_STRIPE_LEN); 742 return -EINVAL; 743 } 744 745 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 746 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 747 return -EINVAL; 748 } 749 750 fs_info->zone_size = zone_size; 751 /* 752 * Also limit max_zone_append_size by max_segments * PAGE_SIZE. 753 * Technically, we can have multiple pages per segment. But, since 754 * we add the pages one by one to a bio, and cannot increase the 755 * metadata reservation even if it increases the number of extents, it 756 * is safe to stick with the limit. 757 */ 758 fs_info->max_zone_append_size = ALIGN_DOWN( 759 min3((u64)lim->max_zone_append_sectors << SECTOR_SHIFT, 760 (u64)lim->max_sectors << SECTOR_SHIFT, 761 (u64)lim->max_segments << PAGE_SHIFT), 762 fs_info->sectorsize); 763 fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; 764 765 fs_info->max_extent_size = min_not_zero(fs_info->max_extent_size, 766 fs_info->max_zone_append_size); 767 768 /* 769 * Check mount options here, because we might change fs_info->zoned 770 * from fs_info->zone_size. 771 */ 772 ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt); 773 if (ret) 774 return ret; 775 776 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 777 return 0; 778 } 779 780 int btrfs_check_mountopts_zoned(const struct btrfs_fs_info *info, 781 unsigned long long *mount_opt) 782 { 783 if (!btrfs_is_zoned(info)) 784 return 0; 785 786 /* 787 * Space cache writing is not COWed. Disable that to avoid write errors 788 * in sequential zones. 789 */ 790 if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { 791 btrfs_err(info, "zoned: space cache v1 is not supported"); 792 return -EINVAL; 793 } 794 795 if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) { 796 btrfs_err(info, "zoned: NODATACOW not supported"); 797 return -EINVAL; 798 } 799 800 if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) { 801 btrfs_info(info, 802 "zoned: async discard ignored and disabled for zoned mode"); 803 btrfs_clear_opt(*mount_opt, DISCARD_ASYNC); 804 } 805 806 return 0; 807 } 808 809 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 810 int rw, u64 *bytenr_ret) 811 { 812 u64 wp; 813 int ret; 814 815 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 816 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 817 return 0; 818 } 819 820 ret = sb_write_pointer(bdev, zones, &wp); 821 if (ret != -ENOENT && ret < 0) 822 return ret; 823 824 if (rw == WRITE) { 825 struct blk_zone *reset = NULL; 826 827 if (wp == zones[0].start << SECTOR_SHIFT) 828 reset = &zones[0]; 829 else if (wp == zones[1].start << SECTOR_SHIFT) 830 reset = &zones[1]; 831 832 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 833 unsigned int nofs_flags; 834 835 ASSERT(sb_zone_is_full(reset)); 836 837 nofs_flags = memalloc_nofs_save(); 838 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 839 reset->start, reset->len); 840 memalloc_nofs_restore(nofs_flags); 841 if (ret) 842 return ret; 843 844 reset->cond = BLK_ZONE_COND_EMPTY; 845 reset->wp = reset->start; 846 } 847 } else if (ret != -ENOENT) { 848 /* 849 * For READ, we want the previous one. Move write pointer to 850 * the end of a zone, if it is at the head of a zone. 851 */ 852 u64 zone_end = 0; 853 854 if (wp == zones[0].start << SECTOR_SHIFT) 855 zone_end = zones[1].start + zones[1].capacity; 856 else if (wp == zones[1].start << SECTOR_SHIFT) 857 zone_end = zones[0].start + zones[0].capacity; 858 if (zone_end) 859 wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, 860 BTRFS_SUPER_INFO_SIZE); 861 862 wp -= BTRFS_SUPER_INFO_SIZE; 863 } 864 865 *bytenr_ret = wp; 866 return 0; 867 868 } 869 870 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 871 u64 *bytenr_ret) 872 { 873 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 874 sector_t zone_sectors; 875 u32 sb_zone; 876 int ret; 877 u8 zone_sectors_shift; 878 sector_t nr_sectors; 879 u32 nr_zones; 880 881 if (!bdev_is_zoned(bdev)) { 882 *bytenr_ret = btrfs_sb_offset(mirror); 883 return 0; 884 } 885 886 ASSERT(rw == READ || rw == WRITE); 887 888 zone_sectors = bdev_zone_sectors(bdev); 889 if (!is_power_of_2(zone_sectors)) 890 return -EINVAL; 891 zone_sectors_shift = ilog2(zone_sectors); 892 nr_sectors = bdev_nr_sectors(bdev); 893 nr_zones = nr_sectors >> zone_sectors_shift; 894 895 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 896 if (sb_zone + 1 >= nr_zones) 897 return -ENOENT; 898 899 ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), 900 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 901 zones); 902 if (ret < 0) 903 return ret; 904 if (unlikely(ret != BTRFS_NR_SB_LOG_ZONES)) 905 return -EIO; 906 907 return sb_log_location(bdev, zones, rw, bytenr_ret); 908 } 909 910 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 911 u64 *bytenr_ret) 912 { 913 struct btrfs_zoned_device_info *zinfo = device->zone_info; 914 u32 zone_num; 915 916 /* 917 * For a zoned filesystem on a non-zoned block device, use the same 918 * super block locations as regular filesystem. Doing so, the super 919 * block can always be retrieved and the zoned flag of the volume 920 * detected from the super block information. 921 */ 922 if (!bdev_is_zoned(device->bdev)) { 923 *bytenr_ret = btrfs_sb_offset(mirror); 924 return 0; 925 } 926 927 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 928 if (zone_num + 1 >= zinfo->nr_zones) 929 return -ENOENT; 930 931 return sb_log_location(device->bdev, 932 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 933 rw, bytenr_ret); 934 } 935 936 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 937 int mirror) 938 { 939 u32 zone_num; 940 941 if (!zinfo) 942 return false; 943 944 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 945 if (zone_num + 1 >= zinfo->nr_zones) 946 return false; 947 948 if (!test_bit(zone_num, zinfo->seq_zones)) 949 return false; 950 951 return true; 952 } 953 954 int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 955 { 956 struct btrfs_zoned_device_info *zinfo = device->zone_info; 957 struct blk_zone *zone; 958 int i; 959 960 if (!is_sb_log_zone(zinfo, mirror)) 961 return 0; 962 963 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 964 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 965 /* Advance the next zone */ 966 if (zone->cond == BLK_ZONE_COND_FULL) { 967 zone++; 968 continue; 969 } 970 971 if (zone->cond == BLK_ZONE_COND_EMPTY) 972 zone->cond = BLK_ZONE_COND_IMP_OPEN; 973 974 zone->wp += SUPER_INFO_SECTORS; 975 976 if (sb_zone_is_full(zone)) { 977 /* 978 * No room left to write new superblock. Since 979 * superblock is written with REQ_SYNC, it is safe to 980 * finish the zone now. 981 * 982 * If the write pointer is exactly at the capacity, 983 * explicit ZONE_FINISH is not necessary. 984 */ 985 if (zone->wp != zone->start + zone->capacity) { 986 unsigned int nofs_flags; 987 int ret; 988 989 nofs_flags = memalloc_nofs_save(); 990 ret = blkdev_zone_mgmt(device->bdev, 991 REQ_OP_ZONE_FINISH, zone->start, 992 zone->len); 993 memalloc_nofs_restore(nofs_flags); 994 if (ret) 995 return ret; 996 } 997 998 zone->wp = zone->start + zone->len; 999 zone->cond = BLK_ZONE_COND_FULL; 1000 } 1001 return 0; 1002 } 1003 1004 /* All the zones are FULL. Should not reach here. */ 1005 DEBUG_WARN("unexpected state, all zones full"); 1006 return -EIO; 1007 } 1008 1009 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 1010 { 1011 unsigned int nofs_flags; 1012 sector_t zone_sectors; 1013 sector_t nr_sectors; 1014 u8 zone_sectors_shift; 1015 u32 sb_zone; 1016 u32 nr_zones; 1017 int ret; 1018 1019 zone_sectors = bdev_zone_sectors(bdev); 1020 zone_sectors_shift = ilog2(zone_sectors); 1021 nr_sectors = bdev_nr_sectors(bdev); 1022 nr_zones = nr_sectors >> zone_sectors_shift; 1023 1024 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 1025 if (sb_zone + 1 >= nr_zones) 1026 return -ENOENT; 1027 1028 nofs_flags = memalloc_nofs_save(); 1029 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 1030 zone_start_sector(sb_zone, bdev), 1031 zone_sectors * BTRFS_NR_SB_LOG_ZONES); 1032 memalloc_nofs_restore(nofs_flags); 1033 return ret; 1034 } 1035 1036 /* 1037 * Find allocatable zones within a given region. 1038 * 1039 * @device: the device to allocate a region on 1040 * @hole_start: the position of the hole to allocate the region 1041 * @num_bytes: size of wanted region 1042 * @hole_end: the end of the hole 1043 * @return: position of allocatable zones 1044 * 1045 * Allocatable region should not contain any superblock locations. 1046 */ 1047 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, 1048 u64 hole_end, u64 num_bytes) 1049 { 1050 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1051 const u8 shift = zinfo->zone_size_shift; 1052 u64 nzones = num_bytes >> shift; 1053 u64 pos = hole_start; 1054 u64 begin, end; 1055 bool have_sb; 1056 int i; 1057 1058 ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); 1059 ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); 1060 1061 while (pos < hole_end) { 1062 begin = pos >> shift; 1063 end = begin + nzones; 1064 1065 if (end > zinfo->nr_zones) 1066 return hole_end; 1067 1068 /* Check if zones in the region are all empty */ 1069 if (btrfs_dev_is_sequential(device, pos) && 1070 !bitmap_test_range_all_set(zinfo->empty_zones, begin, nzones)) { 1071 pos += zinfo->zone_size; 1072 continue; 1073 } 1074 1075 have_sb = false; 1076 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1077 u32 sb_zone; 1078 u64 sb_pos; 1079 1080 sb_zone = sb_zone_number(shift, i); 1081 if (!(end <= sb_zone || 1082 sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { 1083 have_sb = true; 1084 pos = zone_start_physical( 1085 sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); 1086 break; 1087 } 1088 1089 /* We also need to exclude regular superblock positions */ 1090 sb_pos = btrfs_sb_offset(i); 1091 if (!(pos + num_bytes <= sb_pos || 1092 sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { 1093 have_sb = true; 1094 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, 1095 zinfo->zone_size); 1096 break; 1097 } 1098 } 1099 if (!have_sb) 1100 break; 1101 } 1102 1103 return pos; 1104 } 1105 1106 static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) 1107 { 1108 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1109 unsigned int zno = (pos >> zone_info->zone_size_shift); 1110 1111 /* We can use any number of zones */ 1112 if (zone_info->max_active_zones == 0) 1113 return true; 1114 1115 if (!test_bit(zno, zone_info->active_zones)) { 1116 /* Active zone left? */ 1117 if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) 1118 return false; 1119 if (test_and_set_bit(zno, zone_info->active_zones)) { 1120 /* Someone already set the bit */ 1121 atomic_inc(&zone_info->active_zones_left); 1122 } 1123 } 1124 1125 return true; 1126 } 1127 1128 static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) 1129 { 1130 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1131 unsigned int zno = (pos >> zone_info->zone_size_shift); 1132 1133 /* We can use any number of zones */ 1134 if (zone_info->max_active_zones == 0) 1135 return; 1136 1137 if (test_and_clear_bit(zno, zone_info->active_zones)) 1138 atomic_inc(&zone_info->active_zones_left); 1139 } 1140 1141 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 1142 u64 length, u64 *bytes) 1143 { 1144 unsigned int nofs_flags; 1145 int ret; 1146 1147 *bytes = 0; 1148 nofs_flags = memalloc_nofs_save(); 1149 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 1150 physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); 1151 memalloc_nofs_restore(nofs_flags); 1152 if (ret) 1153 return ret; 1154 1155 *bytes = length; 1156 while (length) { 1157 btrfs_dev_set_zone_empty(device, physical); 1158 btrfs_dev_clear_active_zone(device, physical); 1159 physical += device->zone_info->zone_size; 1160 length -= device->zone_info->zone_size; 1161 } 1162 1163 return 0; 1164 } 1165 1166 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) 1167 { 1168 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1169 const u8 shift = zinfo->zone_size_shift; 1170 unsigned long begin = start >> shift; 1171 unsigned long nbits = size >> shift; 1172 u64 pos; 1173 int ret; 1174 1175 ASSERT(IS_ALIGNED(start, zinfo->zone_size)); 1176 ASSERT(IS_ALIGNED(size, zinfo->zone_size)); 1177 1178 if (begin + nbits > zinfo->nr_zones) 1179 return -ERANGE; 1180 1181 /* All the zones are conventional */ 1182 if (bitmap_test_range_all_zero(zinfo->seq_zones, begin, nbits)) 1183 return 0; 1184 1185 /* All the zones are sequential and empty */ 1186 if (bitmap_test_range_all_set(zinfo->seq_zones, begin, nbits) && 1187 bitmap_test_range_all_set(zinfo->empty_zones, begin, nbits)) 1188 return 0; 1189 1190 for (pos = start; pos < start + size; pos += zinfo->zone_size) { 1191 u64 reset_bytes; 1192 1193 if (!btrfs_dev_is_sequential(device, pos) || 1194 btrfs_dev_is_empty_zone(device, pos)) 1195 continue; 1196 1197 /* Free regions should be empty */ 1198 btrfs_warn( 1199 device->fs_info, 1200 "zoned: resetting device %s (devid %llu) zone %llu for allocation", 1201 rcu_dereference(device->name), device->devid, pos >> shift); 1202 WARN_ON_ONCE(1); 1203 1204 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, 1205 &reset_bytes); 1206 if (ret) 1207 return ret; 1208 } 1209 1210 return 0; 1211 } 1212 1213 /* 1214 * Calculate an allocation pointer from the extent allocation information 1215 * for a block group consist of conventional zones. It is pointed to the 1216 * end of the highest addressed extent in the block group as an allocation 1217 * offset. 1218 */ 1219 static int calculate_alloc_pointer(struct btrfs_block_group *cache, 1220 u64 *offset_ret, bool new) 1221 { 1222 struct btrfs_fs_info *fs_info = cache->fs_info; 1223 struct btrfs_root *root; 1224 BTRFS_PATH_AUTO_FREE(path); 1225 struct btrfs_key key; 1226 struct btrfs_key found_key; 1227 int ret; 1228 u64 length; 1229 1230 /* 1231 * Avoid tree lookups for a new block group, there's no use for it. 1232 * It must always be 0. 1233 * 1234 * Also, we have a lock chain of extent buffer lock -> chunk mutex. 1235 * For new a block group, this function is called from 1236 * btrfs_make_block_group() which is already taking the chunk mutex. 1237 * Thus, we cannot call calculate_alloc_pointer() which takes extent 1238 * buffer locks to avoid deadlock. 1239 */ 1240 if (new) { 1241 *offset_ret = 0; 1242 return 0; 1243 } 1244 1245 path = btrfs_alloc_path(); 1246 if (!path) 1247 return -ENOMEM; 1248 1249 key.objectid = cache->start + cache->length; 1250 key.type = 0; 1251 key.offset = 0; 1252 1253 root = btrfs_extent_root(fs_info, key.objectid); 1254 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1255 /* We should not find the exact match */ 1256 if (unlikely(!ret)) 1257 ret = -EUCLEAN; 1258 if (ret < 0) 1259 return ret; 1260 1261 ret = btrfs_previous_extent_item(root, path, cache->start); 1262 if (ret) { 1263 if (ret == 1) { 1264 ret = 0; 1265 *offset_ret = 0; 1266 } 1267 return ret; 1268 } 1269 1270 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 1271 1272 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) 1273 length = found_key.offset; 1274 else 1275 length = fs_info->nodesize; 1276 1277 if (unlikely(!(found_key.objectid >= cache->start && 1278 found_key.objectid + length <= cache->start + cache->length))) { 1279 return -EUCLEAN; 1280 } 1281 *offset_ret = found_key.objectid + length - cache->start; 1282 return 0; 1283 } 1284 1285 struct zone_info { 1286 u64 physical; 1287 u64 capacity; 1288 u64 alloc_offset; 1289 }; 1290 1291 static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, 1292 struct zone_info *info, unsigned long *active, 1293 struct btrfs_chunk_map *map, bool new) 1294 { 1295 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1296 struct btrfs_device *device; 1297 int dev_replace_is_ongoing = 0; 1298 unsigned int nofs_flag; 1299 struct blk_zone zone; 1300 int ret; 1301 1302 info->physical = map->stripes[zone_idx].physical; 1303 1304 down_read(&dev_replace->rwsem); 1305 device = map->stripes[zone_idx].dev; 1306 1307 if (!device->bdev) { 1308 up_read(&dev_replace->rwsem); 1309 info->alloc_offset = WP_MISSING_DEV; 1310 return 0; 1311 } 1312 1313 /* Consider a zone as active if we can allow any number of active zones. */ 1314 if (!device->zone_info->max_active_zones) 1315 __set_bit(zone_idx, active); 1316 1317 if (!btrfs_dev_is_sequential(device, info->physical)) { 1318 up_read(&dev_replace->rwsem); 1319 info->alloc_offset = WP_CONVENTIONAL; 1320 return 0; 1321 } 1322 1323 ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); 1324 1325 /* This zone will be used for allocation, so mark this zone non-empty. */ 1326 btrfs_dev_clear_zone_empty(device, info->physical); 1327 1328 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1329 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1330 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); 1331 1332 /* 1333 * The group is mapped to a sequential zone. Get the zone write pointer 1334 * to determine the allocation offset within the zone. 1335 */ 1336 WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); 1337 1338 if (new) { 1339 sector_t capacity; 1340 1341 capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); 1342 up_read(&dev_replace->rwsem); 1343 info->alloc_offset = 0; 1344 info->capacity = capacity << SECTOR_SHIFT; 1345 1346 return 0; 1347 } 1348 1349 nofs_flag = memalloc_nofs_save(); 1350 ret = btrfs_get_dev_zone(device, info->physical, &zone); 1351 memalloc_nofs_restore(nofs_flag); 1352 if (ret) { 1353 up_read(&dev_replace->rwsem); 1354 if (ret != -EIO && ret != -EOPNOTSUPP) 1355 return ret; 1356 info->alloc_offset = WP_MISSING_DEV; 1357 return 0; 1358 } 1359 1360 if (unlikely(zone.type == BLK_ZONE_TYPE_CONVENTIONAL)) { 1361 btrfs_err(fs_info, 1362 "zoned: unexpected conventional zone %llu on device %s (devid %llu)", 1363 zone.start << SECTOR_SHIFT, rcu_dereference(device->name), 1364 device->devid); 1365 up_read(&dev_replace->rwsem); 1366 return -EIO; 1367 } 1368 1369 info->capacity = (zone.capacity << SECTOR_SHIFT); 1370 1371 switch (zone.cond) { 1372 case BLK_ZONE_COND_OFFLINE: 1373 case BLK_ZONE_COND_READONLY: 1374 btrfs_err(fs_info, 1375 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1376 (info->physical >> device->zone_info->zone_size_shift), 1377 rcu_dereference(device->name), device->devid); 1378 info->alloc_offset = WP_MISSING_DEV; 1379 break; 1380 case BLK_ZONE_COND_EMPTY: 1381 info->alloc_offset = 0; 1382 break; 1383 case BLK_ZONE_COND_FULL: 1384 info->alloc_offset = info->capacity; 1385 break; 1386 default: 1387 /* Partially used zone. */ 1388 info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT); 1389 __set_bit(zone_idx, active); 1390 break; 1391 } 1392 1393 up_read(&dev_replace->rwsem); 1394 1395 return 0; 1396 } 1397 1398 static int btrfs_load_block_group_single(struct btrfs_block_group *bg, 1399 struct zone_info *info, 1400 unsigned long *active) 1401 { 1402 if (unlikely(info->alloc_offset == WP_MISSING_DEV)) { 1403 btrfs_err(bg->fs_info, 1404 "zoned: cannot recover write pointer for zone %llu", 1405 info->physical); 1406 return -EIO; 1407 } 1408 1409 bg->alloc_offset = info->alloc_offset; 1410 bg->zone_capacity = info->capacity; 1411 if (test_bit(0, active)) 1412 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1413 return 0; 1414 } 1415 1416 static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, 1417 struct btrfs_chunk_map *map, 1418 struct zone_info *zone_info, 1419 unsigned long *active, 1420 u64 last_alloc) 1421 { 1422 struct btrfs_fs_info *fs_info = bg->fs_info; 1423 1424 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1425 btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree"); 1426 return -EINVAL; 1427 } 1428 1429 bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); 1430 1431 if (unlikely(zone_info[0].alloc_offset == WP_MISSING_DEV)) { 1432 btrfs_err(bg->fs_info, 1433 "zoned: cannot recover write pointer for zone %llu", 1434 zone_info[0].physical); 1435 return -EIO; 1436 } 1437 if (unlikely(zone_info[1].alloc_offset == WP_MISSING_DEV)) { 1438 btrfs_err(bg->fs_info, 1439 "zoned: cannot recover write pointer for zone %llu", 1440 zone_info[1].physical); 1441 return -EIO; 1442 } 1443 1444 if (zone_info[0].alloc_offset == WP_CONVENTIONAL) 1445 zone_info[0].alloc_offset = last_alloc; 1446 1447 if (zone_info[1].alloc_offset == WP_CONVENTIONAL) 1448 zone_info[1].alloc_offset = last_alloc; 1449 1450 if (unlikely(zone_info[0].alloc_offset != zone_info[1].alloc_offset)) { 1451 btrfs_err(bg->fs_info, 1452 "zoned: write pointer offset mismatch of zones in DUP profile"); 1453 return -EIO; 1454 } 1455 1456 if (test_bit(0, active) != test_bit(1, active)) { 1457 if (unlikely(!btrfs_zone_activate(bg))) 1458 return -EIO; 1459 } else if (test_bit(0, active)) { 1460 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1461 } 1462 1463 bg->alloc_offset = zone_info[0].alloc_offset; 1464 return 0; 1465 } 1466 1467 static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, 1468 struct btrfs_chunk_map *map, 1469 struct zone_info *zone_info, 1470 unsigned long *active, 1471 u64 last_alloc) 1472 { 1473 struct btrfs_fs_info *fs_info = bg->fs_info; 1474 int i; 1475 1476 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1477 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1478 btrfs_bg_type_to_raid_name(map->type)); 1479 return -EINVAL; 1480 } 1481 1482 /* In case a device is missing we have a cap of 0, so don't use it. */ 1483 bg->zone_capacity = min_not_zero(zone_info[0].capacity, zone_info[1].capacity); 1484 1485 for (i = 0; i < map->num_stripes; i++) { 1486 if (zone_info[i].alloc_offset == WP_MISSING_DEV) 1487 continue; 1488 1489 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) 1490 zone_info[i].alloc_offset = last_alloc; 1491 1492 if (unlikely((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && 1493 !btrfs_test_opt(fs_info, DEGRADED))) { 1494 btrfs_err(fs_info, 1495 "zoned: write pointer offset mismatch of zones in %s profile", 1496 btrfs_bg_type_to_raid_name(map->type)); 1497 return -EIO; 1498 } 1499 if (test_bit(0, active) != test_bit(i, active)) { 1500 if (unlikely(!btrfs_test_opt(fs_info, DEGRADED) && 1501 !btrfs_zone_activate(bg))) { 1502 return -EIO; 1503 } 1504 } else { 1505 if (test_bit(0, active)) 1506 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1507 } 1508 } 1509 1510 if (zone_info[0].alloc_offset != WP_MISSING_DEV) 1511 bg->alloc_offset = zone_info[0].alloc_offset; 1512 else 1513 bg->alloc_offset = zone_info[i - 1].alloc_offset; 1514 1515 return 0; 1516 } 1517 1518 static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, 1519 struct btrfs_chunk_map *map, 1520 struct zone_info *zone_info, 1521 unsigned long *active, 1522 u64 last_alloc) 1523 { 1524 struct btrfs_fs_info *fs_info = bg->fs_info; 1525 1526 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1527 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1528 btrfs_bg_type_to_raid_name(map->type)); 1529 return -EINVAL; 1530 } 1531 1532 for (int i = 0; i < map->num_stripes; i++) { 1533 if (zone_info[i].alloc_offset == WP_MISSING_DEV) 1534 continue; 1535 1536 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { 1537 u64 stripe_nr, full_stripe_nr; 1538 u64 stripe_offset; 1539 int stripe_index; 1540 1541 stripe_nr = div64_u64(last_alloc, map->stripe_size); 1542 stripe_offset = stripe_nr * map->stripe_size; 1543 full_stripe_nr = div_u64(stripe_nr, map->num_stripes); 1544 div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); 1545 1546 zone_info[i].alloc_offset = 1547 full_stripe_nr * map->stripe_size; 1548 1549 if (stripe_index > i) 1550 zone_info[i].alloc_offset += map->stripe_size; 1551 else if (stripe_index == i) 1552 zone_info[i].alloc_offset += 1553 (last_alloc - stripe_offset); 1554 } 1555 1556 if (test_bit(0, active) != test_bit(i, active)) { 1557 if (unlikely(!btrfs_zone_activate(bg))) 1558 return -EIO; 1559 } else { 1560 if (test_bit(0, active)) 1561 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1562 } 1563 bg->zone_capacity += zone_info[i].capacity; 1564 bg->alloc_offset += zone_info[i].alloc_offset; 1565 } 1566 1567 return 0; 1568 } 1569 1570 static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, 1571 struct btrfs_chunk_map *map, 1572 struct zone_info *zone_info, 1573 unsigned long *active, 1574 u64 last_alloc) 1575 { 1576 struct btrfs_fs_info *fs_info = bg->fs_info; 1577 1578 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { 1579 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1580 btrfs_bg_type_to_raid_name(map->type)); 1581 return -EINVAL; 1582 } 1583 1584 for (int i = 0; i < map->num_stripes; i++) { 1585 if (zone_info[i].alloc_offset == WP_MISSING_DEV) 1586 continue; 1587 1588 if (test_bit(0, active) != test_bit(i, active)) { 1589 if (unlikely(!btrfs_zone_activate(bg))) 1590 return -EIO; 1591 } else { 1592 if (test_bit(0, active)) 1593 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); 1594 } 1595 1596 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) { 1597 u64 stripe_nr, full_stripe_nr; 1598 u64 stripe_offset; 1599 int stripe_index; 1600 1601 stripe_nr = div64_u64(last_alloc, map->stripe_size); 1602 stripe_offset = stripe_nr * map->stripe_size; 1603 full_stripe_nr = div_u64(stripe_nr, 1604 map->num_stripes / map->sub_stripes); 1605 div_u64_rem(stripe_nr, 1606 (map->num_stripes / map->sub_stripes), 1607 &stripe_index); 1608 1609 zone_info[i].alloc_offset = 1610 full_stripe_nr * map->stripe_size; 1611 1612 if (stripe_index > (i / map->sub_stripes)) 1613 zone_info[i].alloc_offset += map->stripe_size; 1614 else if (stripe_index == (i / map->sub_stripes)) 1615 zone_info[i].alloc_offset += 1616 (last_alloc - stripe_offset); 1617 } 1618 1619 if ((i % map->sub_stripes) == 0) { 1620 bg->zone_capacity += zone_info[i].capacity; 1621 bg->alloc_offset += zone_info[i].alloc_offset; 1622 } 1623 } 1624 1625 return 0; 1626 } 1627 1628 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) 1629 { 1630 struct btrfs_fs_info *fs_info = cache->fs_info; 1631 struct btrfs_chunk_map *map; 1632 u64 logical = cache->start; 1633 u64 length = cache->length; 1634 struct zone_info *zone_info = NULL; 1635 int ret; 1636 int i; 1637 unsigned long *active = NULL; 1638 u64 last_alloc = 0; 1639 u32 num_sequential = 0, num_conventional = 0; 1640 u64 profile; 1641 1642 if (!btrfs_is_zoned(fs_info)) 1643 return 0; 1644 1645 /* Sanity check */ 1646 if (unlikely(!IS_ALIGNED(length, fs_info->zone_size))) { 1647 btrfs_err(fs_info, 1648 "zoned: block group %llu len %llu unaligned to zone size %llu", 1649 logical, length, fs_info->zone_size); 1650 return -EIO; 1651 } 1652 1653 map = btrfs_find_chunk_map(fs_info, logical, length); 1654 if (!map) 1655 return -EINVAL; 1656 1657 cache->physical_map = map; 1658 1659 zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); 1660 if (!zone_info) { 1661 ret = -ENOMEM; 1662 goto out; 1663 } 1664 1665 active = bitmap_zalloc(map->num_stripes, GFP_NOFS); 1666 if (!active) { 1667 ret = -ENOMEM; 1668 goto out; 1669 } 1670 1671 for (i = 0; i < map->num_stripes; i++) { 1672 ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); 1673 if (ret) 1674 goto out; 1675 1676 if (zone_info[i].alloc_offset == WP_CONVENTIONAL) 1677 num_conventional++; 1678 else 1679 num_sequential++; 1680 } 1681 1682 if (num_sequential > 0) 1683 set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); 1684 1685 if (num_conventional > 0) { 1686 /* Zone capacity is always zone size in emulation */ 1687 cache->zone_capacity = cache->length; 1688 ret = calculate_alloc_pointer(cache, &last_alloc, new); 1689 if (ret) { 1690 btrfs_err(fs_info, 1691 "zoned: failed to determine allocation offset of bg %llu", 1692 cache->start); 1693 goto out; 1694 } else if (map->num_stripes == num_conventional) { 1695 cache->alloc_offset = last_alloc; 1696 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); 1697 goto out; 1698 } 1699 } 1700 1701 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; 1702 switch (profile) { 1703 case 0: /* single */ 1704 ret = btrfs_load_block_group_single(cache, &zone_info[0], active); 1705 break; 1706 case BTRFS_BLOCK_GROUP_DUP: 1707 ret = btrfs_load_block_group_dup(cache, map, zone_info, active, 1708 last_alloc); 1709 break; 1710 case BTRFS_BLOCK_GROUP_RAID1: 1711 case BTRFS_BLOCK_GROUP_RAID1C3: 1712 case BTRFS_BLOCK_GROUP_RAID1C4: 1713 ret = btrfs_load_block_group_raid1(cache, map, zone_info, 1714 active, last_alloc); 1715 break; 1716 case BTRFS_BLOCK_GROUP_RAID0: 1717 ret = btrfs_load_block_group_raid0(cache, map, zone_info, 1718 active, last_alloc); 1719 break; 1720 case BTRFS_BLOCK_GROUP_RAID10: 1721 ret = btrfs_load_block_group_raid10(cache, map, zone_info, 1722 active, last_alloc); 1723 break; 1724 case BTRFS_BLOCK_GROUP_RAID5: 1725 case BTRFS_BLOCK_GROUP_RAID6: 1726 default: 1727 btrfs_err(fs_info, "zoned: profile %s not yet supported", 1728 btrfs_bg_type_to_raid_name(map->type)); 1729 ret = -EINVAL; 1730 goto out; 1731 } 1732 1733 if (ret == -EIO && profile != 0 && profile != BTRFS_BLOCK_GROUP_RAID0 && 1734 profile != BTRFS_BLOCK_GROUP_RAID10) { 1735 /* 1736 * Detected broken write pointer. Make this block group 1737 * unallocatable by setting the allocation pointer at the end of 1738 * allocatable region. Relocating this block group will fix the 1739 * mismatch. 1740 * 1741 * Currently, we cannot handle RAID0 or RAID10 case like this 1742 * because we don't have a proper zone_capacity value. But, 1743 * reading from this block group won't work anyway by a missing 1744 * stripe. 1745 */ 1746 cache->alloc_offset = cache->zone_capacity; 1747 } 1748 1749 out: 1750 /* Reject non SINGLE data profiles without RST */ 1751 if ((map->type & BTRFS_BLOCK_GROUP_DATA) && 1752 (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) && 1753 !fs_info->stripe_root) { 1754 btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", 1755 btrfs_bg_type_to_raid_name(map->type)); 1756 return -EINVAL; 1757 } 1758 1759 if (unlikely(cache->alloc_offset > cache->zone_capacity)) { 1760 btrfs_err(fs_info, 1761 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", 1762 cache->alloc_offset, cache->zone_capacity, 1763 cache->start); 1764 ret = -EIO; 1765 } 1766 1767 /* An extent is allocated after the write pointer */ 1768 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1769 btrfs_err(fs_info, 1770 "zoned: got wrong write pointer in BG %llu: %llu > %llu", 1771 logical, last_alloc, cache->alloc_offset); 1772 ret = -EIO; 1773 } 1774 1775 if (!ret) { 1776 cache->meta_write_pointer = cache->alloc_offset + cache->start; 1777 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags)) { 1778 btrfs_get_block_group(cache); 1779 spin_lock(&fs_info->zone_active_bgs_lock); 1780 list_add_tail(&cache->active_bg_list, 1781 &fs_info->zone_active_bgs); 1782 spin_unlock(&fs_info->zone_active_bgs_lock); 1783 } 1784 } else { 1785 btrfs_free_chunk_map(cache->physical_map); 1786 cache->physical_map = NULL; 1787 } 1788 bitmap_free(active); 1789 kfree(zone_info); 1790 1791 return ret; 1792 } 1793 1794 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) 1795 { 1796 u64 unusable, free; 1797 1798 if (!btrfs_is_zoned(cache->fs_info)) 1799 return; 1800 1801 WARN_ON(cache->bytes_super != 0); 1802 unusable = (cache->alloc_offset - cache->used) + 1803 (cache->length - cache->zone_capacity); 1804 free = cache->zone_capacity - cache->alloc_offset; 1805 1806 /* We only need ->free_space in ALLOC_SEQ block groups */ 1807 cache->cached = BTRFS_CACHE_FINISHED; 1808 cache->free_space_ctl->free_space = free; 1809 cache->zone_unusable = unusable; 1810 } 1811 1812 bool btrfs_use_zone_append(struct btrfs_bio *bbio) 1813 { 1814 u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); 1815 struct btrfs_inode *inode = bbio->inode; 1816 struct btrfs_fs_info *fs_info = bbio->fs_info; 1817 struct btrfs_block_group *cache; 1818 bool ret = false; 1819 1820 if (!btrfs_is_zoned(fs_info)) 1821 return false; 1822 1823 if (!inode || !is_data_inode(inode)) 1824 return false; 1825 1826 if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) 1827 return false; 1828 1829 /* 1830 * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the 1831 * extent layout the relocation code has. 1832 * Furthermore we have set aside own block-group from which only the 1833 * relocation "process" can allocate and make sure only one process at a 1834 * time can add pages to an extent that gets relocated, so it's safe to 1835 * use regular REQ_OP_WRITE for this special case. 1836 */ 1837 if (btrfs_is_data_reloc_root(inode->root)) 1838 return false; 1839 1840 cache = btrfs_lookup_block_group(fs_info, start); 1841 ASSERT(cache); 1842 if (!cache) 1843 return false; 1844 1845 ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); 1846 btrfs_put_block_group(cache); 1847 1848 return ret; 1849 } 1850 1851 void btrfs_record_physical_zoned(struct btrfs_bio *bbio) 1852 { 1853 const u64 physical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; 1854 struct btrfs_ordered_sum *sum = bbio->sums; 1855 1856 if (physical < bbio->orig_physical) 1857 sum->logical -= bbio->orig_physical - physical; 1858 else 1859 sum->logical += physical - bbio->orig_physical; 1860 } 1861 1862 static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, 1863 u64 logical) 1864 { 1865 struct extent_map_tree *em_tree = &ordered->inode->extent_tree; 1866 struct extent_map *em; 1867 1868 ordered->disk_bytenr = logical; 1869 1870 write_lock(&em_tree->lock); 1871 em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, 1872 ordered->num_bytes); 1873 /* The em should be a new COW extent, thus it should not have an offset. */ 1874 ASSERT(em->offset == 0); 1875 em->disk_bytenr = logical; 1876 btrfs_free_extent_map(em); 1877 write_unlock(&em_tree->lock); 1878 } 1879 1880 static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, 1881 u64 logical, u64 len) 1882 { 1883 struct btrfs_ordered_extent *new; 1884 1885 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && 1886 btrfs_split_extent_map(ordered->inode, ordered->file_offset, 1887 ordered->num_bytes, len, logical)) 1888 return false; 1889 1890 new = btrfs_split_ordered_extent(ordered, len); 1891 if (IS_ERR(new)) 1892 return false; 1893 new->disk_bytenr = logical; 1894 btrfs_finish_one_ordered(new); 1895 return true; 1896 } 1897 1898 void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) 1899 { 1900 struct btrfs_inode *inode = ordered->inode; 1901 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1902 struct btrfs_ordered_sum *sum; 1903 u64 logical, len; 1904 1905 /* 1906 * Write to pre-allocated region is for the data relocation, and so 1907 * it should use WRITE operation. No split/rewrite are necessary. 1908 */ 1909 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 1910 return; 1911 1912 ASSERT(!list_empty(&ordered->list)); 1913 /* The ordered->list can be empty in the above pre-alloc case. */ 1914 sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); 1915 logical = sum->logical; 1916 len = sum->len; 1917 1918 while (len < ordered->disk_num_bytes) { 1919 sum = list_next_entry(sum, list); 1920 if (sum->logical == logical + len) { 1921 len += sum->len; 1922 continue; 1923 } 1924 if (!btrfs_zoned_split_ordered(ordered, logical, len)) { 1925 set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); 1926 btrfs_err(fs_info, "failed to split ordered extent"); 1927 goto out; 1928 } 1929 logical = sum->logical; 1930 len = sum->len; 1931 } 1932 1933 if (ordered->disk_bytenr != logical) 1934 btrfs_rewrite_logical_zoned(ordered, logical); 1935 1936 out: 1937 /* 1938 * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures 1939 * were allocated by btrfs_alloc_dummy_sum only to record the logical 1940 * addresses and don't contain actual checksums. We thus must free them 1941 * here so that we don't attempt to log the csums later. 1942 */ 1943 if ((inode->flags & BTRFS_INODE_NODATASUM) || 1944 test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) { 1945 while ((sum = list_first_entry_or_null(&ordered->list, 1946 typeof(*sum), list))) { 1947 list_del(&sum->list); 1948 kfree(sum); 1949 } 1950 } 1951 } 1952 1953 static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, 1954 struct btrfs_block_group **active_bg) 1955 { 1956 const struct writeback_control *wbc = ctx->wbc; 1957 struct btrfs_block_group *block_group = ctx->zoned_bg; 1958 struct btrfs_fs_info *fs_info = block_group->fs_info; 1959 1960 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) 1961 return true; 1962 1963 if (fs_info->treelog_bg == block_group->start) { 1964 if (!btrfs_zone_activate(block_group)) { 1965 int ret_fin = btrfs_zone_finish_one_bg(fs_info); 1966 1967 if (ret_fin != 1 || !btrfs_zone_activate(block_group)) 1968 return false; 1969 } 1970 } else if (*active_bg != block_group) { 1971 struct btrfs_block_group *tgt = *active_bg; 1972 1973 /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ 1974 lockdep_assert_held(&fs_info->zoned_meta_io_lock); 1975 1976 if (tgt) { 1977 /* 1978 * If there is an unsent IO left in the allocated area, 1979 * we cannot wait for them as it may cause a deadlock. 1980 */ 1981 if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { 1982 if (wbc->sync_mode == WB_SYNC_NONE || 1983 (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) 1984 return false; 1985 } 1986 1987 /* Pivot active metadata/system block group. */ 1988 btrfs_zoned_meta_io_unlock(fs_info); 1989 wait_eb_writebacks(tgt); 1990 do_zone_finish(tgt, true); 1991 btrfs_zoned_meta_io_lock(fs_info); 1992 if (*active_bg == tgt) { 1993 btrfs_put_block_group(tgt); 1994 *active_bg = NULL; 1995 } 1996 } 1997 if (!btrfs_zone_activate(block_group)) 1998 return false; 1999 if (*active_bg != block_group) { 2000 ASSERT(*active_bg == NULL); 2001 *active_bg = block_group; 2002 btrfs_get_block_group(block_group); 2003 } 2004 } 2005 2006 return true; 2007 } 2008 2009 /* 2010 * Check if @ctx->eb is aligned to the write pointer. 2011 * 2012 * Return: 2013 * 0: @ctx->eb is at the write pointer. You can write it. 2014 * -EAGAIN: There is a hole. The caller should handle the case. 2015 * -EBUSY: There is a hole, but the caller can just bail out. 2016 */ 2017 int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, 2018 struct btrfs_eb_write_context *ctx) 2019 { 2020 const struct writeback_control *wbc = ctx->wbc; 2021 const struct extent_buffer *eb = ctx->eb; 2022 struct btrfs_block_group *block_group = ctx->zoned_bg; 2023 2024 if (!btrfs_is_zoned(fs_info)) 2025 return 0; 2026 2027 if (block_group) { 2028 if (block_group->start > eb->start || 2029 block_group->start + block_group->length <= eb->start) { 2030 btrfs_put_block_group(block_group); 2031 block_group = NULL; 2032 ctx->zoned_bg = NULL; 2033 } 2034 } 2035 2036 if (!block_group) { 2037 block_group = btrfs_lookup_block_group(fs_info, eb->start); 2038 if (!block_group) 2039 return 0; 2040 ctx->zoned_bg = block_group; 2041 } 2042 2043 if (block_group->meta_write_pointer == eb->start) { 2044 struct btrfs_block_group **tgt; 2045 2046 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) 2047 return 0; 2048 2049 if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) 2050 tgt = &fs_info->active_system_bg; 2051 else 2052 tgt = &fs_info->active_meta_bg; 2053 if (check_bg_is_active(ctx, tgt)) 2054 return 0; 2055 } 2056 2057 /* 2058 * Since we may release fs_info->zoned_meta_io_lock, someone can already 2059 * start writing this eb. In that case, we can just bail out. 2060 */ 2061 if (block_group->meta_write_pointer > eb->start) 2062 return -EBUSY; 2063 2064 /* If for_sync, this hole will be filled with transaction commit. */ 2065 if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) 2066 return -EAGAIN; 2067 return -EBUSY; 2068 } 2069 2070 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) 2071 { 2072 if (!btrfs_dev_is_sequential(device, physical)) 2073 return -EOPNOTSUPP; 2074 2075 return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, 2076 length >> SECTOR_SHIFT, GFP_NOFS, 0); 2077 } 2078 2079 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, 2080 struct blk_zone *zone) 2081 { 2082 struct btrfs_io_context *bioc = NULL; 2083 u64 mapped_length = PAGE_SIZE; 2084 unsigned int nofs_flag; 2085 int nmirrors; 2086 int i, ret; 2087 2088 ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 2089 &mapped_length, &bioc, NULL, NULL); 2090 if (unlikely(ret || !bioc || mapped_length < PAGE_SIZE)) { 2091 ret = -EIO; 2092 goto out_put_bioc; 2093 } 2094 2095 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 2096 ret = -EINVAL; 2097 goto out_put_bioc; 2098 } 2099 2100 nofs_flag = memalloc_nofs_save(); 2101 nmirrors = (int)bioc->num_stripes; 2102 for (i = 0; i < nmirrors; i++) { 2103 u64 physical = bioc->stripes[i].physical; 2104 struct btrfs_device *dev = bioc->stripes[i].dev; 2105 2106 /* Missing device */ 2107 if (!dev->bdev) 2108 continue; 2109 2110 ret = btrfs_get_dev_zone(dev, physical, zone); 2111 /* Failing device */ 2112 if (ret == -EIO || ret == -EOPNOTSUPP) 2113 continue; 2114 break; 2115 } 2116 memalloc_nofs_restore(nofs_flag); 2117 out_put_bioc: 2118 btrfs_put_bioc(bioc); 2119 return ret; 2120 } 2121 2122 /* 2123 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by 2124 * filling zeros between @physical_pos to a write pointer of dev-replace 2125 * source device. 2126 */ 2127 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 2128 u64 physical_start, u64 physical_pos) 2129 { 2130 struct btrfs_fs_info *fs_info = tgt_dev->fs_info; 2131 struct blk_zone zone; 2132 u64 length; 2133 u64 wp; 2134 int ret; 2135 2136 if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) 2137 return 0; 2138 2139 ret = read_zone_info(fs_info, logical, &zone); 2140 if (ret) 2141 return ret; 2142 2143 wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); 2144 2145 if (physical_pos == wp) 2146 return 0; 2147 2148 if (unlikely(physical_pos > wp)) 2149 return -EUCLEAN; 2150 2151 length = wp - physical_pos; 2152 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 2153 } 2154 2155 /* 2156 * Activate block group and underlying device zones 2157 * 2158 * @block_group: the block group to activate 2159 * 2160 * Return: true on success, false otherwise 2161 */ 2162 bool btrfs_zone_activate(struct btrfs_block_group *block_group) 2163 { 2164 struct btrfs_fs_info *fs_info = block_group->fs_info; 2165 struct btrfs_chunk_map *map; 2166 struct btrfs_device *device; 2167 u64 physical; 2168 const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); 2169 bool ret; 2170 int i; 2171 2172 if (!btrfs_is_zoned(block_group->fs_info)) 2173 return true; 2174 2175 map = block_group->physical_map; 2176 2177 spin_lock(&fs_info->zone_active_bgs_lock); 2178 spin_lock(&block_group->lock); 2179 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { 2180 ret = true; 2181 goto out_unlock; 2182 } 2183 2184 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) { 2185 /* The caller should check if the block group is full. */ 2186 if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) { 2187 ret = false; 2188 goto out_unlock; 2189 } 2190 } else { 2191 /* Since it is already written, it should have been active. */ 2192 WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start); 2193 } 2194 2195 for (i = 0; i < map->num_stripes; i++) { 2196 struct btrfs_zoned_device_info *zinfo; 2197 int reserved = 0; 2198 2199 device = map->stripes[i].dev; 2200 physical = map->stripes[i].physical; 2201 zinfo = device->zone_info; 2202 2203 if (!device->bdev) 2204 continue; 2205 2206 if (zinfo->max_active_zones == 0) 2207 continue; 2208 2209 if (is_data) 2210 reserved = zinfo->reserved_active_zones; 2211 /* 2212 * For the data block group, leave active zones for one 2213 * metadata block group and one system block group. 2214 */ 2215 if (atomic_read(&zinfo->active_zones_left) <= reserved) { 2216 ret = false; 2217 goto out_unlock; 2218 } 2219 2220 if (!btrfs_dev_set_active_zone(device, physical)) { 2221 /* Cannot activate the zone */ 2222 ret = false; 2223 goto out_unlock; 2224 } 2225 if (!is_data) 2226 zinfo->reserved_active_zones--; 2227 } 2228 2229 /* Successfully activated all the zones */ 2230 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); 2231 spin_unlock(&block_group->lock); 2232 2233 /* For the active block group list */ 2234 btrfs_get_block_group(block_group); 2235 list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); 2236 spin_unlock(&fs_info->zone_active_bgs_lock); 2237 2238 return true; 2239 2240 out_unlock: 2241 spin_unlock(&block_group->lock); 2242 spin_unlock(&fs_info->zone_active_bgs_lock); 2243 return ret; 2244 } 2245 2246 static void wait_eb_writebacks(struct btrfs_block_group *block_group) 2247 { 2248 struct btrfs_fs_info *fs_info = block_group->fs_info; 2249 const u64 end = block_group->start + block_group->length; 2250 struct extent_buffer *eb; 2251 unsigned long index, start = (block_group->start >> fs_info->nodesize_bits); 2252 2253 rcu_read_lock(); 2254 xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { 2255 if (eb->start < block_group->start) 2256 continue; 2257 if (eb->start >= end) 2258 break; 2259 rcu_read_unlock(); 2260 wait_on_extent_buffer_writeback(eb); 2261 rcu_read_lock(); 2262 } 2263 rcu_read_unlock(); 2264 } 2265 2266 static int call_zone_finish(struct btrfs_block_group *block_group, 2267 struct btrfs_io_stripe *stripe) 2268 { 2269 struct btrfs_device *device = stripe->dev; 2270 const u64 physical = stripe->physical; 2271 struct btrfs_zoned_device_info *zinfo = device->zone_info; 2272 int ret; 2273 2274 if (!device->bdev) 2275 return 0; 2276 2277 if (zinfo->max_active_zones == 0) 2278 return 0; 2279 2280 if (btrfs_dev_is_sequential(device, physical)) { 2281 unsigned int nofs_flags; 2282 2283 nofs_flags = memalloc_nofs_save(); 2284 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 2285 physical >> SECTOR_SHIFT, 2286 zinfo->zone_size >> SECTOR_SHIFT); 2287 memalloc_nofs_restore(nofs_flags); 2288 2289 if (ret) 2290 return ret; 2291 } 2292 2293 if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) 2294 zinfo->reserved_active_zones++; 2295 btrfs_dev_clear_active_zone(device, physical); 2296 2297 return 0; 2298 } 2299 2300 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) 2301 { 2302 struct btrfs_fs_info *fs_info = block_group->fs_info; 2303 struct btrfs_chunk_map *map; 2304 const bool is_metadata = (block_group->flags & 2305 (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); 2306 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 2307 int ret = 0; 2308 int i; 2309 2310 spin_lock(&block_group->lock); 2311 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { 2312 spin_unlock(&block_group->lock); 2313 return 0; 2314 } 2315 2316 /* Check if we have unwritten allocated space */ 2317 if (is_metadata && 2318 block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { 2319 spin_unlock(&block_group->lock); 2320 return -EAGAIN; 2321 } 2322 2323 /* 2324 * If we are sure that the block group is full (= no more room left for 2325 * new allocation) and the IO for the last usable block is completed, we 2326 * don't need to wait for the other IOs. This holds because we ensure 2327 * the sequential IO submissions using the ZONE_APPEND command for data 2328 * and block_group->meta_write_pointer for metadata. 2329 */ 2330 if (!fully_written) { 2331 if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { 2332 spin_unlock(&block_group->lock); 2333 return -EAGAIN; 2334 } 2335 spin_unlock(&block_group->lock); 2336 2337 ret = btrfs_inc_block_group_ro(block_group, false); 2338 if (ret) 2339 return ret; 2340 2341 /* Ensure all writes in this block group finish */ 2342 btrfs_wait_block_group_reservations(block_group); 2343 /* No need to wait for NOCOW writers. Zoned mode does not allow that */ 2344 btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group); 2345 /* Wait for extent buffers to be written. */ 2346 if (is_metadata) 2347 wait_eb_writebacks(block_group); 2348 2349 spin_lock(&block_group->lock); 2350 2351 /* 2352 * Bail out if someone already deactivated the block group, or 2353 * allocated space is left in the block group. 2354 */ 2355 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2356 &block_group->runtime_flags)) { 2357 spin_unlock(&block_group->lock); 2358 btrfs_dec_block_group_ro(block_group); 2359 return 0; 2360 } 2361 2362 if (block_group->reserved || 2363 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, 2364 &block_group->runtime_flags)) { 2365 spin_unlock(&block_group->lock); 2366 btrfs_dec_block_group_ro(block_group); 2367 return -EAGAIN; 2368 } 2369 } 2370 2371 clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); 2372 block_group->alloc_offset = block_group->zone_capacity; 2373 if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) 2374 block_group->meta_write_pointer = block_group->start + 2375 block_group->zone_capacity; 2376 block_group->free_space_ctl->free_space = 0; 2377 btrfs_clear_treelog_bg(block_group); 2378 btrfs_clear_data_reloc_bg(block_group); 2379 spin_unlock(&block_group->lock); 2380 2381 down_read(&dev_replace->rwsem); 2382 map = block_group->physical_map; 2383 for (i = 0; i < map->num_stripes; i++) { 2384 2385 ret = call_zone_finish(block_group, &map->stripes[i]); 2386 if (ret) { 2387 up_read(&dev_replace->rwsem); 2388 return ret; 2389 } 2390 } 2391 up_read(&dev_replace->rwsem); 2392 2393 if (!fully_written) 2394 btrfs_dec_block_group_ro(block_group); 2395 2396 spin_lock(&fs_info->zone_active_bgs_lock); 2397 ASSERT(!list_empty(&block_group->active_bg_list)); 2398 list_del_init(&block_group->active_bg_list); 2399 spin_unlock(&fs_info->zone_active_bgs_lock); 2400 2401 /* For active_bg_list */ 2402 btrfs_put_block_group(block_group); 2403 2404 clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); 2405 2406 return 0; 2407 } 2408 2409 int btrfs_zone_finish(struct btrfs_block_group *block_group) 2410 { 2411 if (!btrfs_is_zoned(block_group->fs_info)) 2412 return 0; 2413 2414 return do_zone_finish(block_group, false); 2415 } 2416 2417 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) 2418 { 2419 struct btrfs_fs_info *fs_info = fs_devices->fs_info; 2420 struct btrfs_device *device; 2421 bool ret = false; 2422 2423 if (!btrfs_is_zoned(fs_info)) 2424 return true; 2425 2426 if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) 2427 return false; 2428 2429 /* Check if there is a device with active zones left */ 2430 mutex_lock(&fs_info->chunk_mutex); 2431 spin_lock(&fs_info->zone_active_bgs_lock); 2432 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 2433 struct btrfs_zoned_device_info *zinfo = device->zone_info; 2434 int reserved = 0; 2435 2436 if (!device->bdev) 2437 continue; 2438 2439 if (!zinfo->max_active_zones) { 2440 ret = true; 2441 break; 2442 } 2443 2444 if (flags & BTRFS_BLOCK_GROUP_DATA) 2445 reserved = zinfo->reserved_active_zones; 2446 2447 switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 2448 case 0: /* single */ 2449 ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); 2450 break; 2451 case BTRFS_BLOCK_GROUP_DUP: 2452 ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); 2453 break; 2454 } 2455 if (ret) 2456 break; 2457 } 2458 spin_unlock(&fs_info->zone_active_bgs_lock); 2459 mutex_unlock(&fs_info->chunk_mutex); 2460 2461 if (!ret) 2462 set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); 2463 2464 return ret; 2465 } 2466 2467 int btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) 2468 { 2469 struct btrfs_block_group *block_group; 2470 u64 min_alloc_bytes; 2471 2472 if (!btrfs_is_zoned(fs_info)) 2473 return 0; 2474 2475 block_group = btrfs_lookup_block_group(fs_info, logical); 2476 if (WARN_ON_ONCE(!block_group)) 2477 return -ENOENT; 2478 2479 /* No MIXED_BG on zoned btrfs. */ 2480 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 2481 min_alloc_bytes = fs_info->sectorsize; 2482 else 2483 min_alloc_bytes = fs_info->nodesize; 2484 2485 /* Bail out if we can allocate more data from this block group. */ 2486 if (logical + length + min_alloc_bytes <= 2487 block_group->start + block_group->zone_capacity) 2488 goto out; 2489 2490 do_zone_finish(block_group, true); 2491 2492 out: 2493 btrfs_put_block_group(block_group); 2494 return 0; 2495 } 2496 2497 static void btrfs_zone_finish_endio_workfn(struct work_struct *work) 2498 { 2499 int ret; 2500 struct btrfs_block_group *bg = 2501 container_of(work, struct btrfs_block_group, zone_finish_work); 2502 2503 wait_on_extent_buffer_writeback(bg->last_eb); 2504 free_extent_buffer(bg->last_eb); 2505 ret = do_zone_finish(bg, true); 2506 if (ret) 2507 btrfs_handle_fs_error(bg->fs_info, ret, 2508 "Failed to finish block-group's zone"); 2509 btrfs_put_block_group(bg); 2510 } 2511 2512 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, 2513 struct extent_buffer *eb) 2514 { 2515 if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || 2516 eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) 2517 return; 2518 2519 if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { 2520 btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", 2521 bg->start); 2522 return; 2523 } 2524 2525 /* For the work */ 2526 btrfs_get_block_group(bg); 2527 refcount_inc(&eb->refs); 2528 bg->last_eb = eb; 2529 INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); 2530 queue_work(system_dfl_wq, &bg->zone_finish_work); 2531 } 2532 2533 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) 2534 { 2535 struct btrfs_fs_info *fs_info = bg->fs_info; 2536 2537 spin_lock(&fs_info->relocation_bg_lock); 2538 if (fs_info->data_reloc_bg == bg->start) 2539 fs_info->data_reloc_bg = 0; 2540 spin_unlock(&fs_info->relocation_bg_lock); 2541 } 2542 2543 void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) 2544 { 2545 struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; 2546 struct btrfs_space_info *space_info = data_sinfo; 2547 struct btrfs_trans_handle *trans; 2548 struct btrfs_block_group *bg; 2549 struct list_head *bg_list; 2550 u64 alloc_flags; 2551 bool first = true; 2552 bool did_chunk_alloc = false; 2553 int index; 2554 int ret; 2555 2556 if (!btrfs_is_zoned(fs_info)) 2557 return; 2558 2559 if (fs_info->data_reloc_bg) 2560 return; 2561 2562 if (sb_rdonly(fs_info->sb)) 2563 return; 2564 2565 alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); 2566 index = btrfs_bg_flags_to_raid_index(alloc_flags); 2567 2568 /* Scan the data space_info to find empty block groups. Take the second one. */ 2569 again: 2570 bg_list = &space_info->block_groups[index]; 2571 list_for_each_entry(bg, bg_list, list) { 2572 if (bg->alloc_offset != 0) 2573 continue; 2574 2575 if (first) { 2576 first = false; 2577 continue; 2578 } 2579 2580 if (space_info == data_sinfo) { 2581 /* Migrate the block group to the data relocation space_info. */ 2582 struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0]; 2583 int factor; 2584 2585 ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); 2586 factor = btrfs_bg_type_to_factor(bg->flags); 2587 2588 down_write(&space_info->groups_sem); 2589 list_del_init(&bg->list); 2590 /* We can assume this as we choose the second empty one. */ 2591 ASSERT(!list_empty(&space_info->block_groups[index])); 2592 up_write(&space_info->groups_sem); 2593 2594 spin_lock(&space_info->lock); 2595 space_info->total_bytes -= bg->length; 2596 space_info->disk_total -= bg->length * factor; 2597 space_info->disk_total -= bg->zone_unusable; 2598 /* There is no allocation ever happened. */ 2599 ASSERT(bg->used == 0); 2600 /* No super block in a block group on the zoned setup. */ 2601 ASSERT(bg->bytes_super == 0); 2602 spin_unlock(&space_info->lock); 2603 2604 bg->space_info = reloc_sinfo; 2605 if (reloc_sinfo->block_group_kobjs[index] == NULL) 2606 btrfs_sysfs_add_block_group_type(bg); 2607 2608 btrfs_add_bg_to_space_info(fs_info, bg); 2609 } 2610 2611 fs_info->data_reloc_bg = bg->start; 2612 set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags); 2613 btrfs_zone_activate(bg); 2614 2615 return; 2616 } 2617 2618 if (did_chunk_alloc) 2619 return; 2620 2621 trans = btrfs_join_transaction(fs_info->tree_root); 2622 if (IS_ERR(trans)) 2623 return; 2624 2625 /* Allocate new BG in the data relocation space_info. */ 2626 space_info = data_sinfo->sub_group[0]; 2627 ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); 2628 ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); 2629 btrfs_end_transaction(trans); 2630 if (ret == 1) { 2631 /* 2632 * We allocated a new block group in the data relocation space_info. We 2633 * can take that one. 2634 */ 2635 first = false; 2636 did_chunk_alloc = true; 2637 goto again; 2638 } 2639 } 2640 2641 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) 2642 { 2643 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2644 struct btrfs_device *device; 2645 2646 if (!btrfs_is_zoned(fs_info)) 2647 return; 2648 2649 mutex_lock(&fs_devices->device_list_mutex); 2650 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2651 if (device->zone_info) { 2652 vfree(device->zone_info->zone_cache); 2653 device->zone_info->zone_cache = NULL; 2654 } 2655 } 2656 mutex_unlock(&fs_devices->device_list_mutex); 2657 } 2658 2659 bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info) 2660 { 2661 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2662 struct btrfs_device *device; 2663 u64 total = btrfs_super_total_bytes(fs_info->super_copy); 2664 u64 used = 0; 2665 u64 factor; 2666 2667 ASSERT(btrfs_is_zoned(fs_info)); 2668 2669 if (fs_info->bg_reclaim_threshold == 0) 2670 return false; 2671 2672 mutex_lock(&fs_devices->device_list_mutex); 2673 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2674 if (!device->bdev) 2675 continue; 2676 2677 used += device->bytes_used; 2678 } 2679 mutex_unlock(&fs_devices->device_list_mutex); 2680 2681 factor = div64_u64(used * 100, total); 2682 return factor >= fs_info->bg_reclaim_threshold; 2683 } 2684 2685 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, 2686 u64 length) 2687 { 2688 struct btrfs_block_group *block_group; 2689 2690 if (!btrfs_is_zoned(fs_info)) 2691 return; 2692 2693 block_group = btrfs_lookup_block_group(fs_info, logical); 2694 /* It should be called on a previous data relocation block group. */ 2695 ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); 2696 2697 spin_lock(&block_group->lock); 2698 if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) 2699 goto out; 2700 2701 /* All relocation extents are written. */ 2702 if (block_group->start + block_group->alloc_offset == logical + length) { 2703 /* 2704 * Now, release this block group for further allocations and 2705 * zone finish. 2706 */ 2707 clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, 2708 &block_group->runtime_flags); 2709 } 2710 2711 out: 2712 spin_unlock(&block_group->lock); 2713 btrfs_put_block_group(block_group); 2714 } 2715 2716 int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) 2717 { 2718 struct btrfs_block_group *block_group; 2719 struct btrfs_block_group *min_bg = NULL; 2720 u64 min_avail = U64_MAX; 2721 int ret; 2722 2723 spin_lock(&fs_info->zone_active_bgs_lock); 2724 list_for_each_entry(block_group, &fs_info->zone_active_bgs, 2725 active_bg_list) { 2726 u64 avail; 2727 2728 spin_lock(&block_group->lock); 2729 if (block_group->reserved || block_group->alloc_offset == 0 || 2730 !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) || 2731 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { 2732 spin_unlock(&block_group->lock); 2733 continue; 2734 } 2735 2736 avail = block_group->zone_capacity - block_group->alloc_offset; 2737 if (min_avail > avail) { 2738 if (min_bg) 2739 btrfs_put_block_group(min_bg); 2740 min_bg = block_group; 2741 min_avail = avail; 2742 btrfs_get_block_group(min_bg); 2743 } 2744 spin_unlock(&block_group->lock); 2745 } 2746 spin_unlock(&fs_info->zone_active_bgs_lock); 2747 2748 if (!min_bg) 2749 return 0; 2750 2751 ret = btrfs_zone_finish(min_bg); 2752 btrfs_put_block_group(min_bg); 2753 2754 return ret < 0 ? ret : 1; 2755 } 2756 2757 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, 2758 struct btrfs_space_info *space_info, 2759 bool do_finish) 2760 { 2761 struct btrfs_block_group *bg; 2762 int index; 2763 2764 if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) 2765 return 0; 2766 2767 for (;;) { 2768 int ret; 2769 bool need_finish = false; 2770 2771 down_read(&space_info->groups_sem); 2772 for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { 2773 list_for_each_entry(bg, &space_info->block_groups[index], 2774 list) { 2775 if (!spin_trylock(&bg->lock)) 2776 continue; 2777 if (btrfs_zoned_bg_is_full(bg) || 2778 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, 2779 &bg->runtime_flags)) { 2780 spin_unlock(&bg->lock); 2781 continue; 2782 } 2783 spin_unlock(&bg->lock); 2784 2785 if (btrfs_zone_activate(bg)) { 2786 up_read(&space_info->groups_sem); 2787 return 1; 2788 } 2789 2790 need_finish = true; 2791 } 2792 } 2793 up_read(&space_info->groups_sem); 2794 2795 if (!do_finish || !need_finish) 2796 break; 2797 2798 ret = btrfs_zone_finish_one_bg(fs_info); 2799 if (ret == 0) 2800 break; 2801 if (ret < 0) 2802 return ret; 2803 } 2804 2805 return 0; 2806 } 2807 2808 /* 2809 * Reserve zones for one metadata block group, one tree-log block group, and one 2810 * system block group. 2811 */ 2812 void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) 2813 { 2814 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2815 struct btrfs_block_group *block_group; 2816 struct btrfs_device *device; 2817 /* Reserve zones for normal SINGLE metadata and tree-log block group. */ 2818 unsigned int metadata_reserve = 2; 2819 /* Reserve a zone for SINGLE system block group. */ 2820 unsigned int system_reserve = 1; 2821 2822 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) 2823 return; 2824 2825 /* 2826 * This function is called from the mount context. So, there is no 2827 * parallel process touching the bits. No need for read_seqretry(). 2828 */ 2829 if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) 2830 metadata_reserve = 4; 2831 if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) 2832 system_reserve = 2; 2833 2834 /* Apply the reservation on all the devices. */ 2835 mutex_lock(&fs_devices->device_list_mutex); 2836 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2837 if (!device->bdev) 2838 continue; 2839 2840 device->zone_info->reserved_active_zones = 2841 metadata_reserve + system_reserve; 2842 } 2843 mutex_unlock(&fs_devices->device_list_mutex); 2844 2845 /* Release reservation for currently active block groups. */ 2846 spin_lock(&fs_info->zone_active_bgs_lock); 2847 list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { 2848 struct btrfs_chunk_map *map = block_group->physical_map; 2849 2850 if (!(block_group->flags & 2851 (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) 2852 continue; 2853 2854 for (int i = 0; i < map->num_stripes; i++) 2855 map->stripes[i].dev->zone_info->reserved_active_zones--; 2856 } 2857 spin_unlock(&fs_info->zone_active_bgs_lock); 2858 } 2859 2860 /* 2861 * Reset the zones of unused block groups from @space_info->bytes_zone_unusable. 2862 * 2863 * @space_info: the space to work on 2864 * @num_bytes: targeting reclaim bytes 2865 * 2866 * This one resets the zones of a block group, so we can reuse the region 2867 * without removing the block group. On the other hand, btrfs_delete_unused_bgs() 2868 * just removes a block group and frees up the underlying zones. So, we still 2869 * need to allocate a new block group to reuse the zones. 2870 * 2871 * Resetting is faster than deleting/recreating a block group. It is similar 2872 * to freeing the logical space on the regular mode. However, we cannot change 2873 * the block group's profile with this operation. 2874 */ 2875 int btrfs_reset_unused_block_groups(struct btrfs_space_info *space_info, u64 num_bytes) 2876 { 2877 struct btrfs_fs_info *fs_info = space_info->fs_info; 2878 const sector_t zone_size_sectors = fs_info->zone_size >> SECTOR_SHIFT; 2879 2880 if (!btrfs_is_zoned(fs_info)) 2881 return 0; 2882 2883 while (num_bytes > 0) { 2884 struct btrfs_chunk_map *map; 2885 struct btrfs_block_group *bg = NULL; 2886 bool found = false; 2887 u64 reclaimed = 0; 2888 2889 /* 2890 * Here, we choose a fully zone_unusable block group. It's 2891 * technically possible to reset a partly zone_unusable block 2892 * group, which still has some free space left. However, 2893 * handling that needs to cope with the allocation side, which 2894 * makes the logic more complex. So, let's handle the easy case 2895 * for now. 2896 */ 2897 spin_lock(&fs_info->unused_bgs_lock); 2898 list_for_each_entry(bg, &fs_info->unused_bgs, bg_list) { 2899 if ((bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != space_info->flags) 2900 continue; 2901 2902 /* 2903 * Use trylock to avoid locking order violation. In 2904 * btrfs_reclaim_bgs_work(), the lock order is 2905 * &bg->lock -> &fs_info->unused_bgs_lock. We skip a 2906 * block group if we cannot take its lock. 2907 */ 2908 if (!spin_trylock(&bg->lock)) 2909 continue; 2910 if (btrfs_is_block_group_used(bg) || bg->zone_unusable < bg->length) { 2911 spin_unlock(&bg->lock); 2912 continue; 2913 } 2914 spin_unlock(&bg->lock); 2915 found = true; 2916 break; 2917 } 2918 if (!found) { 2919 spin_unlock(&fs_info->unused_bgs_lock); 2920 return 0; 2921 } 2922 2923 list_del_init(&bg->bg_list); 2924 btrfs_put_block_group(bg); 2925 spin_unlock(&fs_info->unused_bgs_lock); 2926 2927 /* 2928 * Since the block group is fully zone_unusable and we cannot 2929 * allocate from this block group anymore, we don't need to set 2930 * this block group read-only. 2931 */ 2932 2933 down_read(&fs_info->dev_replace.rwsem); 2934 map = bg->physical_map; 2935 for (int i = 0; i < map->num_stripes; i++) { 2936 struct btrfs_io_stripe *stripe = &map->stripes[i]; 2937 unsigned int nofs_flags; 2938 int ret; 2939 2940 nofs_flags = memalloc_nofs_save(); 2941 ret = blkdev_zone_mgmt(stripe->dev->bdev, REQ_OP_ZONE_RESET, 2942 stripe->physical >> SECTOR_SHIFT, 2943 zone_size_sectors); 2944 memalloc_nofs_restore(nofs_flags); 2945 2946 if (ret) { 2947 up_read(&fs_info->dev_replace.rwsem); 2948 return ret; 2949 } 2950 } 2951 up_read(&fs_info->dev_replace.rwsem); 2952 2953 spin_lock(&space_info->lock); 2954 spin_lock(&bg->lock); 2955 ASSERT(!btrfs_is_block_group_used(bg)); 2956 if (bg->ro) { 2957 spin_unlock(&bg->lock); 2958 spin_unlock(&space_info->lock); 2959 continue; 2960 } 2961 2962 reclaimed = bg->alloc_offset; 2963 bg->zone_unusable = bg->length - bg->zone_capacity; 2964 bg->alloc_offset = 0; 2965 /* 2966 * This holds because we currently reset fully used then freed 2967 * block group. 2968 */ 2969 ASSERT(reclaimed == bg->zone_capacity); 2970 bg->free_space_ctl->free_space += reclaimed; 2971 space_info->bytes_zone_unusable -= reclaimed; 2972 spin_unlock(&bg->lock); 2973 btrfs_return_free_space(space_info, reclaimed); 2974 spin_unlock(&space_info->lock); 2975 2976 if (num_bytes <= reclaimed) 2977 break; 2978 num_bytes -= reclaimed; 2979 } 2980 2981 return 0; 2982 } 2983