1 // SPDX-License-Identifier: GPL-2.0 2 3 #include <linux/bitops.h> 4 #include <linux/slab.h> 5 #include <linux/blkdev.h> 6 #include <linux/sched/mm.h> 7 #include <linux/atomic.h> 8 #include <linux/vmalloc.h> 9 #include "ctree.h" 10 #include "volumes.h" 11 #include "zoned.h" 12 #include "rcu-string.h" 13 #include "disk-io.h" 14 #include "block-group.h" 15 #include "transaction.h" 16 #include "dev-replace.h" 17 #include "space-info.h" 18 19 /* Maximum number of zones to report per blkdev_report_zones() call */ 20 #define BTRFS_REPORT_NR_ZONES 4096 21 /* Invalid allocation pointer value for missing devices */ 22 #define WP_MISSING_DEV ((u64)-1) 23 /* Pseudo write pointer value for conventional zone */ 24 #define WP_CONVENTIONAL ((u64)-2) 25 26 /* 27 * Location of the first zone of superblock logging zone pairs. 28 * 29 * - primary superblock: 0B (zone 0) 30 * - first copy: 512G (zone starting at that offset) 31 * - second copy: 4T (zone starting at that offset) 32 */ 33 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) 34 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) 35 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) 36 37 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) 38 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) 39 40 /* Number of superblock log zones */ 41 #define BTRFS_NR_SB_LOG_ZONES 2 42 43 /* 44 * Minimum of active zones we need: 45 * 46 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors 47 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group 48 * - 1 zone for tree-log dedicated block group 49 * - 1 zone for relocation 50 */ 51 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) 52 53 /* 54 * Minimum / maximum supported zone size. Currently, SMR disks have a zone 55 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. 56 * We do not expect the zone size to become larger than 8GiB or smaller than 57 * 4MiB in the near future. 58 */ 59 #define BTRFS_MAX_ZONE_SIZE SZ_8G 60 #define BTRFS_MIN_ZONE_SIZE SZ_4M 61 62 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) 63 64 static inline bool sb_zone_is_full(const struct blk_zone *zone) 65 { 66 return (zone->cond == BLK_ZONE_COND_FULL) || 67 (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); 68 } 69 70 static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) 71 { 72 struct blk_zone *zones = data; 73 74 memcpy(&zones[idx], zone, sizeof(*zone)); 75 76 return 0; 77 } 78 79 static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, 80 u64 *wp_ret) 81 { 82 bool empty[BTRFS_NR_SB_LOG_ZONES]; 83 bool full[BTRFS_NR_SB_LOG_ZONES]; 84 sector_t sector; 85 int i; 86 87 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 88 ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); 89 empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); 90 full[i] = sb_zone_is_full(&zones[i]); 91 } 92 93 /* 94 * Possible states of log buffer zones 95 * 96 * Empty[0] In use[0] Full[0] 97 * Empty[1] * 0 1 98 * In use[1] x x 1 99 * Full[1] 0 0 C 100 * 101 * Log position: 102 * *: Special case, no superblock is written 103 * 0: Use write pointer of zones[0] 104 * 1: Use write pointer of zones[1] 105 * C: Compare super blocks from zones[0] and zones[1], use the latest 106 * one determined by generation 107 * x: Invalid state 108 */ 109 110 if (empty[0] && empty[1]) { 111 /* Special case to distinguish no superblock to read */ 112 *wp_ret = zones[0].start << SECTOR_SHIFT; 113 return -ENOENT; 114 } else if (full[0] && full[1]) { 115 /* Compare two super blocks */ 116 struct address_space *mapping = bdev->bd_inode->i_mapping; 117 struct page *page[BTRFS_NR_SB_LOG_ZONES]; 118 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES]; 119 int i; 120 121 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 122 u64 bytenr; 123 124 bytenr = ((zones[i].start + zones[i].len) 125 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE; 126 127 page[i] = read_cache_page_gfp(mapping, 128 bytenr >> PAGE_SHIFT, GFP_NOFS); 129 if (IS_ERR(page[i])) { 130 if (i == 1) 131 btrfs_release_disk_super(super[0]); 132 return PTR_ERR(page[i]); 133 } 134 super[i] = page_address(page[i]); 135 } 136 137 if (super[0]->generation > super[1]->generation) 138 sector = zones[1].start; 139 else 140 sector = zones[0].start; 141 142 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) 143 btrfs_release_disk_super(super[i]); 144 } else if (!full[0] && (empty[1] || full[1])) { 145 sector = zones[0].wp; 146 } else if (full[0]) { 147 sector = zones[1].wp; 148 } else { 149 return -EUCLEAN; 150 } 151 *wp_ret = sector << SECTOR_SHIFT; 152 return 0; 153 } 154 155 /* 156 * Get the first zone number of the superblock mirror 157 */ 158 static inline u32 sb_zone_number(int shift, int mirror) 159 { 160 u64 zone; 161 162 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); 163 switch (mirror) { 164 case 0: zone = 0; break; 165 case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; 166 case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; 167 } 168 169 ASSERT(zone <= U32_MAX); 170 171 return (u32)zone; 172 } 173 174 static inline sector_t zone_start_sector(u32 zone_number, 175 struct block_device *bdev) 176 { 177 return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev)); 178 } 179 180 static inline u64 zone_start_physical(u32 zone_number, 181 struct btrfs_zoned_device_info *zone_info) 182 { 183 return (u64)zone_number << zone_info->zone_size_shift; 184 } 185 186 /* 187 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block 188 * device into static sized chunks and fake a conventional zone on each of 189 * them. 190 */ 191 static int emulate_report_zones(struct btrfs_device *device, u64 pos, 192 struct blk_zone *zones, unsigned int nr_zones) 193 { 194 const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT; 195 sector_t bdev_size = bdev_nr_sectors(device->bdev); 196 unsigned int i; 197 198 pos >>= SECTOR_SHIFT; 199 for (i = 0; i < nr_zones; i++) { 200 zones[i].start = i * zone_sectors + pos; 201 zones[i].len = zone_sectors; 202 zones[i].capacity = zone_sectors; 203 zones[i].wp = zones[i].start + zone_sectors; 204 zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL; 205 zones[i].cond = BLK_ZONE_COND_NOT_WP; 206 207 if (zones[i].wp >= bdev_size) { 208 i++; 209 break; 210 } 211 } 212 213 return i; 214 } 215 216 static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos, 217 struct blk_zone *zones, unsigned int *nr_zones) 218 { 219 struct btrfs_zoned_device_info *zinfo = device->zone_info; 220 u32 zno; 221 int ret; 222 223 if (!*nr_zones) 224 return 0; 225 226 if (!bdev_is_zoned(device->bdev)) { 227 ret = emulate_report_zones(device, pos, zones, *nr_zones); 228 *nr_zones = ret; 229 return 0; 230 } 231 232 /* Check cache */ 233 if (zinfo->zone_cache) { 234 unsigned int i; 235 236 ASSERT(IS_ALIGNED(pos, zinfo->zone_size)); 237 zno = pos >> zinfo->zone_size_shift; 238 /* 239 * We cannot report zones beyond the zone end. So, it is OK to 240 * cap *nr_zones to at the end. 241 */ 242 *nr_zones = min_t(u32, *nr_zones, zinfo->nr_zones - zno); 243 244 for (i = 0; i < *nr_zones; i++) { 245 struct blk_zone *zone_info; 246 247 zone_info = &zinfo->zone_cache[zno + i]; 248 if (!zone_info->len) 249 break; 250 } 251 252 if (i == *nr_zones) { 253 /* Cache hit on all the zones */ 254 memcpy(zones, zinfo->zone_cache + zno, 255 sizeof(*zinfo->zone_cache) * *nr_zones); 256 return 0; 257 } 258 } 259 260 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones, 261 copy_zone_info_cb, zones); 262 if (ret < 0) { 263 btrfs_err_in_rcu(device->fs_info, 264 "zoned: failed to read zone %llu on %s (devid %llu)", 265 pos, rcu_str_deref(device->name), 266 device->devid); 267 return ret; 268 } 269 *nr_zones = ret; 270 if (!ret) 271 return -EIO; 272 273 /* Populate cache */ 274 if (zinfo->zone_cache) 275 memcpy(zinfo->zone_cache + zno, zones, 276 sizeof(*zinfo->zone_cache) * *nr_zones); 277 278 return 0; 279 } 280 281 /* The emulated zone size is determined from the size of device extent */ 282 static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) 283 { 284 struct btrfs_path *path; 285 struct btrfs_root *root = fs_info->dev_root; 286 struct btrfs_key key; 287 struct extent_buffer *leaf; 288 struct btrfs_dev_extent *dext; 289 int ret = 0; 290 291 key.objectid = 1; 292 key.type = BTRFS_DEV_EXTENT_KEY; 293 key.offset = 0; 294 295 path = btrfs_alloc_path(); 296 if (!path) 297 return -ENOMEM; 298 299 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 300 if (ret < 0) 301 goto out; 302 303 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { 304 ret = btrfs_next_leaf(root, path); 305 if (ret < 0) 306 goto out; 307 /* No dev extents at all? Not good */ 308 if (ret > 0) { 309 ret = -EUCLEAN; 310 goto out; 311 } 312 } 313 314 leaf = path->nodes[0]; 315 dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); 316 fs_info->zone_size = btrfs_dev_extent_length(leaf, dext); 317 ret = 0; 318 319 out: 320 btrfs_free_path(path); 321 322 return ret; 323 } 324 325 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info) 326 { 327 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 328 struct btrfs_device *device; 329 int ret = 0; 330 331 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */ 332 if (!btrfs_fs_incompat(fs_info, ZONED)) 333 return 0; 334 335 mutex_lock(&fs_devices->device_list_mutex); 336 list_for_each_entry(device, &fs_devices->devices, dev_list) { 337 /* We can skip reading of zone info for missing devices */ 338 if (!device->bdev) 339 continue; 340 341 ret = btrfs_get_dev_zone_info(device, true); 342 if (ret) 343 break; 344 } 345 mutex_unlock(&fs_devices->device_list_mutex); 346 347 return ret; 348 } 349 350 int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) 351 { 352 struct btrfs_fs_info *fs_info = device->fs_info; 353 struct btrfs_zoned_device_info *zone_info = NULL; 354 struct block_device *bdev = device->bdev; 355 unsigned int max_active_zones; 356 unsigned int nactive; 357 sector_t nr_sectors; 358 sector_t sector = 0; 359 struct blk_zone *zones = NULL; 360 unsigned int i, nreported = 0, nr_zones; 361 sector_t zone_sectors; 362 char *model, *emulated; 363 int ret; 364 365 /* 366 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not 367 * yet be set. 368 */ 369 if (!btrfs_fs_incompat(fs_info, ZONED)) 370 return 0; 371 372 if (device->zone_info) 373 return 0; 374 375 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL); 376 if (!zone_info) 377 return -ENOMEM; 378 379 device->zone_info = zone_info; 380 381 if (!bdev_is_zoned(bdev)) { 382 if (!fs_info->zone_size) { 383 ret = calculate_emulated_zone_size(fs_info); 384 if (ret) 385 goto out; 386 } 387 388 ASSERT(fs_info->zone_size); 389 zone_sectors = fs_info->zone_size >> SECTOR_SHIFT; 390 } else { 391 zone_sectors = bdev_zone_sectors(bdev); 392 } 393 394 /* Check if it's power of 2 (see is_power_of_2) */ 395 ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); 396 zone_info->zone_size = zone_sectors << SECTOR_SHIFT; 397 398 /* We reject devices with a zone size larger than 8GB */ 399 if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { 400 btrfs_err_in_rcu(fs_info, 401 "zoned: %s: zone size %llu larger than supported maximum %llu", 402 rcu_str_deref(device->name), 403 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); 404 ret = -EINVAL; 405 goto out; 406 } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { 407 btrfs_err_in_rcu(fs_info, 408 "zoned: %s: zone size %llu smaller than supported minimum %u", 409 rcu_str_deref(device->name), 410 zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); 411 ret = -EINVAL; 412 goto out; 413 } 414 415 nr_sectors = bdev_nr_sectors(bdev); 416 zone_info->zone_size_shift = ilog2(zone_info->zone_size); 417 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); 418 /* 419 * We limit max_zone_append_size also by max_segments * 420 * PAGE_SIZE. Technically, we can have multiple pages per segment. But, 421 * since btrfs adds the pages one by one to a bio, and btrfs cannot 422 * increase the metadata reservation even if it increases the number of 423 * extents, it is safe to stick with the limit. 424 * 425 * With the zoned emulation, we can have non-zoned device on the zoned 426 * mode. In this case, we don't have a valid max zone append size. So, 427 * use max_segments * PAGE_SIZE as the pseudo max_zone_append_size. 428 */ 429 if (bdev_is_zoned(bdev)) { 430 zone_info->max_zone_append_size = min_t(u64, 431 (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, 432 (u64)bdev_max_segments(bdev) << PAGE_SHIFT); 433 } else { 434 zone_info->max_zone_append_size = 435 (u64)bdev_max_segments(bdev) << PAGE_SHIFT; 436 } 437 if (!IS_ALIGNED(nr_sectors, zone_sectors)) 438 zone_info->nr_zones++; 439 440 max_active_zones = bdev_max_active_zones(bdev); 441 if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { 442 btrfs_err_in_rcu(fs_info, 443 "zoned: %s: max active zones %u is too small, need at least %u active zones", 444 rcu_str_deref(device->name), max_active_zones, 445 BTRFS_MIN_ACTIVE_ZONES); 446 ret = -EINVAL; 447 goto out; 448 } 449 zone_info->max_active_zones = max_active_zones; 450 451 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 452 if (!zone_info->seq_zones) { 453 ret = -ENOMEM; 454 goto out; 455 } 456 457 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 458 if (!zone_info->empty_zones) { 459 ret = -ENOMEM; 460 goto out; 461 } 462 463 zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); 464 if (!zone_info->active_zones) { 465 ret = -ENOMEM; 466 goto out; 467 } 468 469 zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); 470 if (!zones) { 471 ret = -ENOMEM; 472 goto out; 473 } 474 475 /* 476 * Enable zone cache only for a zoned device. On a non-zoned device, we 477 * fill the zone info with emulated CONVENTIONAL zones, so no need to 478 * use the cache. 479 */ 480 if (populate_cache && bdev_is_zoned(device->bdev)) { 481 zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * 482 zone_info->nr_zones); 483 if (!zone_info->zone_cache) { 484 btrfs_err_in_rcu(device->fs_info, 485 "zoned: failed to allocate zone cache for %s", 486 rcu_str_deref(device->name)); 487 ret = -ENOMEM; 488 goto out; 489 } 490 } 491 492 /* Get zones type */ 493 nactive = 0; 494 while (sector < nr_sectors) { 495 nr_zones = BTRFS_REPORT_NR_ZONES; 496 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, 497 &nr_zones); 498 if (ret) 499 goto out; 500 501 for (i = 0; i < nr_zones; i++) { 502 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) 503 __set_bit(nreported, zone_info->seq_zones); 504 switch (zones[i].cond) { 505 case BLK_ZONE_COND_EMPTY: 506 __set_bit(nreported, zone_info->empty_zones); 507 break; 508 case BLK_ZONE_COND_IMP_OPEN: 509 case BLK_ZONE_COND_EXP_OPEN: 510 case BLK_ZONE_COND_CLOSED: 511 __set_bit(nreported, zone_info->active_zones); 512 nactive++; 513 break; 514 } 515 nreported++; 516 } 517 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; 518 } 519 520 if (nreported != zone_info->nr_zones) { 521 btrfs_err_in_rcu(device->fs_info, 522 "inconsistent number of zones on %s (%u/%u)", 523 rcu_str_deref(device->name), nreported, 524 zone_info->nr_zones); 525 ret = -EIO; 526 goto out; 527 } 528 529 if (max_active_zones) { 530 if (nactive > max_active_zones) { 531 btrfs_err_in_rcu(device->fs_info, 532 "zoned: %u active zones on %s exceeds max_active_zones %u", 533 nactive, rcu_str_deref(device->name), 534 max_active_zones); 535 ret = -EIO; 536 goto out; 537 } 538 atomic_set(&zone_info->active_zones_left, 539 max_active_zones - nactive); 540 } 541 542 /* Validate superblock log */ 543 nr_zones = BTRFS_NR_SB_LOG_ZONES; 544 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 545 u32 sb_zone; 546 u64 sb_wp; 547 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i; 548 549 sb_zone = sb_zone_number(zone_info->zone_size_shift, i); 550 if (sb_zone + 1 >= zone_info->nr_zones) 551 continue; 552 553 ret = btrfs_get_dev_zones(device, 554 zone_start_physical(sb_zone, zone_info), 555 &zone_info->sb_zones[sb_pos], 556 &nr_zones); 557 if (ret) 558 goto out; 559 560 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) { 561 btrfs_err_in_rcu(device->fs_info, 562 "zoned: failed to read super block log zone info at devid %llu zone %u", 563 device->devid, sb_zone); 564 ret = -EUCLEAN; 565 goto out; 566 } 567 568 /* 569 * If zones[0] is conventional, always use the beginning of the 570 * zone to record superblock. No need to validate in that case. 571 */ 572 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type == 573 BLK_ZONE_TYPE_CONVENTIONAL) 574 continue; 575 576 ret = sb_write_pointer(device->bdev, 577 &zone_info->sb_zones[sb_pos], &sb_wp); 578 if (ret != -ENOENT && ret) { 579 btrfs_err_in_rcu(device->fs_info, 580 "zoned: super block log zone corrupted devid %llu zone %u", 581 device->devid, sb_zone); 582 ret = -EUCLEAN; 583 goto out; 584 } 585 } 586 587 588 kfree(zones); 589 590 switch (bdev_zoned_model(bdev)) { 591 case BLK_ZONED_HM: 592 model = "host-managed zoned"; 593 emulated = ""; 594 break; 595 case BLK_ZONED_HA: 596 model = "host-aware zoned"; 597 emulated = ""; 598 break; 599 case BLK_ZONED_NONE: 600 model = "regular"; 601 emulated = "emulated "; 602 break; 603 default: 604 /* Just in case */ 605 btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", 606 bdev_zoned_model(bdev), 607 rcu_str_deref(device->name)); 608 ret = -EOPNOTSUPP; 609 goto out_free_zone_info; 610 } 611 612 btrfs_info_in_rcu(fs_info, 613 "%s block device %s, %u %szones of %llu bytes", 614 model, rcu_str_deref(device->name), zone_info->nr_zones, 615 emulated, zone_info->zone_size); 616 617 return 0; 618 619 out: 620 kfree(zones); 621 out_free_zone_info: 622 btrfs_destroy_dev_zone_info(device); 623 624 return ret; 625 } 626 627 void btrfs_destroy_dev_zone_info(struct btrfs_device *device) 628 { 629 struct btrfs_zoned_device_info *zone_info = device->zone_info; 630 631 if (!zone_info) 632 return; 633 634 bitmap_free(zone_info->active_zones); 635 bitmap_free(zone_info->seq_zones); 636 bitmap_free(zone_info->empty_zones); 637 vfree(zone_info->zone_cache); 638 kfree(zone_info); 639 device->zone_info = NULL; 640 } 641 642 int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, 643 struct blk_zone *zone) 644 { 645 unsigned int nr_zones = 1; 646 int ret; 647 648 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones); 649 if (ret != 0 || !nr_zones) 650 return ret ? ret : -EIO; 651 652 return 0; 653 } 654 655 int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) 656 { 657 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 658 struct btrfs_device *device; 659 u64 zoned_devices = 0; 660 u64 nr_devices = 0; 661 u64 zone_size = 0; 662 u64 max_zone_append_size = 0; 663 const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); 664 int ret = 0; 665 666 /* Count zoned devices */ 667 list_for_each_entry(device, &fs_devices->devices, dev_list) { 668 enum blk_zoned_model model; 669 670 if (!device->bdev) 671 continue; 672 673 model = bdev_zoned_model(device->bdev); 674 /* 675 * A Host-Managed zoned device must be used as a zoned device. 676 * A Host-Aware zoned device and a non-zoned devices can be 677 * treated as a zoned device, if ZONED flag is enabled in the 678 * superblock. 679 */ 680 if (model == BLK_ZONED_HM || 681 (model == BLK_ZONED_HA && incompat_zoned) || 682 (model == BLK_ZONED_NONE && incompat_zoned)) { 683 struct btrfs_zoned_device_info *zone_info; 684 685 zone_info = device->zone_info; 686 zoned_devices++; 687 if (!zone_size) { 688 zone_size = zone_info->zone_size; 689 } else if (zone_info->zone_size != zone_size) { 690 btrfs_err(fs_info, 691 "zoned: unequal block device zone sizes: have %llu found %llu", 692 device->zone_info->zone_size, 693 zone_size); 694 ret = -EINVAL; 695 goto out; 696 } 697 if (!max_zone_append_size || 698 (zone_info->max_zone_append_size && 699 zone_info->max_zone_append_size < max_zone_append_size)) 700 max_zone_append_size = 701 zone_info->max_zone_append_size; 702 } 703 nr_devices++; 704 } 705 706 if (!zoned_devices && !incompat_zoned) 707 goto out; 708 709 if (!zoned_devices && incompat_zoned) { 710 /* No zoned block device found on ZONED filesystem */ 711 btrfs_err(fs_info, 712 "zoned: no zoned devices found on a zoned filesystem"); 713 ret = -EINVAL; 714 goto out; 715 } 716 717 if (zoned_devices && !incompat_zoned) { 718 btrfs_err(fs_info, 719 "zoned: mode not enabled but zoned device found"); 720 ret = -EINVAL; 721 goto out; 722 } 723 724 if (zoned_devices != nr_devices) { 725 btrfs_err(fs_info, 726 "zoned: cannot mix zoned and regular devices"); 727 ret = -EINVAL; 728 goto out; 729 } 730 731 /* 732 * stripe_size is always aligned to BTRFS_STRIPE_LEN in 733 * btrfs_create_chunk(). Since we want stripe_len == zone_size, 734 * check the alignment here. 735 */ 736 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { 737 btrfs_err(fs_info, 738 "zoned: zone size %llu not aligned to stripe %u", 739 zone_size, BTRFS_STRIPE_LEN); 740 ret = -EINVAL; 741 goto out; 742 } 743 744 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { 745 btrfs_err(fs_info, "zoned: mixed block groups not supported"); 746 ret = -EINVAL; 747 goto out; 748 } 749 750 fs_info->zone_size = zone_size; 751 fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, 752 fs_info->sectorsize); 753 fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; 754 if (fs_info->max_zone_append_size < fs_info->max_extent_size) 755 fs_info->max_extent_size = fs_info->max_zone_append_size; 756 757 /* 758 * Check mount options here, because we might change fs_info->zoned 759 * from fs_info->zone_size. 760 */ 761 ret = btrfs_check_mountopts_zoned(fs_info); 762 if (ret) 763 goto out; 764 765 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size); 766 out: 767 return ret; 768 } 769 770 int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) 771 { 772 if (!btrfs_is_zoned(info)) 773 return 0; 774 775 /* 776 * Space cache writing is not COWed. Disable that to avoid write errors 777 * in sequential zones. 778 */ 779 if (btrfs_test_opt(info, SPACE_CACHE)) { 780 btrfs_err(info, "zoned: space cache v1 is not supported"); 781 return -EINVAL; 782 } 783 784 if (btrfs_test_opt(info, NODATACOW)) { 785 btrfs_err(info, "zoned: NODATACOW not supported"); 786 return -EINVAL; 787 } 788 789 return 0; 790 } 791 792 static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, 793 int rw, u64 *bytenr_ret) 794 { 795 u64 wp; 796 int ret; 797 798 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) { 799 *bytenr_ret = zones[0].start << SECTOR_SHIFT; 800 return 0; 801 } 802 803 ret = sb_write_pointer(bdev, zones, &wp); 804 if (ret != -ENOENT && ret < 0) 805 return ret; 806 807 if (rw == WRITE) { 808 struct blk_zone *reset = NULL; 809 810 if (wp == zones[0].start << SECTOR_SHIFT) 811 reset = &zones[0]; 812 else if (wp == zones[1].start << SECTOR_SHIFT) 813 reset = &zones[1]; 814 815 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { 816 ASSERT(sb_zone_is_full(reset)); 817 818 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 819 reset->start, reset->len, 820 GFP_NOFS); 821 if (ret) 822 return ret; 823 824 reset->cond = BLK_ZONE_COND_EMPTY; 825 reset->wp = reset->start; 826 } 827 } else if (ret != -ENOENT) { 828 /* 829 * For READ, we want the previous one. Move write pointer to 830 * the end of a zone, if it is at the head of a zone. 831 */ 832 u64 zone_end = 0; 833 834 if (wp == zones[0].start << SECTOR_SHIFT) 835 zone_end = zones[1].start + zones[1].capacity; 836 else if (wp == zones[1].start << SECTOR_SHIFT) 837 zone_end = zones[0].start + zones[0].capacity; 838 if (zone_end) 839 wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, 840 BTRFS_SUPER_INFO_SIZE); 841 842 wp -= BTRFS_SUPER_INFO_SIZE; 843 } 844 845 *bytenr_ret = wp; 846 return 0; 847 848 } 849 850 int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, 851 u64 *bytenr_ret) 852 { 853 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES]; 854 sector_t zone_sectors; 855 u32 sb_zone; 856 int ret; 857 u8 zone_sectors_shift; 858 sector_t nr_sectors; 859 u32 nr_zones; 860 861 if (!bdev_is_zoned(bdev)) { 862 *bytenr_ret = btrfs_sb_offset(mirror); 863 return 0; 864 } 865 866 ASSERT(rw == READ || rw == WRITE); 867 868 zone_sectors = bdev_zone_sectors(bdev); 869 if (!is_power_of_2(zone_sectors)) 870 return -EINVAL; 871 zone_sectors_shift = ilog2(zone_sectors); 872 nr_sectors = bdev_nr_sectors(bdev); 873 nr_zones = nr_sectors >> zone_sectors_shift; 874 875 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 876 if (sb_zone + 1 >= nr_zones) 877 return -ENOENT; 878 879 ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev), 880 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb, 881 zones); 882 if (ret < 0) 883 return ret; 884 if (ret != BTRFS_NR_SB_LOG_ZONES) 885 return -EIO; 886 887 return sb_log_location(bdev, zones, rw, bytenr_ret); 888 } 889 890 int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, 891 u64 *bytenr_ret) 892 { 893 struct btrfs_zoned_device_info *zinfo = device->zone_info; 894 u32 zone_num; 895 896 /* 897 * For a zoned filesystem on a non-zoned block device, use the same 898 * super block locations as regular filesystem. Doing so, the super 899 * block can always be retrieved and the zoned flag of the volume 900 * detected from the super block information. 901 */ 902 if (!bdev_is_zoned(device->bdev)) { 903 *bytenr_ret = btrfs_sb_offset(mirror); 904 return 0; 905 } 906 907 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 908 if (zone_num + 1 >= zinfo->nr_zones) 909 return -ENOENT; 910 911 return sb_log_location(device->bdev, 912 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror], 913 rw, bytenr_ret); 914 } 915 916 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, 917 int mirror) 918 { 919 u32 zone_num; 920 921 if (!zinfo) 922 return false; 923 924 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror); 925 if (zone_num + 1 >= zinfo->nr_zones) 926 return false; 927 928 if (!test_bit(zone_num, zinfo->seq_zones)) 929 return false; 930 931 return true; 932 } 933 934 int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) 935 { 936 struct btrfs_zoned_device_info *zinfo = device->zone_info; 937 struct blk_zone *zone; 938 int i; 939 940 if (!is_sb_log_zone(zinfo, mirror)) 941 return 0; 942 943 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; 944 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { 945 /* Advance the next zone */ 946 if (zone->cond == BLK_ZONE_COND_FULL) { 947 zone++; 948 continue; 949 } 950 951 if (zone->cond == BLK_ZONE_COND_EMPTY) 952 zone->cond = BLK_ZONE_COND_IMP_OPEN; 953 954 zone->wp += SUPER_INFO_SECTORS; 955 956 if (sb_zone_is_full(zone)) { 957 /* 958 * No room left to write new superblock. Since 959 * superblock is written with REQ_SYNC, it is safe to 960 * finish the zone now. 961 * 962 * If the write pointer is exactly at the capacity, 963 * explicit ZONE_FINISH is not necessary. 964 */ 965 if (zone->wp != zone->start + zone->capacity) { 966 int ret; 967 968 ret = blkdev_zone_mgmt(device->bdev, 969 REQ_OP_ZONE_FINISH, zone->start, 970 zone->len, GFP_NOFS); 971 if (ret) 972 return ret; 973 } 974 975 zone->wp = zone->start + zone->len; 976 zone->cond = BLK_ZONE_COND_FULL; 977 } 978 return 0; 979 } 980 981 /* All the zones are FULL. Should not reach here. */ 982 ASSERT(0); 983 return -EIO; 984 } 985 986 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) 987 { 988 sector_t zone_sectors; 989 sector_t nr_sectors; 990 u8 zone_sectors_shift; 991 u32 sb_zone; 992 u32 nr_zones; 993 994 zone_sectors = bdev_zone_sectors(bdev); 995 zone_sectors_shift = ilog2(zone_sectors); 996 nr_sectors = bdev_nr_sectors(bdev); 997 nr_zones = nr_sectors >> zone_sectors_shift; 998 999 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror); 1000 if (sb_zone + 1 >= nr_zones) 1001 return -ENOENT; 1002 1003 return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, 1004 zone_start_sector(sb_zone, bdev), 1005 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); 1006 } 1007 1008 /** 1009 * btrfs_find_allocatable_zones - find allocatable zones within a given region 1010 * 1011 * @device: the device to allocate a region on 1012 * @hole_start: the position of the hole to allocate the region 1013 * @num_bytes: size of wanted region 1014 * @hole_end: the end of the hole 1015 * @return: position of allocatable zones 1016 * 1017 * Allocatable region should not contain any superblock locations. 1018 */ 1019 u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, 1020 u64 hole_end, u64 num_bytes) 1021 { 1022 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1023 const u8 shift = zinfo->zone_size_shift; 1024 u64 nzones = num_bytes >> shift; 1025 u64 pos = hole_start; 1026 u64 begin, end; 1027 bool have_sb; 1028 int i; 1029 1030 ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size)); 1031 ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size)); 1032 1033 while (pos < hole_end) { 1034 begin = pos >> shift; 1035 end = begin + nzones; 1036 1037 if (end > zinfo->nr_zones) 1038 return hole_end; 1039 1040 /* Check if zones in the region are all empty */ 1041 if (btrfs_dev_is_sequential(device, pos) && 1042 find_next_zero_bit(zinfo->empty_zones, end, begin) != end) { 1043 pos += zinfo->zone_size; 1044 continue; 1045 } 1046 1047 have_sb = false; 1048 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { 1049 u32 sb_zone; 1050 u64 sb_pos; 1051 1052 sb_zone = sb_zone_number(shift, i); 1053 if (!(end <= sb_zone || 1054 sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) { 1055 have_sb = true; 1056 pos = zone_start_physical( 1057 sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo); 1058 break; 1059 } 1060 1061 /* We also need to exclude regular superblock positions */ 1062 sb_pos = btrfs_sb_offset(i); 1063 if (!(pos + num_bytes <= sb_pos || 1064 sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) { 1065 have_sb = true; 1066 pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE, 1067 zinfo->zone_size); 1068 break; 1069 } 1070 } 1071 if (!have_sb) 1072 break; 1073 } 1074 1075 return pos; 1076 } 1077 1078 static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) 1079 { 1080 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1081 unsigned int zno = (pos >> zone_info->zone_size_shift); 1082 1083 /* We can use any number of zones */ 1084 if (zone_info->max_active_zones == 0) 1085 return true; 1086 1087 if (!test_bit(zno, zone_info->active_zones)) { 1088 /* Active zone left? */ 1089 if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) 1090 return false; 1091 if (test_and_set_bit(zno, zone_info->active_zones)) { 1092 /* Someone already set the bit */ 1093 atomic_inc(&zone_info->active_zones_left); 1094 } 1095 } 1096 1097 return true; 1098 } 1099 1100 static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) 1101 { 1102 struct btrfs_zoned_device_info *zone_info = device->zone_info; 1103 unsigned int zno = (pos >> zone_info->zone_size_shift); 1104 1105 /* We can use any number of zones */ 1106 if (zone_info->max_active_zones == 0) 1107 return; 1108 1109 if (test_and_clear_bit(zno, zone_info->active_zones)) 1110 atomic_inc(&zone_info->active_zones_left); 1111 } 1112 1113 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, 1114 u64 length, u64 *bytes) 1115 { 1116 int ret; 1117 1118 *bytes = 0; 1119 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, 1120 physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, 1121 GFP_NOFS); 1122 if (ret) 1123 return ret; 1124 1125 *bytes = length; 1126 while (length) { 1127 btrfs_dev_set_zone_empty(device, physical); 1128 btrfs_dev_clear_active_zone(device, physical); 1129 physical += device->zone_info->zone_size; 1130 length -= device->zone_info->zone_size; 1131 } 1132 1133 return 0; 1134 } 1135 1136 int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size) 1137 { 1138 struct btrfs_zoned_device_info *zinfo = device->zone_info; 1139 const u8 shift = zinfo->zone_size_shift; 1140 unsigned long begin = start >> shift; 1141 unsigned long end = (start + size) >> shift; 1142 u64 pos; 1143 int ret; 1144 1145 ASSERT(IS_ALIGNED(start, zinfo->zone_size)); 1146 ASSERT(IS_ALIGNED(size, zinfo->zone_size)); 1147 1148 if (end > zinfo->nr_zones) 1149 return -ERANGE; 1150 1151 /* All the zones are conventional */ 1152 if (find_next_bit(zinfo->seq_zones, begin, end) == end) 1153 return 0; 1154 1155 /* All the zones are sequential and empty */ 1156 if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end && 1157 find_next_zero_bit(zinfo->empty_zones, begin, end) == end) 1158 return 0; 1159 1160 for (pos = start; pos < start + size; pos += zinfo->zone_size) { 1161 u64 reset_bytes; 1162 1163 if (!btrfs_dev_is_sequential(device, pos) || 1164 btrfs_dev_is_empty_zone(device, pos)) 1165 continue; 1166 1167 /* Free regions should be empty */ 1168 btrfs_warn_in_rcu( 1169 device->fs_info, 1170 "zoned: resetting device %s (devid %llu) zone %llu for allocation", 1171 rcu_str_deref(device->name), device->devid, pos >> shift); 1172 WARN_ON_ONCE(1); 1173 1174 ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size, 1175 &reset_bytes); 1176 if (ret) 1177 return ret; 1178 } 1179 1180 return 0; 1181 } 1182 1183 /* 1184 * Calculate an allocation pointer from the extent allocation information 1185 * for a block group consist of conventional zones. It is pointed to the 1186 * end of the highest addressed extent in the block group as an allocation 1187 * offset. 1188 */ 1189 static int calculate_alloc_pointer(struct btrfs_block_group *cache, 1190 u64 *offset_ret, bool new) 1191 { 1192 struct btrfs_fs_info *fs_info = cache->fs_info; 1193 struct btrfs_root *root; 1194 struct btrfs_path *path; 1195 struct btrfs_key key; 1196 struct btrfs_key found_key; 1197 int ret; 1198 u64 length; 1199 1200 /* 1201 * Avoid tree lookups for a new block group, there's no use for it. 1202 * It must always be 0. 1203 * 1204 * Also, we have a lock chain of extent buffer lock -> chunk mutex. 1205 * For new a block group, this function is called from 1206 * btrfs_make_block_group() which is already taking the chunk mutex. 1207 * Thus, we cannot call calculate_alloc_pointer() which takes extent 1208 * buffer locks to avoid deadlock. 1209 */ 1210 if (new) { 1211 *offset_ret = 0; 1212 return 0; 1213 } 1214 1215 path = btrfs_alloc_path(); 1216 if (!path) 1217 return -ENOMEM; 1218 1219 key.objectid = cache->start + cache->length; 1220 key.type = 0; 1221 key.offset = 0; 1222 1223 root = btrfs_extent_root(fs_info, key.objectid); 1224 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 1225 /* We should not find the exact match */ 1226 if (!ret) 1227 ret = -EUCLEAN; 1228 if (ret < 0) 1229 goto out; 1230 1231 ret = btrfs_previous_extent_item(root, path, cache->start); 1232 if (ret) { 1233 if (ret == 1) { 1234 ret = 0; 1235 *offset_ret = 0; 1236 } 1237 goto out; 1238 } 1239 1240 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); 1241 1242 if (found_key.type == BTRFS_EXTENT_ITEM_KEY) 1243 length = found_key.offset; 1244 else 1245 length = fs_info->nodesize; 1246 1247 if (!(found_key.objectid >= cache->start && 1248 found_key.objectid + length <= cache->start + cache->length)) { 1249 ret = -EUCLEAN; 1250 goto out; 1251 } 1252 *offset_ret = found_key.objectid + length - cache->start; 1253 ret = 0; 1254 1255 out: 1256 btrfs_free_path(path); 1257 return ret; 1258 } 1259 1260 int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) 1261 { 1262 struct btrfs_fs_info *fs_info = cache->fs_info; 1263 struct extent_map_tree *em_tree = &fs_info->mapping_tree; 1264 struct extent_map *em; 1265 struct map_lookup *map; 1266 struct btrfs_device *device; 1267 u64 logical = cache->start; 1268 u64 length = cache->length; 1269 int ret; 1270 int i; 1271 unsigned int nofs_flag; 1272 u64 *alloc_offsets = NULL; 1273 u64 *caps = NULL; 1274 u64 *physical = NULL; 1275 unsigned long *active = NULL; 1276 u64 last_alloc = 0; 1277 u32 num_sequential = 0, num_conventional = 0; 1278 1279 if (!btrfs_is_zoned(fs_info)) 1280 return 0; 1281 1282 /* Sanity check */ 1283 if (!IS_ALIGNED(length, fs_info->zone_size)) { 1284 btrfs_err(fs_info, 1285 "zoned: block group %llu len %llu unaligned to zone size %llu", 1286 logical, length, fs_info->zone_size); 1287 return -EIO; 1288 } 1289 1290 /* Get the chunk mapping */ 1291 read_lock(&em_tree->lock); 1292 em = lookup_extent_mapping(em_tree, logical, length); 1293 read_unlock(&em_tree->lock); 1294 1295 if (!em) 1296 return -EINVAL; 1297 1298 map = em->map_lookup; 1299 1300 cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); 1301 if (!cache->physical_map) { 1302 ret = -ENOMEM; 1303 goto out; 1304 } 1305 1306 alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); 1307 if (!alloc_offsets) { 1308 ret = -ENOMEM; 1309 goto out; 1310 } 1311 1312 caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); 1313 if (!caps) { 1314 ret = -ENOMEM; 1315 goto out; 1316 } 1317 1318 physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); 1319 if (!physical) { 1320 ret = -ENOMEM; 1321 goto out; 1322 } 1323 1324 active = bitmap_zalloc(map->num_stripes, GFP_NOFS); 1325 if (!active) { 1326 ret = -ENOMEM; 1327 goto out; 1328 } 1329 1330 for (i = 0; i < map->num_stripes; i++) { 1331 bool is_sequential; 1332 struct blk_zone zone; 1333 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; 1334 int dev_replace_is_ongoing = 0; 1335 1336 device = map->stripes[i].dev; 1337 physical[i] = map->stripes[i].physical; 1338 1339 if (device->bdev == NULL) { 1340 alloc_offsets[i] = WP_MISSING_DEV; 1341 continue; 1342 } 1343 1344 is_sequential = btrfs_dev_is_sequential(device, physical[i]); 1345 if (is_sequential) 1346 num_sequential++; 1347 else 1348 num_conventional++; 1349 1350 /* 1351 * Consider a zone as active if we can allow any number of 1352 * active zones. 1353 */ 1354 if (!device->zone_info->max_active_zones) 1355 __set_bit(i, active); 1356 1357 if (!is_sequential) { 1358 alloc_offsets[i] = WP_CONVENTIONAL; 1359 continue; 1360 } 1361 1362 /* 1363 * This zone will be used for allocation, so mark this zone 1364 * non-empty. 1365 */ 1366 btrfs_dev_clear_zone_empty(device, physical[i]); 1367 1368 down_read(&dev_replace->rwsem); 1369 dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); 1370 if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) 1371 btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); 1372 up_read(&dev_replace->rwsem); 1373 1374 /* 1375 * The group is mapped to a sequential zone. Get the zone write 1376 * pointer to determine the allocation offset within the zone. 1377 */ 1378 WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); 1379 nofs_flag = memalloc_nofs_save(); 1380 ret = btrfs_get_dev_zone(device, physical[i], &zone); 1381 memalloc_nofs_restore(nofs_flag); 1382 if (ret == -EIO || ret == -EOPNOTSUPP) { 1383 ret = 0; 1384 alloc_offsets[i] = WP_MISSING_DEV; 1385 continue; 1386 } else if (ret) { 1387 goto out; 1388 } 1389 1390 if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { 1391 btrfs_err_in_rcu(fs_info, 1392 "zoned: unexpected conventional zone %llu on device %s (devid %llu)", 1393 zone.start << SECTOR_SHIFT, 1394 rcu_str_deref(device->name), device->devid); 1395 ret = -EIO; 1396 goto out; 1397 } 1398 1399 caps[i] = (zone.capacity << SECTOR_SHIFT); 1400 1401 switch (zone.cond) { 1402 case BLK_ZONE_COND_OFFLINE: 1403 case BLK_ZONE_COND_READONLY: 1404 btrfs_err(fs_info, 1405 "zoned: offline/readonly zone %llu on device %s (devid %llu)", 1406 physical[i] >> device->zone_info->zone_size_shift, 1407 rcu_str_deref(device->name), device->devid); 1408 alloc_offsets[i] = WP_MISSING_DEV; 1409 break; 1410 case BLK_ZONE_COND_EMPTY: 1411 alloc_offsets[i] = 0; 1412 break; 1413 case BLK_ZONE_COND_FULL: 1414 alloc_offsets[i] = caps[i]; 1415 break; 1416 default: 1417 /* Partially used zone */ 1418 alloc_offsets[i] = 1419 ((zone.wp - zone.start) << SECTOR_SHIFT); 1420 __set_bit(i, active); 1421 break; 1422 } 1423 } 1424 1425 if (num_sequential > 0) 1426 cache->seq_zone = true; 1427 1428 if (num_conventional > 0) { 1429 /* Zone capacity is always zone size in emulation */ 1430 cache->zone_capacity = cache->length; 1431 ret = calculate_alloc_pointer(cache, &last_alloc, new); 1432 if (ret) { 1433 btrfs_err(fs_info, 1434 "zoned: failed to determine allocation offset of bg %llu", 1435 cache->start); 1436 goto out; 1437 } else if (map->num_stripes == num_conventional) { 1438 cache->alloc_offset = last_alloc; 1439 cache->zone_is_active = 1; 1440 goto out; 1441 } 1442 } 1443 1444 switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { 1445 case 0: /* single */ 1446 if (alloc_offsets[0] == WP_MISSING_DEV) { 1447 btrfs_err(fs_info, 1448 "zoned: cannot recover write pointer for zone %llu", 1449 physical[0]); 1450 ret = -EIO; 1451 goto out; 1452 } 1453 cache->alloc_offset = alloc_offsets[0]; 1454 cache->zone_capacity = caps[0]; 1455 cache->zone_is_active = test_bit(0, active); 1456 break; 1457 case BTRFS_BLOCK_GROUP_DUP: 1458 if (map->type & BTRFS_BLOCK_GROUP_DATA) { 1459 btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); 1460 ret = -EINVAL; 1461 goto out; 1462 } 1463 if (alloc_offsets[0] == WP_MISSING_DEV) { 1464 btrfs_err(fs_info, 1465 "zoned: cannot recover write pointer for zone %llu", 1466 physical[0]); 1467 ret = -EIO; 1468 goto out; 1469 } 1470 if (alloc_offsets[1] == WP_MISSING_DEV) { 1471 btrfs_err(fs_info, 1472 "zoned: cannot recover write pointer for zone %llu", 1473 physical[1]); 1474 ret = -EIO; 1475 goto out; 1476 } 1477 if (alloc_offsets[0] != alloc_offsets[1]) { 1478 btrfs_err(fs_info, 1479 "zoned: write pointer offset mismatch of zones in DUP profile"); 1480 ret = -EIO; 1481 goto out; 1482 } 1483 if (test_bit(0, active) != test_bit(1, active)) { 1484 if (!btrfs_zone_activate(cache)) { 1485 ret = -EIO; 1486 goto out; 1487 } 1488 } else { 1489 cache->zone_is_active = test_bit(0, active); 1490 } 1491 cache->alloc_offset = alloc_offsets[0]; 1492 cache->zone_capacity = min(caps[0], caps[1]); 1493 break; 1494 case BTRFS_BLOCK_GROUP_RAID1: 1495 case BTRFS_BLOCK_GROUP_RAID0: 1496 case BTRFS_BLOCK_GROUP_RAID10: 1497 case BTRFS_BLOCK_GROUP_RAID5: 1498 case BTRFS_BLOCK_GROUP_RAID6: 1499 /* non-single profiles are not supported yet */ 1500 default: 1501 btrfs_err(fs_info, "zoned: profile %s not yet supported", 1502 btrfs_bg_type_to_raid_name(map->type)); 1503 ret = -EINVAL; 1504 goto out; 1505 } 1506 1507 out: 1508 if (cache->alloc_offset > fs_info->zone_size) { 1509 btrfs_err(fs_info, 1510 "zoned: invalid write pointer %llu in block group %llu", 1511 cache->alloc_offset, cache->start); 1512 ret = -EIO; 1513 } 1514 1515 if (cache->alloc_offset > cache->zone_capacity) { 1516 btrfs_err(fs_info, 1517 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", 1518 cache->alloc_offset, cache->zone_capacity, 1519 cache->start); 1520 ret = -EIO; 1521 } 1522 1523 /* An extent is allocated after the write pointer */ 1524 if (!ret && num_conventional && last_alloc > cache->alloc_offset) { 1525 btrfs_err(fs_info, 1526 "zoned: got wrong write pointer in BG %llu: %llu > %llu", 1527 logical, last_alloc, cache->alloc_offset); 1528 ret = -EIO; 1529 } 1530 1531 if (!ret) { 1532 cache->meta_write_pointer = cache->alloc_offset + cache->start; 1533 if (cache->zone_is_active) { 1534 btrfs_get_block_group(cache); 1535 spin_lock(&fs_info->zone_active_bgs_lock); 1536 list_add_tail(&cache->active_bg_list, 1537 &fs_info->zone_active_bgs); 1538 spin_unlock(&fs_info->zone_active_bgs_lock); 1539 } 1540 } else { 1541 kfree(cache->physical_map); 1542 cache->physical_map = NULL; 1543 } 1544 bitmap_free(active); 1545 kfree(physical); 1546 kfree(caps); 1547 kfree(alloc_offsets); 1548 free_extent_map(em); 1549 1550 return ret; 1551 } 1552 1553 void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) 1554 { 1555 u64 unusable, free; 1556 1557 if (!btrfs_is_zoned(cache->fs_info)) 1558 return; 1559 1560 WARN_ON(cache->bytes_super != 0); 1561 unusable = (cache->alloc_offset - cache->used) + 1562 (cache->length - cache->zone_capacity); 1563 free = cache->zone_capacity - cache->alloc_offset; 1564 1565 /* We only need ->free_space in ALLOC_SEQ block groups */ 1566 cache->last_byte_to_unpin = (u64)-1; 1567 cache->cached = BTRFS_CACHE_FINISHED; 1568 cache->free_space_ctl->free_space = free; 1569 cache->zone_unusable = unusable; 1570 } 1571 1572 void btrfs_redirty_list_add(struct btrfs_transaction *trans, 1573 struct extent_buffer *eb) 1574 { 1575 struct btrfs_fs_info *fs_info = eb->fs_info; 1576 1577 if (!btrfs_is_zoned(fs_info) || 1578 btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) || 1579 !list_empty(&eb->release_list)) 1580 return; 1581 1582 set_extent_buffer_dirty(eb); 1583 set_extent_bits_nowait(&trans->dirty_pages, eb->start, 1584 eb->start + eb->len - 1, EXTENT_DIRTY); 1585 memzero_extent_buffer(eb, 0, eb->len); 1586 set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); 1587 1588 spin_lock(&trans->releasing_ebs_lock); 1589 list_add_tail(&eb->release_list, &trans->releasing_ebs); 1590 spin_unlock(&trans->releasing_ebs_lock); 1591 atomic_inc(&eb->refs); 1592 } 1593 1594 void btrfs_free_redirty_list(struct btrfs_transaction *trans) 1595 { 1596 spin_lock(&trans->releasing_ebs_lock); 1597 while (!list_empty(&trans->releasing_ebs)) { 1598 struct extent_buffer *eb; 1599 1600 eb = list_first_entry(&trans->releasing_ebs, 1601 struct extent_buffer, release_list); 1602 list_del_init(&eb->release_list); 1603 free_extent_buffer(eb); 1604 } 1605 spin_unlock(&trans->releasing_ebs_lock); 1606 } 1607 1608 bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) 1609 { 1610 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1611 struct btrfs_block_group *cache; 1612 bool ret = false; 1613 1614 if (!btrfs_is_zoned(fs_info)) 1615 return false; 1616 1617 if (!is_data_inode(&inode->vfs_inode)) 1618 return false; 1619 1620 /* 1621 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the 1622 * extent layout the relocation code has. 1623 * Furthermore we have set aside own block-group from which only the 1624 * relocation "process" can allocate and make sure only one process at a 1625 * time can add pages to an extent that gets relocated, so it's safe to 1626 * use regular REQ_OP_WRITE for this special case. 1627 */ 1628 if (btrfs_is_data_reloc_root(inode->root)) 1629 return false; 1630 1631 cache = btrfs_lookup_block_group(fs_info, start); 1632 ASSERT(cache); 1633 if (!cache) 1634 return false; 1635 1636 ret = cache->seq_zone; 1637 btrfs_put_block_group(cache); 1638 1639 return ret; 1640 } 1641 1642 void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, 1643 struct bio *bio) 1644 { 1645 struct btrfs_ordered_extent *ordered; 1646 const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; 1647 1648 if (bio_op(bio) != REQ_OP_ZONE_APPEND) 1649 return; 1650 1651 ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset); 1652 if (WARN_ON(!ordered)) 1653 return; 1654 1655 ordered->physical = physical; 1656 ordered->bdev = bio->bi_bdev; 1657 1658 btrfs_put_ordered_extent(ordered); 1659 } 1660 1661 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered) 1662 { 1663 struct btrfs_inode *inode = BTRFS_I(ordered->inode); 1664 struct btrfs_fs_info *fs_info = inode->root->fs_info; 1665 struct extent_map_tree *em_tree; 1666 struct extent_map *em; 1667 struct btrfs_ordered_sum *sum; 1668 u64 orig_logical = ordered->disk_bytenr; 1669 u64 *logical = NULL; 1670 int nr, stripe_len; 1671 1672 /* Zoned devices should not have partitions. So, we can assume it is 0 */ 1673 ASSERT(!bdev_is_partition(ordered->bdev)); 1674 if (WARN_ON(!ordered->bdev)) 1675 return; 1676 1677 if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev, 1678 ordered->physical, &logical, &nr, 1679 &stripe_len))) 1680 goto out; 1681 1682 WARN_ON(nr != 1); 1683 1684 if (orig_logical == *logical) 1685 goto out; 1686 1687 ordered->disk_bytenr = *logical; 1688 1689 em_tree = &inode->extent_tree; 1690 write_lock(&em_tree->lock); 1691 em = search_extent_mapping(em_tree, ordered->file_offset, 1692 ordered->num_bytes); 1693 em->block_start = *logical; 1694 free_extent_map(em); 1695 write_unlock(&em_tree->lock); 1696 1697 list_for_each_entry(sum, &ordered->list, list) { 1698 if (*logical < orig_logical) 1699 sum->bytenr -= orig_logical - *logical; 1700 else 1701 sum->bytenr += *logical - orig_logical; 1702 } 1703 1704 out: 1705 kfree(logical); 1706 } 1707 1708 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, 1709 struct extent_buffer *eb, 1710 struct btrfs_block_group **cache_ret) 1711 { 1712 struct btrfs_block_group *cache; 1713 bool ret = true; 1714 1715 if (!btrfs_is_zoned(fs_info)) 1716 return true; 1717 1718 cache = btrfs_lookup_block_group(fs_info, eb->start); 1719 if (!cache) 1720 return true; 1721 1722 if (cache->meta_write_pointer != eb->start) { 1723 btrfs_put_block_group(cache); 1724 cache = NULL; 1725 ret = false; 1726 } else { 1727 cache->meta_write_pointer = eb->start + eb->len; 1728 } 1729 1730 *cache_ret = cache; 1731 1732 return ret; 1733 } 1734 1735 void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, 1736 struct extent_buffer *eb) 1737 { 1738 if (!btrfs_is_zoned(eb->fs_info) || !cache) 1739 return; 1740 1741 ASSERT(cache->meta_write_pointer == eb->start + eb->len); 1742 cache->meta_write_pointer = eb->start; 1743 } 1744 1745 int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) 1746 { 1747 if (!btrfs_dev_is_sequential(device, physical)) 1748 return -EOPNOTSUPP; 1749 1750 return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT, 1751 length >> SECTOR_SHIFT, GFP_NOFS, 0); 1752 } 1753 1754 static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, 1755 struct blk_zone *zone) 1756 { 1757 struct btrfs_io_context *bioc = NULL; 1758 u64 mapped_length = PAGE_SIZE; 1759 unsigned int nofs_flag; 1760 int nmirrors; 1761 int i, ret; 1762 1763 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, 1764 &mapped_length, &bioc); 1765 if (ret || !bioc || mapped_length < PAGE_SIZE) { 1766 ret = -EIO; 1767 goto out_put_bioc; 1768 } 1769 1770 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { 1771 ret = -EINVAL; 1772 goto out_put_bioc; 1773 } 1774 1775 nofs_flag = memalloc_nofs_save(); 1776 nmirrors = (int)bioc->num_stripes; 1777 for (i = 0; i < nmirrors; i++) { 1778 u64 physical = bioc->stripes[i].physical; 1779 struct btrfs_device *dev = bioc->stripes[i].dev; 1780 1781 /* Missing device */ 1782 if (!dev->bdev) 1783 continue; 1784 1785 ret = btrfs_get_dev_zone(dev, physical, zone); 1786 /* Failing device */ 1787 if (ret == -EIO || ret == -EOPNOTSUPP) 1788 continue; 1789 break; 1790 } 1791 memalloc_nofs_restore(nofs_flag); 1792 out_put_bioc: 1793 btrfs_put_bioc(bioc); 1794 return ret; 1795 } 1796 1797 /* 1798 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by 1799 * filling zeros between @physical_pos to a write pointer of dev-replace 1800 * source device. 1801 */ 1802 int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, 1803 u64 physical_start, u64 physical_pos) 1804 { 1805 struct btrfs_fs_info *fs_info = tgt_dev->fs_info; 1806 struct blk_zone zone; 1807 u64 length; 1808 u64 wp; 1809 int ret; 1810 1811 if (!btrfs_dev_is_sequential(tgt_dev, physical_pos)) 1812 return 0; 1813 1814 ret = read_zone_info(fs_info, logical, &zone); 1815 if (ret) 1816 return ret; 1817 1818 wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT); 1819 1820 if (physical_pos == wp) 1821 return 0; 1822 1823 if (physical_pos > wp) 1824 return -EUCLEAN; 1825 1826 length = wp - physical_pos; 1827 return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length); 1828 } 1829 1830 struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, 1831 u64 logical, u64 length) 1832 { 1833 struct btrfs_device *device; 1834 struct extent_map *em; 1835 struct map_lookup *map; 1836 1837 em = btrfs_get_chunk_map(fs_info, logical, length); 1838 if (IS_ERR(em)) 1839 return ERR_CAST(em); 1840 1841 map = em->map_lookup; 1842 /* We only support single profile for now */ 1843 device = map->stripes[0].dev; 1844 1845 free_extent_map(em); 1846 1847 return device; 1848 } 1849 1850 /** 1851 * Activate block group and underlying device zones 1852 * 1853 * @block_group: the block group to activate 1854 * 1855 * Return: true on success, false otherwise 1856 */ 1857 bool btrfs_zone_activate(struct btrfs_block_group *block_group) 1858 { 1859 struct btrfs_fs_info *fs_info = block_group->fs_info; 1860 struct btrfs_space_info *space_info = block_group->space_info; 1861 struct map_lookup *map; 1862 struct btrfs_device *device; 1863 u64 physical; 1864 bool ret; 1865 int i; 1866 1867 if (!btrfs_is_zoned(block_group->fs_info)) 1868 return true; 1869 1870 map = block_group->physical_map; 1871 1872 spin_lock(&space_info->lock); 1873 spin_lock(&block_group->lock); 1874 if (block_group->zone_is_active) { 1875 ret = true; 1876 goto out_unlock; 1877 } 1878 1879 /* No space left */ 1880 if (btrfs_zoned_bg_is_full(block_group)) { 1881 ret = false; 1882 goto out_unlock; 1883 } 1884 1885 for (i = 0; i < map->num_stripes; i++) { 1886 device = map->stripes[i].dev; 1887 physical = map->stripes[i].physical; 1888 1889 if (device->zone_info->max_active_zones == 0) 1890 continue; 1891 1892 if (!btrfs_dev_set_active_zone(device, physical)) { 1893 /* Cannot activate the zone */ 1894 ret = false; 1895 goto out_unlock; 1896 } 1897 } 1898 1899 /* Successfully activated all the zones */ 1900 block_group->zone_is_active = 1; 1901 space_info->active_total_bytes += block_group->length; 1902 spin_unlock(&block_group->lock); 1903 btrfs_try_granting_tickets(fs_info, space_info); 1904 spin_unlock(&space_info->lock); 1905 1906 /* For the active block group list */ 1907 btrfs_get_block_group(block_group); 1908 1909 spin_lock(&fs_info->zone_active_bgs_lock); 1910 list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); 1911 spin_unlock(&fs_info->zone_active_bgs_lock); 1912 1913 return true; 1914 1915 out_unlock: 1916 spin_unlock(&block_group->lock); 1917 spin_unlock(&space_info->lock); 1918 return ret; 1919 } 1920 1921 static void wait_eb_writebacks(struct btrfs_block_group *block_group) 1922 { 1923 struct btrfs_fs_info *fs_info = block_group->fs_info; 1924 const u64 end = block_group->start + block_group->length; 1925 struct radix_tree_iter iter; 1926 struct extent_buffer *eb; 1927 void __rcu **slot; 1928 1929 rcu_read_lock(); 1930 radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 1931 block_group->start >> fs_info->sectorsize_bits) { 1932 eb = radix_tree_deref_slot(slot); 1933 if (!eb) 1934 continue; 1935 if (radix_tree_deref_retry(eb)) { 1936 slot = radix_tree_iter_retry(&iter); 1937 continue; 1938 } 1939 1940 if (eb->start < block_group->start) 1941 continue; 1942 if (eb->start >= end) 1943 break; 1944 1945 slot = radix_tree_iter_resume(slot, &iter); 1946 rcu_read_unlock(); 1947 wait_on_extent_buffer_writeback(eb); 1948 rcu_read_lock(); 1949 } 1950 rcu_read_unlock(); 1951 } 1952 1953 static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) 1954 { 1955 struct btrfs_fs_info *fs_info = block_group->fs_info; 1956 struct map_lookup *map; 1957 const bool is_metadata = (block_group->flags & 1958 (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); 1959 int ret = 0; 1960 int i; 1961 1962 spin_lock(&block_group->lock); 1963 if (!block_group->zone_is_active) { 1964 spin_unlock(&block_group->lock); 1965 return 0; 1966 } 1967 1968 /* Check if we have unwritten allocated space */ 1969 if (is_metadata && 1970 block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { 1971 spin_unlock(&block_group->lock); 1972 return -EAGAIN; 1973 } 1974 1975 /* 1976 * If we are sure that the block group is full (= no more room left for 1977 * new allocation) and the IO for the last usable block is completed, we 1978 * don't need to wait for the other IOs. This holds because we ensure 1979 * the sequential IO submissions using the ZONE_APPEND command for data 1980 * and block_group->meta_write_pointer for metadata. 1981 */ 1982 if (!fully_written) { 1983 spin_unlock(&block_group->lock); 1984 1985 ret = btrfs_inc_block_group_ro(block_group, false); 1986 if (ret) 1987 return ret; 1988 1989 /* Ensure all writes in this block group finish */ 1990 btrfs_wait_block_group_reservations(block_group); 1991 /* No need to wait for NOCOW writers. Zoned mode does not allow that */ 1992 btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, 1993 block_group->length); 1994 /* Wait for extent buffers to be written. */ 1995 if (is_metadata) 1996 wait_eb_writebacks(block_group); 1997 1998 spin_lock(&block_group->lock); 1999 2000 /* 2001 * Bail out if someone already deactivated the block group, or 2002 * allocated space is left in the block group. 2003 */ 2004 if (!block_group->zone_is_active) { 2005 spin_unlock(&block_group->lock); 2006 btrfs_dec_block_group_ro(block_group); 2007 return 0; 2008 } 2009 2010 if (block_group->reserved) { 2011 spin_unlock(&block_group->lock); 2012 btrfs_dec_block_group_ro(block_group); 2013 return -EAGAIN; 2014 } 2015 } 2016 2017 block_group->zone_is_active = 0; 2018 block_group->alloc_offset = block_group->zone_capacity; 2019 block_group->free_space_ctl->free_space = 0; 2020 btrfs_clear_treelog_bg(block_group); 2021 btrfs_clear_data_reloc_bg(block_group); 2022 spin_unlock(&block_group->lock); 2023 2024 map = block_group->physical_map; 2025 for (i = 0; i < map->num_stripes; i++) { 2026 struct btrfs_device *device = map->stripes[i].dev; 2027 const u64 physical = map->stripes[i].physical; 2028 2029 if (device->zone_info->max_active_zones == 0) 2030 continue; 2031 2032 ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, 2033 physical >> SECTOR_SHIFT, 2034 device->zone_info->zone_size >> SECTOR_SHIFT, 2035 GFP_NOFS); 2036 2037 if (ret) 2038 return ret; 2039 2040 btrfs_dev_clear_active_zone(device, physical); 2041 } 2042 2043 if (!fully_written) 2044 btrfs_dec_block_group_ro(block_group); 2045 2046 spin_lock(&fs_info->zone_active_bgs_lock); 2047 ASSERT(!list_empty(&block_group->active_bg_list)); 2048 list_del_init(&block_group->active_bg_list); 2049 spin_unlock(&fs_info->zone_active_bgs_lock); 2050 2051 /* For active_bg_list */ 2052 btrfs_put_block_group(block_group); 2053 2054 clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); 2055 2056 return 0; 2057 } 2058 2059 int btrfs_zone_finish(struct btrfs_block_group *block_group) 2060 { 2061 if (!btrfs_is_zoned(block_group->fs_info)) 2062 return 0; 2063 2064 return do_zone_finish(block_group, false); 2065 } 2066 2067 bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) 2068 { 2069 struct btrfs_fs_info *fs_info = fs_devices->fs_info; 2070 struct btrfs_device *device; 2071 bool ret = false; 2072 2073 if (!btrfs_is_zoned(fs_info)) 2074 return true; 2075 2076 /* Check if there is a device with active zones left */ 2077 mutex_lock(&fs_info->chunk_mutex); 2078 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 2079 struct btrfs_zoned_device_info *zinfo = device->zone_info; 2080 2081 if (!device->bdev) 2082 continue; 2083 2084 if (!zinfo->max_active_zones || 2085 atomic_read(&zinfo->active_zones_left)) { 2086 ret = true; 2087 break; 2088 } 2089 } 2090 mutex_unlock(&fs_info->chunk_mutex); 2091 2092 if (!ret) 2093 set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); 2094 2095 return ret; 2096 } 2097 2098 void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) 2099 { 2100 struct btrfs_block_group *block_group; 2101 u64 min_alloc_bytes; 2102 2103 if (!btrfs_is_zoned(fs_info)) 2104 return; 2105 2106 block_group = btrfs_lookup_block_group(fs_info, logical); 2107 ASSERT(block_group); 2108 2109 /* No MIXED_BG on zoned btrfs. */ 2110 if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) 2111 min_alloc_bytes = fs_info->sectorsize; 2112 else 2113 min_alloc_bytes = fs_info->nodesize; 2114 2115 /* Bail out if we can allocate more data from this block group. */ 2116 if (logical + length + min_alloc_bytes <= 2117 block_group->start + block_group->zone_capacity) 2118 goto out; 2119 2120 do_zone_finish(block_group, true); 2121 2122 out: 2123 btrfs_put_block_group(block_group); 2124 } 2125 2126 static void btrfs_zone_finish_endio_workfn(struct work_struct *work) 2127 { 2128 struct btrfs_block_group *bg = 2129 container_of(work, struct btrfs_block_group, zone_finish_work); 2130 2131 wait_on_extent_buffer_writeback(bg->last_eb); 2132 free_extent_buffer(bg->last_eb); 2133 btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); 2134 btrfs_put_block_group(bg); 2135 } 2136 2137 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, 2138 struct extent_buffer *eb) 2139 { 2140 if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) 2141 return; 2142 2143 if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { 2144 btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", 2145 bg->start); 2146 return; 2147 } 2148 2149 /* For the work */ 2150 btrfs_get_block_group(bg); 2151 atomic_inc(&eb->refs); 2152 bg->last_eb = eb; 2153 INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); 2154 queue_work(system_unbound_wq, &bg->zone_finish_work); 2155 } 2156 2157 void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) 2158 { 2159 struct btrfs_fs_info *fs_info = bg->fs_info; 2160 2161 spin_lock(&fs_info->relocation_bg_lock); 2162 if (fs_info->data_reloc_bg == bg->start) 2163 fs_info->data_reloc_bg = 0; 2164 spin_unlock(&fs_info->relocation_bg_lock); 2165 } 2166 2167 void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) 2168 { 2169 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2170 struct btrfs_device *device; 2171 2172 if (!btrfs_is_zoned(fs_info)) 2173 return; 2174 2175 mutex_lock(&fs_devices->device_list_mutex); 2176 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2177 if (device->zone_info) { 2178 vfree(device->zone_info->zone_cache); 2179 device->zone_info->zone_cache = NULL; 2180 } 2181 } 2182 mutex_unlock(&fs_devices->device_list_mutex); 2183 } 2184 2185 bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) 2186 { 2187 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; 2188 struct btrfs_device *device; 2189 u64 used = 0; 2190 u64 total = 0; 2191 u64 factor; 2192 2193 ASSERT(btrfs_is_zoned(fs_info)); 2194 2195 if (fs_info->bg_reclaim_threshold == 0) 2196 return false; 2197 2198 mutex_lock(&fs_devices->device_list_mutex); 2199 list_for_each_entry(device, &fs_devices->devices, dev_list) { 2200 if (!device->bdev) 2201 continue; 2202 2203 total += device->disk_total_bytes; 2204 used += device->bytes_used; 2205 } 2206 mutex_unlock(&fs_devices->device_list_mutex); 2207 2208 factor = div64_u64(used * 100, total); 2209 return factor >= fs_info->bg_reclaim_threshold; 2210 } 2211 2212 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, 2213 u64 length) 2214 { 2215 struct btrfs_block_group *block_group; 2216 2217 if (!btrfs_is_zoned(fs_info)) 2218 return; 2219 2220 block_group = btrfs_lookup_block_group(fs_info, logical); 2221 /* It should be called on a previous data relocation block group. */ 2222 ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); 2223 2224 spin_lock(&block_group->lock); 2225 if (!block_group->zoned_data_reloc_ongoing) 2226 goto out; 2227 2228 /* All relocation extents are written. */ 2229 if (block_group->start + block_group->alloc_offset == logical + length) { 2230 /* Now, release this block group for further allocations. */ 2231 block_group->zoned_data_reloc_ongoing = 0; 2232 } 2233 2234 out: 2235 spin_unlock(&block_group->lock); 2236 btrfs_put_block_group(block_group); 2237 } 2238 2239 int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) 2240 { 2241 struct btrfs_block_group *block_group; 2242 struct btrfs_block_group *min_bg = NULL; 2243 u64 min_avail = U64_MAX; 2244 int ret; 2245 2246 spin_lock(&fs_info->zone_active_bgs_lock); 2247 list_for_each_entry(block_group, &fs_info->zone_active_bgs, 2248 active_bg_list) { 2249 u64 avail; 2250 2251 spin_lock(&block_group->lock); 2252 if (block_group->reserved || 2253 (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { 2254 spin_unlock(&block_group->lock); 2255 continue; 2256 } 2257 2258 avail = block_group->zone_capacity - block_group->alloc_offset; 2259 if (min_avail > avail) { 2260 if (min_bg) 2261 btrfs_put_block_group(min_bg); 2262 min_bg = block_group; 2263 min_avail = avail; 2264 btrfs_get_block_group(min_bg); 2265 } 2266 spin_unlock(&block_group->lock); 2267 } 2268 spin_unlock(&fs_info->zone_active_bgs_lock); 2269 2270 if (!min_bg) 2271 return 0; 2272 2273 ret = btrfs_zone_finish(min_bg); 2274 btrfs_put_block_group(min_bg); 2275 2276 return ret < 0 ? ret : 1; 2277 } 2278 2279 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, 2280 struct btrfs_space_info *space_info, 2281 bool do_finish) 2282 { 2283 struct btrfs_block_group *bg; 2284 int index; 2285 2286 if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) 2287 return 0; 2288 2289 /* No more block groups to activate */ 2290 if (space_info->active_total_bytes == space_info->total_bytes) 2291 return 0; 2292 2293 for (;;) { 2294 int ret; 2295 bool need_finish = false; 2296 2297 down_read(&space_info->groups_sem); 2298 for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { 2299 list_for_each_entry(bg, &space_info->block_groups[index], 2300 list) { 2301 if (!spin_trylock(&bg->lock)) 2302 continue; 2303 if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { 2304 spin_unlock(&bg->lock); 2305 continue; 2306 } 2307 spin_unlock(&bg->lock); 2308 2309 if (btrfs_zone_activate(bg)) { 2310 up_read(&space_info->groups_sem); 2311 return 1; 2312 } 2313 2314 need_finish = true; 2315 } 2316 } 2317 up_read(&space_info->groups_sem); 2318 2319 if (!do_finish || !need_finish) 2320 break; 2321 2322 ret = btrfs_zone_finish_one_bg(fs_info); 2323 if (ret == 0) 2324 break; 2325 if (ret < 0) 2326 return ret; 2327 } 2328 2329 return 0; 2330 } 2331