1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Zoned block device handling 4 * 5 * Copyright (c) 2015, Hannes Reinecke 6 * Copyright (c) 2015, SUSE Linux GmbH 7 * 8 * Copyright (c) 2016, Damien Le Moal 9 * Copyright (c) 2016, Western Digital 10 * Copyright (c) 2024, Western Digital Corporation or its affiliates. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-mq.h> 16 #include <linux/spinlock.h> 17 #include <linux/refcount.h> 18 #include <linux/mempool.h> 19 20 #include <trace/events/block.h> 21 22 #include "blk.h" 23 #include "blk-mq-sched.h" 24 #include "blk-mq-debugfs.h" 25 26 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name 27 static const char *const zone_cond_name[] = { 28 ZONE_COND_NAME(NOT_WP), 29 ZONE_COND_NAME(EMPTY), 30 ZONE_COND_NAME(IMP_OPEN), 31 ZONE_COND_NAME(EXP_OPEN), 32 ZONE_COND_NAME(CLOSED), 33 ZONE_COND_NAME(READONLY), 34 ZONE_COND_NAME(FULL), 35 ZONE_COND_NAME(OFFLINE), 36 ZONE_COND_NAME(ACTIVE), 37 }; 38 #undef ZONE_COND_NAME 39 40 /* 41 * Per-zone write plug. 42 * @node: hlist_node structure for managing the plug using a hash table. 43 * @bio_list: The list of BIOs that are currently plugged. 44 * @bio_work: Work struct to handle issuing of plugged BIOs 45 * @rcu_head: RCU head to free zone write plugs with an RCU grace period. 46 * @disk: The gendisk the plug belongs to. 47 * @lock: Spinlock to atomically manipulate the plug. 48 * @ref: Zone write plug reference counter. A zone write plug reference is 49 * always at least 1 when the plug is hashed in the disk plug hash table. 50 * The reference is incremented whenever a new BIO needing plugging is 51 * submitted and when a function needs to manipulate a plug. The 52 * reference count is decremented whenever a plugged BIO completes and 53 * when a function that referenced the plug returns. The initial 54 * reference is dropped whenever the zone of the zone write plug is reset, 55 * finished and when the zone becomes full (last write BIO to the zone 56 * completes). 57 * @flags: Flags indicating the plug state. 58 * @zone_no: The number of the zone the plug is managing. 59 * @wp_offset: The zone write pointer location relative to the start of the zone 60 * as a number of 512B sectors. 61 * @cond: Condition of the zone 62 */ 63 struct blk_zone_wplug { 64 struct hlist_node node; 65 struct bio_list bio_list; 66 struct work_struct bio_work; 67 struct rcu_head rcu_head; 68 struct gendisk *disk; 69 spinlock_t lock; 70 refcount_t ref; 71 unsigned int flags; 72 unsigned int zone_no; 73 unsigned int wp_offset; 74 enum blk_zone_cond cond; 75 }; 76 77 static inline bool disk_need_zone_resources(struct gendisk *disk) 78 { 79 /* 80 * All request-based zoned devices need zone resources so that the 81 * block layer can automatically handle write BIO plugging. BIO-based 82 * device drivers (e.g. DM devices) are normally responsible for 83 * handling zone write ordering and do not need zone resources, unless 84 * the driver requires zone append emulation. 85 */ 86 return queue_is_mq(disk->queue) || 87 queue_emulates_zone_append(disk->queue); 88 } 89 90 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) 91 { 92 return 1U << disk->zone_wplugs_hash_bits; 93 } 94 95 /* 96 * Zone write plug flags bits: 97 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, 98 * that is, that write BIOs are being throttled due to a write BIO already 99 * being executed or the zone write plug bio list is not empty. 100 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone 101 * write pointer offset and need to update it. 102 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed 103 * from the disk hash table and that the initial reference to the zone 104 * write plug set when the plug was first added to the hash table has been 105 * dropped. This flag is set when a zone is reset, finished or become full, 106 * to prevent new references to the zone write plug to be taken for 107 * newly incoming BIOs. A zone write plug flagged with this flag will be 108 * freed once all remaining references from BIOs or functions are dropped. 109 */ 110 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) 111 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) 112 #define BLK_ZONE_WPLUG_UNHASHED (1U << 2) 113 114 /** 115 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. 116 * @zone_cond: BLK_ZONE_COND_XXX. 117 * 118 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX 119 * into string format. Useful in the debugging and tracing zone conditions. For 120 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". 121 */ 122 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) 123 { 124 static const char *zone_cond_str = "UNKNOWN"; 125 126 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) 127 zone_cond_str = zone_cond_name[zone_cond]; 128 129 return zone_cond_str; 130 } 131 EXPORT_SYMBOL_GPL(blk_zone_cond_str); 132 133 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno, 134 enum blk_zone_cond cond) 135 { 136 if (!zones_cond) 137 return; 138 139 switch (cond) { 140 case BLK_ZONE_COND_IMP_OPEN: 141 case BLK_ZONE_COND_EXP_OPEN: 142 case BLK_ZONE_COND_CLOSED: 143 zones_cond[zno] = BLK_ZONE_COND_ACTIVE; 144 return; 145 case BLK_ZONE_COND_NOT_WP: 146 case BLK_ZONE_COND_EMPTY: 147 case BLK_ZONE_COND_FULL: 148 case BLK_ZONE_COND_OFFLINE: 149 case BLK_ZONE_COND_READONLY: 150 default: 151 zones_cond[zno] = cond; 152 return; 153 } 154 } 155 156 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector, 157 enum blk_zone_cond cond) 158 { 159 u8 *zones_cond; 160 161 rcu_read_lock(); 162 zones_cond = rcu_dereference(disk->zones_cond); 163 if (zones_cond) { 164 unsigned int zno = disk_zone_no(disk, sector); 165 166 /* 167 * The condition of a conventional, readonly and offline zones 168 * never changes, so do nothing if the target zone is in one of 169 * these conditions. 170 */ 171 switch (zones_cond[zno]) { 172 case BLK_ZONE_COND_NOT_WP: 173 case BLK_ZONE_COND_READONLY: 174 case BLK_ZONE_COND_OFFLINE: 175 break; 176 default: 177 blk_zone_set_cond(zones_cond, zno, cond); 178 break; 179 } 180 } 181 rcu_read_unlock(); 182 } 183 184 /** 185 * bdev_zone_is_seq - check if a sector belongs to a sequential write zone 186 * @bdev: block device to check 187 * @sector: sector number 188 * 189 * Check if @sector on @bdev is contained in a sequential write required zone. 190 */ 191 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) 192 { 193 struct gendisk *disk = bdev->bd_disk; 194 unsigned int zno = disk_zone_no(disk, sector); 195 bool is_seq = false; 196 u8 *zones_cond; 197 198 if (!bdev_is_zoned(bdev)) 199 return false; 200 201 rcu_read_lock(); 202 zones_cond = rcu_dereference(disk->zones_cond); 203 if (zones_cond && zno < disk->nr_zones) 204 is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP; 205 rcu_read_unlock(); 206 207 return is_seq; 208 } 209 EXPORT_SYMBOL_GPL(bdev_zone_is_seq); 210 211 /* 212 * Zone report arguments for block device drivers report_zones operation. 213 * @cb: report_zones_cb callback for each reported zone. 214 * @data: Private data passed to report_zones_cb. 215 */ 216 struct blk_report_zones_args { 217 report_zones_cb cb; 218 void *data; 219 bool report_active; 220 }; 221 222 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector, 223 unsigned int nr_zones, 224 struct blk_report_zones_args *args) 225 { 226 struct gendisk *disk = bdev->bd_disk; 227 228 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) 229 return -EOPNOTSUPP; 230 231 if (!nr_zones || sector >= get_capacity(disk)) 232 return 0; 233 234 return disk->fops->report_zones(disk, sector, nr_zones, args); 235 } 236 237 /** 238 * blkdev_report_zones - Get zones information 239 * @bdev: Target block device 240 * @sector: Sector from which to report zones 241 * @nr_zones: Maximum number of zones to report 242 * @cb: Callback function called for each reported zone 243 * @data: Private data for the callback 244 * 245 * Description: 246 * Get zone information starting from the zone containing @sector for at most 247 * @nr_zones, and call @cb for each zone reported by the device. 248 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES 249 * constant can be passed to @nr_zones. 250 * Returns the number of zones reported by the device, or a negative errno 251 * value in case of failure. 252 * 253 * Note: The caller must use memalloc_noXX_save/restore() calls to control 254 * memory allocations done within this function. 255 */ 256 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 257 unsigned int nr_zones, report_zones_cb cb, void *data) 258 { 259 struct blk_report_zones_args args = { 260 .cb = cb, 261 .data = data, 262 }; 263 264 return blkdev_do_report_zones(bdev, sector, nr_zones, &args); 265 } 266 EXPORT_SYMBOL_GPL(blkdev_report_zones); 267 268 static int blkdev_zone_reset_all(struct block_device *bdev) 269 { 270 struct bio bio; 271 272 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); 273 trace_blkdev_zone_mgmt(&bio, 0); 274 return submit_bio_wait(&bio); 275 } 276 277 /** 278 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones 279 * @bdev: Target block device 280 * @op: Operation to be performed on the zones 281 * @sector: Start sector of the first zone to operate on 282 * @nr_sectors: Number of sectors, should be at least the length of one zone and 283 * must be zone size aligned. 284 * 285 * Description: 286 * Perform the specified operation on the range of zones specified by 287 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range 288 * is valid, but the specified range should not contain conventional zones. 289 * The operation to execute on each zone can be a zone reset, open, close 290 * or finish request. 291 */ 292 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, 293 sector_t sector, sector_t nr_sectors) 294 { 295 sector_t zone_sectors = bdev_zone_sectors(bdev); 296 sector_t capacity = bdev_nr_sectors(bdev); 297 sector_t end_sector = sector + nr_sectors; 298 struct bio *bio = NULL; 299 int ret = 0; 300 301 if (!bdev_is_zoned(bdev)) 302 return -EOPNOTSUPP; 303 304 if (bdev_read_only(bdev)) 305 return -EPERM; 306 307 if (!op_is_zone_mgmt(op)) 308 return -EOPNOTSUPP; 309 310 if (end_sector <= sector || end_sector > capacity) 311 /* Out of range */ 312 return -EINVAL; 313 314 /* Check alignment (handle eventual smaller last zone) */ 315 if (!bdev_is_zone_start(bdev, sector)) 316 return -EINVAL; 317 318 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) 319 return -EINVAL; 320 321 /* 322 * In the case of a zone reset operation over all zones, use 323 * REQ_OP_ZONE_RESET_ALL. 324 */ 325 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) 326 return blkdev_zone_reset_all(bdev); 327 328 while (sector < end_sector) { 329 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); 330 bio->bi_iter.bi_sector = sector; 331 sector += zone_sectors; 332 333 /* This may take a while, so be nice to others */ 334 cond_resched(); 335 } 336 337 trace_blkdev_zone_mgmt(bio, nr_sectors); 338 ret = submit_bio_wait(bio); 339 bio_put(bio); 340 341 return ret; 342 } 343 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); 344 345 struct zone_report_args { 346 struct blk_zone __user *zones; 347 }; 348 349 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, 350 void *data) 351 { 352 struct zone_report_args *args = data; 353 354 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) 355 return -EFAULT; 356 return 0; 357 } 358 359 /* 360 * Mask of valid input flags for BLKREPORTZONEV2 ioctl. 361 */ 362 #define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED 363 364 /* 365 * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing. 366 * Called from blkdev_ioctl. 367 */ 368 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, 369 unsigned long arg) 370 { 371 void __user *argp = (void __user *)arg; 372 struct zone_report_args args; 373 struct blk_zone_report rep; 374 int ret; 375 376 if (!argp) 377 return -EINVAL; 378 379 if (!bdev_is_zoned(bdev)) 380 return -ENOTTY; 381 382 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) 383 return -EFAULT; 384 385 if (!rep.nr_zones) 386 return -EINVAL; 387 388 args.zones = argp + sizeof(struct blk_zone_report); 389 390 switch (cmd) { 391 case BLKREPORTZONE: 392 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, 393 blkdev_copy_zone_to_user, &args); 394 break; 395 case BLKREPORTZONEV2: 396 if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS) 397 return -EINVAL; 398 ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones, 399 blkdev_copy_zone_to_user, &args); 400 break; 401 default: 402 return -EINVAL; 403 } 404 405 if (ret < 0) 406 return ret; 407 408 rep.nr_zones = ret; 409 rep.flags = BLK_ZONE_REP_CAPACITY; 410 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) 411 return -EFAULT; 412 return 0; 413 } 414 415 static int blkdev_truncate_zone_range(struct block_device *bdev, 416 blk_mode_t mode, const struct blk_zone_range *zrange) 417 { 418 loff_t start, end; 419 420 if (zrange->sector + zrange->nr_sectors <= zrange->sector || 421 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) 422 /* Out of range */ 423 return -EINVAL; 424 425 start = zrange->sector << SECTOR_SHIFT; 426 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; 427 428 return truncate_bdev_range(bdev, mode, start, end); 429 } 430 431 /* 432 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. 433 * Called from blkdev_ioctl. 434 */ 435 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, 436 unsigned int cmd, unsigned long arg) 437 { 438 void __user *argp = (void __user *)arg; 439 struct blk_zone_range zrange; 440 enum req_op op; 441 int ret; 442 443 if (!argp) 444 return -EINVAL; 445 446 if (!bdev_is_zoned(bdev)) 447 return -ENOTTY; 448 449 if (!(mode & BLK_OPEN_WRITE)) 450 return -EBADF; 451 452 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) 453 return -EFAULT; 454 455 switch (cmd) { 456 case BLKRESETZONE: 457 op = REQ_OP_ZONE_RESET; 458 459 /* Invalidate the page cache, including dirty pages. */ 460 inode_lock(bdev->bd_mapping->host); 461 filemap_invalidate_lock(bdev->bd_mapping); 462 ret = blkdev_truncate_zone_range(bdev, mode, &zrange); 463 if (ret) 464 goto fail; 465 break; 466 case BLKOPENZONE: 467 op = REQ_OP_ZONE_OPEN; 468 break; 469 case BLKCLOSEZONE: 470 op = REQ_OP_ZONE_CLOSE; 471 break; 472 case BLKFINISHZONE: 473 op = REQ_OP_ZONE_FINISH; 474 break; 475 default: 476 return -ENOTTY; 477 } 478 479 ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); 480 481 fail: 482 if (cmd == BLKRESETZONE) { 483 filemap_invalidate_unlock(bdev->bd_mapping); 484 inode_unlock(bdev->bd_mapping->host); 485 } 486 487 return ret; 488 } 489 490 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) 491 { 492 return zone->start + zone->len >= get_capacity(disk); 493 } 494 495 static bool disk_zone_is_full(struct gendisk *disk, 496 unsigned int zno, unsigned int offset_in_zone) 497 { 498 if (zno < disk->nr_zones - 1) 499 return offset_in_zone >= disk->zone_capacity; 500 return offset_in_zone >= disk->last_zone_capacity; 501 } 502 503 static bool disk_zone_wplug_is_full(struct gendisk *disk, 504 struct blk_zone_wplug *zwplug) 505 { 506 return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); 507 } 508 509 static bool disk_insert_zone_wplug(struct gendisk *disk, 510 struct blk_zone_wplug *zwplug) 511 { 512 struct blk_zone_wplug *zwplg; 513 unsigned long flags; 514 u8 *zones_cond; 515 unsigned int idx = 516 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); 517 518 /* 519 * Add the new zone write plug to the hash table, but carefully as we 520 * are racing with other submission context, so we may already have a 521 * zone write plug for the same zone. 522 */ 523 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 524 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { 525 if (zwplg->zone_no == zwplug->zone_no) { 526 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 527 return false; 528 } 529 } 530 531 /* 532 * Set the zone condition: if we do not yet have a zones_cond array 533 * attached to the disk, then this is a zone write plug insert from the 534 * first call to blk_revalidate_disk_zones(), in which case the zone is 535 * necessarilly in the active condition. 536 */ 537 zones_cond = rcu_dereference_check(disk->zones_cond, 538 lockdep_is_held(&disk->zone_wplugs_lock)); 539 if (zones_cond) 540 zwplug->cond = zones_cond[zwplug->zone_no]; 541 else 542 zwplug->cond = BLK_ZONE_COND_ACTIVE; 543 544 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); 545 atomic_inc(&disk->nr_zone_wplugs); 546 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 547 548 return true; 549 } 550 551 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, 552 sector_t sector) 553 { 554 unsigned int zno = disk_zone_no(disk, sector); 555 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); 556 struct blk_zone_wplug *zwplug; 557 558 rcu_read_lock(); 559 560 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { 561 if (zwplug->zone_no == zno && 562 refcount_inc_not_zero(&zwplug->ref)) { 563 rcu_read_unlock(); 564 return zwplug; 565 } 566 } 567 568 rcu_read_unlock(); 569 570 return NULL; 571 } 572 573 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, 574 sector_t sector) 575 { 576 if (!atomic_read(&disk->nr_zone_wplugs)) 577 return NULL; 578 579 return disk_get_hashed_zone_wplug(disk, sector); 580 } 581 582 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) 583 { 584 struct blk_zone_wplug *zwplug = 585 container_of(rcu_head, struct blk_zone_wplug, rcu_head); 586 587 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); 588 } 589 590 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) 591 { 592 if (refcount_dec_and_test(&zwplug->ref)) { 593 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); 594 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); 595 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); 596 597 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); 598 } 599 } 600 601 static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, 602 struct blk_zone_wplug *zwplug) 603 { 604 lockdep_assert_held(&zwplug->lock); 605 606 /* If the zone write plug was already removed, we are done. */ 607 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) 608 return false; 609 610 /* If the zone write plug is still plugged, it cannot be removed. */ 611 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) 612 return false; 613 614 /* 615 * Completions of BIOs with blk_zone_write_plug_bio_endio() may 616 * happen after handling a request completion with 617 * blk_zone_write_plug_finish_request() (e.g. with split BIOs 618 * that are chained). In such case, disk_zone_wplug_unplug_bio() 619 * should not attempt to remove the zone write plug until all BIO 620 * completions are seen. Check by looking at the zone write plug 621 * reference count, which is 2 when the plug is unused (one reference 622 * taken when the plug was allocated and another reference taken by the 623 * caller context). 624 */ 625 if (refcount_read(&zwplug->ref) > 2) 626 return false; 627 628 /* We can remove zone write plugs for zones that are empty or full. */ 629 return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); 630 } 631 632 static void disk_remove_zone_wplug(struct gendisk *disk, 633 struct blk_zone_wplug *zwplug) 634 { 635 unsigned long flags; 636 637 /* If the zone write plug was already removed, we have nothing to do. */ 638 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) 639 return; 640 641 /* 642 * Mark the zone write plug as unhashed and drop the extra reference we 643 * took when the plug was inserted in the hash table. Also update the 644 * disk zone condition array with the current condition of the zone 645 * write plug. 646 */ 647 zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; 648 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 649 blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, 650 lockdep_is_held(&disk->zone_wplugs_lock)), 651 zwplug->zone_no, zwplug->cond); 652 hlist_del_init_rcu(&zwplug->node); 653 atomic_dec(&disk->nr_zone_wplugs); 654 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 655 disk_put_zone_wplug(zwplug); 656 } 657 658 static void blk_zone_wplug_bio_work(struct work_struct *work); 659 660 /* 661 * Get a reference on the write plug for the zone containing @sector. 662 * If the plug does not exist, it is allocated and hashed. 663 * Return a pointer to the zone write plug with the plug spinlock held. 664 */ 665 static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, 666 sector_t sector, gfp_t gfp_mask, 667 unsigned long *flags) 668 { 669 unsigned int zno = disk_zone_no(disk, sector); 670 struct blk_zone_wplug *zwplug; 671 672 again: 673 zwplug = disk_get_zone_wplug(disk, sector); 674 if (zwplug) { 675 /* 676 * Check that a BIO completion or a zone reset or finish 677 * operation has not already removed the zone write plug from 678 * the hash table and dropped its reference count. In such case, 679 * we need to get a new plug so start over from the beginning. 680 */ 681 spin_lock_irqsave(&zwplug->lock, *flags); 682 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { 683 spin_unlock_irqrestore(&zwplug->lock, *flags); 684 disk_put_zone_wplug(zwplug); 685 goto again; 686 } 687 return zwplug; 688 } 689 690 /* 691 * Allocate and initialize a zone write plug with an extra reference 692 * so that it is not freed when the zone write plug becomes idle without 693 * the zone being full. 694 */ 695 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); 696 if (!zwplug) 697 return NULL; 698 699 INIT_HLIST_NODE(&zwplug->node); 700 refcount_set(&zwplug->ref, 2); 701 spin_lock_init(&zwplug->lock); 702 zwplug->flags = 0; 703 zwplug->zone_no = zno; 704 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); 705 bio_list_init(&zwplug->bio_list); 706 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); 707 zwplug->disk = disk; 708 709 spin_lock_irqsave(&zwplug->lock, *flags); 710 711 /* 712 * Insert the new zone write plug in the hash table. This can fail only 713 * if another context already inserted a plug. Retry from the beginning 714 * in such case. 715 */ 716 if (!disk_insert_zone_wplug(disk, zwplug)) { 717 spin_unlock_irqrestore(&zwplug->lock, *flags); 718 mempool_free(zwplug, disk->zone_wplugs_pool); 719 goto again; 720 } 721 722 return zwplug; 723 } 724 725 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, 726 struct bio *bio) 727 { 728 struct request_queue *q = zwplug->disk->queue; 729 730 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 731 bio_io_error(bio); 732 disk_put_zone_wplug(zwplug); 733 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 734 blk_queue_exit(q); 735 } 736 737 /* 738 * Abort (fail) all plugged BIOs of a zone write plug. 739 */ 740 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) 741 { 742 struct bio *bio; 743 744 if (bio_list_empty(&zwplug->bio_list)) 745 return; 746 747 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", 748 zwplug->disk->disk_name, zwplug->zone_no); 749 while ((bio = bio_list_pop(&zwplug->bio_list))) 750 blk_zone_wplug_bio_io_error(zwplug, bio); 751 } 752 753 /* 754 * Update a zone write plug condition based on the write pointer offset. 755 */ 756 static void disk_zone_wplug_update_cond(struct gendisk *disk, 757 struct blk_zone_wplug *zwplug) 758 { 759 lockdep_assert_held(&zwplug->lock); 760 761 if (disk_zone_wplug_is_full(disk, zwplug)) 762 zwplug->cond = BLK_ZONE_COND_FULL; 763 else if (!zwplug->wp_offset) 764 zwplug->cond = BLK_ZONE_COND_EMPTY; 765 else 766 zwplug->cond = BLK_ZONE_COND_ACTIVE; 767 } 768 769 /* 770 * Set a zone write plug write pointer offset to the specified value. 771 * This aborts all plugged BIOs, which is fine as this function is called for 772 * a zone reset operation, a zone finish operation or if the zone needs a wp 773 * update from a report zone after a write error. 774 */ 775 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, 776 struct blk_zone_wplug *zwplug, 777 unsigned int wp_offset) 778 { 779 lockdep_assert_held(&zwplug->lock); 780 781 /* Update the zone write pointer and abort all plugged BIOs. */ 782 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; 783 zwplug->wp_offset = wp_offset; 784 disk_zone_wplug_update_cond(disk, zwplug); 785 786 disk_zone_wplug_abort(zwplug); 787 788 /* 789 * The zone write plug now has no BIO plugged: remove it from the 790 * hash table so that it cannot be seen. The plug will be freed 791 * when the last reference is dropped. 792 */ 793 if (disk_should_remove_zone_wplug(disk, zwplug)) 794 disk_remove_zone_wplug(disk, zwplug); 795 } 796 797 static unsigned int blk_zone_wp_offset(struct blk_zone *zone) 798 { 799 switch (zone->cond) { 800 case BLK_ZONE_COND_IMP_OPEN: 801 case BLK_ZONE_COND_EXP_OPEN: 802 case BLK_ZONE_COND_CLOSED: 803 case BLK_ZONE_COND_ACTIVE: 804 return zone->wp - zone->start; 805 case BLK_ZONE_COND_EMPTY: 806 return 0; 807 case BLK_ZONE_COND_FULL: 808 case BLK_ZONE_COND_NOT_WP: 809 case BLK_ZONE_COND_OFFLINE: 810 case BLK_ZONE_COND_READONLY: 811 default: 812 /* 813 * Conventional, full, offline and read-only zones do not have 814 * a valid write pointer. 815 */ 816 return UINT_MAX; 817 } 818 } 819 820 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk, 821 struct blk_zone *zone) 822 { 823 struct blk_zone_wplug *zwplug; 824 unsigned int wp_offset = blk_zone_wp_offset(zone); 825 826 zwplug = disk_get_zone_wplug(disk, zone->start); 827 if (zwplug) { 828 unsigned long flags; 829 830 spin_lock_irqsave(&zwplug->lock, flags); 831 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 832 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); 833 spin_unlock_irqrestore(&zwplug->lock, flags); 834 disk_put_zone_wplug(zwplug); 835 } 836 837 return wp_offset; 838 } 839 840 /** 841 * disk_report_zone - Report one zone 842 * @disk: Target disk 843 * @zone: The zone to report 844 * @idx: The index of the zone in the overall zone report 845 * @args: report zones callback and data 846 * 847 * Description: 848 * Helper function for block device drivers to report one zone of a zone 849 * report initiated with blkdev_report_zones(). The zone being reported is 850 * specified by @zone and used to update, if necessary, the zone write plug 851 * information for the zone. If @args specifies a user callback function, 852 * this callback is executed. 853 */ 854 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, 855 unsigned int idx, struct blk_report_zones_args *args) 856 { 857 if (args && args->report_active) { 858 /* 859 * If we come here, then this is a report zones as a fallback 860 * for a cached report. So collapse the implicit open, explicit 861 * open and closed conditions into the active zone condition. 862 */ 863 switch (zone->cond) { 864 case BLK_ZONE_COND_IMP_OPEN: 865 case BLK_ZONE_COND_EXP_OPEN: 866 case BLK_ZONE_COND_CLOSED: 867 zone->cond = BLK_ZONE_COND_ACTIVE; 868 break; 869 default: 870 break; 871 } 872 } 873 874 if (disk->zone_wplugs_hash) 875 disk_zone_wplug_sync_wp_offset(disk, zone); 876 877 if (args && args->cb) 878 return args->cb(zone, idx, args->data); 879 880 return 0; 881 } 882 EXPORT_SYMBOL_GPL(disk_report_zone); 883 884 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx, 885 void *data) 886 { 887 memcpy(data, zone, sizeof(struct blk_zone)); 888 return 0; 889 } 890 891 static int blkdev_report_zone_fallback(struct block_device *bdev, 892 sector_t sector, struct blk_zone *zone) 893 { 894 struct blk_report_zones_args args = { 895 .cb = blkdev_report_zone_cb, 896 .data = zone, 897 .report_active = true, 898 }; 899 int error; 900 901 error = blkdev_do_report_zones(bdev, sector, 1, &args); 902 if (error < 0) 903 return error; 904 if (error == 0) 905 return -EIO; 906 return 0; 907 } 908 909 /* 910 * For devices that natively support zone append operations, we do not use zone 911 * write plugging for zone append writes, which makes the zone condition 912 * tracking invalid once zone append was used. In that case fall back to a 913 * regular report zones to get correct information. 914 */ 915 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev) 916 { 917 return disk_need_zone_resources(bdev->bd_disk) && 918 (bdev_emulates_zone_append(bdev) || 919 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state)); 920 } 921 922 /** 923 * blkdev_get_zone_info - Get a single zone information from cached data 924 * @bdev: Target block device 925 * @sector: Sector contained by the target zone 926 * @zone: zone structure to return the zone information 927 * 928 * Description: 929 * Get the zone information for the zone containing @sector using the zone 930 * write plug of the target zone, if one exist, or the disk zone condition 931 * array otherwise. The zone condition may be reported as being 932 * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit 933 * open, explicit open or closed condition. 934 * 935 * Returns 0 on success and a negative error code on failure. 936 */ 937 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, 938 struct blk_zone *zone) 939 { 940 struct gendisk *disk = bdev->bd_disk; 941 sector_t zone_sectors = bdev_zone_sectors(bdev); 942 struct blk_zone_wplug *zwplug; 943 unsigned long flags; 944 u8 *zones_cond; 945 946 if (!bdev_is_zoned(bdev)) 947 return -EOPNOTSUPP; 948 949 if (sector >= get_capacity(disk)) 950 return -EINVAL; 951 952 memset(zone, 0, sizeof(*zone)); 953 sector = bdev_zone_start(bdev, sector); 954 955 if (!blkdev_has_cached_report_zones(bdev)) 956 return blkdev_report_zone_fallback(bdev, sector, zone); 957 958 rcu_read_lock(); 959 zones_cond = rcu_dereference(disk->zones_cond); 960 if (!disk->zone_wplugs_hash || !zones_cond) { 961 rcu_read_unlock(); 962 return blkdev_report_zone_fallback(bdev, sector, zone); 963 } 964 zone->cond = zones_cond[disk_zone_no(disk, sector)]; 965 rcu_read_unlock(); 966 967 zone->start = sector; 968 zone->len = zone_sectors; 969 970 /* 971 * If this is a conventional zone, we do not have a zone write plug and 972 * can report the zone immediately. 973 */ 974 if (zone->cond == BLK_ZONE_COND_NOT_WP) { 975 zone->type = BLK_ZONE_TYPE_CONVENTIONAL; 976 zone->capacity = zone_sectors; 977 zone->wp = ULLONG_MAX; 978 return 0; 979 } 980 981 /* 982 * This is a sequential write required zone. If the zone is read-only or 983 * offline, only set the zone write pointer to an invalid value and 984 * report the zone. 985 */ 986 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; 987 if (disk_zone_is_last(disk, zone)) 988 zone->capacity = disk->last_zone_capacity; 989 else 990 zone->capacity = disk->zone_capacity; 991 992 if (zone->cond == BLK_ZONE_COND_READONLY || 993 zone->cond == BLK_ZONE_COND_OFFLINE) { 994 zone->wp = ULLONG_MAX; 995 return 0; 996 } 997 998 /* 999 * If the zone does not have a zone write plug, it is either full or 1000 * empty, as we otherwise would have a zone write plug for it. In this 1001 * case, set the write pointer accordingly and report the zone. 1002 * Otherwise, if we have a zone write plug, use it. 1003 */ 1004 zwplug = disk_get_zone_wplug(disk, sector); 1005 if (!zwplug) { 1006 if (zone->cond == BLK_ZONE_COND_FULL) 1007 zone->wp = ULLONG_MAX; 1008 else 1009 zone->wp = sector; 1010 return 0; 1011 } 1012 1013 spin_lock_irqsave(&zwplug->lock, flags); 1014 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) { 1015 spin_unlock_irqrestore(&zwplug->lock, flags); 1016 disk_put_zone_wplug(zwplug); 1017 return blkdev_report_zone_fallback(bdev, sector, zone); 1018 } 1019 zone->cond = zwplug->cond; 1020 zone->wp = sector + zwplug->wp_offset; 1021 spin_unlock_irqrestore(&zwplug->lock, flags); 1022 1023 disk_put_zone_wplug(zwplug); 1024 1025 return 0; 1026 } 1027 EXPORT_SYMBOL_GPL(blkdev_get_zone_info); 1028 1029 /** 1030 * blkdev_report_zones_cached - Get cached zones information 1031 * @bdev: Target block device 1032 * @sector: Sector from which to report zones 1033 * @nr_zones: Maximum number of zones to report 1034 * @cb: Callback function called for each reported zone 1035 * @data: Private data for the callback function 1036 * 1037 * Description: 1038 * Similar to blkdev_report_zones() but instead of calling into the low level 1039 * device driver to get the zone report from the device, use 1040 * blkdev_get_zone_info() to generate the report from the disk zone write 1041 * plugs and zones condition array. Since calling this function without a 1042 * callback does not make sense, @cb must be specified. 1043 */ 1044 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, 1045 unsigned int nr_zones, report_zones_cb cb, void *data) 1046 { 1047 struct gendisk *disk = bdev->bd_disk; 1048 sector_t capacity = get_capacity(disk); 1049 sector_t zone_sectors = bdev_zone_sectors(bdev); 1050 unsigned int idx = 0; 1051 struct blk_zone zone; 1052 int ret; 1053 1054 if (!cb || !bdev_is_zoned(bdev) || 1055 WARN_ON_ONCE(!disk->fops->report_zones)) 1056 return -EOPNOTSUPP; 1057 1058 if (!nr_zones || sector >= capacity) 1059 return 0; 1060 1061 if (!blkdev_has_cached_report_zones(bdev)) { 1062 struct blk_report_zones_args args = { 1063 .cb = cb, 1064 .data = data, 1065 .report_active = true, 1066 }; 1067 1068 return blkdev_do_report_zones(bdev, sector, nr_zones, &args); 1069 } 1070 1071 for (sector = bdev_zone_start(bdev, sector); 1072 sector < capacity && idx < nr_zones; 1073 sector += zone_sectors, idx++) { 1074 ret = blkdev_get_zone_info(bdev, sector, &zone); 1075 if (ret) 1076 return ret; 1077 1078 ret = cb(&zone, idx, data); 1079 if (ret) 1080 return ret; 1081 } 1082 1083 return idx; 1084 } 1085 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached); 1086 1087 static void blk_zone_reset_bio_endio(struct bio *bio) 1088 { 1089 struct gendisk *disk = bio->bi_bdev->bd_disk; 1090 sector_t sector = bio->bi_iter.bi_sector; 1091 struct blk_zone_wplug *zwplug; 1092 1093 /* 1094 * If we have a zone write plug, set its write pointer offset to 0. 1095 * This will abort all BIOs plugged for the target zone. It is fine as 1096 * resetting zones while writes are still in-flight will result in the 1097 * writes failing anyway. 1098 */ 1099 zwplug = disk_get_zone_wplug(disk, sector); 1100 if (zwplug) { 1101 unsigned long flags; 1102 1103 spin_lock_irqsave(&zwplug->lock, flags); 1104 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 1105 spin_unlock_irqrestore(&zwplug->lock, flags); 1106 disk_put_zone_wplug(zwplug); 1107 } else { 1108 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); 1109 } 1110 } 1111 1112 static void blk_zone_reset_all_bio_endio(struct bio *bio) 1113 { 1114 struct gendisk *disk = bio->bi_bdev->bd_disk; 1115 sector_t capacity = get_capacity(disk); 1116 struct blk_zone_wplug *zwplug; 1117 unsigned long flags; 1118 sector_t sector; 1119 unsigned int i; 1120 1121 if (atomic_read(&disk->nr_zone_wplugs)) { 1122 /* Update the condition of all zone write plugs. */ 1123 rcu_read_lock(); 1124 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1125 hlist_for_each_entry_rcu(zwplug, 1126 &disk->zone_wplugs_hash[i], 1127 node) { 1128 spin_lock_irqsave(&zwplug->lock, flags); 1129 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 1130 spin_unlock_irqrestore(&zwplug->lock, flags); 1131 } 1132 } 1133 rcu_read_unlock(); 1134 } 1135 1136 /* Update the cached zone conditions. */ 1137 for (sector = 0; sector < capacity; 1138 sector += bdev_zone_sectors(bio->bi_bdev)) 1139 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); 1140 clear_bit(GD_ZONE_APPEND_USED, &disk->state); 1141 } 1142 1143 static void blk_zone_finish_bio_endio(struct bio *bio) 1144 { 1145 struct block_device *bdev = bio->bi_bdev; 1146 struct gendisk *disk = bdev->bd_disk; 1147 sector_t sector = bio->bi_iter.bi_sector; 1148 struct blk_zone_wplug *zwplug; 1149 1150 /* 1151 * If we have a zone write plug, set its write pointer offset to the 1152 * zone size. This will abort all BIOs plugged for the target zone. It 1153 * is fine as resetting zones while writes are still in-flight will 1154 * result in the writes failing anyway. 1155 */ 1156 zwplug = disk_get_zone_wplug(disk, sector); 1157 if (zwplug) { 1158 unsigned long flags; 1159 1160 spin_lock_irqsave(&zwplug->lock, flags); 1161 disk_zone_wplug_set_wp_offset(disk, zwplug, 1162 bdev_zone_sectors(bdev)); 1163 spin_unlock_irqrestore(&zwplug->lock, flags); 1164 disk_put_zone_wplug(zwplug); 1165 } else { 1166 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL); 1167 } 1168 } 1169 1170 void blk_zone_mgmt_bio_endio(struct bio *bio) 1171 { 1172 /* If the BIO failed, we have nothing to do. */ 1173 if (bio->bi_status != BLK_STS_OK) 1174 return; 1175 1176 switch (bio_op(bio)) { 1177 case REQ_OP_ZONE_RESET: 1178 blk_zone_reset_bio_endio(bio); 1179 return; 1180 case REQ_OP_ZONE_RESET_ALL: 1181 blk_zone_reset_all_bio_endio(bio); 1182 return; 1183 case REQ_OP_ZONE_FINISH: 1184 blk_zone_finish_bio_endio(bio); 1185 return; 1186 default: 1187 return; 1188 } 1189 } 1190 1191 static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, 1192 struct blk_zone_wplug *zwplug) 1193 { 1194 lockdep_assert_held(&zwplug->lock); 1195 1196 /* 1197 * Take a reference on the zone write plug and schedule the submission 1198 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the 1199 * reference we take here. 1200 */ 1201 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); 1202 refcount_inc(&zwplug->ref); 1203 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); 1204 } 1205 1206 static inline void disk_zone_wplug_add_bio(struct gendisk *disk, 1207 struct blk_zone_wplug *zwplug, 1208 struct bio *bio, unsigned int nr_segs) 1209 { 1210 /* 1211 * Grab an extra reference on the BIO request queue usage counter. 1212 * This reference will be reused to submit a request for the BIO for 1213 * blk-mq devices and dropped when the BIO is failed and after 1214 * it is issued in the case of BIO-based devices. 1215 */ 1216 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); 1217 1218 /* 1219 * The BIO is being plugged and thus will have to wait for the on-going 1220 * write and for all other writes already plugged. So polling makes 1221 * no sense. 1222 */ 1223 bio_clear_polled(bio); 1224 1225 /* 1226 * Reuse the poll cookie field to store the number of segments when 1227 * split to the hardware limits. 1228 */ 1229 bio->__bi_nr_segments = nr_segs; 1230 1231 /* 1232 * We always receive BIOs after they are split and ready to be issued. 1233 * The block layer passes the parts of a split BIO in order, and the 1234 * user must also issue write sequentially. So simply add the new BIO 1235 * at the tail of the list to preserve the sequential write order. 1236 */ 1237 bio_list_add(&zwplug->bio_list, bio); 1238 trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, 1239 bio->bi_iter.bi_sector, bio_sectors(bio)); 1240 } 1241 1242 /* 1243 * Called from bio_attempt_back_merge() when a BIO was merged with a request. 1244 */ 1245 void blk_zone_write_plug_bio_merged(struct bio *bio) 1246 { 1247 struct gendisk *disk = bio->bi_bdev->bd_disk; 1248 struct blk_zone_wplug *zwplug; 1249 unsigned long flags; 1250 1251 /* 1252 * If the BIO was already plugged, then we were called through 1253 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). 1254 * For this case, we already hold a reference on the zone write plug for 1255 * the BIO and blk_zone_write_plug_init_request() will handle the 1256 * zone write pointer offset update. 1257 */ 1258 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) 1259 return; 1260 1261 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1262 1263 /* 1264 * Get a reference on the zone write plug of the target zone and advance 1265 * the zone write pointer offset. Given that this is a merge, we already 1266 * have at least one request and one BIO referencing the zone write 1267 * plug. So this should not fail. 1268 */ 1269 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1270 if (WARN_ON_ONCE(!zwplug)) 1271 return; 1272 1273 spin_lock_irqsave(&zwplug->lock, flags); 1274 zwplug->wp_offset += bio_sectors(bio); 1275 disk_zone_wplug_update_cond(disk, zwplug); 1276 spin_unlock_irqrestore(&zwplug->lock, flags); 1277 } 1278 1279 /* 1280 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that 1281 * already went through zone write plugging (either a new BIO or one that was 1282 * unplugged). 1283 */ 1284 void blk_zone_write_plug_init_request(struct request *req) 1285 { 1286 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); 1287 struct request_queue *q = req->q; 1288 struct gendisk *disk = q->disk; 1289 struct blk_zone_wplug *zwplug = 1290 disk_get_zone_wplug(disk, blk_rq_pos(req)); 1291 unsigned long flags; 1292 struct bio *bio; 1293 1294 if (WARN_ON_ONCE(!zwplug)) 1295 return; 1296 1297 /* 1298 * Indicate that completion of this request needs to be handled with 1299 * blk_zone_write_plug_finish_request(), which will drop the reference 1300 * on the zone write plug we took above on entry to this function. 1301 */ 1302 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; 1303 1304 if (blk_queue_nomerges(q)) 1305 return; 1306 1307 /* 1308 * Walk through the list of plugged BIOs to check if they can be merged 1309 * into the back of the request. 1310 */ 1311 spin_lock_irqsave(&zwplug->lock, flags); 1312 while (!disk_zone_wplug_is_full(disk, zwplug)) { 1313 bio = bio_list_peek(&zwplug->bio_list); 1314 if (!bio) 1315 break; 1316 1317 if (bio->bi_iter.bi_sector != req_back_sector || 1318 !blk_rq_merge_ok(req, bio)) 1319 break; 1320 1321 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && 1322 !bio->__bi_nr_segments); 1323 1324 bio_list_pop(&zwplug->bio_list); 1325 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != 1326 BIO_MERGE_OK) { 1327 bio_list_add_head(&zwplug->bio_list, bio); 1328 break; 1329 } 1330 1331 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 1332 blk_queue_exit(q); 1333 zwplug->wp_offset += bio_sectors(bio); 1334 disk_zone_wplug_update_cond(disk, zwplug); 1335 1336 req_back_sector += bio_sectors(bio); 1337 } 1338 spin_unlock_irqrestore(&zwplug->lock, flags); 1339 } 1340 1341 /* 1342 * Check and prepare a BIO for submission by incrementing the write pointer 1343 * offset of its zone write plug and changing zone append operations into 1344 * regular write when zone append emulation is needed. 1345 */ 1346 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, 1347 struct bio *bio) 1348 { 1349 struct gendisk *disk = bio->bi_bdev->bd_disk; 1350 1351 lockdep_assert_held(&zwplug->lock); 1352 1353 /* 1354 * If we lost track of the zone write pointer due to a write error, 1355 * the user must either execute a report zones, reset the zone or finish 1356 * the to recover a reliable write pointer position. Fail BIOs if the 1357 * user did not do that as we cannot handle emulated zone append 1358 * otherwise. 1359 */ 1360 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 1361 return false; 1362 1363 /* 1364 * Check that the user is not attempting to write to a full zone. 1365 * We know such BIO will fail, and that would potentially overflow our 1366 * write pointer offset beyond the end of the zone. 1367 */ 1368 if (disk_zone_wplug_is_full(disk, zwplug)) 1369 return false; 1370 1371 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1372 /* 1373 * Use a regular write starting at the current write pointer. 1374 * Similarly to native zone append operations, do not allow 1375 * merging. 1376 */ 1377 bio->bi_opf &= ~REQ_OP_MASK; 1378 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; 1379 bio->bi_iter.bi_sector += zwplug->wp_offset; 1380 1381 /* 1382 * Remember that this BIO is in fact a zone append operation 1383 * so that we can restore its operation code on completion. 1384 */ 1385 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); 1386 } else { 1387 /* 1388 * Check for non-sequential writes early as we know that BIOs 1389 * with a start sector not unaligned to the zone write pointer 1390 * will fail. 1391 */ 1392 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) 1393 return false; 1394 } 1395 1396 /* Advance the zone write pointer offset. */ 1397 zwplug->wp_offset += bio_sectors(bio); 1398 disk_zone_wplug_update_cond(disk, zwplug); 1399 1400 return true; 1401 } 1402 1403 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) 1404 { 1405 struct gendisk *disk = bio->bi_bdev->bd_disk; 1406 sector_t sector = bio->bi_iter.bi_sector; 1407 struct blk_zone_wplug *zwplug; 1408 gfp_t gfp_mask = GFP_NOIO; 1409 unsigned long flags; 1410 1411 /* 1412 * BIOs must be fully contained within a zone so that we use the correct 1413 * zone write plug for the entire BIO. For blk-mq devices, the block 1414 * layer should already have done any splitting required to ensure this 1415 * and this BIO should thus not be straddling zone boundaries. For 1416 * BIO-based devices, it is the responsibility of the driver to split 1417 * the bio before submitting it. 1418 */ 1419 if (WARN_ON_ONCE(bio_straddles_zones(bio))) { 1420 bio_io_error(bio); 1421 return true; 1422 } 1423 1424 /* Conventional zones do not need write plugging. */ 1425 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { 1426 /* Zone append to conventional zones is not allowed. */ 1427 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1428 bio_io_error(bio); 1429 return true; 1430 } 1431 return false; 1432 } 1433 1434 if (bio->bi_opf & REQ_NOWAIT) 1435 gfp_mask = GFP_NOWAIT; 1436 1437 zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); 1438 if (!zwplug) { 1439 if (bio->bi_opf & REQ_NOWAIT) 1440 bio_wouldblock_error(bio); 1441 else 1442 bio_io_error(bio); 1443 return true; 1444 } 1445 1446 /* Indicate that this BIO is being handled using zone write plugging. */ 1447 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1448 1449 /* 1450 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a 1451 * BLK_STS_AGAIN failure if we let the caller submit the BIO. 1452 */ 1453 if (bio->bi_opf & REQ_NOWAIT) { 1454 bio->bi_opf &= ~REQ_NOWAIT; 1455 goto queue_bio; 1456 } 1457 1458 /* If the zone is already plugged, add the BIO to the BIO plug list. */ 1459 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) 1460 goto queue_bio; 1461 1462 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { 1463 spin_unlock_irqrestore(&zwplug->lock, flags); 1464 bio_io_error(bio); 1465 return true; 1466 } 1467 1468 /* Otherwise, plug and let the caller submit the BIO. */ 1469 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1470 1471 spin_unlock_irqrestore(&zwplug->lock, flags); 1472 1473 return false; 1474 1475 queue_bio: 1476 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); 1477 1478 if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { 1479 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1480 disk_zone_wplug_schedule_bio_work(disk, zwplug); 1481 } 1482 1483 spin_unlock_irqrestore(&zwplug->lock, flags); 1484 1485 return true; 1486 } 1487 1488 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) 1489 { 1490 struct gendisk *disk = bio->bi_bdev->bd_disk; 1491 struct blk_zone_wplug *zwplug; 1492 unsigned long flags; 1493 1494 if (!test_bit(GD_ZONE_APPEND_USED, &disk->state)) 1495 set_bit(GD_ZONE_APPEND_USED, &disk->state); 1496 1497 /* 1498 * We have native support for zone append operations, so we are not 1499 * going to handle @bio through plugging. However, we may already have a 1500 * zone write plug for the target zone if that zone was previously 1501 * partially written using regular writes. In such case, we risk leaving 1502 * the plug in the disk hash table if the zone is fully written using 1503 * zone append operations. Avoid this by removing the zone write plug. 1504 */ 1505 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1506 if (likely(!zwplug)) 1507 return; 1508 1509 spin_lock_irqsave(&zwplug->lock, flags); 1510 1511 /* 1512 * We are about to remove the zone write plug. But if the user 1513 * (mistakenly) has issued regular writes together with native zone 1514 * append, we must aborts the writes as otherwise the plugged BIOs would 1515 * not be executed by the plug BIO work as disk_get_zone_wplug() will 1516 * return NULL after the plug is removed. Aborting the plugged write 1517 * BIOs is consistent with the fact that these writes will most likely 1518 * fail anyway as there is no ordering guarantees between zone append 1519 * operations and regular write operations. 1520 */ 1521 if (!bio_list_empty(&zwplug->bio_list)) { 1522 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", 1523 disk->disk_name, zwplug->zone_no); 1524 disk_zone_wplug_abort(zwplug); 1525 } 1526 disk_remove_zone_wplug(disk, zwplug); 1527 spin_unlock_irqrestore(&zwplug->lock, flags); 1528 1529 disk_put_zone_wplug(zwplug); 1530 } 1531 1532 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio) 1533 { 1534 if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL && 1535 !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 1536 /* 1537 * Zone reset and zone finish operations do not apply to 1538 * conventional zones. 1539 */ 1540 bio_io_error(bio); 1541 return true; 1542 } 1543 1544 /* 1545 * No-wait zone management BIOs do not make much sense as the callers 1546 * issue these as blocking operations in most cases. To avoid issues 1547 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn 1548 * about REQ_NOWAIT being set and ignore that flag. 1549 */ 1550 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) 1551 bio->bi_opf &= ~REQ_NOWAIT; 1552 1553 return false; 1554 } 1555 1556 /** 1557 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging 1558 * @bio: The BIO being submitted 1559 * @nr_segs: The number of physical segments of @bio 1560 * 1561 * Handle write, write zeroes and zone append operations requiring emulation 1562 * using zone write plugging. 1563 * 1564 * Return true whenever @bio execution needs to be delayed through the zone 1565 * write plug. Otherwise, return false to let the submission path process 1566 * @bio normally. 1567 */ 1568 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) 1569 { 1570 struct block_device *bdev = bio->bi_bdev; 1571 1572 if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) 1573 return false; 1574 1575 /* 1576 * Regular writes and write zeroes need to be handled through the target 1577 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH 1578 * which may need to go through the flush machinery depending on the 1579 * target device capabilities. Plugging such writes is fine as the flush 1580 * machinery operates at the request level, below the plug, and 1581 * completion of the flush sequence will go through the regular BIO 1582 * completion, which will handle zone write plugging. 1583 * Zone append operations for devices that requested emulation must 1584 * also be plugged so that these BIOs can be changed into regular 1585 * write BIOs. 1586 * Zone reset, reset all and finish commands need special treatment 1587 * to correctly track the write pointer offset of zones. These commands 1588 * are not plugged as we do not need serialization with write 1589 * operations. It is the responsibility of the user to not issue reset 1590 * and finish commands when write operations are in flight. 1591 */ 1592 switch (bio_op(bio)) { 1593 case REQ_OP_ZONE_APPEND: 1594 if (!bdev_emulates_zone_append(bdev)) { 1595 blk_zone_wplug_handle_native_zone_append(bio); 1596 return false; 1597 } 1598 fallthrough; 1599 case REQ_OP_WRITE: 1600 case REQ_OP_WRITE_ZEROES: 1601 return blk_zone_wplug_handle_write(bio, nr_segs); 1602 case REQ_OP_ZONE_RESET: 1603 case REQ_OP_ZONE_FINISH: 1604 case REQ_OP_ZONE_RESET_ALL: 1605 return blk_zone_wplug_handle_zone_mgmt(bio); 1606 default: 1607 return false; 1608 } 1609 1610 return false; 1611 } 1612 EXPORT_SYMBOL_GPL(blk_zone_plug_bio); 1613 1614 static void disk_zone_wplug_unplug_bio(struct gendisk *disk, 1615 struct blk_zone_wplug *zwplug) 1616 { 1617 unsigned long flags; 1618 1619 spin_lock_irqsave(&zwplug->lock, flags); 1620 1621 /* Schedule submission of the next plugged BIO if we have one. */ 1622 if (!bio_list_empty(&zwplug->bio_list)) { 1623 disk_zone_wplug_schedule_bio_work(disk, zwplug); 1624 spin_unlock_irqrestore(&zwplug->lock, flags); 1625 return; 1626 } 1627 1628 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1629 1630 /* 1631 * If the zone is full (it was fully written or finished, or empty 1632 * (it was reset), remove its zone write plug from the hash table. 1633 */ 1634 if (disk_should_remove_zone_wplug(disk, zwplug)) 1635 disk_remove_zone_wplug(disk, zwplug); 1636 1637 spin_unlock_irqrestore(&zwplug->lock, flags); 1638 } 1639 1640 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio) 1641 { 1642 /* 1643 * For zone append requests, the request sector indicates the location 1644 * at which the BIO data was written. Return this value to the BIO 1645 * issuer through the BIO iter sector. 1646 * For plugged zone writes, which include emulated zone append, we need 1647 * the original BIO sector so that blk_zone_write_plug_bio_endio() can 1648 * lookup the zone write plug. 1649 */ 1650 bio->bi_iter.bi_sector = rq->__sector; 1651 trace_blk_zone_append_update_request_bio(rq); 1652 } 1653 1654 void blk_zone_write_plug_bio_endio(struct bio *bio) 1655 { 1656 struct gendisk *disk = bio->bi_bdev->bd_disk; 1657 struct blk_zone_wplug *zwplug = 1658 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1659 unsigned long flags; 1660 1661 if (WARN_ON_ONCE(!zwplug)) 1662 return; 1663 1664 /* Make sure we do not see this BIO again by clearing the plug flag. */ 1665 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1666 1667 /* 1668 * If this is a regular write emulating a zone append operation, 1669 * restore the original operation code. 1670 */ 1671 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { 1672 bio->bi_opf &= ~REQ_OP_MASK; 1673 bio->bi_opf |= REQ_OP_ZONE_APPEND; 1674 bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); 1675 } 1676 1677 /* 1678 * If the BIO failed, abort all plugged BIOs and mark the plug as 1679 * needing a write pointer update. 1680 */ 1681 if (bio->bi_status != BLK_STS_OK) { 1682 spin_lock_irqsave(&zwplug->lock, flags); 1683 disk_zone_wplug_abort(zwplug); 1684 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; 1685 spin_unlock_irqrestore(&zwplug->lock, flags); 1686 } 1687 1688 /* Drop the reference we took when the BIO was issued. */ 1689 disk_put_zone_wplug(zwplug); 1690 1691 /* 1692 * For BIO-based devices, blk_zone_write_plug_finish_request() 1693 * is not called. So we need to schedule execution of the next 1694 * plugged BIO here. 1695 */ 1696 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) 1697 disk_zone_wplug_unplug_bio(disk, zwplug); 1698 1699 /* Drop the reference we took when entering this function. */ 1700 disk_put_zone_wplug(zwplug); 1701 } 1702 1703 void blk_zone_write_plug_finish_request(struct request *req) 1704 { 1705 struct gendisk *disk = req->q->disk; 1706 struct blk_zone_wplug *zwplug; 1707 1708 zwplug = disk_get_zone_wplug(disk, req->__sector); 1709 if (WARN_ON_ONCE(!zwplug)) 1710 return; 1711 1712 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; 1713 1714 /* 1715 * Drop the reference we took when the request was initialized in 1716 * blk_zone_write_plug_init_request(). 1717 */ 1718 disk_put_zone_wplug(zwplug); 1719 1720 disk_zone_wplug_unplug_bio(disk, zwplug); 1721 1722 /* Drop the reference we took when entering this function. */ 1723 disk_put_zone_wplug(zwplug); 1724 } 1725 1726 static void blk_zone_wplug_bio_work(struct work_struct *work) 1727 { 1728 struct blk_zone_wplug *zwplug = 1729 container_of(work, struct blk_zone_wplug, bio_work); 1730 struct block_device *bdev; 1731 unsigned long flags; 1732 struct bio *bio; 1733 bool prepared; 1734 1735 /* 1736 * Submit the next plugged BIO. If we do not have any, clear 1737 * the plugged flag. 1738 */ 1739 again: 1740 spin_lock_irqsave(&zwplug->lock, flags); 1741 bio = bio_list_pop(&zwplug->bio_list); 1742 if (!bio) { 1743 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1744 spin_unlock_irqrestore(&zwplug->lock, flags); 1745 goto put_zwplug; 1746 } 1747 1748 trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, 1749 bio->bi_iter.bi_sector, bio_sectors(bio)); 1750 1751 prepared = blk_zone_wplug_prepare_bio(zwplug, bio); 1752 spin_unlock_irqrestore(&zwplug->lock, flags); 1753 1754 if (!prepared) { 1755 blk_zone_wplug_bio_io_error(zwplug, bio); 1756 goto again; 1757 } 1758 1759 bdev = bio->bi_bdev; 1760 1761 /* 1762 * blk-mq devices will reuse the extra reference on the request queue 1763 * usage counter we took when the BIO was plugged, but the submission 1764 * path for BIO-based devices will not do that. So drop this extra 1765 * reference here. 1766 */ 1767 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { 1768 bdev->bd_disk->fops->submit_bio(bio); 1769 blk_queue_exit(bdev->bd_disk->queue); 1770 } else { 1771 blk_mq_submit_bio(bio); 1772 } 1773 1774 put_zwplug: 1775 /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ 1776 disk_put_zone_wplug(zwplug); 1777 } 1778 1779 void disk_init_zone_resources(struct gendisk *disk) 1780 { 1781 spin_lock_init(&disk->zone_wplugs_lock); 1782 } 1783 1784 /* 1785 * For the size of a disk zone write plug hash table, use the size of the 1786 * zone write plug mempool, which is the maximum of the disk open zones and 1787 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, 1788 * 9 bits. For a disk that has no limits, mempool size defaults to 128. 1789 */ 1790 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 1791 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 1792 1793 static int disk_alloc_zone_resources(struct gendisk *disk, 1794 unsigned int pool_size) 1795 { 1796 unsigned int i; 1797 1798 atomic_set(&disk->nr_zone_wplugs, 0); 1799 disk->zone_wplugs_hash_bits = 1800 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); 1801 1802 disk->zone_wplugs_hash = 1803 kcalloc(disk_zone_wplugs_hash_size(disk), 1804 sizeof(struct hlist_head), GFP_KERNEL); 1805 if (!disk->zone_wplugs_hash) 1806 return -ENOMEM; 1807 1808 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 1809 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); 1810 1811 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, 1812 sizeof(struct blk_zone_wplug)); 1813 if (!disk->zone_wplugs_pool) 1814 goto free_hash; 1815 1816 disk->zone_wplugs_wq = 1817 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, 1818 pool_size, disk->disk_name); 1819 if (!disk->zone_wplugs_wq) 1820 goto destroy_pool; 1821 1822 return 0; 1823 1824 destroy_pool: 1825 mempool_destroy(disk->zone_wplugs_pool); 1826 disk->zone_wplugs_pool = NULL; 1827 free_hash: 1828 kfree(disk->zone_wplugs_hash); 1829 disk->zone_wplugs_hash = NULL; 1830 disk->zone_wplugs_hash_bits = 0; 1831 return -ENOMEM; 1832 } 1833 1834 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) 1835 { 1836 struct blk_zone_wplug *zwplug; 1837 unsigned int i; 1838 1839 if (!disk->zone_wplugs_hash) 1840 return; 1841 1842 /* Free all the zone write plugs we have. */ 1843 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1844 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { 1845 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, 1846 struct blk_zone_wplug, node); 1847 refcount_inc(&zwplug->ref); 1848 disk_remove_zone_wplug(disk, zwplug); 1849 disk_put_zone_wplug(zwplug); 1850 } 1851 } 1852 1853 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); 1854 kfree(disk->zone_wplugs_hash); 1855 disk->zone_wplugs_hash = NULL; 1856 disk->zone_wplugs_hash_bits = 0; 1857 1858 /* 1859 * Wait for the zone write plugs to be RCU-freed before destroying the 1860 * mempool. 1861 */ 1862 rcu_barrier(); 1863 mempool_destroy(disk->zone_wplugs_pool); 1864 disk->zone_wplugs_pool = NULL; 1865 } 1866 1867 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) 1868 { 1869 unsigned long flags; 1870 1871 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); 1872 zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, 1873 lockdep_is_held(&disk->zone_wplugs_lock)); 1874 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); 1875 1876 kfree_rcu_mightsleep(zones_cond); 1877 } 1878 1879 void disk_free_zone_resources(struct gendisk *disk) 1880 { 1881 if (disk->zone_wplugs_wq) { 1882 destroy_workqueue(disk->zone_wplugs_wq); 1883 disk->zone_wplugs_wq = NULL; 1884 } 1885 1886 disk_destroy_zone_wplugs_hash_table(disk); 1887 1888 disk_set_zones_cond_array(disk, NULL); 1889 disk->zone_capacity = 0; 1890 disk->last_zone_capacity = 0; 1891 disk->nr_zones = 0; 1892 } 1893 1894 struct blk_revalidate_zone_args { 1895 struct gendisk *disk; 1896 u8 *zones_cond; 1897 unsigned int nr_zones; 1898 unsigned int nr_conv_zones; 1899 unsigned int zone_capacity; 1900 unsigned int last_zone_capacity; 1901 sector_t sector; 1902 }; 1903 1904 static int disk_revalidate_zone_resources(struct gendisk *disk, 1905 struct blk_revalidate_zone_args *args) 1906 { 1907 struct queue_limits *lim = &disk->queue->limits; 1908 unsigned int pool_size; 1909 1910 args->disk = disk; 1911 args->nr_zones = 1912 DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors); 1913 1914 /* Cached zone conditions: 1 byte per zone */ 1915 args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO); 1916 if (!args->zones_cond) 1917 return -ENOMEM; 1918 1919 if (!disk_need_zone_resources(disk)) 1920 return 0; 1921 1922 /* 1923 * If the device has no limit on the maximum number of open and active 1924 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. 1925 */ 1926 pool_size = max(lim->max_open_zones, lim->max_active_zones); 1927 if (!pool_size) 1928 pool_size = 1929 min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); 1930 1931 if (!disk->zone_wplugs_hash) 1932 return disk_alloc_zone_resources(disk, pool_size); 1933 1934 return 0; 1935 } 1936 1937 /* 1938 * Update the disk zone resources information and device queue limits. 1939 * The disk queue is frozen when this is executed. 1940 */ 1941 static int disk_update_zone_resources(struct gendisk *disk, 1942 struct blk_revalidate_zone_args *args) 1943 { 1944 struct request_queue *q = disk->queue; 1945 unsigned int nr_seq_zones; 1946 unsigned int pool_size, memflags; 1947 struct queue_limits lim; 1948 int ret = 0; 1949 1950 lim = queue_limits_start_update(q); 1951 1952 memflags = blk_mq_freeze_queue(q); 1953 1954 disk->nr_zones = args->nr_zones; 1955 if (args->nr_conv_zones >= disk->nr_zones) { 1956 pr_warn("%s: Invalid number of conventional zones %u / %u\n", 1957 disk->disk_name, args->nr_conv_zones, disk->nr_zones); 1958 ret = -ENODEV; 1959 goto unfreeze; 1960 } 1961 1962 disk->zone_capacity = args->zone_capacity; 1963 disk->last_zone_capacity = args->last_zone_capacity; 1964 disk_set_zones_cond_array(disk, args->zones_cond); 1965 1966 /* 1967 * Some devices can advertise zone resource limits that are larger than 1968 * the number of sequential zones of the zoned block device, e.g. a 1969 * small ZNS namespace. For such case, assume that the zoned device has 1970 * no zone resource limits. 1971 */ 1972 nr_seq_zones = disk->nr_zones - args->nr_conv_zones; 1973 if (lim.max_open_zones >= nr_seq_zones) 1974 lim.max_open_zones = 0; 1975 if (lim.max_active_zones >= nr_seq_zones) 1976 lim.max_active_zones = 0; 1977 1978 if (!disk->zone_wplugs_pool) 1979 goto commit; 1980 1981 /* 1982 * If the device has no limit on the maximum number of open and active 1983 * zones, set its max open zone limit to the mempool size to indicate 1984 * to the user that there is a potential performance impact due to 1985 * dynamic zone write plug allocation when simultaneously writing to 1986 * more zones than the size of the mempool. 1987 */ 1988 pool_size = max(lim.max_open_zones, lim.max_active_zones); 1989 if (!pool_size) 1990 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); 1991 1992 mempool_resize(disk->zone_wplugs_pool, pool_size); 1993 1994 if (!lim.max_open_zones && !lim.max_active_zones) { 1995 if (pool_size < nr_seq_zones) 1996 lim.max_open_zones = pool_size; 1997 else 1998 lim.max_open_zones = 0; 1999 } 2000 2001 commit: 2002 ret = queue_limits_commit_update(q, &lim); 2003 2004 unfreeze: 2005 if (ret) 2006 disk_free_zone_resources(disk); 2007 2008 blk_mq_unfreeze_queue(q, memflags); 2009 2010 return ret; 2011 } 2012 2013 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx, 2014 struct blk_revalidate_zone_args *args) 2015 { 2016 enum blk_zone_cond cond = zone->cond; 2017 2018 /* Check that the zone condition is consistent with the zone type. */ 2019 switch (cond) { 2020 case BLK_ZONE_COND_NOT_WP: 2021 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) 2022 goto invalid_condition; 2023 break; 2024 case BLK_ZONE_COND_IMP_OPEN: 2025 case BLK_ZONE_COND_EXP_OPEN: 2026 case BLK_ZONE_COND_CLOSED: 2027 case BLK_ZONE_COND_EMPTY: 2028 case BLK_ZONE_COND_FULL: 2029 case BLK_ZONE_COND_OFFLINE: 2030 case BLK_ZONE_COND_READONLY: 2031 if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) 2032 goto invalid_condition; 2033 break; 2034 default: 2035 pr_warn("%s: Invalid zone condition 0x%X\n", 2036 args->disk->disk_name, cond); 2037 return -ENODEV; 2038 } 2039 2040 blk_zone_set_cond(args->zones_cond, idx, cond); 2041 2042 return 0; 2043 2044 invalid_condition: 2045 pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n", 2046 args->disk->disk_name, cond, zone->type); 2047 2048 return -ENODEV; 2049 } 2050 2051 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, 2052 struct blk_revalidate_zone_args *args) 2053 { 2054 struct gendisk *disk = args->disk; 2055 2056 if (zone->capacity != zone->len) { 2057 pr_warn("%s: Invalid conventional zone capacity\n", 2058 disk->disk_name); 2059 return -ENODEV; 2060 } 2061 2062 if (disk_zone_is_last(disk, zone)) 2063 args->last_zone_capacity = zone->capacity; 2064 2065 args->nr_conv_zones++; 2066 2067 return 0; 2068 } 2069 2070 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, 2071 struct blk_revalidate_zone_args *args) 2072 { 2073 struct gendisk *disk = args->disk; 2074 struct blk_zone_wplug *zwplug; 2075 unsigned int wp_offset; 2076 unsigned long flags; 2077 2078 /* 2079 * Remember the capacity of the first sequential zone and check 2080 * if it is constant for all zones, ignoring the last zone as it can be 2081 * smaller. 2082 */ 2083 if (!args->zone_capacity) 2084 args->zone_capacity = zone->capacity; 2085 if (disk_zone_is_last(disk, zone)) { 2086 args->last_zone_capacity = zone->capacity; 2087 } else if (zone->capacity != args->zone_capacity) { 2088 pr_warn("%s: Invalid variable zone capacity\n", 2089 disk->disk_name); 2090 return -ENODEV; 2091 } 2092 2093 /* 2094 * If the device needs zone append emulation, we need to track the 2095 * write pointer of all zones that are not empty nor full. So make sure 2096 * we have a zone write plug for such zone if the device has a zone 2097 * write plug hash table. 2098 */ 2099 if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) 2100 return 0; 2101 2102 wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone); 2103 if (!wp_offset || wp_offset >= zone->capacity) 2104 return 0; 2105 2106 zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); 2107 if (!zwplug) 2108 return -ENOMEM; 2109 spin_unlock_irqrestore(&zwplug->lock, flags); 2110 disk_put_zone_wplug(zwplug); 2111 2112 return 0; 2113 } 2114 2115 /* 2116 * Helper function to check the validity of zones of a zoned block device. 2117 */ 2118 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, 2119 void *data) 2120 { 2121 struct blk_revalidate_zone_args *args = data; 2122 struct gendisk *disk = args->disk; 2123 sector_t zone_sectors = disk->queue->limits.chunk_sectors; 2124 int ret; 2125 2126 /* Check for bad zones and holes in the zone report */ 2127 if (zone->start != args->sector) { 2128 pr_warn("%s: Zone gap at sectors %llu..%llu\n", 2129 disk->disk_name, args->sector, zone->start); 2130 return -ENODEV; 2131 } 2132 2133 if (zone->start >= get_capacity(disk) || !zone->len) { 2134 pr_warn("%s: Invalid zone start %llu, length %llu\n", 2135 disk->disk_name, zone->start, zone->len); 2136 return -ENODEV; 2137 } 2138 2139 /* 2140 * All zones must have the same size, with the exception on an eventual 2141 * smaller last zone. 2142 */ 2143 if (!disk_zone_is_last(disk, zone)) { 2144 if (zone->len != zone_sectors) { 2145 pr_warn("%s: Invalid zoned device with non constant zone size\n", 2146 disk->disk_name); 2147 return -ENODEV; 2148 } 2149 } else if (zone->len > zone_sectors) { 2150 pr_warn("%s: Invalid zoned device with larger last zone size\n", 2151 disk->disk_name); 2152 return -ENODEV; 2153 } 2154 2155 if (!zone->capacity || zone->capacity > zone->len) { 2156 pr_warn("%s: Invalid zone capacity\n", 2157 disk->disk_name); 2158 return -ENODEV; 2159 } 2160 2161 /* Check zone condition */ 2162 ret = blk_revalidate_zone_cond(zone, idx, args); 2163 if (ret) 2164 return ret; 2165 2166 /* Check zone type */ 2167 switch (zone->type) { 2168 case BLK_ZONE_TYPE_CONVENTIONAL: 2169 ret = blk_revalidate_conv_zone(zone, idx, args); 2170 break; 2171 case BLK_ZONE_TYPE_SEQWRITE_REQ: 2172 ret = blk_revalidate_seq_zone(zone, idx, args); 2173 break; 2174 case BLK_ZONE_TYPE_SEQWRITE_PREF: 2175 default: 2176 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", 2177 disk->disk_name, (int)zone->type, zone->start); 2178 ret = -ENODEV; 2179 } 2180 2181 if (!ret) 2182 args->sector += zone->len; 2183 2184 return ret; 2185 } 2186 2187 /** 2188 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs 2189 * @disk: Target disk 2190 * 2191 * Helper function for low-level device drivers to check, (re) allocate and 2192 * initialize resources used for managing zoned disks. This function should 2193 * normally be called by blk-mq based drivers when a zoned gendisk is probed 2194 * and when the zone configuration of the gendisk changes (e.g. after a format). 2195 * Before calling this function, the device driver must already have set the 2196 * device zone size (chunk_sector limit) and the max zone append limit. 2197 * BIO based drivers can also use this function as long as the device queue 2198 * can be safely frozen. 2199 */ 2200 int blk_revalidate_disk_zones(struct gendisk *disk) 2201 { 2202 struct request_queue *q = disk->queue; 2203 sector_t zone_sectors = q->limits.chunk_sectors; 2204 sector_t capacity = get_capacity(disk); 2205 struct blk_revalidate_zone_args args = { }; 2206 unsigned int memflags, noio_flag; 2207 struct blk_report_zones_args rep_args = { 2208 .cb = blk_revalidate_zone_cb, 2209 .data = &args, 2210 }; 2211 int ret = -ENOMEM; 2212 2213 if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) 2214 return -EIO; 2215 2216 if (!capacity) 2217 return -ENODEV; 2218 2219 /* 2220 * Checks that the device driver indicated a valid zone size and that 2221 * the max zone append limit is set. 2222 */ 2223 if (!zone_sectors || !is_power_of_2(zone_sectors)) { 2224 pr_warn("%s: Invalid non power of two zone size (%llu)\n", 2225 disk->disk_name, zone_sectors); 2226 return -ENODEV; 2227 } 2228 2229 /* 2230 * Ensure that all memory allocations in this context are done as if 2231 * GFP_NOIO was specified. 2232 */ 2233 noio_flag = memalloc_noio_save(); 2234 ret = disk_revalidate_zone_resources(disk, &args); 2235 if (ret) { 2236 memalloc_noio_restore(noio_flag); 2237 return ret; 2238 } 2239 2240 ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args); 2241 if (!ret) { 2242 pr_warn("%s: No zones reported\n", disk->disk_name); 2243 ret = -ENODEV; 2244 } 2245 memalloc_noio_restore(noio_flag); 2246 2247 /* 2248 * If zones where reported, make sure that the entire disk capacity 2249 * has been checked. 2250 */ 2251 if (ret > 0 && args.sector != capacity) { 2252 pr_warn("%s: Missing zones from sector %llu\n", 2253 disk->disk_name, args.sector); 2254 ret = -ENODEV; 2255 } 2256 2257 if (ret > 0) 2258 return disk_update_zone_resources(disk, &args); 2259 2260 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 2261 2262 memflags = blk_mq_freeze_queue(q); 2263 disk_free_zone_resources(disk); 2264 blk_mq_unfreeze_queue(q, memflags); 2265 2266 return ret; 2267 } 2268 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 2269 2270 /** 2271 * blk_zone_issue_zeroout - zero-fill a block range in a zone 2272 * @bdev: blockdev to write 2273 * @sector: start sector 2274 * @nr_sects: number of sectors to write 2275 * @gfp_mask: memory allocation flags (for bio_alloc) 2276 * 2277 * Description: 2278 * Zero-fill a block range in a zone (@sector must be equal to the zone write 2279 * pointer), handling potential errors due to the (initially unknown) lack of 2280 * hardware offload (See blkdev_issue_zeroout()). 2281 */ 2282 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, 2283 sector_t nr_sects, gfp_t gfp_mask) 2284 { 2285 struct gendisk *disk = bdev->bd_disk; 2286 int ret; 2287 2288 if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) 2289 return -EIO; 2290 2291 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 2292 BLKDEV_ZERO_NOFALLBACK); 2293 if (ret != -EOPNOTSUPP) 2294 return ret; 2295 2296 /* 2297 * The failed call to blkdev_issue_zeroout() advanced the zone write 2298 * pointer. Undo this using a report zone to update the zone write 2299 * pointer to the correct current value. 2300 */ 2301 ret = disk->fops->report_zones(disk, sector, 1, NULL); 2302 if (ret != 1) 2303 return ret < 0 ? ret : -EIO; 2304 2305 /* 2306 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a 2307 * regular write with zero-pages. 2308 */ 2309 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); 2310 } 2311 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); 2312 2313 #ifdef CONFIG_BLK_DEBUG_FS 2314 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, 2315 struct seq_file *m) 2316 { 2317 unsigned int zwp_wp_offset, zwp_flags; 2318 unsigned int zwp_zone_no, zwp_ref; 2319 unsigned int zwp_bio_list_size; 2320 enum blk_zone_cond zwp_cond; 2321 unsigned long flags; 2322 2323 spin_lock_irqsave(&zwplug->lock, flags); 2324 zwp_zone_no = zwplug->zone_no; 2325 zwp_flags = zwplug->flags; 2326 zwp_ref = refcount_read(&zwplug->ref); 2327 zwp_cond = zwplug->cond; 2328 zwp_wp_offset = zwplug->wp_offset; 2329 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); 2330 spin_unlock_irqrestore(&zwplug->lock, flags); 2331 2332 seq_printf(m, 2333 "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n", 2334 zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond), 2335 zwp_wp_offset, zwp_bio_list_size); 2336 } 2337 2338 int queue_zone_wplugs_show(void *data, struct seq_file *m) 2339 { 2340 struct request_queue *q = data; 2341 struct gendisk *disk = q->disk; 2342 struct blk_zone_wplug *zwplug; 2343 unsigned int i; 2344 2345 if (!disk->zone_wplugs_hash) 2346 return 0; 2347 2348 rcu_read_lock(); 2349 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 2350 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], 2351 node) 2352 queue_zone_wplug_show(zwplug, m); 2353 rcu_read_unlock(); 2354 2355 return 0; 2356 } 2357 2358 #endif 2359