1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Zoned block device handling 4 * 5 * Copyright (c) 2015, Hannes Reinecke 6 * Copyright (c) 2015, SUSE Linux GmbH 7 * 8 * Copyright (c) 2016, Damien Le Moal 9 * Copyright (c) 2016, Western Digital 10 * Copyright (c) 2024, Western Digital Corporation or its affiliates. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-mq.h> 16 #include <linux/spinlock.h> 17 #include <linux/refcount.h> 18 #include <linux/mempool.h> 19 #include <linux/kthread.h> 20 #include <linux/freezer.h> 21 22 #include <trace/events/block.h> 23 24 #include "blk.h" 25 #include "blk-mq-sched.h" 26 #include "blk-mq-debugfs.h" 27 28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name 29 static const char *const zone_cond_name[] = { 30 ZONE_COND_NAME(NOT_WP), 31 ZONE_COND_NAME(EMPTY), 32 ZONE_COND_NAME(IMP_OPEN), 33 ZONE_COND_NAME(EXP_OPEN), 34 ZONE_COND_NAME(CLOSED), 35 ZONE_COND_NAME(READONLY), 36 ZONE_COND_NAME(FULL), 37 ZONE_COND_NAME(OFFLINE), 38 ZONE_COND_NAME(ACTIVE), 39 }; 40 #undef ZONE_COND_NAME 41 42 /* 43 * Per-zone write plug. 44 * @node: hlist_node structure for managing the plug using a hash table. 45 * @entry: list_head structure for listing the plug in the disk list of active 46 * zone write plugs. 47 * @bio_list: The list of BIOs that are currently plugged. 48 * @bio_work: Work struct to handle issuing of plugged BIOs 49 * @rcu_head: RCU head to free zone write plugs with an RCU grace period. 50 * @disk: The gendisk the plug belongs to. 51 * @lock: Spinlock to atomically manipulate the plug. 52 * @ref: Zone write plug reference counter. A zone write plug reference is 53 * always at least 1 when the plug is hashed in the disk plug hash table. 54 * The reference is incremented whenever a new BIO needing plugging is 55 * submitted and when a function needs to manipulate a plug. The 56 * reference count is decremented whenever a plugged BIO completes and 57 * when a function that referenced the plug returns. The initial 58 * reference is dropped whenever the zone of the zone write plug is reset, 59 * finished and when the zone becomes full (last write BIO to the zone 60 * completes). 61 * @flags: Flags indicating the plug state. 62 * @zone_no: The number of the zone the plug is managing. 63 * @wp_offset: The zone write pointer location relative to the start of the zone 64 * as a number of 512B sectors. 65 * @cond: Condition of the zone 66 */ 67 struct blk_zone_wplug { 68 struct hlist_node node; 69 struct list_head entry; 70 struct bio_list bio_list; 71 struct work_struct bio_work; 72 struct rcu_head rcu_head; 73 struct gendisk *disk; 74 spinlock_t lock; 75 refcount_t ref; 76 unsigned int flags; 77 unsigned int zone_no; 78 unsigned int wp_offset; 79 enum blk_zone_cond cond; 80 }; 81 82 static inline bool disk_need_zone_resources(struct gendisk *disk) 83 { 84 /* 85 * All request-based zoned devices need zone resources so that the 86 * block layer can automatically handle write BIO plugging. BIO-based 87 * device drivers (e.g. DM devices) are normally responsible for 88 * handling zone write ordering and do not need zone resources, unless 89 * the driver requires zone append emulation. 90 */ 91 return queue_is_mq(disk->queue) || 92 queue_emulates_zone_append(disk->queue); 93 } 94 95 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) 96 { 97 return 1U << disk->zone_wplugs_hash_bits; 98 } 99 100 /* 101 * Zone write plug flags bits: 102 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, 103 * that is, that write BIOs are being throttled due to a write BIO already 104 * being executed or the zone write plug bio list is not empty. 105 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone 106 * write pointer offset and need to update it. 107 * - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be 108 * removed from the disk hash table of zone write plugs when the last 109 * reference on the zone write plug is dropped. If set, this flag also 110 * indicates that the initial extra reference on the zone write plug was 111 * dropped, meaning that the reference count indicates the current number of 112 * active users (code context or BIOs and requests in flight). This flag is 113 * set when a zone is reset, finished or becomes full. 114 */ 115 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) 116 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) 117 #define BLK_ZONE_WPLUG_DEAD (1U << 2) 118 119 /** 120 * blk_zone_cond_str - Return a zone condition name string 121 * @zone_cond: a zone condition BLK_ZONE_COND_name 122 * 123 * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful 124 * for the debugging and tracing zone conditions. For an invalid zone 125 * conditions, the string "UNKNOWN" is returned. 126 */ 127 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) 128 { 129 static const char *zone_cond_str = "UNKNOWN"; 130 131 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) 132 zone_cond_str = zone_cond_name[zone_cond]; 133 134 return zone_cond_str; 135 } 136 EXPORT_SYMBOL_GPL(blk_zone_cond_str); 137 138 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno, 139 enum blk_zone_cond cond) 140 { 141 if (!zones_cond) 142 return; 143 144 switch (cond) { 145 case BLK_ZONE_COND_IMP_OPEN: 146 case BLK_ZONE_COND_EXP_OPEN: 147 case BLK_ZONE_COND_CLOSED: 148 zones_cond[zno] = BLK_ZONE_COND_ACTIVE; 149 return; 150 case BLK_ZONE_COND_NOT_WP: 151 case BLK_ZONE_COND_EMPTY: 152 case BLK_ZONE_COND_FULL: 153 case BLK_ZONE_COND_OFFLINE: 154 case BLK_ZONE_COND_READONLY: 155 default: 156 zones_cond[zno] = cond; 157 return; 158 } 159 } 160 161 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector, 162 enum blk_zone_cond cond) 163 { 164 u8 *zones_cond; 165 166 rcu_read_lock(); 167 zones_cond = rcu_dereference(disk->zones_cond); 168 if (zones_cond) { 169 unsigned int zno = disk_zone_no(disk, sector); 170 171 /* 172 * The condition of a conventional, readonly and offline zones 173 * never changes, so do nothing if the target zone is in one of 174 * these conditions. 175 */ 176 switch (zones_cond[zno]) { 177 case BLK_ZONE_COND_NOT_WP: 178 case BLK_ZONE_COND_READONLY: 179 case BLK_ZONE_COND_OFFLINE: 180 break; 181 default: 182 blk_zone_set_cond(zones_cond, zno, cond); 183 break; 184 } 185 } 186 rcu_read_unlock(); 187 } 188 189 /** 190 * bdev_zone_is_seq - check if a sector belongs to a sequential write zone 191 * @bdev: block device to check 192 * @sector: sector number 193 * 194 * Check if @sector on @bdev is contained in a sequential write required zone. 195 */ 196 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) 197 { 198 struct gendisk *disk = bdev->bd_disk; 199 unsigned int zno = disk_zone_no(disk, sector); 200 bool is_seq = false; 201 u8 *zones_cond; 202 203 if (!bdev_is_zoned(bdev)) 204 return false; 205 206 rcu_read_lock(); 207 zones_cond = rcu_dereference(disk->zones_cond); 208 if (zones_cond && zno < disk->nr_zones) 209 is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP; 210 rcu_read_unlock(); 211 212 return is_seq; 213 } 214 EXPORT_SYMBOL_GPL(bdev_zone_is_seq); 215 216 /* 217 * Zone report arguments for block device drivers report_zones operation. 218 * @cb: report_zones_cb callback for each reported zone. 219 * @data: Private data passed to report_zones_cb. 220 */ 221 struct blk_report_zones_args { 222 report_zones_cb cb; 223 void *data; 224 bool report_active; 225 }; 226 227 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector, 228 unsigned int nr_zones, 229 struct blk_report_zones_args *args) 230 { 231 struct gendisk *disk = bdev->bd_disk; 232 233 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) 234 return -EOPNOTSUPP; 235 236 if (!nr_zones || sector >= get_capacity(disk)) 237 return 0; 238 239 return disk->fops->report_zones(disk, sector, nr_zones, args); 240 } 241 242 /** 243 * blkdev_report_zones - Get zones information 244 * @bdev: Target block device 245 * @sector: Sector from which to report zones 246 * @nr_zones: Maximum number of zones to report 247 * @cb: Callback function called for each reported zone 248 * @data: Private data for the callback 249 * 250 * Description: 251 * Get zone information starting from the zone containing @sector for at most 252 * @nr_zones, and call @cb for each zone reported by the device. 253 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES 254 * constant can be passed to @nr_zones. 255 * Returns the number of zones reported by the device, or a negative errno 256 * value in case of failure. 257 * 258 * Note: The caller must use memalloc_noXX_save/restore() calls to control 259 * memory allocations done within this function. 260 */ 261 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 262 unsigned int nr_zones, report_zones_cb cb, void *data) 263 { 264 struct blk_report_zones_args args = { 265 .cb = cb, 266 .data = data, 267 }; 268 269 return blkdev_do_report_zones(bdev, sector, nr_zones, &args); 270 } 271 EXPORT_SYMBOL_GPL(blkdev_report_zones); 272 273 static int blkdev_zone_reset_all(struct block_device *bdev) 274 { 275 struct bio bio; 276 277 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); 278 trace_blkdev_zone_mgmt(&bio, 0); 279 return submit_bio_wait(&bio); 280 } 281 282 /** 283 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones 284 * @bdev: Target block device 285 * @op: Operation to be performed on the zones 286 * @sector: Start sector of the first zone to operate on 287 * @nr_sectors: Number of sectors, should be at least the length of one zone and 288 * must be zone size aligned. 289 * 290 * Description: 291 * Perform the specified operation on the range of zones specified by 292 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range 293 * is valid, but the specified range should not contain conventional zones. 294 * The operation to execute on each zone can be a zone reset, open, close 295 * or finish request. 296 */ 297 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, 298 sector_t sector, sector_t nr_sectors) 299 { 300 sector_t zone_sectors = bdev_zone_sectors(bdev); 301 sector_t capacity = bdev_nr_sectors(bdev); 302 sector_t end_sector = sector + nr_sectors; 303 struct bio *bio = NULL; 304 int ret = 0; 305 306 if (!bdev_is_zoned(bdev)) 307 return -EOPNOTSUPP; 308 309 if (bdev_read_only(bdev)) 310 return -EPERM; 311 312 if (!op_is_zone_mgmt(op)) 313 return -EOPNOTSUPP; 314 315 if (end_sector <= sector || end_sector > capacity) 316 /* Out of range */ 317 return -EINVAL; 318 319 /* Check alignment (handle eventual smaller last zone) */ 320 if (!bdev_is_zone_start(bdev, sector)) 321 return -EINVAL; 322 323 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) 324 return -EINVAL; 325 326 /* 327 * In the case of a zone reset operation over all zones, use 328 * REQ_OP_ZONE_RESET_ALL. 329 */ 330 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) 331 return blkdev_zone_reset_all(bdev); 332 333 while (sector < end_sector) { 334 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); 335 bio->bi_iter.bi_sector = sector; 336 sector += zone_sectors; 337 338 /* This may take a while, so be nice to others */ 339 cond_resched(); 340 } 341 342 trace_blkdev_zone_mgmt(bio, nr_sectors); 343 ret = submit_bio_wait(bio); 344 bio_put(bio); 345 346 return ret; 347 } 348 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); 349 350 struct zone_report_args { 351 struct blk_zone __user *zones; 352 }; 353 354 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, 355 void *data) 356 { 357 struct zone_report_args *args = data; 358 359 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) 360 return -EFAULT; 361 return 0; 362 } 363 364 /* 365 * Mask of valid input flags for BLKREPORTZONEV2 ioctl. 366 */ 367 #define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED 368 369 /* 370 * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing. 371 * Called from blkdev_ioctl. 372 */ 373 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, 374 unsigned long arg) 375 { 376 void __user *argp = (void __user *)arg; 377 struct zone_report_args args; 378 struct blk_zone_report rep; 379 int ret; 380 381 if (!argp) 382 return -EINVAL; 383 384 if (!bdev_is_zoned(bdev)) 385 return -ENOTTY; 386 387 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) 388 return -EFAULT; 389 390 if (!rep.nr_zones) 391 return -EINVAL; 392 393 args.zones = argp + sizeof(struct blk_zone_report); 394 395 switch (cmd) { 396 case BLKREPORTZONE: 397 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, 398 blkdev_copy_zone_to_user, &args); 399 break; 400 case BLKREPORTZONEV2: 401 if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS) 402 return -EINVAL; 403 ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones, 404 blkdev_copy_zone_to_user, &args); 405 break; 406 default: 407 return -EINVAL; 408 } 409 410 if (ret < 0) 411 return ret; 412 413 rep.nr_zones = ret; 414 rep.flags = BLK_ZONE_REP_CAPACITY; 415 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) 416 return -EFAULT; 417 return 0; 418 } 419 420 static int blkdev_truncate_zone_range(struct block_device *bdev, 421 blk_mode_t mode, const struct blk_zone_range *zrange) 422 { 423 loff_t start, end; 424 425 if (zrange->sector + zrange->nr_sectors <= zrange->sector || 426 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) 427 /* Out of range */ 428 return -EINVAL; 429 430 start = zrange->sector << SECTOR_SHIFT; 431 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; 432 433 return truncate_bdev_range(bdev, mode, start, end); 434 } 435 436 /* 437 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. 438 * Called from blkdev_ioctl. 439 */ 440 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, 441 unsigned int cmd, unsigned long arg) 442 { 443 void __user *argp = (void __user *)arg; 444 struct blk_zone_range zrange; 445 enum req_op op; 446 int ret; 447 448 if (!argp) 449 return -EINVAL; 450 451 if (!bdev_is_zoned(bdev)) 452 return -ENOTTY; 453 454 if (!(mode & BLK_OPEN_WRITE)) 455 return -EBADF; 456 457 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) 458 return -EFAULT; 459 460 switch (cmd) { 461 case BLKRESETZONE: 462 op = REQ_OP_ZONE_RESET; 463 464 /* Invalidate the page cache, including dirty pages. */ 465 inode_lock(bdev->bd_mapping->host); 466 filemap_invalidate_lock(bdev->bd_mapping); 467 ret = blkdev_truncate_zone_range(bdev, mode, &zrange); 468 if (ret) 469 goto fail; 470 break; 471 case BLKOPENZONE: 472 op = REQ_OP_ZONE_OPEN; 473 break; 474 case BLKCLOSEZONE: 475 op = REQ_OP_ZONE_CLOSE; 476 break; 477 case BLKFINISHZONE: 478 op = REQ_OP_ZONE_FINISH; 479 break; 480 default: 481 return -ENOTTY; 482 } 483 484 ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); 485 486 fail: 487 if (cmd == BLKRESETZONE) { 488 filemap_invalidate_unlock(bdev->bd_mapping); 489 inode_unlock(bdev->bd_mapping->host); 490 } 491 492 return ret; 493 } 494 495 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) 496 { 497 return zone->start + zone->len >= get_capacity(disk); 498 } 499 500 static bool disk_zone_wplug_is_full(struct gendisk *disk, 501 struct blk_zone_wplug *zwplug) 502 { 503 if (zwplug->zone_no < disk->nr_zones - 1) 504 return zwplug->wp_offset >= disk->zone_capacity; 505 return zwplug->wp_offset >= disk->last_zone_capacity; 506 } 507 508 static bool disk_insert_zone_wplug(struct gendisk *disk, 509 struct blk_zone_wplug *zwplug) 510 { 511 struct blk_zone_wplug *zwplg; 512 unsigned long flags; 513 u8 *zones_cond; 514 unsigned int idx = 515 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); 516 517 /* 518 * Add the new zone write plug to the hash table, but carefully as we 519 * are racing with other submission context, so we may already have a 520 * zone write plug for the same zone. 521 */ 522 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 523 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { 524 if (zwplg->zone_no == zwplug->zone_no) { 525 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, 526 flags); 527 return false; 528 } 529 } 530 531 /* 532 * Set the zone condition: if we do not yet have a zones_cond array 533 * attached to the disk, then this is a zone write plug insert from the 534 * first call to blk_revalidate_disk_zones(), in which case the zone is 535 * necessarilly in the active condition. 536 */ 537 zones_cond = rcu_dereference_check(disk->zones_cond, 538 lockdep_is_held(&disk->zone_wplugs_hash_lock)); 539 if (zones_cond) 540 zwplug->cond = zones_cond[zwplug->zone_no]; 541 else 542 zwplug->cond = BLK_ZONE_COND_ACTIVE; 543 544 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); 545 atomic_inc(&disk->nr_zone_wplugs); 546 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 547 548 return true; 549 } 550 551 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, 552 sector_t sector) 553 { 554 unsigned int zno = disk_zone_no(disk, sector); 555 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); 556 struct blk_zone_wplug *zwplug; 557 558 rcu_read_lock(); 559 560 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { 561 if (zwplug->zone_no == zno && 562 refcount_inc_not_zero(&zwplug->ref)) { 563 rcu_read_unlock(); 564 return zwplug; 565 } 566 } 567 568 rcu_read_unlock(); 569 570 return NULL; 571 } 572 573 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, 574 sector_t sector) 575 { 576 if (!atomic_read(&disk->nr_zone_wplugs)) 577 return NULL; 578 579 return disk_get_hashed_zone_wplug(disk, sector); 580 } 581 582 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) 583 { 584 struct blk_zone_wplug *zwplug = 585 container_of(rcu_head, struct blk_zone_wplug, rcu_head); 586 587 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); 588 } 589 590 static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) 591 { 592 struct gendisk *disk = zwplug->disk; 593 unsigned long flags; 594 595 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)); 596 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); 597 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); 598 599 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 600 blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, 601 lockdep_is_held(&disk->zone_wplugs_hash_lock)), 602 zwplug->zone_no, zwplug->cond); 603 hlist_del_init_rcu(&zwplug->node); 604 atomic_dec(&disk->nr_zone_wplugs); 605 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 606 607 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); 608 } 609 610 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) 611 { 612 if (refcount_dec_and_test(&zwplug->ref)) 613 disk_free_zone_wplug(zwplug); 614 } 615 616 /* 617 * Flag the zone write plug as dead and drop the initial reference we got when 618 * the zone write plug was added to the hash table. The zone write plug will be 619 * unhashed when its last reference is dropped. 620 */ 621 static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) 622 { 623 lockdep_assert_held(&zwplug->lock); 624 625 if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) { 626 zwplug->flags |= BLK_ZONE_WPLUG_DEAD; 627 disk_put_zone_wplug(zwplug); 628 } 629 } 630 631 static bool disk_zone_wplug_submit_bio(struct gendisk *disk, 632 struct blk_zone_wplug *zwplug); 633 634 static void blk_zone_wplug_bio_work(struct work_struct *work) 635 { 636 struct blk_zone_wplug *zwplug = 637 container_of(work, struct blk_zone_wplug, bio_work); 638 639 disk_zone_wplug_submit_bio(zwplug->disk, zwplug); 640 641 /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ 642 disk_put_zone_wplug(zwplug); 643 } 644 645 /* 646 * Get a zone write plug for the zone containing @sector. 647 * If the plug does not exist, it is allocated and inserted in the disk hash 648 * table. 649 */ 650 static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk, 651 sector_t sector, gfp_t gfp_mask) 652 { 653 unsigned int zno = disk_zone_no(disk, sector); 654 struct blk_zone_wplug *zwplug; 655 656 again: 657 zwplug = disk_get_zone_wplug(disk, sector); 658 if (zwplug) 659 return zwplug; 660 661 /* 662 * Allocate and initialize a zone write plug with an extra reference 663 * so that it is not freed when the zone write plug becomes idle without 664 * the zone being full. 665 */ 666 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); 667 if (!zwplug) 668 return NULL; 669 670 INIT_HLIST_NODE(&zwplug->node); 671 refcount_set(&zwplug->ref, 2); 672 spin_lock_init(&zwplug->lock); 673 zwplug->flags = 0; 674 zwplug->zone_no = zno; 675 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); 676 bio_list_init(&zwplug->bio_list); 677 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); 678 INIT_LIST_HEAD(&zwplug->entry); 679 zwplug->disk = disk; 680 681 /* 682 * Insert the new zone write plug in the hash table. This can fail only 683 * if another context already inserted a plug. Retry from the beginning 684 * in such case. 685 */ 686 if (!disk_insert_zone_wplug(disk, zwplug)) { 687 mempool_free(zwplug, disk->zone_wplugs_pool); 688 goto again; 689 } 690 691 return zwplug; 692 } 693 694 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, 695 struct bio *bio) 696 { 697 struct request_queue *q = zwplug->disk->queue; 698 699 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 700 bio_io_error(bio); 701 disk_put_zone_wplug(zwplug); 702 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 703 blk_queue_exit(q); 704 } 705 706 /* 707 * Abort (fail) all plugged BIOs of a zone write plug. 708 */ 709 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) 710 { 711 struct gendisk *disk = zwplug->disk; 712 struct bio *bio; 713 714 lockdep_assert_held(&zwplug->lock); 715 716 if (bio_list_empty(&zwplug->bio_list)) 717 return; 718 719 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", 720 zwplug->disk->disk_name, zwplug->zone_no); 721 while ((bio = bio_list_pop(&zwplug->bio_list))) 722 blk_zone_wplug_bio_io_error(zwplug, bio); 723 724 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 725 726 /* 727 * If we are using the per disk zone write plugs worker thread, remove 728 * the zone write plug from the work list and drop the reference we 729 * took when the zone write plug was added to that list. 730 */ 731 if (blk_queue_zoned_qd1_writes(disk->queue)) { 732 spin_lock(&disk->zone_wplugs_list_lock); 733 if (!list_empty(&zwplug->entry)) { 734 list_del_init(&zwplug->entry); 735 disk_put_zone_wplug(zwplug); 736 } 737 spin_unlock(&disk->zone_wplugs_list_lock); 738 } 739 } 740 741 /* 742 * Update a zone write plug condition based on the write pointer offset. 743 */ 744 static void disk_zone_wplug_update_cond(struct gendisk *disk, 745 struct blk_zone_wplug *zwplug) 746 { 747 lockdep_assert_held(&zwplug->lock); 748 749 if (disk_zone_wplug_is_full(disk, zwplug)) 750 zwplug->cond = BLK_ZONE_COND_FULL; 751 else if (!zwplug->wp_offset) 752 zwplug->cond = BLK_ZONE_COND_EMPTY; 753 else 754 zwplug->cond = BLK_ZONE_COND_ACTIVE; 755 } 756 757 /* 758 * Set a zone write plug write pointer offset to the specified value. 759 * This aborts all plugged BIOs, which is fine as this function is called for 760 * a zone reset operation, a zone finish operation or if the zone needs a wp 761 * update from a report zone after a write error. 762 */ 763 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, 764 struct blk_zone_wplug *zwplug, 765 unsigned int wp_offset) 766 { 767 lockdep_assert_held(&zwplug->lock); 768 769 /* Update the zone write pointer and abort all plugged BIOs. */ 770 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; 771 zwplug->wp_offset = wp_offset; 772 disk_zone_wplug_update_cond(disk, zwplug); 773 774 disk_zone_wplug_abort(zwplug); 775 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) 776 disk_mark_zone_wplug_dead(zwplug); 777 } 778 779 static unsigned int blk_zone_wp_offset(struct blk_zone *zone) 780 { 781 switch (zone->cond) { 782 case BLK_ZONE_COND_IMP_OPEN: 783 case BLK_ZONE_COND_EXP_OPEN: 784 case BLK_ZONE_COND_CLOSED: 785 case BLK_ZONE_COND_ACTIVE: 786 return zone->wp - zone->start; 787 case BLK_ZONE_COND_EMPTY: 788 return 0; 789 case BLK_ZONE_COND_FULL: 790 case BLK_ZONE_COND_NOT_WP: 791 case BLK_ZONE_COND_OFFLINE: 792 case BLK_ZONE_COND_READONLY: 793 default: 794 /* 795 * Conventional, full, offline and read-only zones do not have 796 * a valid write pointer. 797 */ 798 return UINT_MAX; 799 } 800 } 801 802 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk, 803 struct blk_zone *zone) 804 { 805 struct blk_zone_wplug *zwplug; 806 unsigned int wp_offset = blk_zone_wp_offset(zone); 807 808 zwplug = disk_get_zone_wplug(disk, zone->start); 809 if (zwplug) { 810 unsigned long flags; 811 812 spin_lock_irqsave(&zwplug->lock, flags); 813 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 814 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); 815 spin_unlock_irqrestore(&zwplug->lock, flags); 816 disk_put_zone_wplug(zwplug); 817 } 818 819 return wp_offset; 820 } 821 822 /** 823 * disk_report_zone - Report one zone 824 * @disk: Target disk 825 * @zone: The zone to report 826 * @idx: The index of the zone in the overall zone report 827 * @args: report zones callback and data 828 * 829 * Description: 830 * Helper function for block device drivers to report one zone of a zone 831 * report initiated with blkdev_report_zones(). The zone being reported is 832 * specified by @zone and used to update, if necessary, the zone write plug 833 * information for the zone. If @args specifies a user callback function, 834 * this callback is executed. 835 */ 836 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, 837 unsigned int idx, struct blk_report_zones_args *args) 838 { 839 if (args && args->report_active) { 840 /* 841 * If we come here, then this is a report zones as a fallback 842 * for a cached report. So collapse the implicit open, explicit 843 * open and closed conditions into the active zone condition. 844 */ 845 switch (zone->cond) { 846 case BLK_ZONE_COND_IMP_OPEN: 847 case BLK_ZONE_COND_EXP_OPEN: 848 case BLK_ZONE_COND_CLOSED: 849 zone->cond = BLK_ZONE_COND_ACTIVE; 850 break; 851 default: 852 break; 853 } 854 } 855 856 if (disk->zone_wplugs_hash) 857 disk_zone_wplug_sync_wp_offset(disk, zone); 858 859 if (args && args->cb) 860 return args->cb(zone, idx, args->data); 861 862 return 0; 863 } 864 EXPORT_SYMBOL_GPL(disk_report_zone); 865 866 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx, 867 void *data) 868 { 869 memcpy(data, zone, sizeof(struct blk_zone)); 870 return 0; 871 } 872 873 static int blkdev_report_zone_fallback(struct block_device *bdev, 874 sector_t sector, struct blk_zone *zone) 875 { 876 struct blk_report_zones_args args = { 877 .cb = blkdev_report_zone_cb, 878 .data = zone, 879 .report_active = true, 880 }; 881 int error; 882 883 error = blkdev_do_report_zones(bdev, sector, 1, &args); 884 if (error < 0) 885 return error; 886 if (error == 0) 887 return -EIO; 888 return 0; 889 } 890 891 /* 892 * For devices that natively support zone append operations, we do not use zone 893 * write plugging for zone append writes, which makes the zone condition 894 * tracking invalid once zone append was used. In that case fall back to a 895 * regular report zones to get correct information. 896 */ 897 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev) 898 { 899 return disk_need_zone_resources(bdev->bd_disk) && 900 (bdev_emulates_zone_append(bdev) || 901 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state)); 902 } 903 904 /** 905 * blkdev_get_zone_info - Get a single zone information from cached data 906 * @bdev: Target block device 907 * @sector: Sector contained by the target zone 908 * @zone: zone structure to return the zone information 909 * 910 * Description: 911 * Get the zone information for the zone containing @sector using the zone 912 * write plug of the target zone, if one exist, or the disk zone condition 913 * array otherwise. The zone condition may be reported as being 914 * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit 915 * open, explicit open or closed condition. 916 * 917 * Returns 0 on success and a negative error code on failure. 918 */ 919 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, 920 struct blk_zone *zone) 921 { 922 struct gendisk *disk = bdev->bd_disk; 923 sector_t zone_sectors = bdev_zone_sectors(bdev); 924 struct blk_zone_wplug *zwplug; 925 unsigned long flags; 926 u8 *zones_cond; 927 928 if (!bdev_is_zoned(bdev)) 929 return -EOPNOTSUPP; 930 931 if (sector >= get_capacity(disk)) 932 return -EINVAL; 933 934 memset(zone, 0, sizeof(*zone)); 935 sector = bdev_zone_start(bdev, sector); 936 937 if (!blkdev_has_cached_report_zones(bdev)) 938 return blkdev_report_zone_fallback(bdev, sector, zone); 939 940 rcu_read_lock(); 941 zones_cond = rcu_dereference(disk->zones_cond); 942 if (!disk->zone_wplugs_hash || !zones_cond) { 943 rcu_read_unlock(); 944 return blkdev_report_zone_fallback(bdev, sector, zone); 945 } 946 zone->cond = zones_cond[disk_zone_no(disk, sector)]; 947 rcu_read_unlock(); 948 949 zone->start = sector; 950 zone->len = zone_sectors; 951 952 /* 953 * If this is a conventional zone, we do not have a zone write plug and 954 * can report the zone immediately. 955 */ 956 if (zone->cond == BLK_ZONE_COND_NOT_WP) { 957 zone->type = BLK_ZONE_TYPE_CONVENTIONAL; 958 zone->capacity = zone_sectors; 959 zone->wp = ULLONG_MAX; 960 return 0; 961 } 962 963 /* 964 * This is a sequential write required zone. If the zone is read-only or 965 * offline, only set the zone write pointer to an invalid value and 966 * report the zone. 967 */ 968 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; 969 if (disk_zone_is_last(disk, zone)) 970 zone->capacity = disk->last_zone_capacity; 971 else 972 zone->capacity = disk->zone_capacity; 973 974 if (zone->cond == BLK_ZONE_COND_READONLY || 975 zone->cond == BLK_ZONE_COND_OFFLINE) { 976 zone->wp = ULLONG_MAX; 977 return 0; 978 } 979 980 /* 981 * If the zone does not have a zone write plug, it is either full or 982 * empty, as we otherwise would have a zone write plug for it. In this 983 * case, set the write pointer accordingly and report the zone. 984 * Otherwise, if we have a zone write plug, use it. 985 */ 986 zwplug = disk_get_zone_wplug(disk, sector); 987 if (!zwplug) { 988 if (zone->cond == BLK_ZONE_COND_FULL) 989 zone->wp = ULLONG_MAX; 990 else 991 zone->wp = sector; 992 return 0; 993 } 994 995 spin_lock_irqsave(&zwplug->lock, flags); 996 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) { 997 spin_unlock_irqrestore(&zwplug->lock, flags); 998 disk_put_zone_wplug(zwplug); 999 return blkdev_report_zone_fallback(bdev, sector, zone); 1000 } 1001 zone->cond = zwplug->cond; 1002 zone->wp = sector + zwplug->wp_offset; 1003 spin_unlock_irqrestore(&zwplug->lock, flags); 1004 1005 disk_put_zone_wplug(zwplug); 1006 1007 return 0; 1008 } 1009 EXPORT_SYMBOL_GPL(blkdev_get_zone_info); 1010 1011 /** 1012 * blkdev_report_zones_cached - Get cached zones information 1013 * @bdev: Target block device 1014 * @sector: Sector from which to report zones 1015 * @nr_zones: Maximum number of zones to report 1016 * @cb: Callback function called for each reported zone 1017 * @data: Private data for the callback function 1018 * 1019 * Description: 1020 * Similar to blkdev_report_zones() but instead of calling into the low level 1021 * device driver to get the zone report from the device, use 1022 * blkdev_get_zone_info() to generate the report from the disk zone write 1023 * plugs and zones condition array. Since calling this function without a 1024 * callback does not make sense, @cb must be specified. 1025 */ 1026 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, 1027 unsigned int nr_zones, report_zones_cb cb, void *data) 1028 { 1029 struct gendisk *disk = bdev->bd_disk; 1030 sector_t capacity = get_capacity(disk); 1031 sector_t zone_sectors = bdev_zone_sectors(bdev); 1032 unsigned int idx = 0; 1033 struct blk_zone zone; 1034 int ret; 1035 1036 if (!cb || !bdev_is_zoned(bdev) || 1037 WARN_ON_ONCE(!disk->fops->report_zones)) 1038 return -EOPNOTSUPP; 1039 1040 if (!nr_zones || sector >= capacity) 1041 return 0; 1042 1043 if (!blkdev_has_cached_report_zones(bdev)) { 1044 struct blk_report_zones_args args = { 1045 .cb = cb, 1046 .data = data, 1047 .report_active = true, 1048 }; 1049 1050 return blkdev_do_report_zones(bdev, sector, nr_zones, &args); 1051 } 1052 1053 for (sector = bdev_zone_start(bdev, sector); 1054 sector < capacity && idx < nr_zones; 1055 sector += zone_sectors, idx++) { 1056 ret = blkdev_get_zone_info(bdev, sector, &zone); 1057 if (ret) 1058 return ret; 1059 1060 ret = cb(&zone, idx, data); 1061 if (ret) 1062 return ret; 1063 } 1064 1065 return idx; 1066 } 1067 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached); 1068 1069 static void blk_zone_reset_bio_endio(struct bio *bio) 1070 { 1071 struct gendisk *disk = bio->bi_bdev->bd_disk; 1072 sector_t sector = bio->bi_iter.bi_sector; 1073 struct blk_zone_wplug *zwplug; 1074 1075 /* 1076 * If we have a zone write plug, set its write pointer offset to 0. 1077 * This will abort all BIOs plugged for the target zone. It is fine as 1078 * resetting zones while writes are still in-flight will result in the 1079 * writes failing anyway. 1080 */ 1081 zwplug = disk_get_zone_wplug(disk, sector); 1082 if (zwplug) { 1083 unsigned long flags; 1084 1085 spin_lock_irqsave(&zwplug->lock, flags); 1086 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 1087 spin_unlock_irqrestore(&zwplug->lock, flags); 1088 disk_put_zone_wplug(zwplug); 1089 } else { 1090 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); 1091 } 1092 } 1093 1094 static void blk_zone_reset_all_bio_endio(struct bio *bio) 1095 { 1096 struct gendisk *disk = bio->bi_bdev->bd_disk; 1097 sector_t capacity = get_capacity(disk); 1098 struct blk_zone_wplug *zwplug; 1099 unsigned long flags; 1100 sector_t sector; 1101 unsigned int i; 1102 1103 if (atomic_read(&disk->nr_zone_wplugs)) { 1104 /* Update the condition of all zone write plugs. */ 1105 rcu_read_lock(); 1106 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1107 hlist_for_each_entry_rcu(zwplug, 1108 &disk->zone_wplugs_hash[i], 1109 node) { 1110 spin_lock_irqsave(&zwplug->lock, flags); 1111 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 1112 spin_unlock_irqrestore(&zwplug->lock, flags); 1113 } 1114 } 1115 rcu_read_unlock(); 1116 } 1117 1118 /* Update the cached zone conditions. */ 1119 for (sector = 0; sector < capacity; 1120 sector += bdev_zone_sectors(bio->bi_bdev)) 1121 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); 1122 clear_bit(GD_ZONE_APPEND_USED, &disk->state); 1123 } 1124 1125 static void blk_zone_finish_bio_endio(struct bio *bio) 1126 { 1127 struct block_device *bdev = bio->bi_bdev; 1128 struct gendisk *disk = bdev->bd_disk; 1129 sector_t sector = bio->bi_iter.bi_sector; 1130 struct blk_zone_wplug *zwplug; 1131 1132 /* 1133 * If we have a zone write plug, set its write pointer offset to the 1134 * zone size. This will abort all BIOs plugged for the target zone. It 1135 * is fine as resetting zones while writes are still in-flight will 1136 * result in the writes failing anyway. 1137 */ 1138 zwplug = disk_get_zone_wplug(disk, sector); 1139 if (zwplug) { 1140 unsigned long flags; 1141 1142 spin_lock_irqsave(&zwplug->lock, flags); 1143 disk_zone_wplug_set_wp_offset(disk, zwplug, 1144 bdev_zone_sectors(bdev)); 1145 spin_unlock_irqrestore(&zwplug->lock, flags); 1146 disk_put_zone_wplug(zwplug); 1147 } else { 1148 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL); 1149 } 1150 } 1151 1152 void blk_zone_mgmt_bio_endio(struct bio *bio) 1153 { 1154 /* If the BIO failed, we have nothing to do. */ 1155 if (bio->bi_status != BLK_STS_OK) 1156 return; 1157 1158 switch (bio_op(bio)) { 1159 case REQ_OP_ZONE_RESET: 1160 blk_zone_reset_bio_endio(bio); 1161 return; 1162 case REQ_OP_ZONE_RESET_ALL: 1163 blk_zone_reset_all_bio_endio(bio); 1164 return; 1165 case REQ_OP_ZONE_FINISH: 1166 blk_zone_finish_bio_endio(bio); 1167 return; 1168 default: 1169 return; 1170 } 1171 } 1172 1173 static void disk_zone_wplug_schedule_work(struct gendisk *disk, 1174 struct blk_zone_wplug *zwplug) 1175 { 1176 lockdep_assert_held(&zwplug->lock); 1177 1178 /* 1179 * Schedule the submission of the next plugged BIO. Taking a reference 1180 * to the zone write plug is required as the bio_work belongs to the 1181 * plug, and thus we must ensure that the write plug does not go away 1182 * while the work is being scheduled but has not run yet. 1183 * blk_zone_wplug_bio_work() will release the reference we take here, 1184 * and we also drop this reference if the work is already scheduled. 1185 */ 1186 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); 1187 WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); 1188 refcount_inc(&zwplug->ref); 1189 if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) 1190 disk_put_zone_wplug(zwplug); 1191 } 1192 1193 static inline void disk_zone_wplug_add_bio(struct gendisk *disk, 1194 struct blk_zone_wplug *zwplug, 1195 struct bio *bio, unsigned int nr_segs) 1196 { 1197 /* 1198 * Grab an extra reference on the BIO request queue usage counter. 1199 * This reference will be reused to submit a request for the BIO for 1200 * blk-mq devices and dropped when the BIO is failed and after 1201 * it is issued in the case of BIO-based devices. 1202 */ 1203 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); 1204 1205 /* 1206 * The BIO is being plugged and thus will have to wait for the on-going 1207 * write and for all other writes already plugged. So polling makes 1208 * no sense. 1209 */ 1210 bio_clear_polled(bio); 1211 1212 /* 1213 * Reuse the poll cookie field to store the number of segments when 1214 * split to the hardware limits. 1215 */ 1216 bio->__bi_nr_segments = nr_segs; 1217 1218 /* 1219 * We always receive BIOs after they are split and ready to be issued. 1220 * The block layer passes the parts of a split BIO in order, and the 1221 * user must also issue write sequentially. So simply add the new BIO 1222 * at the tail of the list to preserve the sequential write order. 1223 */ 1224 bio_list_add(&zwplug->bio_list, bio); 1225 trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, 1226 bio->bi_iter.bi_sector, bio_sectors(bio)); 1227 1228 /* 1229 * If we are using the disk zone write plugs worker instead of the per 1230 * zone write plug BIO work, add the zone write plug to the work list 1231 * if it is not already there. Make sure to also get an extra reference 1232 * on the zone write plug so that it does not go away until it is 1233 * removed from the work list. 1234 */ 1235 if (blk_queue_zoned_qd1_writes(disk->queue)) { 1236 spin_lock(&disk->zone_wplugs_list_lock); 1237 if (list_empty(&zwplug->entry)) { 1238 list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); 1239 refcount_inc(&zwplug->ref); 1240 } 1241 spin_unlock(&disk->zone_wplugs_list_lock); 1242 } 1243 } 1244 1245 /* 1246 * Called from bio_attempt_back_merge() when a BIO was merged with a request. 1247 */ 1248 void blk_zone_write_plug_bio_merged(struct bio *bio) 1249 { 1250 struct gendisk *disk = bio->bi_bdev->bd_disk; 1251 struct blk_zone_wplug *zwplug; 1252 unsigned long flags; 1253 1254 /* 1255 * If the BIO was already plugged, then we were called through 1256 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). 1257 * For this case, we already hold a reference on the zone write plug for 1258 * the BIO and blk_zone_write_plug_init_request() will handle the 1259 * zone write pointer offset update. 1260 */ 1261 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) 1262 return; 1263 1264 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1265 1266 /* 1267 * Get a reference on the zone write plug of the target zone and advance 1268 * the zone write pointer offset. Given that this is a merge, we already 1269 * have at least one request and one BIO referencing the zone write 1270 * plug. So this should not fail. 1271 */ 1272 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1273 if (WARN_ON_ONCE(!zwplug)) 1274 return; 1275 1276 spin_lock_irqsave(&zwplug->lock, flags); 1277 zwplug->wp_offset += bio_sectors(bio); 1278 disk_zone_wplug_update_cond(disk, zwplug); 1279 spin_unlock_irqrestore(&zwplug->lock, flags); 1280 } 1281 1282 /* 1283 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that 1284 * already went through zone write plugging (either a new BIO or one that was 1285 * unplugged). 1286 */ 1287 void blk_zone_write_plug_init_request(struct request *req) 1288 { 1289 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); 1290 struct request_queue *q = req->q; 1291 struct gendisk *disk = q->disk; 1292 struct blk_zone_wplug *zwplug = 1293 disk_get_zone_wplug(disk, blk_rq_pos(req)); 1294 unsigned long flags; 1295 struct bio *bio; 1296 1297 if (WARN_ON_ONCE(!zwplug)) 1298 return; 1299 1300 /* 1301 * Indicate that completion of this request needs to be handled with 1302 * blk_zone_write_plug_finish_request(), which will drop the reference 1303 * on the zone write plug we took above on entry to this function. 1304 */ 1305 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; 1306 1307 if (blk_queue_nomerges(q)) 1308 return; 1309 1310 /* 1311 * Walk through the list of plugged BIOs to check if they can be merged 1312 * into the back of the request. 1313 */ 1314 spin_lock_irqsave(&zwplug->lock, flags); 1315 while (!disk_zone_wplug_is_full(disk, zwplug)) { 1316 bio = bio_list_peek(&zwplug->bio_list); 1317 if (!bio) 1318 break; 1319 1320 if (bio->bi_iter.bi_sector != req_back_sector || 1321 !blk_rq_merge_ok(req, bio)) 1322 break; 1323 1324 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && 1325 !bio->__bi_nr_segments); 1326 1327 bio_list_pop(&zwplug->bio_list); 1328 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != 1329 BIO_MERGE_OK) { 1330 bio_list_add_head(&zwplug->bio_list, bio); 1331 break; 1332 } 1333 1334 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 1335 blk_queue_exit(q); 1336 zwplug->wp_offset += bio_sectors(bio); 1337 disk_zone_wplug_update_cond(disk, zwplug); 1338 1339 req_back_sector += bio_sectors(bio); 1340 } 1341 spin_unlock_irqrestore(&zwplug->lock, flags); 1342 } 1343 1344 /* 1345 * Check and prepare a BIO for submission by incrementing the write pointer 1346 * offset of its zone write plug and changing zone append operations into 1347 * regular write when zone append emulation is needed. 1348 */ 1349 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, 1350 struct bio *bio) 1351 { 1352 struct gendisk *disk = bio->bi_bdev->bd_disk; 1353 1354 lockdep_assert_held(&zwplug->lock); 1355 1356 /* 1357 * If we lost track of the zone write pointer due to a write error, 1358 * the user must either execute a report zones, reset the zone or finish 1359 * the to recover a reliable write pointer position. Fail BIOs if the 1360 * user did not do that as we cannot handle emulated zone append 1361 * otherwise. 1362 */ 1363 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 1364 return false; 1365 1366 /* 1367 * Check that the user is not attempting to write to a full zone. 1368 * We know such BIO will fail, and that would potentially overflow our 1369 * write pointer offset beyond the end of the zone. 1370 */ 1371 if (disk_zone_wplug_is_full(disk, zwplug)) 1372 return false; 1373 1374 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1375 /* 1376 * Use a regular write starting at the current write pointer. 1377 * Similarly to native zone append operations, do not allow 1378 * merging. 1379 */ 1380 bio->bi_opf &= ~REQ_OP_MASK; 1381 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; 1382 bio->bi_iter.bi_sector += zwplug->wp_offset; 1383 1384 /* 1385 * Remember that this BIO is in fact a zone append operation 1386 * so that we can restore its operation code on completion. 1387 */ 1388 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); 1389 } else { 1390 /* 1391 * Check for non-sequential writes early as we know that BIOs 1392 * with a start sector not unaligned to the zone write pointer 1393 * will fail. 1394 */ 1395 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) 1396 return false; 1397 } 1398 1399 /* Advance the zone write pointer offset. */ 1400 zwplug->wp_offset += bio_sectors(bio); 1401 disk_zone_wplug_update_cond(disk, zwplug); 1402 1403 return true; 1404 } 1405 1406 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) 1407 { 1408 struct gendisk *disk = bio->bi_bdev->bd_disk; 1409 sector_t sector = bio->bi_iter.bi_sector; 1410 struct blk_zone_wplug *zwplug; 1411 gfp_t gfp_mask = GFP_NOIO; 1412 unsigned long flags; 1413 1414 /* 1415 * BIOs must be fully contained within a zone so that we use the correct 1416 * zone write plug for the entire BIO. For blk-mq devices, the block 1417 * layer should already have done any splitting required to ensure this 1418 * and this BIO should thus not be straddling zone boundaries. For 1419 * BIO-based devices, it is the responsibility of the driver to split 1420 * the bio before submitting it. 1421 */ 1422 if (WARN_ON_ONCE(bio_straddles_zones(bio))) { 1423 bio_io_error(bio); 1424 return true; 1425 } 1426 1427 /* Conventional zones do not need write plugging. */ 1428 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { 1429 /* Zone append to conventional zones is not allowed. */ 1430 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1431 bio_io_error(bio); 1432 return true; 1433 } 1434 return false; 1435 } 1436 1437 if (bio->bi_opf & REQ_NOWAIT) 1438 gfp_mask = GFP_NOWAIT; 1439 1440 zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask); 1441 if (!zwplug) { 1442 if (bio->bi_opf & REQ_NOWAIT) 1443 bio_wouldblock_error(bio); 1444 else 1445 bio_io_error(bio); 1446 return true; 1447 } 1448 1449 spin_lock_irqsave(&zwplug->lock, flags); 1450 1451 /* 1452 * If we got a zone write plug marked as dead, then the user is issuing 1453 * writes to a full zone, or without synchronizing with zone reset or 1454 * zone finish operations. In such case, fail the BIO to signal this 1455 * invalid usage. 1456 */ 1457 if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) { 1458 spin_unlock_irqrestore(&zwplug->lock, flags); 1459 disk_put_zone_wplug(zwplug); 1460 bio_io_error(bio); 1461 return true; 1462 } 1463 1464 /* Indicate that this BIO is being handled using zone write plugging. */ 1465 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1466 1467 /* 1468 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a 1469 * BLK_STS_AGAIN failure if we let the caller submit the BIO. 1470 */ 1471 if (bio->bi_opf & REQ_NOWAIT) { 1472 bio->bi_opf &= ~REQ_NOWAIT; 1473 goto queue_bio; 1474 } 1475 1476 /* 1477 * For rotational devices, we will use the gendisk zone write plugs 1478 * work instead of the per zone write plug BIO work, so queue the BIO. 1479 */ 1480 if (blk_queue_zoned_qd1_writes(disk->queue)) 1481 goto queue_bio; 1482 1483 /* If the zone is already plugged, add the BIO to the BIO plug list. */ 1484 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) 1485 goto queue_bio; 1486 1487 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { 1488 spin_unlock_irqrestore(&zwplug->lock, flags); 1489 bio_io_error(bio); 1490 return true; 1491 } 1492 1493 /* Otherwise, plug and let the caller submit the BIO. */ 1494 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1495 1496 spin_unlock_irqrestore(&zwplug->lock, flags); 1497 1498 return false; 1499 1500 queue_bio: 1501 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); 1502 1503 if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { 1504 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1505 if (blk_queue_zoned_qd1_writes(disk->queue)) 1506 wake_up_process(disk->zone_wplugs_worker); 1507 else 1508 disk_zone_wplug_schedule_work(disk, zwplug); 1509 } 1510 1511 spin_unlock_irqrestore(&zwplug->lock, flags); 1512 1513 return true; 1514 } 1515 1516 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) 1517 { 1518 struct gendisk *disk = bio->bi_bdev->bd_disk; 1519 struct blk_zone_wplug *zwplug; 1520 unsigned long flags; 1521 1522 if (!test_bit(GD_ZONE_APPEND_USED, &disk->state)) 1523 set_bit(GD_ZONE_APPEND_USED, &disk->state); 1524 1525 /* 1526 * We have native support for zone append operations, so we are not 1527 * going to handle @bio through plugging. However, we may already have a 1528 * zone write plug for the target zone if that zone was previously 1529 * partially written using regular writes. In such case, we risk leaving 1530 * the plug in the disk hash table if the zone is fully written using 1531 * zone append operations. Avoid this by removing the zone write plug. 1532 */ 1533 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1534 if (likely(!zwplug)) 1535 return; 1536 1537 spin_lock_irqsave(&zwplug->lock, flags); 1538 1539 /* 1540 * We are about to remove the zone write plug. But if the user 1541 * (mistakenly) has issued regular writes together with native zone 1542 * append, we must aborts the writes as otherwise the plugged BIOs would 1543 * not be executed by the plug BIO work as disk_get_zone_wplug() will 1544 * return NULL after the plug is removed. Aborting the plugged write 1545 * BIOs is consistent with the fact that these writes will most likely 1546 * fail anyway as there is no ordering guarantees between zone append 1547 * operations and regular write operations. 1548 */ 1549 if (!bio_list_empty(&zwplug->bio_list)) { 1550 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", 1551 disk->disk_name, zwplug->zone_no); 1552 disk_zone_wplug_abort(zwplug); 1553 } 1554 disk_mark_zone_wplug_dead(zwplug); 1555 spin_unlock_irqrestore(&zwplug->lock, flags); 1556 1557 disk_put_zone_wplug(zwplug); 1558 } 1559 1560 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio) 1561 { 1562 if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL && 1563 !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 1564 /* 1565 * Zone reset and zone finish operations do not apply to 1566 * conventional zones. 1567 */ 1568 bio_io_error(bio); 1569 return true; 1570 } 1571 1572 /* 1573 * No-wait zone management BIOs do not make much sense as the callers 1574 * issue these as blocking operations in most cases. To avoid issues 1575 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn 1576 * about REQ_NOWAIT being set and ignore that flag. 1577 */ 1578 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) 1579 bio->bi_opf &= ~REQ_NOWAIT; 1580 1581 return false; 1582 } 1583 1584 /** 1585 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging 1586 * @bio: The BIO being submitted 1587 * @nr_segs: The number of physical segments of @bio 1588 * 1589 * Handle write, write zeroes and zone append operations requiring emulation 1590 * using zone write plugging. 1591 * 1592 * Return true whenever @bio execution needs to be delayed through the zone 1593 * write plug. Otherwise, return false to let the submission path process 1594 * @bio normally. 1595 */ 1596 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) 1597 { 1598 struct block_device *bdev = bio->bi_bdev; 1599 1600 if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) 1601 return false; 1602 1603 /* 1604 * Regular writes and write zeroes need to be handled through the target 1605 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH 1606 * which may need to go through the flush machinery depending on the 1607 * target device capabilities. Plugging such writes is fine as the flush 1608 * machinery operates at the request level, below the plug, and 1609 * completion of the flush sequence will go through the regular BIO 1610 * completion, which will handle zone write plugging. 1611 * Zone append operations for devices that requested emulation must 1612 * also be plugged so that these BIOs can be changed into regular 1613 * write BIOs. 1614 * Zone reset, reset all and finish commands need special treatment 1615 * to correctly track the write pointer offset of zones. These commands 1616 * are not plugged as we do not need serialization with write 1617 * operations. It is the responsibility of the user to not issue reset 1618 * and finish commands when write operations are in flight. 1619 */ 1620 switch (bio_op(bio)) { 1621 case REQ_OP_ZONE_APPEND: 1622 if (!bdev_emulates_zone_append(bdev)) { 1623 blk_zone_wplug_handle_native_zone_append(bio); 1624 return false; 1625 } 1626 fallthrough; 1627 case REQ_OP_WRITE: 1628 case REQ_OP_WRITE_ZEROES: 1629 return blk_zone_wplug_handle_write(bio, nr_segs); 1630 case REQ_OP_ZONE_RESET: 1631 case REQ_OP_ZONE_FINISH: 1632 case REQ_OP_ZONE_RESET_ALL: 1633 return blk_zone_wplug_handle_zone_mgmt(bio); 1634 default: 1635 return false; 1636 } 1637 1638 return false; 1639 } 1640 EXPORT_SYMBOL_GPL(blk_zone_plug_bio); 1641 1642 static void disk_zone_wplug_unplug_bio(struct gendisk *disk, 1643 struct blk_zone_wplug *zwplug) 1644 { 1645 unsigned long flags; 1646 1647 spin_lock_irqsave(&zwplug->lock, flags); 1648 1649 /* 1650 * For rotational devices, signal the BIO completion to the zone write 1651 * plug work. Otherwise, schedule submission of the next plugged BIO 1652 * if we have one. 1653 */ 1654 if (bio_list_empty(&zwplug->bio_list)) 1655 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1656 1657 if (blk_queue_zoned_qd1_writes(disk->queue)) 1658 complete(&disk->zone_wplugs_worker_bio_done); 1659 else if (!bio_list_empty(&zwplug->bio_list)) 1660 disk_zone_wplug_schedule_work(disk, zwplug); 1661 1662 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) 1663 disk_mark_zone_wplug_dead(zwplug); 1664 1665 spin_unlock_irqrestore(&zwplug->lock, flags); 1666 } 1667 1668 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio) 1669 { 1670 /* 1671 * For zone append requests, the request sector indicates the location 1672 * at which the BIO data was written. Return this value to the BIO 1673 * issuer through the BIO iter sector. 1674 * For plugged zone writes, which include emulated zone append, we need 1675 * the original BIO sector so that blk_zone_write_plug_bio_endio() can 1676 * lookup the zone write plug. 1677 */ 1678 bio->bi_iter.bi_sector = rq->__sector; 1679 trace_blk_zone_append_update_request_bio(rq); 1680 } 1681 1682 void blk_zone_write_plug_bio_endio(struct bio *bio) 1683 { 1684 struct gendisk *disk = bio->bi_bdev->bd_disk; 1685 struct blk_zone_wplug *zwplug = 1686 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1687 unsigned long flags; 1688 1689 if (WARN_ON_ONCE(!zwplug)) 1690 return; 1691 1692 /* Make sure we do not see this BIO again by clearing the plug flag. */ 1693 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1694 1695 /* 1696 * If this is a regular write emulating a zone append operation, 1697 * restore the original operation code. 1698 */ 1699 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { 1700 bio->bi_opf &= ~REQ_OP_MASK; 1701 bio->bi_opf |= REQ_OP_ZONE_APPEND; 1702 bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); 1703 } 1704 1705 /* 1706 * If the BIO failed, abort all plugged BIOs and mark the plug as 1707 * needing a write pointer update. 1708 */ 1709 if (bio->bi_status != BLK_STS_OK) { 1710 spin_lock_irqsave(&zwplug->lock, flags); 1711 disk_zone_wplug_abort(zwplug); 1712 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; 1713 spin_unlock_irqrestore(&zwplug->lock, flags); 1714 } 1715 1716 /* Drop the reference we took when the BIO was issued. */ 1717 disk_put_zone_wplug(zwplug); 1718 1719 /* 1720 * For BIO-based devices, blk_zone_write_plug_finish_request() 1721 * is not called. So we need to schedule execution of the next 1722 * plugged BIO here. 1723 */ 1724 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) 1725 disk_zone_wplug_unplug_bio(disk, zwplug); 1726 1727 /* Drop the reference we took when entering this function. */ 1728 disk_put_zone_wplug(zwplug); 1729 } 1730 1731 void blk_zone_write_plug_finish_request(struct request *req) 1732 { 1733 struct gendisk *disk = req->q->disk; 1734 struct blk_zone_wplug *zwplug; 1735 1736 zwplug = disk_get_zone_wplug(disk, req->__sector); 1737 if (WARN_ON_ONCE(!zwplug)) 1738 return; 1739 1740 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; 1741 1742 /* 1743 * Drop the reference we took when the request was initialized in 1744 * blk_zone_write_plug_init_request(). 1745 */ 1746 disk_put_zone_wplug(zwplug); 1747 1748 disk_zone_wplug_unplug_bio(disk, zwplug); 1749 1750 /* Drop the reference we took when entering this function. */ 1751 disk_put_zone_wplug(zwplug); 1752 } 1753 1754 static bool disk_zone_wplug_submit_bio(struct gendisk *disk, 1755 struct blk_zone_wplug *zwplug) 1756 { 1757 struct block_device *bdev; 1758 unsigned long flags; 1759 struct bio *bio; 1760 bool prepared; 1761 1762 /* 1763 * Submit the next plugged BIO. If we do not have any, clear 1764 * the plugged flag. 1765 */ 1766 again: 1767 spin_lock_irqsave(&zwplug->lock, flags); 1768 bio = bio_list_pop(&zwplug->bio_list); 1769 if (!bio) { 1770 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1771 spin_unlock_irqrestore(&zwplug->lock, flags); 1772 return false; 1773 } 1774 1775 trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, 1776 bio->bi_iter.bi_sector, bio_sectors(bio)); 1777 1778 prepared = blk_zone_wplug_prepare_bio(zwplug, bio); 1779 spin_unlock_irqrestore(&zwplug->lock, flags); 1780 1781 if (!prepared) { 1782 blk_zone_wplug_bio_io_error(zwplug, bio); 1783 goto again; 1784 } 1785 1786 /* 1787 * blk-mq devices will reuse the extra reference on the request queue 1788 * usage counter we took when the BIO was plugged, but the submission 1789 * path for BIO-based devices will not do that. So drop this extra 1790 * reference here. 1791 */ 1792 if (blk_queue_zoned_qd1_writes(disk->queue)) 1793 reinit_completion(&disk->zone_wplugs_worker_bio_done); 1794 bdev = bio->bi_bdev; 1795 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { 1796 bdev->bd_disk->fops->submit_bio(bio); 1797 blk_queue_exit(bdev->bd_disk->queue); 1798 } else { 1799 blk_mq_submit_bio(bio); 1800 } 1801 1802 return true; 1803 } 1804 1805 static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) 1806 { 1807 struct blk_zone_wplug *zwplug; 1808 1809 spin_lock_irq(&disk->zone_wplugs_list_lock); 1810 zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, 1811 struct blk_zone_wplug, entry); 1812 if (zwplug) 1813 list_del_init(&zwplug->entry); 1814 spin_unlock_irq(&disk->zone_wplugs_list_lock); 1815 1816 return zwplug; 1817 } 1818 1819 static int disk_zone_wplugs_worker(void *data) 1820 { 1821 struct gendisk *disk = data; 1822 struct blk_zone_wplug *zwplug; 1823 unsigned int noio_flag; 1824 1825 noio_flag = memalloc_noio_save(); 1826 set_user_nice(current, MIN_NICE); 1827 set_freezable(); 1828 1829 for (;;) { 1830 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1831 1832 zwplug = disk_get_zone_wplugs_work(disk); 1833 if (zwplug) { 1834 /* 1835 * Process all BIOs of this zone write plug and then 1836 * drop the reference we took when adding the zone write 1837 * plug to the active list. 1838 */ 1839 set_current_state(TASK_RUNNING); 1840 while (disk_zone_wplug_submit_bio(disk, zwplug)) 1841 blk_wait_io(&disk->zone_wplugs_worker_bio_done); 1842 disk_put_zone_wplug(zwplug); 1843 continue; 1844 } 1845 1846 /* 1847 * Only sleep if nothing sets the state to running. Else check 1848 * for zone write plugs work again as a newly submitted BIO 1849 * might have added a zone write plug to the work list. 1850 */ 1851 if (get_current_state() == TASK_RUNNING) { 1852 try_to_freeze(); 1853 } else { 1854 if (kthread_should_stop()) { 1855 set_current_state(TASK_RUNNING); 1856 break; 1857 } 1858 schedule(); 1859 } 1860 } 1861 1862 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); 1863 memalloc_noio_restore(noio_flag); 1864 1865 return 0; 1866 } 1867 1868 void disk_init_zone_resources(struct gendisk *disk) 1869 { 1870 spin_lock_init(&disk->zone_wplugs_hash_lock); 1871 spin_lock_init(&disk->zone_wplugs_list_lock); 1872 INIT_LIST_HEAD(&disk->zone_wplugs_list); 1873 init_completion(&disk->zone_wplugs_worker_bio_done); 1874 } 1875 1876 /* 1877 * For the size of a disk zone write plug hash table, use the size of the 1878 * zone write plug mempool, which is the maximum of the disk open zones and 1879 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, 1880 * 9 bits. For a disk that has no limits, mempool size defaults to 128. 1881 */ 1882 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 1883 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 1884 1885 static int disk_alloc_zone_resources(struct gendisk *disk, 1886 unsigned int pool_size) 1887 { 1888 unsigned int i; 1889 int ret = -ENOMEM; 1890 1891 atomic_set(&disk->nr_zone_wplugs, 0); 1892 disk->zone_wplugs_hash_bits = 1893 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); 1894 1895 disk->zone_wplugs_hash = 1896 kzalloc_objs(struct hlist_head, 1897 disk_zone_wplugs_hash_size(disk)); 1898 if (!disk->zone_wplugs_hash) 1899 return -ENOMEM; 1900 1901 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 1902 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); 1903 1904 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, 1905 sizeof(struct blk_zone_wplug)); 1906 if (!disk->zone_wplugs_pool) 1907 goto free_hash; 1908 1909 disk->zone_wplugs_wq = 1910 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, 1911 pool_size, disk->disk_name); 1912 if (!disk->zone_wplugs_wq) 1913 goto destroy_pool; 1914 1915 disk->zone_wplugs_worker = 1916 kthread_create(disk_zone_wplugs_worker, disk, 1917 "%s_zwplugs_worker", disk->disk_name); 1918 if (IS_ERR(disk->zone_wplugs_worker)) { 1919 ret = PTR_ERR(disk->zone_wplugs_worker); 1920 disk->zone_wplugs_worker = NULL; 1921 goto destroy_wq; 1922 } 1923 wake_up_process(disk->zone_wplugs_worker); 1924 1925 return 0; 1926 1927 destroy_wq: 1928 destroy_workqueue(disk->zone_wplugs_wq); 1929 disk->zone_wplugs_wq = NULL; 1930 destroy_pool: 1931 mempool_destroy(disk->zone_wplugs_pool); 1932 disk->zone_wplugs_pool = NULL; 1933 free_hash: 1934 kfree(disk->zone_wplugs_hash); 1935 disk->zone_wplugs_hash = NULL; 1936 disk->zone_wplugs_hash_bits = 0; 1937 return ret; 1938 } 1939 1940 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) 1941 { 1942 struct blk_zone_wplug *zwplug; 1943 unsigned int i; 1944 1945 if (!disk->zone_wplugs_hash) 1946 return; 1947 1948 /* Free all the zone write plugs we have. */ 1949 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1950 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { 1951 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, 1952 struct blk_zone_wplug, node); 1953 spin_lock_irq(&zwplug->lock); 1954 disk_mark_zone_wplug_dead(zwplug); 1955 spin_unlock_irq(&zwplug->lock); 1956 } 1957 } 1958 1959 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); 1960 kfree(disk->zone_wplugs_hash); 1961 disk->zone_wplugs_hash = NULL; 1962 disk->zone_wplugs_hash_bits = 0; 1963 1964 /* 1965 * Wait for the zone write plugs to be RCU-freed before destroying the 1966 * mempool. 1967 */ 1968 rcu_barrier(); 1969 mempool_destroy(disk->zone_wplugs_pool); 1970 disk->zone_wplugs_pool = NULL; 1971 } 1972 1973 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) 1974 { 1975 unsigned long flags; 1976 1977 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 1978 zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, 1979 lockdep_is_held(&disk->zone_wplugs_hash_lock)); 1980 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 1981 1982 kfree_rcu_mightsleep(zones_cond); 1983 } 1984 1985 void disk_free_zone_resources(struct gendisk *disk) 1986 { 1987 if (disk->zone_wplugs_worker) 1988 kthread_stop(disk->zone_wplugs_worker); 1989 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); 1990 1991 if (disk->zone_wplugs_wq) { 1992 destroy_workqueue(disk->zone_wplugs_wq); 1993 disk->zone_wplugs_wq = NULL; 1994 } 1995 1996 disk_destroy_zone_wplugs_hash_table(disk); 1997 1998 disk_set_zones_cond_array(disk, NULL); 1999 disk->zone_capacity = 0; 2000 disk->last_zone_capacity = 0; 2001 disk->nr_zones = 0; 2002 } 2003 2004 struct blk_revalidate_zone_args { 2005 struct gendisk *disk; 2006 u8 *zones_cond; 2007 unsigned int nr_zones; 2008 unsigned int nr_conv_zones; 2009 unsigned int zone_capacity; 2010 unsigned int last_zone_capacity; 2011 sector_t sector; 2012 }; 2013 2014 static int disk_revalidate_zone_resources(struct gendisk *disk, 2015 struct blk_revalidate_zone_args *args) 2016 { 2017 struct queue_limits *lim = &disk->queue->limits; 2018 unsigned int pool_size; 2019 2020 args->disk = disk; 2021 args->nr_zones = 2022 DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors); 2023 2024 /* Cached zone conditions: 1 byte per zone */ 2025 args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO); 2026 if (!args->zones_cond) 2027 return -ENOMEM; 2028 2029 if (!disk_need_zone_resources(disk)) 2030 return 0; 2031 2032 /* 2033 * If the device has no limit on the maximum number of open and active 2034 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. 2035 */ 2036 pool_size = max(lim->max_open_zones, lim->max_active_zones); 2037 if (!pool_size) 2038 pool_size = 2039 min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); 2040 2041 if (!disk->zone_wplugs_hash) 2042 return disk_alloc_zone_resources(disk, pool_size); 2043 2044 return 0; 2045 } 2046 2047 /* 2048 * Update the disk zone resources information and device queue limits. 2049 * The disk queue is frozen when this is executed. 2050 */ 2051 static int disk_update_zone_resources(struct gendisk *disk, 2052 struct blk_revalidate_zone_args *args) 2053 { 2054 struct request_queue *q = disk->queue; 2055 unsigned int nr_seq_zones; 2056 unsigned int pool_size, memflags; 2057 struct queue_limits lim; 2058 int ret = 0; 2059 2060 lim = queue_limits_start_update(q); 2061 2062 memflags = blk_mq_freeze_queue(q); 2063 2064 disk->nr_zones = args->nr_zones; 2065 if (args->nr_conv_zones >= disk->nr_zones) { 2066 queue_limits_cancel_update(q); 2067 pr_warn("%s: Invalid number of conventional zones %u / %u\n", 2068 disk->disk_name, args->nr_conv_zones, disk->nr_zones); 2069 ret = -ENODEV; 2070 goto unfreeze; 2071 } 2072 2073 disk->zone_capacity = args->zone_capacity; 2074 disk->last_zone_capacity = args->last_zone_capacity; 2075 disk_set_zones_cond_array(disk, args->zones_cond); 2076 2077 /* 2078 * Some devices can advertise zone resource limits that are larger than 2079 * the number of sequential zones of the zoned block device, e.g. a 2080 * small ZNS namespace. For such case, assume that the zoned device has 2081 * no zone resource limits. 2082 */ 2083 nr_seq_zones = disk->nr_zones - args->nr_conv_zones; 2084 if (lim.max_open_zones >= nr_seq_zones) 2085 lim.max_open_zones = 0; 2086 if (lim.max_active_zones >= nr_seq_zones) 2087 lim.max_active_zones = 0; 2088 2089 if (!disk->zone_wplugs_pool) 2090 goto commit; 2091 2092 /* 2093 * If the device has no limit on the maximum number of open and active 2094 * zones, set its max open zone limit to the mempool size to indicate 2095 * to the user that there is a potential performance impact due to 2096 * dynamic zone write plug allocation when simultaneously writing to 2097 * more zones than the size of the mempool. 2098 */ 2099 pool_size = max(lim.max_open_zones, lim.max_active_zones); 2100 if (!pool_size) 2101 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); 2102 2103 mempool_resize(disk->zone_wplugs_pool, pool_size); 2104 2105 if (!lim.max_open_zones && !lim.max_active_zones) { 2106 if (pool_size < nr_seq_zones) 2107 lim.max_open_zones = pool_size; 2108 else 2109 lim.max_open_zones = 0; 2110 } 2111 2112 commit: 2113 ret = queue_limits_commit_update(q, &lim); 2114 2115 unfreeze: 2116 if (ret) 2117 disk_free_zone_resources(disk); 2118 2119 blk_mq_unfreeze_queue(q, memflags); 2120 2121 return ret; 2122 } 2123 2124 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx, 2125 struct blk_revalidate_zone_args *args) 2126 { 2127 enum blk_zone_cond cond = zone->cond; 2128 2129 /* Check that the zone condition is consistent with the zone type. */ 2130 switch (cond) { 2131 case BLK_ZONE_COND_NOT_WP: 2132 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) 2133 goto invalid_condition; 2134 break; 2135 case BLK_ZONE_COND_IMP_OPEN: 2136 case BLK_ZONE_COND_EXP_OPEN: 2137 case BLK_ZONE_COND_CLOSED: 2138 case BLK_ZONE_COND_EMPTY: 2139 case BLK_ZONE_COND_FULL: 2140 case BLK_ZONE_COND_OFFLINE: 2141 case BLK_ZONE_COND_READONLY: 2142 if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) 2143 goto invalid_condition; 2144 break; 2145 default: 2146 pr_warn("%s: Invalid zone condition 0x%X\n", 2147 args->disk->disk_name, cond); 2148 return -ENODEV; 2149 } 2150 2151 blk_zone_set_cond(args->zones_cond, idx, cond); 2152 2153 return 0; 2154 2155 invalid_condition: 2156 pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n", 2157 args->disk->disk_name, cond, zone->type); 2158 2159 return -ENODEV; 2160 } 2161 2162 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, 2163 struct blk_revalidate_zone_args *args) 2164 { 2165 struct gendisk *disk = args->disk; 2166 2167 if (zone->capacity != zone->len) { 2168 pr_warn("%s: Invalid conventional zone capacity\n", 2169 disk->disk_name); 2170 return -ENODEV; 2171 } 2172 2173 if (disk_zone_is_last(disk, zone)) 2174 args->last_zone_capacity = zone->capacity; 2175 2176 args->nr_conv_zones++; 2177 2178 return 0; 2179 } 2180 2181 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, 2182 struct blk_revalidate_zone_args *args) 2183 { 2184 struct gendisk *disk = args->disk; 2185 struct blk_zone_wplug *zwplug; 2186 unsigned int wp_offset; 2187 2188 /* 2189 * Remember the capacity of the first sequential zone and check 2190 * if it is constant for all zones, ignoring the last zone as it can be 2191 * smaller. 2192 */ 2193 if (!args->zone_capacity) 2194 args->zone_capacity = zone->capacity; 2195 if (disk_zone_is_last(disk, zone)) { 2196 args->last_zone_capacity = zone->capacity; 2197 } else if (zone->capacity != args->zone_capacity) { 2198 pr_warn("%s: Invalid variable zone capacity\n", 2199 disk->disk_name); 2200 return -ENODEV; 2201 } 2202 2203 /* 2204 * If the device needs zone append emulation, we need to track the 2205 * write pointer of all zones that are not empty nor full. So make sure 2206 * we have a zone write plug for such zone if the device has a zone 2207 * write plug hash table. 2208 */ 2209 if (!disk->zone_wplugs_hash) 2210 return 0; 2211 2212 wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone); 2213 if (!wp_offset || wp_offset >= zone->capacity) 2214 return 0; 2215 2216 zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO); 2217 if (!zwplug) 2218 return -ENOMEM; 2219 disk_put_zone_wplug(zwplug); 2220 2221 return 0; 2222 } 2223 2224 /* 2225 * Helper function to check the validity of zones of a zoned block device. 2226 */ 2227 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, 2228 void *data) 2229 { 2230 struct blk_revalidate_zone_args *args = data; 2231 struct gendisk *disk = args->disk; 2232 sector_t zone_sectors = disk->queue->limits.chunk_sectors; 2233 int ret; 2234 2235 /* Check for bad zones and holes in the zone report */ 2236 if (zone->start != args->sector) { 2237 pr_warn("%s: Zone gap at sectors %llu..%llu\n", 2238 disk->disk_name, args->sector, zone->start); 2239 return -ENODEV; 2240 } 2241 2242 if (zone->start >= get_capacity(disk) || !zone->len) { 2243 pr_warn("%s: Invalid zone start %llu, length %llu\n", 2244 disk->disk_name, zone->start, zone->len); 2245 return -ENODEV; 2246 } 2247 2248 /* 2249 * All zones must have the same size, with the exception on an eventual 2250 * smaller last zone. 2251 */ 2252 if (!disk_zone_is_last(disk, zone)) { 2253 if (zone->len != zone_sectors) { 2254 pr_warn("%s: Invalid zoned device with non constant zone size\n", 2255 disk->disk_name); 2256 return -ENODEV; 2257 } 2258 } else if (zone->len > zone_sectors) { 2259 pr_warn("%s: Invalid zoned device with larger last zone size\n", 2260 disk->disk_name); 2261 return -ENODEV; 2262 } 2263 2264 if (!zone->capacity || zone->capacity > zone->len) { 2265 pr_warn("%s: Invalid zone capacity\n", 2266 disk->disk_name); 2267 return -ENODEV; 2268 } 2269 2270 /* Check zone condition */ 2271 ret = blk_revalidate_zone_cond(zone, idx, args); 2272 if (ret) 2273 return ret; 2274 2275 /* Check zone type */ 2276 switch (zone->type) { 2277 case BLK_ZONE_TYPE_CONVENTIONAL: 2278 ret = blk_revalidate_conv_zone(zone, idx, args); 2279 break; 2280 case BLK_ZONE_TYPE_SEQWRITE_REQ: 2281 ret = blk_revalidate_seq_zone(zone, idx, args); 2282 break; 2283 case BLK_ZONE_TYPE_SEQWRITE_PREF: 2284 default: 2285 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", 2286 disk->disk_name, (int)zone->type, zone->start); 2287 ret = -ENODEV; 2288 } 2289 2290 if (!ret) 2291 args->sector += zone->len; 2292 2293 return ret; 2294 } 2295 2296 /** 2297 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs 2298 * @disk: Target disk 2299 * 2300 * Helper function for low-level device drivers to check, (re) allocate and 2301 * initialize resources used for managing zoned disks. This function should 2302 * normally be called by blk-mq based drivers when a zoned gendisk is probed 2303 * and when the zone configuration of the gendisk changes (e.g. after a format). 2304 * Before calling this function, the device driver must already have set the 2305 * device zone size (chunk_sector limit) and the max zone append limit. 2306 * BIO based drivers can also use this function as long as the device queue 2307 * can be safely frozen. 2308 */ 2309 int blk_revalidate_disk_zones(struct gendisk *disk) 2310 { 2311 struct request_queue *q = disk->queue; 2312 sector_t zone_sectors = q->limits.chunk_sectors; 2313 sector_t capacity = get_capacity(disk); 2314 struct blk_revalidate_zone_args args = { }; 2315 unsigned int memflags, noio_flag; 2316 struct blk_report_zones_args rep_args = { 2317 .cb = blk_revalidate_zone_cb, 2318 .data = &args, 2319 }; 2320 int ret = -ENOMEM; 2321 2322 if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) 2323 return -EIO; 2324 2325 if (!capacity) 2326 return -ENODEV; 2327 2328 /* 2329 * Checks that the device driver indicated a valid zone size and that 2330 * the max zone append limit is set. 2331 */ 2332 if (!zone_sectors || !is_power_of_2(zone_sectors)) { 2333 pr_warn("%s: Invalid non power of two zone size (%llu)\n", 2334 disk->disk_name, zone_sectors); 2335 return -ENODEV; 2336 } 2337 2338 /* 2339 * Ensure that all memory allocations in this context are done as if 2340 * GFP_NOIO was specified. 2341 */ 2342 noio_flag = memalloc_noio_save(); 2343 ret = disk_revalidate_zone_resources(disk, &args); 2344 if (ret) { 2345 memalloc_noio_restore(noio_flag); 2346 return ret; 2347 } 2348 2349 ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args); 2350 if (!ret) { 2351 pr_warn("%s: No zones reported\n", disk->disk_name); 2352 ret = -ENODEV; 2353 } 2354 memalloc_noio_restore(noio_flag); 2355 2356 /* 2357 * If zones where reported, make sure that the entire disk capacity 2358 * has been checked. 2359 */ 2360 if (ret > 0 && args.sector != capacity) { 2361 pr_warn("%s: Missing zones from sector %llu\n", 2362 disk->disk_name, args.sector); 2363 ret = -ENODEV; 2364 } 2365 2366 if (ret > 0) 2367 return disk_update_zone_resources(disk, &args); 2368 2369 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 2370 2371 memflags = blk_mq_freeze_queue(q); 2372 disk_free_zone_resources(disk); 2373 blk_mq_unfreeze_queue(q, memflags); 2374 2375 return ret; 2376 } 2377 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 2378 2379 /** 2380 * blk_zone_issue_zeroout - zero-fill a block range in a zone 2381 * @bdev: blockdev to write 2382 * @sector: start sector 2383 * @nr_sects: number of sectors to write 2384 * @gfp_mask: memory allocation flags (for bio_alloc) 2385 * 2386 * Description: 2387 * Zero-fill a block range in a zone (@sector must be equal to the zone write 2388 * pointer), handling potential errors due to the (initially unknown) lack of 2389 * hardware offload (See blkdev_issue_zeroout()). 2390 */ 2391 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, 2392 sector_t nr_sects, gfp_t gfp_mask) 2393 { 2394 struct gendisk *disk = bdev->bd_disk; 2395 int ret; 2396 2397 if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) 2398 return -EIO; 2399 2400 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 2401 BLKDEV_ZERO_NOFALLBACK); 2402 if (ret != -EOPNOTSUPP) 2403 return ret; 2404 2405 /* 2406 * The failed call to blkdev_issue_zeroout() advanced the zone write 2407 * pointer. Undo this using a report zone to update the zone write 2408 * pointer to the correct current value. 2409 */ 2410 ret = disk->fops->report_zones(disk, sector, 1, NULL); 2411 if (ret != 1) 2412 return ret < 0 ? ret : -EIO; 2413 2414 /* 2415 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a 2416 * regular write with zero-pages. 2417 */ 2418 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); 2419 } 2420 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); 2421 2422 #ifdef CONFIG_BLK_DEBUG_FS 2423 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, 2424 struct seq_file *m) 2425 { 2426 unsigned int zwp_wp_offset, zwp_flags; 2427 unsigned int zwp_zone_no, zwp_ref; 2428 unsigned int zwp_bio_list_size; 2429 enum blk_zone_cond zwp_cond; 2430 unsigned long flags; 2431 2432 spin_lock_irqsave(&zwplug->lock, flags); 2433 zwp_zone_no = zwplug->zone_no; 2434 zwp_flags = zwplug->flags; 2435 zwp_ref = refcount_read(&zwplug->ref); 2436 zwp_cond = zwplug->cond; 2437 zwp_wp_offset = zwplug->wp_offset; 2438 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); 2439 spin_unlock_irqrestore(&zwplug->lock, flags); 2440 2441 seq_printf(m, 2442 "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n", 2443 zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond), 2444 zwp_wp_offset, zwp_bio_list_size); 2445 } 2446 2447 int queue_zone_wplugs_show(void *data, struct seq_file *m) 2448 { 2449 struct request_queue *q = data; 2450 struct gendisk *disk = q->disk; 2451 struct blk_zone_wplug *zwplug; 2452 unsigned int i; 2453 2454 if (!disk->zone_wplugs_hash) 2455 return 0; 2456 2457 rcu_read_lock(); 2458 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 2459 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], 2460 node) 2461 queue_zone_wplug_show(zwplug, m); 2462 rcu_read_unlock(); 2463 2464 return 0; 2465 } 2466 2467 #endif 2468