1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Zoned block device handling 4 * 5 * Copyright (c) 2015, Hannes Reinecke 6 * Copyright (c) 2015, SUSE Linux GmbH 7 * 8 * Copyright (c) 2016, Damien Le Moal 9 * Copyright (c) 2016, Western Digital 10 * Copyright (c) 2024, Western Digital Corporation or its affiliates. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-mq.h> 16 #include <linux/spinlock.h> 17 #include <linux/refcount.h> 18 #include <linux/mempool.h> 19 #include <linux/kthread.h> 20 #include <linux/freezer.h> 21 22 #include <trace/events/block.h> 23 24 #include "blk.h" 25 #include "blk-mq-sched.h" 26 #include "blk-mq-debugfs.h" 27 28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name 29 static const char *const zone_cond_name[] = { 30 ZONE_COND_NAME(NOT_WP), 31 ZONE_COND_NAME(EMPTY), 32 ZONE_COND_NAME(IMP_OPEN), 33 ZONE_COND_NAME(EXP_OPEN), 34 ZONE_COND_NAME(CLOSED), 35 ZONE_COND_NAME(READONLY), 36 ZONE_COND_NAME(FULL), 37 ZONE_COND_NAME(OFFLINE), 38 ZONE_COND_NAME(ACTIVE), 39 }; 40 #undef ZONE_COND_NAME 41 42 /* 43 * Per-zone write plug. 44 * @node: hlist_node structure for managing the plug using a hash table. 45 * @entry: list_head structure for listing the plug in the disk list of active 46 * zone write plugs. 47 * @bio_list: The list of BIOs that are currently plugged. 48 * @bio_work: Work struct to handle issuing of plugged BIOs 49 * @rcu_head: RCU head to free zone write plugs with an RCU grace period. 50 * @disk: The gendisk the plug belongs to. 51 * @lock: Spinlock to atomically manipulate the plug. 52 * @ref: Zone write plug reference counter. A zone write plug reference is 53 * always at least 1 when the plug is hashed in the disk plug hash table. 54 * The reference is incremented whenever a new BIO needing plugging is 55 * submitted and when a function needs to manipulate a plug. The 56 * reference count is decremented whenever a plugged BIO completes and 57 * when a function that referenced the plug returns. The initial 58 * reference is dropped whenever the zone of the zone write plug is reset, 59 * finished and when the zone becomes full (last write BIO to the zone 60 * completes). 61 * @flags: Flags indicating the plug state. 62 * @zone_no: The number of the zone the plug is managing. 63 * @wp_offset: The zone write pointer location relative to the start of the zone 64 * as a number of 512B sectors. 65 * @cond: Condition of the zone 66 */ 67 struct blk_zone_wplug { 68 struct hlist_node node; 69 struct list_head entry; 70 struct bio_list bio_list; 71 struct work_struct bio_work; 72 struct rcu_head rcu_head; 73 struct gendisk *disk; 74 spinlock_t lock; 75 refcount_t ref; 76 unsigned int flags; 77 unsigned int zone_no; 78 unsigned int wp_offset; 79 enum blk_zone_cond cond; 80 }; 81 82 static inline bool disk_need_zone_resources(struct gendisk *disk) 83 { 84 /* 85 * All request-based zoned devices need zone resources so that the 86 * block layer can automatically handle write BIO plugging. BIO-based 87 * device drivers (e.g. DM devices) are normally responsible for 88 * handling zone write ordering and do not need zone resources, unless 89 * the driver requires zone append emulation. 90 */ 91 return queue_is_mq(disk->queue) || 92 queue_emulates_zone_append(disk->queue); 93 } 94 95 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) 96 { 97 return 1U << disk->zone_wplugs_hash_bits; 98 } 99 100 /* 101 * Zone write plug flags bits: 102 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, 103 * that is, that write BIOs are being throttled due to a write BIO already 104 * being executed or the zone write plug bio list is not empty. 105 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone 106 * write pointer offset and need to update it. 107 * - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be 108 * removed from the disk hash table of zone write plugs when the last 109 * reference on the zone write plug is dropped. If set, this flag also 110 * indicates that the initial extra reference on the zone write plug was 111 * dropped, meaning that the reference count indicates the current number of 112 * active users (code context or BIOs and requests in flight). This flag is 113 * set when a zone is reset, finished or becomes full. 114 */ 115 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) 116 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) 117 #define BLK_ZONE_WPLUG_DEAD (1U << 2) 118 119 /** 120 * blk_zone_cond_str - Return a zone condition name string 121 * @zone_cond: a zone condition BLK_ZONE_COND_name 122 * 123 * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful 124 * for the debugging and tracing zone conditions. For an invalid zone 125 * conditions, the string "UNKNOWN" is returned. 126 */ 127 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) 128 { 129 static const char *zone_cond_str = "UNKNOWN"; 130 131 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) 132 zone_cond_str = zone_cond_name[zone_cond]; 133 134 return zone_cond_str; 135 } 136 EXPORT_SYMBOL_GPL(blk_zone_cond_str); 137 138 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno, 139 enum blk_zone_cond cond) 140 { 141 if (!zones_cond) 142 return; 143 144 switch (cond) { 145 case BLK_ZONE_COND_IMP_OPEN: 146 case BLK_ZONE_COND_EXP_OPEN: 147 case BLK_ZONE_COND_CLOSED: 148 zones_cond[zno] = BLK_ZONE_COND_ACTIVE; 149 return; 150 case BLK_ZONE_COND_NOT_WP: 151 case BLK_ZONE_COND_EMPTY: 152 case BLK_ZONE_COND_FULL: 153 case BLK_ZONE_COND_OFFLINE: 154 case BLK_ZONE_COND_READONLY: 155 default: 156 zones_cond[zno] = cond; 157 return; 158 } 159 } 160 161 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector, 162 enum blk_zone_cond cond) 163 { 164 u8 *zones_cond; 165 166 rcu_read_lock(); 167 zones_cond = rcu_dereference(disk->zones_cond); 168 if (zones_cond) { 169 unsigned int zno = disk_zone_no(disk, sector); 170 171 /* 172 * The condition of a conventional, readonly and offline zones 173 * never changes, so do nothing if the target zone is in one of 174 * these conditions. 175 */ 176 switch (zones_cond[zno]) { 177 case BLK_ZONE_COND_NOT_WP: 178 case BLK_ZONE_COND_READONLY: 179 case BLK_ZONE_COND_OFFLINE: 180 break; 181 default: 182 blk_zone_set_cond(zones_cond, zno, cond); 183 break; 184 } 185 } 186 rcu_read_unlock(); 187 } 188 189 /** 190 * bdev_zone_is_seq - check if a sector belongs to a sequential write zone 191 * @bdev: block device to check 192 * @sector: sector number 193 * 194 * Check if @sector on @bdev is contained in a sequential write required zone. 195 */ 196 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) 197 { 198 struct gendisk *disk = bdev->bd_disk; 199 unsigned int zno = disk_zone_no(disk, sector); 200 bool is_seq = false; 201 u8 *zones_cond; 202 203 if (!bdev_is_zoned(bdev)) 204 return false; 205 206 rcu_read_lock(); 207 zones_cond = rcu_dereference(disk->zones_cond); 208 if (zones_cond && zno < disk->nr_zones) 209 is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP; 210 rcu_read_unlock(); 211 212 return is_seq; 213 } 214 EXPORT_SYMBOL_GPL(bdev_zone_is_seq); 215 216 /* 217 * Zone report arguments for block device drivers report_zones operation. 218 * @cb: report_zones_cb callback for each reported zone. 219 * @data: Private data passed to report_zones_cb. 220 */ 221 struct blk_report_zones_args { 222 report_zones_cb cb; 223 void *data; 224 bool report_active; 225 }; 226 227 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector, 228 unsigned int nr_zones, 229 struct blk_report_zones_args *args) 230 { 231 struct gendisk *disk = bdev->bd_disk; 232 233 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) 234 return -EOPNOTSUPP; 235 236 if (!nr_zones || sector >= get_capacity(disk)) 237 return 0; 238 239 return disk->fops->report_zones(disk, sector, nr_zones, args); 240 } 241 242 /** 243 * blkdev_report_zones - Get zones information 244 * @bdev: Target block device 245 * @sector: Sector from which to report zones 246 * @nr_zones: Maximum number of zones to report 247 * @cb: Callback function called for each reported zone 248 * @data: Private data for the callback 249 * 250 * Description: 251 * Get zone information starting from the zone containing @sector for at most 252 * @nr_zones, and call @cb for each zone reported by the device. 253 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES 254 * constant can be passed to @nr_zones. 255 * Returns the number of zones reported by the device, or a negative errno 256 * value in case of failure. 257 * 258 * Note: The caller must use memalloc_noXX_save/restore() calls to control 259 * memory allocations done within this function. 260 */ 261 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 262 unsigned int nr_zones, report_zones_cb cb, void *data) 263 { 264 struct blk_report_zones_args args = { 265 .cb = cb, 266 .data = data, 267 }; 268 269 return blkdev_do_report_zones(bdev, sector, nr_zones, &args); 270 } 271 EXPORT_SYMBOL_GPL(blkdev_report_zones); 272 273 static int blkdev_zone_reset_all(struct block_device *bdev) 274 { 275 struct bio bio; 276 277 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); 278 trace_blkdev_zone_mgmt(&bio, 0); 279 return submit_bio_wait(&bio); 280 } 281 282 /** 283 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones 284 * @bdev: Target block device 285 * @op: Operation to be performed on the zones 286 * @sector: Start sector of the first zone to operate on 287 * @nr_sectors: Number of sectors, should be at least the length of one zone and 288 * must be zone size aligned. 289 * 290 * Description: 291 * Perform the specified operation on the range of zones specified by 292 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range 293 * is valid, but the specified range should not contain conventional zones. 294 * The operation to execute on each zone can be a zone reset, open, close 295 * or finish request. 296 */ 297 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, 298 sector_t sector, sector_t nr_sectors) 299 { 300 sector_t zone_sectors = bdev_zone_sectors(bdev); 301 sector_t capacity = bdev_nr_sectors(bdev); 302 sector_t end_sector = sector + nr_sectors; 303 struct bio *bio = NULL; 304 int ret = 0; 305 306 if (!bdev_is_zoned(bdev)) 307 return -EOPNOTSUPP; 308 309 if (bdev_read_only(bdev)) 310 return -EPERM; 311 312 if (!op_is_zone_mgmt(op)) 313 return -EOPNOTSUPP; 314 315 if (end_sector <= sector || end_sector > capacity) 316 /* Out of range */ 317 return -EINVAL; 318 319 /* Check alignment (handle eventual smaller last zone) */ 320 if (!bdev_is_zone_start(bdev, sector)) 321 return -EINVAL; 322 323 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) 324 return -EINVAL; 325 326 /* 327 * In the case of a zone reset operation over all zones, use 328 * REQ_OP_ZONE_RESET_ALL. 329 */ 330 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) 331 return blkdev_zone_reset_all(bdev); 332 333 while (sector < end_sector) { 334 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); 335 bio->bi_iter.bi_sector = sector; 336 sector += zone_sectors; 337 338 /* This may take a while, so be nice to others */ 339 cond_resched(); 340 } 341 342 trace_blkdev_zone_mgmt(bio, nr_sectors); 343 ret = submit_bio_wait(bio); 344 bio_put(bio); 345 346 return ret; 347 } 348 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); 349 350 struct zone_report_args { 351 struct blk_zone __user *zones; 352 }; 353 354 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, 355 void *data) 356 { 357 struct zone_report_args *args = data; 358 359 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) 360 return -EFAULT; 361 return 0; 362 } 363 364 /* 365 * Mask of valid input flags for BLKREPORTZONEV2 ioctl. 366 */ 367 #define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED 368 369 /* 370 * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing. 371 * Called from blkdev_ioctl. 372 */ 373 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, 374 unsigned long arg) 375 { 376 void __user *argp = (void __user *)arg; 377 struct zone_report_args args; 378 struct blk_zone_report rep; 379 int ret; 380 381 if (!argp) 382 return -EINVAL; 383 384 if (!bdev_is_zoned(bdev)) 385 return -ENOTTY; 386 387 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) 388 return -EFAULT; 389 390 if (!rep.nr_zones) 391 return -EINVAL; 392 393 args.zones = argp + sizeof(struct blk_zone_report); 394 395 switch (cmd) { 396 case BLKREPORTZONE: 397 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, 398 blkdev_copy_zone_to_user, &args); 399 break; 400 case BLKREPORTZONEV2: 401 if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS) 402 return -EINVAL; 403 ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones, 404 blkdev_copy_zone_to_user, &args); 405 break; 406 default: 407 return -EINVAL; 408 } 409 410 if (ret < 0) 411 return ret; 412 413 rep.nr_zones = ret; 414 rep.flags = BLK_ZONE_REP_CAPACITY; 415 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) 416 return -EFAULT; 417 return 0; 418 } 419 420 static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode, 421 struct blk_zone_range *zrange) 422 { 423 loff_t start, end; 424 int ret = -EINVAL; 425 426 inode_lock(bdev->bd_mapping->host); 427 filemap_invalidate_lock(bdev->bd_mapping); 428 if (zrange->sector + zrange->nr_sectors <= zrange->sector || 429 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) 430 /* Out of range */ 431 goto out_unlock; 432 433 start = zrange->sector << SECTOR_SHIFT; 434 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; 435 436 ret = truncate_bdev_range(bdev, mode, start, end); 437 if (ret) 438 goto out_unlock; 439 440 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector, 441 zrange->nr_sectors); 442 out_unlock: 443 filemap_invalidate_unlock(bdev->bd_mapping); 444 inode_unlock(bdev->bd_mapping->host); 445 return ret; 446 } 447 448 /* 449 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. 450 * Called from blkdev_ioctl. 451 */ 452 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, 453 unsigned int cmd, unsigned long arg) 454 { 455 void __user *argp = (void __user *)arg; 456 struct blk_zone_range zrange; 457 enum req_op op; 458 459 if (!argp) 460 return -EINVAL; 461 462 if (!bdev_is_zoned(bdev)) 463 return -ENOTTY; 464 465 if (!(mode & BLK_OPEN_WRITE)) 466 return -EBADF; 467 468 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) 469 return -EFAULT; 470 471 switch (cmd) { 472 case BLKRESETZONE: 473 return blkdev_reset_zone(bdev, mode, &zrange); 474 case BLKOPENZONE: 475 op = REQ_OP_ZONE_OPEN; 476 break; 477 case BLKCLOSEZONE: 478 op = REQ_OP_ZONE_CLOSE; 479 break; 480 case BLKFINISHZONE: 481 op = REQ_OP_ZONE_FINISH; 482 break; 483 default: 484 return -ENOTTY; 485 } 486 487 return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); 488 } 489 490 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) 491 { 492 return zone->start + zone->len >= get_capacity(disk); 493 } 494 495 static bool disk_zone_wplug_is_full(struct gendisk *disk, 496 struct blk_zone_wplug *zwplug) 497 { 498 if (zwplug->zone_no < disk->nr_zones - 1) 499 return zwplug->wp_offset >= disk->zone_capacity; 500 return zwplug->wp_offset >= disk->last_zone_capacity; 501 } 502 503 static bool disk_insert_zone_wplug(struct gendisk *disk, 504 struct blk_zone_wplug *zwplug) 505 { 506 struct blk_zone_wplug *zwplg; 507 unsigned long flags; 508 u8 *zones_cond; 509 unsigned int idx = 510 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); 511 512 /* 513 * Add the new zone write plug to the hash table, but carefully as we 514 * are racing with other submission context, so we may already have a 515 * zone write plug for the same zone. 516 */ 517 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 518 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { 519 if (zwplg->zone_no == zwplug->zone_no) { 520 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, 521 flags); 522 return false; 523 } 524 } 525 526 /* 527 * Set the zone condition: if we do not yet have a zones_cond array 528 * attached to the disk, then this is a zone write plug insert from the 529 * first call to blk_revalidate_disk_zones(), in which case the zone is 530 * necessarilly in the active condition. 531 */ 532 zones_cond = rcu_dereference_check(disk->zones_cond, 533 lockdep_is_held(&disk->zone_wplugs_hash_lock)); 534 if (zones_cond) 535 zwplug->cond = zones_cond[zwplug->zone_no]; 536 else 537 zwplug->cond = BLK_ZONE_COND_ACTIVE; 538 539 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); 540 atomic_inc(&disk->nr_zone_wplugs); 541 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 542 543 return true; 544 } 545 546 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, 547 sector_t sector) 548 { 549 unsigned int zno = disk_zone_no(disk, sector); 550 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); 551 struct blk_zone_wplug *zwplug; 552 553 rcu_read_lock(); 554 555 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { 556 if (zwplug->zone_no == zno && 557 refcount_inc_not_zero(&zwplug->ref)) { 558 rcu_read_unlock(); 559 return zwplug; 560 } 561 } 562 563 rcu_read_unlock(); 564 565 return NULL; 566 } 567 568 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, 569 sector_t sector) 570 { 571 if (!atomic_read(&disk->nr_zone_wplugs)) 572 return NULL; 573 574 return disk_get_hashed_zone_wplug(disk, sector); 575 } 576 577 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) 578 { 579 struct blk_zone_wplug *zwplug = 580 container_of(rcu_head, struct blk_zone_wplug, rcu_head); 581 582 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); 583 } 584 585 static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) 586 { 587 struct gendisk *disk = zwplug->disk; 588 unsigned long flags; 589 590 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)); 591 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); 592 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); 593 594 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 595 blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, 596 lockdep_is_held(&disk->zone_wplugs_hash_lock)), 597 zwplug->zone_no, zwplug->cond); 598 hlist_del_init_rcu(&zwplug->node); 599 atomic_dec(&disk->nr_zone_wplugs); 600 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 601 602 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); 603 } 604 605 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) 606 { 607 if (refcount_dec_and_test(&zwplug->ref)) 608 disk_free_zone_wplug(zwplug); 609 } 610 611 /* 612 * Flag the zone write plug as dead and drop the initial reference we got when 613 * the zone write plug was added to the hash table. The zone write plug will be 614 * unhashed when its last reference is dropped. 615 */ 616 static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) 617 { 618 lockdep_assert_held(&zwplug->lock); 619 620 if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) { 621 zwplug->flags |= BLK_ZONE_WPLUG_DEAD; 622 disk_put_zone_wplug(zwplug); 623 } 624 } 625 626 static inline bool disk_check_zone_wplug_dead(struct blk_zone_wplug *zwplug) 627 { 628 if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) 629 return false; 630 631 /* 632 * If a new write is received right after a zone reset completes and 633 * while the disk_zone_wplugs_worker() thread has not yet released the 634 * reference on the zone write plug after processing the last write to 635 * the zone, then the new write BIO will see the zone write plug marked 636 * as dead. This case is however a false positive and a perfectly valid 637 * pattern. In such case, restore the zone write plug to a live one. 638 */ 639 if (!zwplug->wp_offset && bio_list_empty(&zwplug->bio_list)) { 640 zwplug->flags &= ~BLK_ZONE_WPLUG_DEAD; 641 refcount_inc(&zwplug->ref); 642 return false; 643 } 644 645 return true; 646 } 647 648 static bool disk_zone_wplug_submit_bio(struct gendisk *disk, 649 struct blk_zone_wplug *zwplug); 650 651 static void blk_zone_wplug_bio_work(struct work_struct *work) 652 { 653 struct blk_zone_wplug *zwplug = 654 container_of(work, struct blk_zone_wplug, bio_work); 655 656 disk_zone_wplug_submit_bio(zwplug->disk, zwplug); 657 658 /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ 659 disk_put_zone_wplug(zwplug); 660 } 661 662 /* 663 * Get a zone write plug for the zone containing @sector. 664 * If the plug does not exist, it is allocated and inserted in the disk hash 665 * table. 666 */ 667 static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk, 668 sector_t sector, gfp_t gfp_mask) 669 { 670 unsigned int zno = disk_zone_no(disk, sector); 671 struct blk_zone_wplug *zwplug; 672 673 again: 674 zwplug = disk_get_zone_wplug(disk, sector); 675 if (zwplug) 676 return zwplug; 677 678 /* 679 * Allocate and initialize a zone write plug with an extra reference 680 * so that it is not freed when the zone write plug becomes idle without 681 * the zone being full. 682 */ 683 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); 684 if (!zwplug) 685 return NULL; 686 687 INIT_HLIST_NODE(&zwplug->node); 688 refcount_set(&zwplug->ref, 2); 689 spin_lock_init(&zwplug->lock); 690 zwplug->flags = 0; 691 zwplug->zone_no = zno; 692 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); 693 bio_list_init(&zwplug->bio_list); 694 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); 695 INIT_LIST_HEAD(&zwplug->entry); 696 zwplug->disk = disk; 697 698 /* 699 * Insert the new zone write plug in the hash table. This can fail only 700 * if another context already inserted a plug. Retry from the beginning 701 * in such case. 702 */ 703 if (!disk_insert_zone_wplug(disk, zwplug)) { 704 mempool_free(zwplug, disk->zone_wplugs_pool); 705 goto again; 706 } 707 708 return zwplug; 709 } 710 711 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, 712 struct bio *bio) 713 { 714 struct request_queue *q = zwplug->disk->queue; 715 716 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 717 bio_io_error(bio); 718 disk_put_zone_wplug(zwplug); 719 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 720 blk_queue_exit(q); 721 } 722 723 /* 724 * Abort (fail) all plugged BIOs of a zone write plug. 725 */ 726 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) 727 { 728 struct gendisk *disk = zwplug->disk; 729 struct bio *bio; 730 731 lockdep_assert_held(&zwplug->lock); 732 733 if (bio_list_empty(&zwplug->bio_list)) 734 return; 735 736 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", 737 zwplug->disk->disk_name, zwplug->zone_no); 738 while ((bio = bio_list_pop(&zwplug->bio_list))) 739 blk_zone_wplug_bio_io_error(zwplug, bio); 740 741 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 742 743 /* 744 * If we are using the per disk zone write plugs worker thread, remove 745 * the zone write plug from the work list and drop the reference we 746 * took when the zone write plug was added to that list. 747 */ 748 if (blk_queue_zoned_qd1_writes(disk->queue)) { 749 spin_lock(&disk->zone_wplugs_list_lock); 750 if (!list_empty(&zwplug->entry)) { 751 list_del_init(&zwplug->entry); 752 disk_put_zone_wplug(zwplug); 753 } 754 spin_unlock(&disk->zone_wplugs_list_lock); 755 } 756 } 757 758 /* 759 * Update a zone write plug condition based on the write pointer offset. 760 */ 761 static void disk_zone_wplug_update_cond(struct gendisk *disk, 762 struct blk_zone_wplug *zwplug) 763 { 764 lockdep_assert_held(&zwplug->lock); 765 766 if (disk_zone_wplug_is_full(disk, zwplug)) 767 zwplug->cond = BLK_ZONE_COND_FULL; 768 else if (!zwplug->wp_offset) 769 zwplug->cond = BLK_ZONE_COND_EMPTY; 770 else 771 zwplug->cond = BLK_ZONE_COND_ACTIVE; 772 } 773 774 /* 775 * Set a zone write plug write pointer offset to the specified value. 776 * This aborts all plugged BIOs, which is fine as this function is called for 777 * a zone reset operation, a zone finish operation or if the zone needs a wp 778 * update from a report zone after a write error. 779 */ 780 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, 781 struct blk_zone_wplug *zwplug, 782 unsigned int wp_offset) 783 { 784 lockdep_assert_held(&zwplug->lock); 785 786 /* Update the zone write pointer and abort all plugged BIOs. */ 787 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; 788 zwplug->wp_offset = wp_offset; 789 disk_zone_wplug_update_cond(disk, zwplug); 790 791 disk_zone_wplug_abort(zwplug); 792 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) 793 disk_mark_zone_wplug_dead(zwplug); 794 } 795 796 static unsigned int blk_zone_wp_offset(struct blk_zone *zone) 797 { 798 switch (zone->cond) { 799 case BLK_ZONE_COND_IMP_OPEN: 800 case BLK_ZONE_COND_EXP_OPEN: 801 case BLK_ZONE_COND_CLOSED: 802 case BLK_ZONE_COND_ACTIVE: 803 return zone->wp - zone->start; 804 case BLK_ZONE_COND_EMPTY: 805 return 0; 806 case BLK_ZONE_COND_FULL: 807 case BLK_ZONE_COND_NOT_WP: 808 case BLK_ZONE_COND_OFFLINE: 809 case BLK_ZONE_COND_READONLY: 810 default: 811 /* 812 * Conventional, full, offline and read-only zones do not have 813 * a valid write pointer. 814 */ 815 return UINT_MAX; 816 } 817 } 818 819 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk, 820 struct blk_zone *zone) 821 { 822 struct blk_zone_wplug *zwplug; 823 unsigned int wp_offset = blk_zone_wp_offset(zone); 824 825 zwplug = disk_get_zone_wplug(disk, zone->start); 826 if (zwplug) { 827 unsigned long flags; 828 829 spin_lock_irqsave(&zwplug->lock, flags); 830 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 831 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); 832 spin_unlock_irqrestore(&zwplug->lock, flags); 833 disk_put_zone_wplug(zwplug); 834 } 835 836 return wp_offset; 837 } 838 839 /** 840 * disk_report_zone - Report one zone 841 * @disk: Target disk 842 * @zone: The zone to report 843 * @idx: The index of the zone in the overall zone report 844 * @args: report zones callback and data 845 * 846 * Description: 847 * Helper function for block device drivers to report one zone of a zone 848 * report initiated with blkdev_report_zones(). The zone being reported is 849 * specified by @zone and used to update, if necessary, the zone write plug 850 * information for the zone. If @args specifies a user callback function, 851 * this callback is executed. 852 */ 853 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, 854 unsigned int idx, struct blk_report_zones_args *args) 855 { 856 if (args && args->report_active) { 857 /* 858 * If we come here, then this is a report zones as a fallback 859 * for a cached report. So collapse the implicit open, explicit 860 * open and closed conditions into the active zone condition. 861 */ 862 switch (zone->cond) { 863 case BLK_ZONE_COND_IMP_OPEN: 864 case BLK_ZONE_COND_EXP_OPEN: 865 case BLK_ZONE_COND_CLOSED: 866 zone->cond = BLK_ZONE_COND_ACTIVE; 867 break; 868 default: 869 break; 870 } 871 } 872 873 if (disk->zone_wplugs_hash) 874 disk_zone_wplug_sync_wp_offset(disk, zone); 875 876 if (args && args->cb) 877 return args->cb(zone, idx, args->data); 878 879 return 0; 880 } 881 EXPORT_SYMBOL_GPL(disk_report_zone); 882 883 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx, 884 void *data) 885 { 886 memcpy(data, zone, sizeof(struct blk_zone)); 887 return 0; 888 } 889 890 static int blkdev_report_zone_fallback(struct block_device *bdev, 891 sector_t sector, struct blk_zone *zone) 892 { 893 struct blk_report_zones_args args = { 894 .cb = blkdev_report_zone_cb, 895 .data = zone, 896 .report_active = true, 897 }; 898 int error; 899 900 error = blkdev_do_report_zones(bdev, sector, 1, &args); 901 if (error < 0) 902 return error; 903 if (error == 0) 904 return -EIO; 905 return 0; 906 } 907 908 /* 909 * For devices that natively support zone append operations, we do not use zone 910 * write plugging for zone append writes, which makes the zone condition 911 * tracking invalid once zone append was used. In that case fall back to a 912 * regular report zones to get correct information. 913 */ 914 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev) 915 { 916 return disk_need_zone_resources(bdev->bd_disk) && 917 (bdev_emulates_zone_append(bdev) || 918 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state)); 919 } 920 921 /** 922 * blkdev_get_zone_info - Get a single zone information from cached data 923 * @bdev: Target block device 924 * @sector: Sector contained by the target zone 925 * @zone: zone structure to return the zone information 926 * 927 * Description: 928 * Get the zone information for the zone containing @sector using the zone 929 * write plug of the target zone, if one exist, or the disk zone condition 930 * array otherwise. The zone condition may be reported as being 931 * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit 932 * open, explicit open or closed condition. 933 * 934 * Returns 0 on success and a negative error code on failure. 935 */ 936 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, 937 struct blk_zone *zone) 938 { 939 struct gendisk *disk = bdev->bd_disk; 940 sector_t zone_sectors = bdev_zone_sectors(bdev); 941 struct blk_zone_wplug *zwplug; 942 unsigned long flags; 943 u8 *zones_cond; 944 945 if (!bdev_is_zoned(bdev)) 946 return -EOPNOTSUPP; 947 948 if (sector >= get_capacity(disk)) 949 return -EINVAL; 950 951 memset(zone, 0, sizeof(*zone)); 952 sector = bdev_zone_start(bdev, sector); 953 954 if (!blkdev_has_cached_report_zones(bdev)) 955 return blkdev_report_zone_fallback(bdev, sector, zone); 956 957 rcu_read_lock(); 958 zones_cond = rcu_dereference(disk->zones_cond); 959 if (!disk->zone_wplugs_hash || !zones_cond) { 960 rcu_read_unlock(); 961 return blkdev_report_zone_fallback(bdev, sector, zone); 962 } 963 zone->cond = zones_cond[disk_zone_no(disk, sector)]; 964 rcu_read_unlock(); 965 966 zone->start = sector; 967 zone->len = zone_sectors; 968 969 /* 970 * If this is a conventional zone, we do not have a zone write plug and 971 * can report the zone immediately. 972 */ 973 if (zone->cond == BLK_ZONE_COND_NOT_WP) { 974 zone->type = BLK_ZONE_TYPE_CONVENTIONAL; 975 zone->capacity = zone_sectors; 976 zone->wp = ULLONG_MAX; 977 return 0; 978 } 979 980 /* 981 * This is a sequential write required zone. If the zone is read-only or 982 * offline, only set the zone write pointer to an invalid value and 983 * report the zone. 984 */ 985 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; 986 if (disk_zone_is_last(disk, zone)) 987 zone->capacity = disk->last_zone_capacity; 988 else 989 zone->capacity = disk->zone_capacity; 990 991 if (zone->cond == BLK_ZONE_COND_READONLY || 992 zone->cond == BLK_ZONE_COND_OFFLINE) { 993 zone->wp = ULLONG_MAX; 994 return 0; 995 } 996 997 /* 998 * If the zone does not have a zone write plug, it is either full or 999 * empty, as we otherwise would have a zone write plug for it. In this 1000 * case, set the write pointer accordingly and report the zone. 1001 * Otherwise, if we have a zone write plug, use it. 1002 */ 1003 zwplug = disk_get_zone_wplug(disk, sector); 1004 if (!zwplug) { 1005 if (zone->cond == BLK_ZONE_COND_FULL) 1006 zone->wp = ULLONG_MAX; 1007 else 1008 zone->wp = sector; 1009 return 0; 1010 } 1011 1012 spin_lock_irqsave(&zwplug->lock, flags); 1013 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) { 1014 spin_unlock_irqrestore(&zwplug->lock, flags); 1015 disk_put_zone_wplug(zwplug); 1016 return blkdev_report_zone_fallback(bdev, sector, zone); 1017 } 1018 zone->cond = zwplug->cond; 1019 zone->wp = sector + zwplug->wp_offset; 1020 spin_unlock_irqrestore(&zwplug->lock, flags); 1021 1022 disk_put_zone_wplug(zwplug); 1023 1024 return 0; 1025 } 1026 EXPORT_SYMBOL_GPL(blkdev_get_zone_info); 1027 1028 /** 1029 * blkdev_report_zones_cached - Get cached zones information 1030 * @bdev: Target block device 1031 * @sector: Sector from which to report zones 1032 * @nr_zones: Maximum number of zones to report 1033 * @cb: Callback function called for each reported zone 1034 * @data: Private data for the callback function 1035 * 1036 * Description: 1037 * Similar to blkdev_report_zones() but instead of calling into the low level 1038 * device driver to get the zone report from the device, use 1039 * blkdev_get_zone_info() to generate the report from the disk zone write 1040 * plugs and zones condition array. Since calling this function without a 1041 * callback does not make sense, @cb must be specified. 1042 */ 1043 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, 1044 unsigned int nr_zones, report_zones_cb cb, void *data) 1045 { 1046 struct gendisk *disk = bdev->bd_disk; 1047 sector_t capacity = get_capacity(disk); 1048 sector_t zone_sectors = bdev_zone_sectors(bdev); 1049 unsigned int idx = 0; 1050 struct blk_zone zone; 1051 int ret; 1052 1053 if (!cb || !bdev_is_zoned(bdev) || 1054 WARN_ON_ONCE(!disk->fops->report_zones)) 1055 return -EOPNOTSUPP; 1056 1057 if (!nr_zones || sector >= capacity) 1058 return 0; 1059 1060 if (!blkdev_has_cached_report_zones(bdev)) { 1061 struct blk_report_zones_args args = { 1062 .cb = cb, 1063 .data = data, 1064 .report_active = true, 1065 }; 1066 1067 return blkdev_do_report_zones(bdev, sector, nr_zones, &args); 1068 } 1069 1070 for (sector = bdev_zone_start(bdev, sector); 1071 sector < capacity && idx < nr_zones; 1072 sector += zone_sectors, idx++) { 1073 ret = blkdev_get_zone_info(bdev, sector, &zone); 1074 if (ret) 1075 return ret; 1076 1077 ret = cb(&zone, idx, data); 1078 if (ret) 1079 return ret; 1080 } 1081 1082 return idx; 1083 } 1084 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached); 1085 1086 static void blk_zone_reset_bio_endio(struct bio *bio) 1087 { 1088 struct gendisk *disk = bio->bi_bdev->bd_disk; 1089 sector_t sector = bio->bi_iter.bi_sector; 1090 struct blk_zone_wplug *zwplug; 1091 1092 /* 1093 * If we have a zone write plug, set its write pointer offset to 0. 1094 * This will abort all BIOs plugged for the target zone. It is fine as 1095 * resetting zones while writes are still in-flight will result in the 1096 * writes failing anyway. 1097 */ 1098 zwplug = disk_get_zone_wplug(disk, sector); 1099 if (zwplug) { 1100 unsigned long flags; 1101 1102 spin_lock_irqsave(&zwplug->lock, flags); 1103 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 1104 spin_unlock_irqrestore(&zwplug->lock, flags); 1105 disk_put_zone_wplug(zwplug); 1106 } else { 1107 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); 1108 } 1109 } 1110 1111 static void blk_zone_reset_all_bio_endio(struct bio *bio) 1112 { 1113 struct gendisk *disk = bio->bi_bdev->bd_disk; 1114 sector_t capacity = get_capacity(disk); 1115 struct blk_zone_wplug *zwplug; 1116 unsigned long flags; 1117 sector_t sector; 1118 unsigned int i; 1119 1120 if (atomic_read(&disk->nr_zone_wplugs)) { 1121 /* Update the condition of all zone write plugs. */ 1122 rcu_read_lock(); 1123 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1124 hlist_for_each_entry_rcu(zwplug, 1125 &disk->zone_wplugs_hash[i], 1126 node) { 1127 spin_lock_irqsave(&zwplug->lock, flags); 1128 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 1129 spin_unlock_irqrestore(&zwplug->lock, flags); 1130 } 1131 } 1132 rcu_read_unlock(); 1133 } 1134 1135 /* Update the cached zone conditions. */ 1136 for (sector = 0; sector < capacity; 1137 sector += bdev_zone_sectors(bio->bi_bdev)) 1138 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); 1139 clear_bit(GD_ZONE_APPEND_USED, &disk->state); 1140 } 1141 1142 static void blk_zone_finish_bio_endio(struct bio *bio) 1143 { 1144 struct block_device *bdev = bio->bi_bdev; 1145 struct gendisk *disk = bdev->bd_disk; 1146 sector_t sector = bio->bi_iter.bi_sector; 1147 struct blk_zone_wplug *zwplug; 1148 1149 /* 1150 * If we have a zone write plug, set its write pointer offset to the 1151 * zone size. This will abort all BIOs plugged for the target zone. It 1152 * is fine as resetting zones while writes are still in-flight will 1153 * result in the writes failing anyway. 1154 */ 1155 zwplug = disk_get_zone_wplug(disk, sector); 1156 if (zwplug) { 1157 unsigned long flags; 1158 1159 spin_lock_irqsave(&zwplug->lock, flags); 1160 disk_zone_wplug_set_wp_offset(disk, zwplug, 1161 bdev_zone_sectors(bdev)); 1162 spin_unlock_irqrestore(&zwplug->lock, flags); 1163 disk_put_zone_wplug(zwplug); 1164 } else { 1165 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL); 1166 } 1167 } 1168 1169 void blk_zone_mgmt_bio_endio(struct bio *bio) 1170 { 1171 /* If the BIO failed, we have nothing to do. */ 1172 if (bio->bi_status != BLK_STS_OK) 1173 return; 1174 1175 switch (bio_op(bio)) { 1176 case REQ_OP_ZONE_RESET: 1177 blk_zone_reset_bio_endio(bio); 1178 return; 1179 case REQ_OP_ZONE_RESET_ALL: 1180 blk_zone_reset_all_bio_endio(bio); 1181 return; 1182 case REQ_OP_ZONE_FINISH: 1183 blk_zone_finish_bio_endio(bio); 1184 return; 1185 default: 1186 return; 1187 } 1188 } 1189 1190 static void disk_zone_wplug_schedule_work(struct gendisk *disk, 1191 struct blk_zone_wplug *zwplug) 1192 { 1193 lockdep_assert_held(&zwplug->lock); 1194 1195 /* 1196 * Schedule the submission of the next plugged BIO. Taking a reference 1197 * to the zone write plug is required as the bio_work belongs to the 1198 * plug, and thus we must ensure that the write plug does not go away 1199 * while the work is being scheduled but has not run yet. 1200 * blk_zone_wplug_bio_work() will release the reference we take here, 1201 * and we also drop this reference if the work is already scheduled. 1202 */ 1203 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); 1204 WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); 1205 refcount_inc(&zwplug->ref); 1206 if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) 1207 disk_put_zone_wplug(zwplug); 1208 } 1209 1210 static inline void disk_zone_wplug_add_bio(struct gendisk *disk, 1211 struct blk_zone_wplug *zwplug, 1212 struct bio *bio, unsigned int nr_segs) 1213 { 1214 /* 1215 * Grab an extra reference on the BIO request queue usage counter. 1216 * This reference will be reused to submit a request for the BIO for 1217 * blk-mq devices and dropped when the BIO is failed and after 1218 * it is issued in the case of BIO-based devices. 1219 */ 1220 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); 1221 1222 /* 1223 * The BIO is being plugged and thus will have to wait for the on-going 1224 * write and for all other writes already plugged. So polling makes 1225 * no sense. 1226 */ 1227 bio_clear_polled(bio); 1228 1229 /* 1230 * Reuse the poll cookie field to store the number of segments when 1231 * split to the hardware limits. 1232 */ 1233 bio->__bi_nr_segments = nr_segs; 1234 1235 /* 1236 * We always receive BIOs after they are split and ready to be issued. 1237 * The block layer passes the parts of a split BIO in order, and the 1238 * user must also issue write sequentially. So simply add the new BIO 1239 * at the tail of the list to preserve the sequential write order. 1240 */ 1241 bio_list_add(&zwplug->bio_list, bio); 1242 trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, 1243 bio->bi_iter.bi_sector, bio_sectors(bio)); 1244 1245 /* 1246 * If we are using the disk zone write plugs worker instead of the per 1247 * zone write plug BIO work, add the zone write plug to the work list 1248 * if it is not already there. Make sure to also get an extra reference 1249 * on the zone write plug so that it does not go away until it is 1250 * removed from the work list. 1251 */ 1252 if (blk_queue_zoned_qd1_writes(disk->queue)) { 1253 spin_lock(&disk->zone_wplugs_list_lock); 1254 if (list_empty(&zwplug->entry)) { 1255 list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); 1256 refcount_inc(&zwplug->ref); 1257 } 1258 spin_unlock(&disk->zone_wplugs_list_lock); 1259 } 1260 } 1261 1262 /* 1263 * Called from bio_attempt_back_merge() when a BIO was merged with a request. 1264 */ 1265 void blk_zone_write_plug_bio_merged(struct bio *bio) 1266 { 1267 struct gendisk *disk = bio->bi_bdev->bd_disk; 1268 struct blk_zone_wplug *zwplug; 1269 unsigned long flags; 1270 1271 /* 1272 * If the BIO was already plugged, then we were called through 1273 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). 1274 * For this case, we already hold a reference on the zone write plug for 1275 * the BIO and blk_zone_write_plug_init_request() will handle the 1276 * zone write pointer offset update. 1277 */ 1278 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) 1279 return; 1280 1281 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1282 1283 /* 1284 * Get a reference on the zone write plug of the target zone and advance 1285 * the zone write pointer offset. Given that this is a merge, we already 1286 * have at least one request and one BIO referencing the zone write 1287 * plug. So this should not fail. 1288 */ 1289 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1290 if (WARN_ON_ONCE(!zwplug)) 1291 return; 1292 1293 spin_lock_irqsave(&zwplug->lock, flags); 1294 zwplug->wp_offset += bio_sectors(bio); 1295 disk_zone_wplug_update_cond(disk, zwplug); 1296 spin_unlock_irqrestore(&zwplug->lock, flags); 1297 } 1298 1299 /* 1300 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that 1301 * already went through zone write plugging (either a new BIO or one that was 1302 * unplugged). 1303 */ 1304 void blk_zone_write_plug_init_request(struct request *req) 1305 { 1306 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); 1307 struct request_queue *q = req->q; 1308 struct gendisk *disk = q->disk; 1309 struct blk_zone_wplug *zwplug = 1310 disk_get_zone_wplug(disk, blk_rq_pos(req)); 1311 unsigned long flags; 1312 struct bio *bio; 1313 1314 if (WARN_ON_ONCE(!zwplug)) 1315 return; 1316 1317 /* 1318 * Indicate that completion of this request needs to be handled with 1319 * blk_zone_write_plug_finish_request(), which will drop the reference 1320 * on the zone write plug we took above on entry to this function. 1321 */ 1322 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; 1323 1324 if (blk_queue_nomerges(q)) 1325 return; 1326 1327 /* 1328 * Walk through the list of plugged BIOs to check if they can be merged 1329 * into the back of the request. 1330 */ 1331 spin_lock_irqsave(&zwplug->lock, flags); 1332 while (!disk_zone_wplug_is_full(disk, zwplug)) { 1333 bio = bio_list_peek(&zwplug->bio_list); 1334 if (!bio) 1335 break; 1336 1337 if (bio->bi_iter.bi_sector != req_back_sector || 1338 !blk_rq_merge_ok(req, bio)) 1339 break; 1340 1341 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && 1342 !bio->__bi_nr_segments); 1343 1344 bio_list_pop(&zwplug->bio_list); 1345 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != 1346 BIO_MERGE_OK) { 1347 bio_list_add_head(&zwplug->bio_list, bio); 1348 break; 1349 } 1350 1351 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 1352 blk_queue_exit(q); 1353 zwplug->wp_offset += bio_sectors(bio); 1354 disk_zone_wplug_update_cond(disk, zwplug); 1355 1356 req_back_sector += bio_sectors(bio); 1357 } 1358 spin_unlock_irqrestore(&zwplug->lock, flags); 1359 } 1360 1361 /* 1362 * Check and prepare a BIO for submission by incrementing the write pointer 1363 * offset of its zone write plug and changing zone append operations into 1364 * regular write when zone append emulation is needed. 1365 */ 1366 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, 1367 struct bio *bio) 1368 { 1369 struct gendisk *disk = bio->bi_bdev->bd_disk; 1370 1371 lockdep_assert_held(&zwplug->lock); 1372 1373 /* 1374 * If we lost track of the zone write pointer due to a write error, 1375 * the user must either execute a report zones, reset the zone or finish 1376 * the to recover a reliable write pointer position. Fail BIOs if the 1377 * user did not do that as we cannot handle emulated zone append 1378 * otherwise. 1379 */ 1380 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 1381 return false; 1382 1383 /* 1384 * Check that the user is not attempting to write to a full zone. 1385 * We know such BIO will fail, and that would potentially overflow our 1386 * write pointer offset beyond the end of the zone. 1387 */ 1388 if (disk_zone_wplug_is_full(disk, zwplug)) 1389 return false; 1390 1391 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1392 /* 1393 * Use a regular write starting at the current write pointer. 1394 * Similarly to native zone append operations, do not allow 1395 * merging. 1396 */ 1397 bio->bi_opf &= ~REQ_OP_MASK; 1398 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; 1399 bio->bi_iter.bi_sector += zwplug->wp_offset; 1400 1401 /* 1402 * Remember that this BIO is in fact a zone append operation 1403 * so that we can restore its operation code on completion. 1404 */ 1405 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); 1406 } else { 1407 /* 1408 * Check for non-sequential writes early as we know that BIOs 1409 * with a start sector not unaligned to the zone write pointer 1410 * will fail. 1411 */ 1412 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) 1413 return false; 1414 } 1415 1416 /* Advance the zone write pointer offset. */ 1417 zwplug->wp_offset += bio_sectors(bio); 1418 disk_zone_wplug_update_cond(disk, zwplug); 1419 1420 return true; 1421 } 1422 1423 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) 1424 { 1425 struct gendisk *disk = bio->bi_bdev->bd_disk; 1426 sector_t sector = bio->bi_iter.bi_sector; 1427 struct blk_zone_wplug *zwplug; 1428 gfp_t gfp_mask = GFP_NOIO; 1429 unsigned long flags; 1430 1431 /* 1432 * BIOs must be fully contained within a zone so that we use the correct 1433 * zone write plug for the entire BIO. For blk-mq devices, the block 1434 * layer should already have done any splitting required to ensure this 1435 * and this BIO should thus not be straddling zone boundaries. For 1436 * BIO-based devices, it is the responsibility of the driver to split 1437 * the bio before submitting it. 1438 */ 1439 if (WARN_ON_ONCE(bio_straddles_zones(bio))) { 1440 bio_io_error(bio); 1441 return true; 1442 } 1443 1444 /* Conventional zones do not need write plugging. */ 1445 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { 1446 /* Zone append to conventional zones is not allowed. */ 1447 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1448 bio_io_error(bio); 1449 return true; 1450 } 1451 return false; 1452 } 1453 1454 if (bio->bi_opf & REQ_NOWAIT) 1455 gfp_mask = GFP_NOWAIT; 1456 1457 zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask); 1458 if (!zwplug) { 1459 if (bio->bi_opf & REQ_NOWAIT) 1460 bio_wouldblock_error(bio); 1461 else 1462 bio_io_error(bio); 1463 return true; 1464 } 1465 1466 spin_lock_irqsave(&zwplug->lock, flags); 1467 1468 /* 1469 * Check if we got a zone write plug marked as dead. If yes, then the 1470 * user is likely issuing writes to a full zone, or without 1471 * synchronizing with zone reset or zone finish operations. In such 1472 * case, fail the BIO to signal this invalid usage. 1473 */ 1474 if (disk_check_zone_wplug_dead(zwplug)) { 1475 spin_unlock_irqrestore(&zwplug->lock, flags); 1476 disk_put_zone_wplug(zwplug); 1477 bio_io_error(bio); 1478 return true; 1479 } 1480 1481 /* Indicate that this BIO is being handled using zone write plugging. */ 1482 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1483 1484 /* 1485 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a 1486 * BLK_STS_AGAIN failure if we let the caller submit the BIO. 1487 */ 1488 if (bio->bi_opf & REQ_NOWAIT) { 1489 bio->bi_opf &= ~REQ_NOWAIT; 1490 goto queue_bio; 1491 } 1492 1493 /* 1494 * For rotational devices, we will use the gendisk zone write plugs 1495 * work instead of the per zone write plug BIO work, so queue the BIO. 1496 */ 1497 if (blk_queue_zoned_qd1_writes(disk->queue)) 1498 goto queue_bio; 1499 1500 /* If the zone is already plugged, add the BIO to the BIO plug list. */ 1501 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) 1502 goto queue_bio; 1503 1504 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { 1505 spin_unlock_irqrestore(&zwplug->lock, flags); 1506 bio_io_error(bio); 1507 return true; 1508 } 1509 1510 /* Otherwise, plug and let the caller submit the BIO. */ 1511 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1512 1513 spin_unlock_irqrestore(&zwplug->lock, flags); 1514 1515 return false; 1516 1517 queue_bio: 1518 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); 1519 1520 if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { 1521 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1522 if (blk_queue_zoned_qd1_writes(disk->queue)) 1523 wake_up_process(disk->zone_wplugs_worker); 1524 else 1525 disk_zone_wplug_schedule_work(disk, zwplug); 1526 } 1527 1528 spin_unlock_irqrestore(&zwplug->lock, flags); 1529 1530 return true; 1531 } 1532 1533 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) 1534 { 1535 struct gendisk *disk = bio->bi_bdev->bd_disk; 1536 struct blk_zone_wplug *zwplug; 1537 unsigned long flags; 1538 1539 if (!test_bit(GD_ZONE_APPEND_USED, &disk->state)) 1540 set_bit(GD_ZONE_APPEND_USED, &disk->state); 1541 1542 /* 1543 * We have native support for zone append operations, so we are not 1544 * going to handle @bio through plugging. However, we may already have a 1545 * zone write plug for the target zone if that zone was previously 1546 * partially written using regular writes. In such case, we risk leaving 1547 * the plug in the disk hash table if the zone is fully written using 1548 * zone append operations. Avoid this by removing the zone write plug. 1549 */ 1550 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1551 if (likely(!zwplug)) 1552 return; 1553 1554 spin_lock_irqsave(&zwplug->lock, flags); 1555 1556 /* 1557 * We are about to remove the zone write plug. But if the user 1558 * (mistakenly) has issued regular writes together with native zone 1559 * append, we must aborts the writes as otherwise the plugged BIOs would 1560 * not be executed by the plug BIO work as disk_get_zone_wplug() will 1561 * return NULL after the plug is removed. Aborting the plugged write 1562 * BIOs is consistent with the fact that these writes will most likely 1563 * fail anyway as there is no ordering guarantees between zone append 1564 * operations and regular write operations. 1565 */ 1566 if (!bio_list_empty(&zwplug->bio_list)) { 1567 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", 1568 disk->disk_name, zwplug->zone_no); 1569 disk_zone_wplug_abort(zwplug); 1570 } 1571 disk_mark_zone_wplug_dead(zwplug); 1572 spin_unlock_irqrestore(&zwplug->lock, flags); 1573 1574 disk_put_zone_wplug(zwplug); 1575 } 1576 1577 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio) 1578 { 1579 if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL && 1580 !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 1581 /* 1582 * Zone reset and zone finish operations do not apply to 1583 * conventional zones. 1584 */ 1585 bio_io_error(bio); 1586 return true; 1587 } 1588 1589 /* 1590 * No-wait zone management BIOs do not make much sense as the callers 1591 * issue these as blocking operations in most cases. To avoid issues 1592 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn 1593 * about REQ_NOWAIT being set and ignore that flag. 1594 */ 1595 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) 1596 bio->bi_opf &= ~REQ_NOWAIT; 1597 1598 return false; 1599 } 1600 1601 /** 1602 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging 1603 * @bio: The BIO being submitted 1604 * @nr_segs: The number of physical segments of @bio 1605 * 1606 * Handle write, write zeroes and zone append operations requiring emulation 1607 * using zone write plugging. 1608 * 1609 * Return true whenever @bio execution needs to be delayed through the zone 1610 * write plug. Otherwise, return false to let the submission path process 1611 * @bio normally. 1612 */ 1613 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) 1614 { 1615 struct block_device *bdev = bio->bi_bdev; 1616 1617 if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) 1618 return false; 1619 1620 /* 1621 * Regular writes and write zeroes need to be handled through the target 1622 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH 1623 * which may need to go through the flush machinery depending on the 1624 * target device capabilities. Plugging such writes is fine as the flush 1625 * machinery operates at the request level, below the plug, and 1626 * completion of the flush sequence will go through the regular BIO 1627 * completion, which will handle zone write plugging. 1628 * Zone append operations for devices that requested emulation must 1629 * also be plugged so that these BIOs can be changed into regular 1630 * write BIOs. 1631 * Zone reset, reset all and finish commands need special treatment 1632 * to correctly track the write pointer offset of zones. These commands 1633 * are not plugged as we do not need serialization with write 1634 * operations. It is the responsibility of the user to not issue reset 1635 * and finish commands when write operations are in flight. 1636 */ 1637 switch (bio_op(bio)) { 1638 case REQ_OP_ZONE_APPEND: 1639 if (!bdev_emulates_zone_append(bdev)) { 1640 blk_zone_wplug_handle_native_zone_append(bio); 1641 return false; 1642 } 1643 fallthrough; 1644 case REQ_OP_WRITE: 1645 case REQ_OP_WRITE_ZEROES: 1646 return blk_zone_wplug_handle_write(bio, nr_segs); 1647 case REQ_OP_ZONE_RESET: 1648 case REQ_OP_ZONE_FINISH: 1649 case REQ_OP_ZONE_RESET_ALL: 1650 return blk_zone_wplug_handle_zone_mgmt(bio); 1651 default: 1652 return false; 1653 } 1654 1655 return false; 1656 } 1657 EXPORT_SYMBOL_GPL(blk_zone_plug_bio); 1658 1659 static void disk_zone_wplug_unplug_bio(struct gendisk *disk, 1660 struct blk_zone_wplug *zwplug) 1661 { 1662 unsigned long flags; 1663 1664 spin_lock_irqsave(&zwplug->lock, flags); 1665 1666 /* 1667 * For rotational devices, signal the BIO completion to the zone write 1668 * plug work. Otherwise, schedule submission of the next plugged BIO 1669 * if we have one. 1670 */ 1671 if (bio_list_empty(&zwplug->bio_list)) 1672 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1673 1674 if (blk_queue_zoned_qd1_writes(disk->queue)) 1675 complete(&disk->zone_wplugs_worker_bio_done); 1676 else if (!bio_list_empty(&zwplug->bio_list)) 1677 disk_zone_wplug_schedule_work(disk, zwplug); 1678 1679 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) 1680 disk_mark_zone_wplug_dead(zwplug); 1681 1682 spin_unlock_irqrestore(&zwplug->lock, flags); 1683 } 1684 1685 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio) 1686 { 1687 /* 1688 * For zone append requests, the request sector indicates the location 1689 * at which the BIO data was written. Return this value to the BIO 1690 * issuer through the BIO iter sector. 1691 * For plugged zone writes, which include emulated zone append, we need 1692 * the original BIO sector so that blk_zone_write_plug_bio_endio() can 1693 * lookup the zone write plug. 1694 */ 1695 bio->bi_iter.bi_sector = rq->__sector; 1696 trace_blk_zone_append_update_request_bio(rq); 1697 } 1698 1699 void blk_zone_write_plug_bio_endio(struct bio *bio) 1700 { 1701 struct gendisk *disk = bio->bi_bdev->bd_disk; 1702 struct blk_zone_wplug *zwplug = 1703 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1704 unsigned long flags; 1705 1706 if (WARN_ON_ONCE(!zwplug)) 1707 return; 1708 1709 /* Make sure we do not see this BIO again by clearing the plug flag. */ 1710 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1711 1712 /* 1713 * If this is a regular write emulating a zone append operation, 1714 * restore the original operation code. 1715 */ 1716 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { 1717 bio->bi_opf &= ~REQ_OP_MASK; 1718 bio->bi_opf |= REQ_OP_ZONE_APPEND; 1719 bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); 1720 } 1721 1722 /* 1723 * If the BIO failed, abort all plugged BIOs and mark the plug as 1724 * needing a write pointer update. 1725 */ 1726 if (bio->bi_status != BLK_STS_OK) { 1727 spin_lock_irqsave(&zwplug->lock, flags); 1728 disk_zone_wplug_abort(zwplug); 1729 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; 1730 spin_unlock_irqrestore(&zwplug->lock, flags); 1731 } 1732 1733 /* Drop the reference we took when the BIO was issued. */ 1734 disk_put_zone_wplug(zwplug); 1735 1736 /* 1737 * For BIO-based devices, blk_zone_write_plug_finish_request() 1738 * is not called. So we need to schedule execution of the next 1739 * plugged BIO here. 1740 */ 1741 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) 1742 disk_zone_wplug_unplug_bio(disk, zwplug); 1743 1744 /* Drop the reference we took when entering this function. */ 1745 disk_put_zone_wplug(zwplug); 1746 } 1747 1748 void blk_zone_write_plug_finish_request(struct request *req) 1749 { 1750 struct gendisk *disk = req->q->disk; 1751 struct blk_zone_wplug *zwplug; 1752 1753 zwplug = disk_get_zone_wplug(disk, req->__sector); 1754 if (WARN_ON_ONCE(!zwplug)) 1755 return; 1756 1757 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; 1758 1759 /* 1760 * Drop the reference we took when the request was initialized in 1761 * blk_zone_write_plug_init_request(). 1762 */ 1763 disk_put_zone_wplug(zwplug); 1764 1765 disk_zone_wplug_unplug_bio(disk, zwplug); 1766 1767 /* Drop the reference we took when entering this function. */ 1768 disk_put_zone_wplug(zwplug); 1769 } 1770 1771 static bool disk_zone_wplug_submit_bio(struct gendisk *disk, 1772 struct blk_zone_wplug *zwplug) 1773 { 1774 struct block_device *bdev; 1775 unsigned long flags; 1776 struct bio *bio; 1777 bool prepared; 1778 1779 /* 1780 * Submit the next plugged BIO. If we do not have any, clear 1781 * the plugged flag. 1782 */ 1783 again: 1784 spin_lock_irqsave(&zwplug->lock, flags); 1785 bio = bio_list_pop(&zwplug->bio_list); 1786 if (!bio) { 1787 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1788 spin_unlock_irqrestore(&zwplug->lock, flags); 1789 return false; 1790 } 1791 1792 trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, 1793 bio->bi_iter.bi_sector, bio_sectors(bio)); 1794 1795 prepared = blk_zone_wplug_prepare_bio(zwplug, bio); 1796 spin_unlock_irqrestore(&zwplug->lock, flags); 1797 1798 if (!prepared) { 1799 blk_zone_wplug_bio_io_error(zwplug, bio); 1800 goto again; 1801 } 1802 1803 /* 1804 * blk-mq devices will reuse the extra reference on the request queue 1805 * usage counter we took when the BIO was plugged, but the submission 1806 * path for BIO-based devices will not do that. So drop this extra 1807 * reference here. 1808 */ 1809 if (blk_queue_zoned_qd1_writes(disk->queue)) 1810 reinit_completion(&disk->zone_wplugs_worker_bio_done); 1811 bdev = bio->bi_bdev; 1812 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { 1813 bdev->bd_disk->fops->submit_bio(bio); 1814 blk_queue_exit(bdev->bd_disk->queue); 1815 } else { 1816 blk_mq_submit_bio(bio); 1817 } 1818 1819 return true; 1820 } 1821 1822 static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) 1823 { 1824 struct blk_zone_wplug *zwplug; 1825 1826 spin_lock_irq(&disk->zone_wplugs_list_lock); 1827 zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, 1828 struct blk_zone_wplug, entry); 1829 if (zwplug) 1830 list_del_init(&zwplug->entry); 1831 spin_unlock_irq(&disk->zone_wplugs_list_lock); 1832 1833 return zwplug; 1834 } 1835 1836 static int disk_zone_wplugs_worker(void *data) 1837 { 1838 struct gendisk *disk = data; 1839 struct blk_zone_wplug *zwplug; 1840 unsigned int noio_flag; 1841 1842 noio_flag = memalloc_noio_save(); 1843 set_user_nice(current, MIN_NICE); 1844 set_freezable(); 1845 1846 for (;;) { 1847 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1848 1849 zwplug = disk_get_zone_wplugs_work(disk); 1850 if (zwplug) { 1851 /* 1852 * Process all BIOs of this zone write plug and then 1853 * drop the reference we took when adding the zone write 1854 * plug to the active list. 1855 */ 1856 set_current_state(TASK_RUNNING); 1857 while (disk_zone_wplug_submit_bio(disk, zwplug)) 1858 blk_wait_io(&disk->zone_wplugs_worker_bio_done); 1859 disk_put_zone_wplug(zwplug); 1860 continue; 1861 } 1862 1863 /* 1864 * Only sleep if nothing sets the state to running. Else check 1865 * for zone write plugs work again as a newly submitted BIO 1866 * might have added a zone write plug to the work list. 1867 */ 1868 if (get_current_state() == TASK_RUNNING) { 1869 try_to_freeze(); 1870 } else { 1871 if (kthread_should_stop()) { 1872 set_current_state(TASK_RUNNING); 1873 break; 1874 } 1875 schedule(); 1876 } 1877 } 1878 1879 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); 1880 memalloc_noio_restore(noio_flag); 1881 1882 return 0; 1883 } 1884 1885 void disk_init_zone_resources(struct gendisk *disk) 1886 { 1887 spin_lock_init(&disk->zone_wplugs_hash_lock); 1888 spin_lock_init(&disk->zone_wplugs_list_lock); 1889 INIT_LIST_HEAD(&disk->zone_wplugs_list); 1890 init_completion(&disk->zone_wplugs_worker_bio_done); 1891 } 1892 1893 /* 1894 * For the size of a disk zone write plug hash table, use the size of the 1895 * zone write plug mempool, which is the maximum of the disk open zones and 1896 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, 1897 * 9 bits. For a disk that has no limits, mempool size defaults to 128. 1898 */ 1899 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 1900 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 1901 1902 static int disk_alloc_zone_resources(struct gendisk *disk, 1903 unsigned int pool_size) 1904 { 1905 unsigned int i; 1906 int ret = -ENOMEM; 1907 1908 atomic_set(&disk->nr_zone_wplugs, 0); 1909 disk->zone_wplugs_hash_bits = 1910 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); 1911 1912 disk->zone_wplugs_hash = 1913 kzalloc_objs(struct hlist_head, 1914 disk_zone_wplugs_hash_size(disk)); 1915 if (!disk->zone_wplugs_hash) 1916 return -ENOMEM; 1917 1918 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 1919 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); 1920 1921 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, 1922 sizeof(struct blk_zone_wplug)); 1923 if (!disk->zone_wplugs_pool) 1924 goto free_hash; 1925 1926 disk->zone_wplugs_wq = 1927 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, 1928 pool_size, disk->disk_name); 1929 if (!disk->zone_wplugs_wq) 1930 goto destroy_pool; 1931 1932 disk->zone_wplugs_worker = 1933 kthread_create(disk_zone_wplugs_worker, disk, 1934 "%s_zwplugs_worker", disk->disk_name); 1935 if (IS_ERR(disk->zone_wplugs_worker)) { 1936 ret = PTR_ERR(disk->zone_wplugs_worker); 1937 disk->zone_wplugs_worker = NULL; 1938 goto destroy_wq; 1939 } 1940 wake_up_process(disk->zone_wplugs_worker); 1941 1942 return 0; 1943 1944 destroy_wq: 1945 destroy_workqueue(disk->zone_wplugs_wq); 1946 disk->zone_wplugs_wq = NULL; 1947 destroy_pool: 1948 mempool_destroy(disk->zone_wplugs_pool); 1949 disk->zone_wplugs_pool = NULL; 1950 free_hash: 1951 kfree(disk->zone_wplugs_hash); 1952 disk->zone_wplugs_hash = NULL; 1953 disk->zone_wplugs_hash_bits = 0; 1954 return ret; 1955 } 1956 1957 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) 1958 { 1959 struct blk_zone_wplug *zwplug; 1960 unsigned int i; 1961 1962 if (!disk->zone_wplugs_hash) 1963 return; 1964 1965 /* Free all the zone write plugs we have. */ 1966 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1967 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { 1968 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, 1969 struct blk_zone_wplug, node); 1970 spin_lock_irq(&zwplug->lock); 1971 disk_mark_zone_wplug_dead(zwplug); 1972 spin_unlock_irq(&zwplug->lock); 1973 } 1974 } 1975 1976 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); 1977 kfree(disk->zone_wplugs_hash); 1978 disk->zone_wplugs_hash = NULL; 1979 disk->zone_wplugs_hash_bits = 0; 1980 1981 /* 1982 * Wait for the zone write plugs to be RCU-freed before destroying the 1983 * mempool. 1984 */ 1985 rcu_barrier(); 1986 mempool_destroy(disk->zone_wplugs_pool); 1987 disk->zone_wplugs_pool = NULL; 1988 } 1989 1990 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) 1991 { 1992 unsigned long flags; 1993 1994 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 1995 zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, 1996 lockdep_is_held(&disk->zone_wplugs_hash_lock)); 1997 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 1998 1999 kfree_rcu_mightsleep(zones_cond); 2000 } 2001 2002 void disk_free_zone_resources(struct gendisk *disk) 2003 { 2004 if (disk->zone_wplugs_worker) 2005 kthread_stop(disk->zone_wplugs_worker); 2006 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); 2007 2008 if (disk->zone_wplugs_wq) { 2009 destroy_workqueue(disk->zone_wplugs_wq); 2010 disk->zone_wplugs_wq = NULL; 2011 } 2012 2013 disk_destroy_zone_wplugs_hash_table(disk); 2014 2015 disk_set_zones_cond_array(disk, NULL); 2016 disk->zone_capacity = 0; 2017 disk->last_zone_capacity = 0; 2018 disk->nr_zones = 0; 2019 } 2020 2021 struct blk_revalidate_zone_args { 2022 struct gendisk *disk; 2023 u8 *zones_cond; 2024 unsigned int nr_zones; 2025 unsigned int nr_conv_zones; 2026 unsigned int zone_capacity; 2027 unsigned int last_zone_capacity; 2028 sector_t sector; 2029 }; 2030 2031 static int disk_revalidate_zone_resources(struct gendisk *disk, 2032 struct blk_revalidate_zone_args *args) 2033 { 2034 struct queue_limits *lim = &disk->queue->limits; 2035 unsigned int pool_size; 2036 int ret = 0; 2037 2038 args->disk = disk; 2039 args->nr_zones = 2040 DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors); 2041 2042 /* Cached zone conditions: 1 byte per zone */ 2043 args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO); 2044 if (!args->zones_cond) 2045 return -ENOMEM; 2046 2047 if (!disk_need_zone_resources(disk)) 2048 return 0; 2049 2050 /* 2051 * If the device has no limit on the maximum number of open and active 2052 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. 2053 */ 2054 pool_size = max(lim->max_open_zones, lim->max_active_zones); 2055 if (!pool_size) 2056 pool_size = 2057 min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); 2058 2059 if (!disk->zone_wplugs_hash) { 2060 ret = disk_alloc_zone_resources(disk, pool_size); 2061 if (ret) 2062 kfree(args->zones_cond); 2063 } 2064 2065 return ret; 2066 } 2067 2068 /* 2069 * Update the disk zone resources information and device queue limits. 2070 * The disk queue is frozen when this is executed. 2071 */ 2072 static int disk_update_zone_resources(struct gendisk *disk, 2073 struct blk_revalidate_zone_args *args) 2074 { 2075 struct request_queue *q = disk->queue; 2076 unsigned int nr_seq_zones; 2077 unsigned int pool_size, memflags; 2078 struct queue_limits lim; 2079 int ret = 0; 2080 2081 lim = queue_limits_start_update(q); 2082 2083 memflags = blk_mq_freeze_queue(q); 2084 2085 disk->nr_zones = args->nr_zones; 2086 if (args->nr_conv_zones >= disk->nr_zones) { 2087 queue_limits_cancel_update(q); 2088 pr_warn("%s: Invalid number of conventional zones %u / %u\n", 2089 disk->disk_name, args->nr_conv_zones, disk->nr_zones); 2090 ret = -ENODEV; 2091 goto unfreeze; 2092 } 2093 2094 disk->zone_capacity = args->zone_capacity; 2095 disk->last_zone_capacity = args->last_zone_capacity; 2096 disk_set_zones_cond_array(disk, args->zones_cond); 2097 args->zones_cond = NULL; 2098 2099 /* 2100 * Some devices can advertise zone resource limits that are larger than 2101 * the number of sequential zones of the zoned block device, e.g. a 2102 * small ZNS namespace. For such case, assume that the zoned device has 2103 * no zone resource limits. 2104 */ 2105 nr_seq_zones = disk->nr_zones - args->nr_conv_zones; 2106 if (lim.max_open_zones >= nr_seq_zones) 2107 lim.max_open_zones = 0; 2108 if (lim.max_active_zones >= nr_seq_zones) 2109 lim.max_active_zones = 0; 2110 2111 if (!disk->zone_wplugs_pool) 2112 goto commit; 2113 2114 /* 2115 * If the device has no limit on the maximum number of open and active 2116 * zones, set its max open zone limit to the mempool size to indicate 2117 * to the user that there is a potential performance impact due to 2118 * dynamic zone write plug allocation when simultaneously writing to 2119 * more zones than the size of the mempool. 2120 */ 2121 pool_size = max(lim.max_open_zones, lim.max_active_zones); 2122 if (!pool_size) 2123 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); 2124 2125 mempool_resize(disk->zone_wplugs_pool, pool_size); 2126 2127 if (!lim.max_open_zones && !lim.max_active_zones) { 2128 if (pool_size < nr_seq_zones) 2129 lim.max_open_zones = pool_size; 2130 else 2131 lim.max_open_zones = 0; 2132 } 2133 2134 commit: 2135 ret = queue_limits_commit_update(q, &lim); 2136 2137 unfreeze: 2138 if (ret) 2139 disk_free_zone_resources(disk); 2140 2141 blk_mq_unfreeze_queue(q, memflags); 2142 2143 return ret; 2144 } 2145 2146 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx, 2147 struct blk_revalidate_zone_args *args) 2148 { 2149 enum blk_zone_cond cond = zone->cond; 2150 2151 /* Check that the zone condition is consistent with the zone type. */ 2152 switch (cond) { 2153 case BLK_ZONE_COND_NOT_WP: 2154 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) 2155 goto invalid_condition; 2156 break; 2157 case BLK_ZONE_COND_IMP_OPEN: 2158 case BLK_ZONE_COND_EXP_OPEN: 2159 case BLK_ZONE_COND_CLOSED: 2160 case BLK_ZONE_COND_EMPTY: 2161 case BLK_ZONE_COND_FULL: 2162 case BLK_ZONE_COND_OFFLINE: 2163 case BLK_ZONE_COND_READONLY: 2164 if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) 2165 goto invalid_condition; 2166 break; 2167 default: 2168 pr_warn("%s: Invalid zone condition 0x%X\n", 2169 args->disk->disk_name, cond); 2170 return -ENODEV; 2171 } 2172 2173 blk_zone_set_cond(args->zones_cond, idx, cond); 2174 2175 return 0; 2176 2177 invalid_condition: 2178 pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n", 2179 args->disk->disk_name, cond, zone->type); 2180 2181 return -ENODEV; 2182 } 2183 2184 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, 2185 struct blk_revalidate_zone_args *args) 2186 { 2187 struct gendisk *disk = args->disk; 2188 2189 if (zone->capacity != zone->len) { 2190 pr_warn("%s: Invalid conventional zone capacity\n", 2191 disk->disk_name); 2192 return -ENODEV; 2193 } 2194 2195 if (disk_zone_is_last(disk, zone)) 2196 args->last_zone_capacity = zone->capacity; 2197 2198 args->nr_conv_zones++; 2199 2200 return 0; 2201 } 2202 2203 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, 2204 struct blk_revalidate_zone_args *args) 2205 { 2206 struct gendisk *disk = args->disk; 2207 struct blk_zone_wplug *zwplug; 2208 unsigned int wp_offset; 2209 2210 /* 2211 * Remember the capacity of the first sequential zone and check 2212 * if it is constant for all zones, ignoring the last zone as it can be 2213 * smaller. 2214 */ 2215 if (!args->zone_capacity) 2216 args->zone_capacity = zone->capacity; 2217 if (disk_zone_is_last(disk, zone)) { 2218 args->last_zone_capacity = zone->capacity; 2219 } else if (zone->capacity != args->zone_capacity) { 2220 pr_warn("%s: Invalid variable zone capacity\n", 2221 disk->disk_name); 2222 return -ENODEV; 2223 } 2224 2225 /* 2226 * If the device needs zone append emulation, we need to track the 2227 * write pointer of all zones that are not empty nor full. So make sure 2228 * we have a zone write plug for such zone if the device has a zone 2229 * write plug hash table. 2230 */ 2231 if (!disk->zone_wplugs_hash) 2232 return 0; 2233 2234 wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone); 2235 if (!wp_offset || wp_offset >= zone->capacity) 2236 return 0; 2237 2238 zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO); 2239 if (!zwplug) 2240 return -ENOMEM; 2241 disk_put_zone_wplug(zwplug); 2242 2243 return 0; 2244 } 2245 2246 /* 2247 * Helper function to check the validity of zones of a zoned block device. 2248 */ 2249 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, 2250 void *data) 2251 { 2252 struct blk_revalidate_zone_args *args = data; 2253 struct gendisk *disk = args->disk; 2254 sector_t zone_sectors = disk->queue->limits.chunk_sectors; 2255 int ret; 2256 2257 /* Check for bad zones and holes in the zone report */ 2258 if (zone->start != args->sector) { 2259 pr_warn("%s: Zone gap at sectors %llu..%llu\n", 2260 disk->disk_name, args->sector, zone->start); 2261 return -ENODEV; 2262 } 2263 2264 if (zone->start >= get_capacity(disk) || !zone->len) { 2265 pr_warn("%s: Invalid zone start %llu, length %llu\n", 2266 disk->disk_name, zone->start, zone->len); 2267 return -ENODEV; 2268 } 2269 2270 /* 2271 * All zones must have the same size, with the exception on an eventual 2272 * smaller last zone. 2273 */ 2274 if (!disk_zone_is_last(disk, zone)) { 2275 if (zone->len != zone_sectors) { 2276 pr_warn("%s: Invalid zoned device with non constant zone size\n", 2277 disk->disk_name); 2278 return -ENODEV; 2279 } 2280 } else if (zone->len > zone_sectors) { 2281 pr_warn("%s: Invalid zoned device with larger last zone size\n", 2282 disk->disk_name); 2283 return -ENODEV; 2284 } 2285 2286 if (!zone->capacity || zone->capacity > zone->len) { 2287 pr_warn("%s: Invalid zone capacity\n", 2288 disk->disk_name); 2289 return -ENODEV; 2290 } 2291 2292 /* Check zone condition */ 2293 ret = blk_revalidate_zone_cond(zone, idx, args); 2294 if (ret) 2295 return ret; 2296 2297 /* Check zone type */ 2298 switch (zone->type) { 2299 case BLK_ZONE_TYPE_CONVENTIONAL: 2300 ret = blk_revalidate_conv_zone(zone, idx, args); 2301 break; 2302 case BLK_ZONE_TYPE_SEQWRITE_REQ: 2303 ret = blk_revalidate_seq_zone(zone, idx, args); 2304 break; 2305 case BLK_ZONE_TYPE_SEQWRITE_PREF: 2306 default: 2307 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", 2308 disk->disk_name, (int)zone->type, zone->start); 2309 ret = -ENODEV; 2310 } 2311 2312 if (!ret) 2313 args->sector += zone->len; 2314 2315 return ret; 2316 } 2317 2318 /** 2319 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs 2320 * @disk: Target disk 2321 * 2322 * Helper function for low-level device drivers to check, (re) allocate and 2323 * initialize resources used for managing zoned disks. This function should 2324 * normally be called by blk-mq based drivers when a zoned gendisk is probed 2325 * and when the zone configuration of the gendisk changes (e.g. after a format). 2326 * Before calling this function, the device driver must already have set the 2327 * device zone size (chunk_sector limit) and the max zone append limit. 2328 * BIO based drivers can also use this function as long as the device queue 2329 * can be safely frozen. 2330 */ 2331 int blk_revalidate_disk_zones(struct gendisk *disk) 2332 { 2333 struct request_queue *q = disk->queue; 2334 sector_t zone_sectors = q->limits.chunk_sectors; 2335 sector_t capacity = get_capacity(disk); 2336 struct blk_revalidate_zone_args args = { }; 2337 unsigned int memflags, noio_flag; 2338 struct blk_report_zones_args rep_args = { 2339 .cb = blk_revalidate_zone_cb, 2340 .data = &args, 2341 }; 2342 int ret = -ENOMEM; 2343 2344 if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) 2345 return -EIO; 2346 2347 if (!capacity) 2348 return -ENODEV; 2349 2350 /* 2351 * Checks that the device driver indicated a valid zone size and that 2352 * the max zone append limit is set. 2353 */ 2354 if (!zone_sectors || !is_power_of_2(zone_sectors)) { 2355 pr_warn("%s: Invalid non power of two zone size (%llu)\n", 2356 disk->disk_name, zone_sectors); 2357 return -ENODEV; 2358 } 2359 2360 /* 2361 * Ensure that all memory allocations in this context are done as if 2362 * GFP_NOIO was specified. 2363 */ 2364 noio_flag = memalloc_noio_save(); 2365 ret = disk_revalidate_zone_resources(disk, &args); 2366 if (ret) { 2367 memalloc_noio_restore(noio_flag); 2368 return ret; 2369 } 2370 2371 ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args); 2372 if (!ret) { 2373 pr_warn("%s: No zones reported\n", disk->disk_name); 2374 ret = -ENODEV; 2375 } 2376 memalloc_noio_restore(noio_flag); 2377 2378 if (ret <= 0) 2379 goto free_resources; 2380 2381 /* 2382 * If zones where reported, make sure that the entire disk capacity 2383 * has been checked. 2384 */ 2385 if (args.sector != capacity) { 2386 pr_warn("%s: Missing zones from sector %llu\n", 2387 disk->disk_name, args.sector); 2388 ret = -ENODEV; 2389 goto free_resources; 2390 } 2391 2392 ret = disk_update_zone_resources(disk, &args); 2393 if (ret) 2394 goto free_resources; 2395 2396 return 0; 2397 2398 free_resources: 2399 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 2400 2401 kfree(args.zones_cond); 2402 memflags = blk_mq_freeze_queue(q); 2403 disk_free_zone_resources(disk); 2404 blk_mq_unfreeze_queue(q, memflags); 2405 2406 return ret; 2407 } 2408 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 2409 2410 /** 2411 * blk_zone_issue_zeroout - zero-fill a block range in a zone 2412 * @bdev: blockdev to write 2413 * @sector: start sector 2414 * @nr_sects: number of sectors to write 2415 * @gfp_mask: memory allocation flags (for bio_alloc) 2416 * 2417 * Description: 2418 * Zero-fill a block range in a zone (@sector must be equal to the zone write 2419 * pointer), handling potential errors due to the (initially unknown) lack of 2420 * hardware offload (See blkdev_issue_zeroout()). 2421 */ 2422 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, 2423 sector_t nr_sects, gfp_t gfp_mask) 2424 { 2425 struct gendisk *disk = bdev->bd_disk; 2426 int ret; 2427 2428 if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) 2429 return -EIO; 2430 2431 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 2432 BLKDEV_ZERO_NOFALLBACK); 2433 if (ret != -EOPNOTSUPP) 2434 return ret; 2435 2436 /* 2437 * The failed call to blkdev_issue_zeroout() advanced the zone write 2438 * pointer. Undo this using a report zone to update the zone write 2439 * pointer to the correct current value. 2440 */ 2441 ret = disk->fops->report_zones(disk, sector, 1, NULL); 2442 if (ret != 1) 2443 return ret < 0 ? ret : -EIO; 2444 2445 /* 2446 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a 2447 * regular write with zero-pages. 2448 */ 2449 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); 2450 } 2451 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); 2452 2453 #ifdef CONFIG_BLK_DEBUG_FS 2454 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, 2455 struct seq_file *m) 2456 { 2457 unsigned int zwp_wp_offset, zwp_flags; 2458 unsigned int zwp_zone_no, zwp_ref; 2459 unsigned int zwp_bio_list_size; 2460 enum blk_zone_cond zwp_cond; 2461 unsigned long flags; 2462 2463 spin_lock_irqsave(&zwplug->lock, flags); 2464 zwp_zone_no = zwplug->zone_no; 2465 zwp_flags = zwplug->flags; 2466 zwp_ref = refcount_read(&zwplug->ref); 2467 zwp_cond = zwplug->cond; 2468 zwp_wp_offset = zwplug->wp_offset; 2469 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); 2470 spin_unlock_irqrestore(&zwplug->lock, flags); 2471 2472 seq_printf(m, 2473 "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n", 2474 zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond), 2475 zwp_wp_offset, zwp_bio_list_size); 2476 } 2477 2478 int queue_zone_wplugs_show(void *data, struct seq_file *m) 2479 { 2480 struct request_queue *q = data; 2481 struct gendisk *disk = q->disk; 2482 struct blk_zone_wplug *zwplug; 2483 unsigned int i; 2484 2485 if (!disk->zone_wplugs_hash) 2486 return 0; 2487 2488 rcu_read_lock(); 2489 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 2490 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], 2491 node) 2492 queue_zone_wplug_show(zwplug, m); 2493 rcu_read_unlock(); 2494 2495 return 0; 2496 } 2497 2498 #endif 2499