1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Zoned block device handling 4 * 5 * Copyright (c) 2015, Hannes Reinecke 6 * Copyright (c) 2015, SUSE Linux GmbH 7 * 8 * Copyright (c) 2016, Damien Le Moal 9 * Copyright (c) 2016, Western Digital 10 * Copyright (c) 2024, Western Digital Corporation or its affiliates. 11 */ 12 13 #include <linux/kernel.h> 14 #include <linux/blkdev.h> 15 #include <linux/blk-mq.h> 16 #include <linux/spinlock.h> 17 #include <linux/refcount.h> 18 #include <linux/mempool.h> 19 #include <linux/kthread.h> 20 #include <linux/freezer.h> 21 22 #include <trace/events/block.h> 23 24 #include "blk.h" 25 #include "blk-mq-sched.h" 26 #include "blk-mq-debugfs.h" 27 28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name 29 static const char *const zone_cond_name[] = { 30 ZONE_COND_NAME(NOT_WP), 31 ZONE_COND_NAME(EMPTY), 32 ZONE_COND_NAME(IMP_OPEN), 33 ZONE_COND_NAME(EXP_OPEN), 34 ZONE_COND_NAME(CLOSED), 35 ZONE_COND_NAME(READONLY), 36 ZONE_COND_NAME(FULL), 37 ZONE_COND_NAME(OFFLINE), 38 ZONE_COND_NAME(ACTIVE), 39 }; 40 #undef ZONE_COND_NAME 41 42 /* 43 * Per-zone write plug. 44 * @node: hlist_node structure for managing the plug using a hash table. 45 * @entry: list_head structure for listing the plug in the disk list of active 46 * zone write plugs. 47 * @bio_list: The list of BIOs that are currently plugged. 48 * @bio_work: Work struct to handle issuing of plugged BIOs 49 * @rcu_head: RCU head to free zone write plugs with an RCU grace period. 50 * @disk: The gendisk the plug belongs to. 51 * @lock: Spinlock to atomically manipulate the plug. 52 * @ref: Zone write plug reference counter. A zone write plug reference is 53 * always at least 1 when the plug is hashed in the disk plug hash table. 54 * The reference is incremented whenever a new BIO needing plugging is 55 * submitted and when a function needs to manipulate a plug. The 56 * reference count is decremented whenever a plugged BIO completes and 57 * when a function that referenced the plug returns. The initial 58 * reference is dropped whenever the zone of the zone write plug is reset, 59 * finished and when the zone becomes full (last write BIO to the zone 60 * completes). 61 * @flags: Flags indicating the plug state. 62 * @zone_no: The number of the zone the plug is managing. 63 * @wp_offset: The zone write pointer location relative to the start of the zone 64 * as a number of 512B sectors. 65 * @cond: Condition of the zone 66 */ 67 struct blk_zone_wplug { 68 struct hlist_node node; 69 struct list_head entry; 70 struct bio_list bio_list; 71 struct work_struct bio_work; 72 struct rcu_head rcu_head; 73 struct gendisk *disk; 74 spinlock_t lock; 75 refcount_t ref; 76 unsigned int flags; 77 unsigned int zone_no; 78 unsigned int wp_offset; 79 enum blk_zone_cond cond; 80 }; 81 82 static inline bool disk_need_zone_resources(struct gendisk *disk) 83 { 84 /* 85 * All request-based zoned devices need zone resources so that the 86 * block layer can automatically handle write BIO plugging. BIO-based 87 * device drivers (e.g. DM devices) are normally responsible for 88 * handling zone write ordering and do not need zone resources, unless 89 * the driver requires zone append emulation. 90 */ 91 return queue_is_mq(disk->queue) || 92 queue_emulates_zone_append(disk->queue); 93 } 94 95 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) 96 { 97 return 1U << disk->zone_wplugs_hash_bits; 98 } 99 100 /* 101 * Zone write plug flags bits: 102 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, 103 * that is, that write BIOs are being throttled due to a write BIO already 104 * being executed or the zone write plug bio list is not empty. 105 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone 106 * write pointer offset and need to update it. 107 * - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be 108 * removed from the disk hash table of zone write plugs when the last 109 * reference on the zone write plug is dropped. If set, this flag also 110 * indicates that the initial extra reference on the zone write plug was 111 * dropped, meaning that the reference count indicates the current number of 112 * active users (code context or BIOs and requests in flight). This flag is 113 * set when a zone is reset, finished or becomes full. 114 */ 115 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0) 116 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1) 117 #define BLK_ZONE_WPLUG_DEAD (1U << 2) 118 119 /** 120 * blk_zone_cond_str - Return a zone condition name string 121 * @zone_cond: a zone condition BLK_ZONE_COND_name 122 * 123 * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful 124 * for the debugging and tracing zone conditions. For an invalid zone 125 * conditions, the string "UNKNOWN" is returned. 126 */ 127 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) 128 { 129 static const char *zone_cond_str = "UNKNOWN"; 130 131 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) 132 zone_cond_str = zone_cond_name[zone_cond]; 133 134 return zone_cond_str; 135 } 136 EXPORT_SYMBOL_GPL(blk_zone_cond_str); 137 138 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno, 139 enum blk_zone_cond cond) 140 { 141 if (!zones_cond) 142 return; 143 144 switch (cond) { 145 case BLK_ZONE_COND_IMP_OPEN: 146 case BLK_ZONE_COND_EXP_OPEN: 147 case BLK_ZONE_COND_CLOSED: 148 zones_cond[zno] = BLK_ZONE_COND_ACTIVE; 149 return; 150 case BLK_ZONE_COND_NOT_WP: 151 case BLK_ZONE_COND_EMPTY: 152 case BLK_ZONE_COND_FULL: 153 case BLK_ZONE_COND_OFFLINE: 154 case BLK_ZONE_COND_READONLY: 155 default: 156 zones_cond[zno] = cond; 157 return; 158 } 159 } 160 161 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector, 162 enum blk_zone_cond cond) 163 { 164 u8 *zones_cond; 165 166 rcu_read_lock(); 167 zones_cond = rcu_dereference(disk->zones_cond); 168 if (zones_cond) { 169 unsigned int zno = disk_zone_no(disk, sector); 170 171 /* 172 * The condition of a conventional, readonly and offline zones 173 * never changes, so do nothing if the target zone is in one of 174 * these conditions. 175 */ 176 switch (zones_cond[zno]) { 177 case BLK_ZONE_COND_NOT_WP: 178 case BLK_ZONE_COND_READONLY: 179 case BLK_ZONE_COND_OFFLINE: 180 break; 181 default: 182 blk_zone_set_cond(zones_cond, zno, cond); 183 break; 184 } 185 } 186 rcu_read_unlock(); 187 } 188 189 /** 190 * bdev_zone_is_seq - check if a sector belongs to a sequential write zone 191 * @bdev: block device to check 192 * @sector: sector number 193 * 194 * Check if @sector on @bdev is contained in a sequential write required zone. 195 */ 196 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector) 197 { 198 struct gendisk *disk = bdev->bd_disk; 199 unsigned int zno = disk_zone_no(disk, sector); 200 bool is_seq = false; 201 u8 *zones_cond; 202 203 if (!bdev_is_zoned(bdev)) 204 return false; 205 206 rcu_read_lock(); 207 zones_cond = rcu_dereference(disk->zones_cond); 208 if (zones_cond && zno < disk->nr_zones) 209 is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP; 210 rcu_read_unlock(); 211 212 return is_seq; 213 } 214 EXPORT_SYMBOL_GPL(bdev_zone_is_seq); 215 216 /* 217 * Zone report arguments for block device drivers report_zones operation. 218 * @cb: report_zones_cb callback for each reported zone. 219 * @data: Private data passed to report_zones_cb. 220 */ 221 struct blk_report_zones_args { 222 report_zones_cb cb; 223 void *data; 224 bool report_active; 225 }; 226 227 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector, 228 unsigned int nr_zones, 229 struct blk_report_zones_args *args) 230 { 231 struct gendisk *disk = bdev->bd_disk; 232 233 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) 234 return -EOPNOTSUPP; 235 236 if (!nr_zones || sector >= get_capacity(disk)) 237 return 0; 238 239 return disk->fops->report_zones(disk, sector, nr_zones, args); 240 } 241 242 /** 243 * blkdev_report_zones - Get zones information 244 * @bdev: Target block device 245 * @sector: Sector from which to report zones 246 * @nr_zones: Maximum number of zones to report 247 * @cb: Callback function called for each reported zone 248 * @data: Private data for the callback 249 * 250 * Description: 251 * Get zone information starting from the zone containing @sector for at most 252 * @nr_zones, and call @cb for each zone reported by the device. 253 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES 254 * constant can be passed to @nr_zones. 255 * Returns the number of zones reported by the device, or a negative errno 256 * value in case of failure. 257 * 258 * Note: The caller must use memalloc_noXX_save/restore() calls to control 259 * memory allocations done within this function. 260 */ 261 int blkdev_report_zones(struct block_device *bdev, sector_t sector, 262 unsigned int nr_zones, report_zones_cb cb, void *data) 263 { 264 struct blk_report_zones_args args = { 265 .cb = cb, 266 .data = data, 267 }; 268 269 return blkdev_do_report_zones(bdev, sector, nr_zones, &args); 270 } 271 EXPORT_SYMBOL_GPL(blkdev_report_zones); 272 273 static int blkdev_zone_reset_all(struct block_device *bdev) 274 { 275 struct bio bio; 276 277 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); 278 trace_blkdev_zone_mgmt(&bio, 0); 279 return submit_bio_wait(&bio); 280 } 281 282 /** 283 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones 284 * @bdev: Target block device 285 * @op: Operation to be performed on the zones 286 * @sector: Start sector of the first zone to operate on 287 * @nr_sectors: Number of sectors, should be at least the length of one zone and 288 * must be zone size aligned. 289 * 290 * Description: 291 * Perform the specified operation on the range of zones specified by 292 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range 293 * is valid, but the specified range should not contain conventional zones. 294 * The operation to execute on each zone can be a zone reset, open, close 295 * or finish request. 296 */ 297 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, 298 sector_t sector, sector_t nr_sectors) 299 { 300 sector_t zone_sectors = bdev_zone_sectors(bdev); 301 sector_t capacity = bdev_nr_sectors(bdev); 302 sector_t end_sector = sector + nr_sectors; 303 struct bio *bio = NULL; 304 int ret = 0; 305 306 if (!bdev_is_zoned(bdev)) 307 return -EOPNOTSUPP; 308 309 if (bdev_read_only(bdev)) 310 return -EPERM; 311 312 if (!op_is_zone_mgmt(op)) 313 return -EOPNOTSUPP; 314 315 if (end_sector <= sector || end_sector > capacity) 316 /* Out of range */ 317 return -EINVAL; 318 319 /* Check alignment (handle eventual smaller last zone) */ 320 if (!bdev_is_zone_start(bdev, sector)) 321 return -EINVAL; 322 323 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) 324 return -EINVAL; 325 326 /* 327 * In the case of a zone reset operation over all zones, use 328 * REQ_OP_ZONE_RESET_ALL. 329 */ 330 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) 331 return blkdev_zone_reset_all(bdev); 332 333 while (sector < end_sector) { 334 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); 335 bio->bi_iter.bi_sector = sector; 336 sector += zone_sectors; 337 338 /* This may take a while, so be nice to others */ 339 cond_resched(); 340 } 341 342 trace_blkdev_zone_mgmt(bio, nr_sectors); 343 ret = submit_bio_wait(bio); 344 bio_put(bio); 345 346 return ret; 347 } 348 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); 349 350 struct zone_report_args { 351 struct blk_zone __user *zones; 352 }; 353 354 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, 355 void *data) 356 { 357 struct zone_report_args *args = data; 358 359 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) 360 return -EFAULT; 361 return 0; 362 } 363 364 /* 365 * Mask of valid input flags for BLKREPORTZONEV2 ioctl. 366 */ 367 #define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED 368 369 /* 370 * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing. 371 * Called from blkdev_ioctl. 372 */ 373 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, 374 unsigned long arg) 375 { 376 void __user *argp = (void __user *)arg; 377 struct zone_report_args args; 378 struct blk_zone_report rep; 379 int ret; 380 381 if (!argp) 382 return -EINVAL; 383 384 if (!bdev_is_zoned(bdev)) 385 return -ENOTTY; 386 387 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) 388 return -EFAULT; 389 390 if (!rep.nr_zones) 391 return -EINVAL; 392 393 args.zones = argp + sizeof(struct blk_zone_report); 394 395 switch (cmd) { 396 case BLKREPORTZONE: 397 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, 398 blkdev_copy_zone_to_user, &args); 399 break; 400 case BLKREPORTZONEV2: 401 if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS) 402 return -EINVAL; 403 ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones, 404 blkdev_copy_zone_to_user, &args); 405 break; 406 default: 407 return -EINVAL; 408 } 409 410 if (ret < 0) 411 return ret; 412 413 rep.nr_zones = ret; 414 rep.flags = BLK_ZONE_REP_CAPACITY; 415 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) 416 return -EFAULT; 417 return 0; 418 } 419 420 static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode, 421 struct blk_zone_range *zrange) 422 { 423 loff_t start, end; 424 int ret = -EINVAL; 425 426 inode_lock(bdev->bd_mapping->host); 427 filemap_invalidate_lock(bdev->bd_mapping); 428 if (zrange->sector + zrange->nr_sectors <= zrange->sector || 429 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) 430 /* Out of range */ 431 goto out_unlock; 432 433 start = zrange->sector << SECTOR_SHIFT; 434 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; 435 436 ret = truncate_bdev_range(bdev, mode, start, end); 437 if (ret) 438 goto out_unlock; 439 440 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector, 441 zrange->nr_sectors); 442 out_unlock: 443 filemap_invalidate_unlock(bdev->bd_mapping); 444 inode_unlock(bdev->bd_mapping->host); 445 return ret; 446 } 447 448 /* 449 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. 450 * Called from blkdev_ioctl. 451 */ 452 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, 453 unsigned int cmd, unsigned long arg) 454 { 455 void __user *argp = (void __user *)arg; 456 struct blk_zone_range zrange; 457 enum req_op op; 458 459 if (!argp) 460 return -EINVAL; 461 462 if (!bdev_is_zoned(bdev)) 463 return -ENOTTY; 464 465 if (!(mode & BLK_OPEN_WRITE)) 466 return -EBADF; 467 468 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) 469 return -EFAULT; 470 471 switch (cmd) { 472 case BLKRESETZONE: 473 return blkdev_reset_zone(bdev, mode, &zrange); 474 case BLKOPENZONE: 475 op = REQ_OP_ZONE_OPEN; 476 break; 477 case BLKCLOSEZONE: 478 op = REQ_OP_ZONE_CLOSE; 479 break; 480 case BLKFINISHZONE: 481 op = REQ_OP_ZONE_FINISH; 482 break; 483 default: 484 return -ENOTTY; 485 } 486 487 return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); 488 } 489 490 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone) 491 { 492 return zone->start + zone->len >= get_capacity(disk); 493 } 494 495 static bool disk_zone_wplug_is_full(struct gendisk *disk, 496 struct blk_zone_wplug *zwplug) 497 { 498 if (zwplug->zone_no < disk->nr_zones - 1) 499 return zwplug->wp_offset >= disk->zone_capacity; 500 return zwplug->wp_offset >= disk->last_zone_capacity; 501 } 502 503 static bool disk_insert_zone_wplug(struct gendisk *disk, 504 struct blk_zone_wplug *zwplug) 505 { 506 struct blk_zone_wplug *zwplg; 507 unsigned long flags; 508 u8 *zones_cond; 509 unsigned int idx = 510 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); 511 512 /* 513 * Add the new zone write plug to the hash table, but carefully as we 514 * are racing with other submission context, so we may already have a 515 * zone write plug for the same zone. 516 */ 517 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 518 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { 519 if (zwplg->zone_no == zwplug->zone_no) { 520 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, 521 flags); 522 return false; 523 } 524 } 525 526 /* 527 * Set the zone condition: if we do not yet have a zones_cond array 528 * attached to the disk, then this is a zone write plug insert from the 529 * first call to blk_revalidate_disk_zones(), in which case the zone is 530 * necessarilly in the active condition. 531 */ 532 zones_cond = rcu_dereference_check(disk->zones_cond, 533 lockdep_is_held(&disk->zone_wplugs_hash_lock)); 534 if (zones_cond) 535 zwplug->cond = zones_cond[zwplug->zone_no]; 536 else 537 zwplug->cond = BLK_ZONE_COND_ACTIVE; 538 539 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); 540 atomic_inc(&disk->nr_zone_wplugs); 541 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 542 543 return true; 544 } 545 546 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, 547 sector_t sector) 548 { 549 unsigned int zno = disk_zone_no(disk, sector); 550 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); 551 struct blk_zone_wplug *zwplug; 552 553 rcu_read_lock(); 554 555 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { 556 if (zwplug->zone_no == zno && 557 refcount_inc_not_zero(&zwplug->ref)) { 558 rcu_read_unlock(); 559 return zwplug; 560 } 561 } 562 563 rcu_read_unlock(); 564 565 return NULL; 566 } 567 568 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, 569 sector_t sector) 570 { 571 if (!atomic_read(&disk->nr_zone_wplugs)) 572 return NULL; 573 574 return disk_get_hashed_zone_wplug(disk, sector); 575 } 576 577 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) 578 { 579 struct blk_zone_wplug *zwplug = 580 container_of(rcu_head, struct blk_zone_wplug, rcu_head); 581 582 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); 583 } 584 585 static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug) 586 { 587 struct gendisk *disk = zwplug->disk; 588 unsigned long flags; 589 590 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)); 591 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); 592 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); 593 594 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 595 blk_zone_set_cond(rcu_dereference_check(disk->zones_cond, 596 lockdep_is_held(&disk->zone_wplugs_hash_lock)), 597 zwplug->zone_no, zwplug->cond); 598 hlist_del_init_rcu(&zwplug->node); 599 atomic_dec(&disk->nr_zone_wplugs); 600 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 601 602 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); 603 } 604 605 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) 606 { 607 if (refcount_dec_and_test(&zwplug->ref)) 608 disk_free_zone_wplug(zwplug); 609 } 610 611 /* 612 * Flag the zone write plug as dead and drop the initial reference we got when 613 * the zone write plug was added to the hash table. The zone write plug will be 614 * unhashed when its last reference is dropped. 615 */ 616 static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug) 617 { 618 lockdep_assert_held(&zwplug->lock); 619 620 if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) { 621 zwplug->flags |= BLK_ZONE_WPLUG_DEAD; 622 disk_put_zone_wplug(zwplug); 623 } 624 } 625 626 static bool disk_zone_wplug_submit_bio(struct gendisk *disk, 627 struct blk_zone_wplug *zwplug); 628 629 static void blk_zone_wplug_bio_work(struct work_struct *work) 630 { 631 struct blk_zone_wplug *zwplug = 632 container_of(work, struct blk_zone_wplug, bio_work); 633 634 disk_zone_wplug_submit_bio(zwplug->disk, zwplug); 635 636 /* Drop the reference we took in disk_zone_wplug_schedule_work(). */ 637 disk_put_zone_wplug(zwplug); 638 } 639 640 /* 641 * Get a zone write plug for the zone containing @sector. 642 * If the plug does not exist, it is allocated and inserted in the disk hash 643 * table. 644 */ 645 static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk, 646 sector_t sector, gfp_t gfp_mask) 647 { 648 unsigned int zno = disk_zone_no(disk, sector); 649 struct blk_zone_wplug *zwplug; 650 651 again: 652 zwplug = disk_get_zone_wplug(disk, sector); 653 if (zwplug) 654 return zwplug; 655 656 /* 657 * Allocate and initialize a zone write plug with an extra reference 658 * so that it is not freed when the zone write plug becomes idle without 659 * the zone being full. 660 */ 661 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); 662 if (!zwplug) 663 return NULL; 664 665 INIT_HLIST_NODE(&zwplug->node); 666 refcount_set(&zwplug->ref, 2); 667 spin_lock_init(&zwplug->lock); 668 zwplug->flags = 0; 669 zwplug->zone_no = zno; 670 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); 671 bio_list_init(&zwplug->bio_list); 672 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); 673 INIT_LIST_HEAD(&zwplug->entry); 674 zwplug->disk = disk; 675 676 /* 677 * Insert the new zone write plug in the hash table. This can fail only 678 * if another context already inserted a plug. Retry from the beginning 679 * in such case. 680 */ 681 if (!disk_insert_zone_wplug(disk, zwplug)) { 682 mempool_free(zwplug, disk->zone_wplugs_pool); 683 goto again; 684 } 685 686 return zwplug; 687 } 688 689 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, 690 struct bio *bio) 691 { 692 struct request_queue *q = zwplug->disk->queue; 693 694 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 695 bio_io_error(bio); 696 disk_put_zone_wplug(zwplug); 697 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 698 blk_queue_exit(q); 699 } 700 701 /* 702 * Abort (fail) all plugged BIOs of a zone write plug. 703 */ 704 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) 705 { 706 struct gendisk *disk = zwplug->disk; 707 struct bio *bio; 708 709 lockdep_assert_held(&zwplug->lock); 710 711 if (bio_list_empty(&zwplug->bio_list)) 712 return; 713 714 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", 715 zwplug->disk->disk_name, zwplug->zone_no); 716 while ((bio = bio_list_pop(&zwplug->bio_list))) 717 blk_zone_wplug_bio_io_error(zwplug, bio); 718 719 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 720 721 /* 722 * If we are using the per disk zone write plugs worker thread, remove 723 * the zone write plug from the work list and drop the reference we 724 * took when the zone write plug was added to that list. 725 */ 726 if (blk_queue_zoned_qd1_writes(disk->queue)) { 727 spin_lock(&disk->zone_wplugs_list_lock); 728 if (!list_empty(&zwplug->entry)) { 729 list_del_init(&zwplug->entry); 730 disk_put_zone_wplug(zwplug); 731 } 732 spin_unlock(&disk->zone_wplugs_list_lock); 733 } 734 } 735 736 /* 737 * Update a zone write plug condition based on the write pointer offset. 738 */ 739 static void disk_zone_wplug_update_cond(struct gendisk *disk, 740 struct blk_zone_wplug *zwplug) 741 { 742 lockdep_assert_held(&zwplug->lock); 743 744 if (disk_zone_wplug_is_full(disk, zwplug)) 745 zwplug->cond = BLK_ZONE_COND_FULL; 746 else if (!zwplug->wp_offset) 747 zwplug->cond = BLK_ZONE_COND_EMPTY; 748 else 749 zwplug->cond = BLK_ZONE_COND_ACTIVE; 750 } 751 752 /* 753 * Set a zone write plug write pointer offset to the specified value. 754 * This aborts all plugged BIOs, which is fine as this function is called for 755 * a zone reset operation, a zone finish operation or if the zone needs a wp 756 * update from a report zone after a write error. 757 */ 758 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, 759 struct blk_zone_wplug *zwplug, 760 unsigned int wp_offset) 761 { 762 lockdep_assert_held(&zwplug->lock); 763 764 /* Update the zone write pointer and abort all plugged BIOs. */ 765 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; 766 zwplug->wp_offset = wp_offset; 767 disk_zone_wplug_update_cond(disk, zwplug); 768 769 disk_zone_wplug_abort(zwplug); 770 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) 771 disk_mark_zone_wplug_dead(zwplug); 772 } 773 774 static unsigned int blk_zone_wp_offset(struct blk_zone *zone) 775 { 776 switch (zone->cond) { 777 case BLK_ZONE_COND_IMP_OPEN: 778 case BLK_ZONE_COND_EXP_OPEN: 779 case BLK_ZONE_COND_CLOSED: 780 case BLK_ZONE_COND_ACTIVE: 781 return zone->wp - zone->start; 782 case BLK_ZONE_COND_EMPTY: 783 return 0; 784 case BLK_ZONE_COND_FULL: 785 case BLK_ZONE_COND_NOT_WP: 786 case BLK_ZONE_COND_OFFLINE: 787 case BLK_ZONE_COND_READONLY: 788 default: 789 /* 790 * Conventional, full, offline and read-only zones do not have 791 * a valid write pointer. 792 */ 793 return UINT_MAX; 794 } 795 } 796 797 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk, 798 struct blk_zone *zone) 799 { 800 struct blk_zone_wplug *zwplug; 801 unsigned int wp_offset = blk_zone_wp_offset(zone); 802 803 zwplug = disk_get_zone_wplug(disk, zone->start); 804 if (zwplug) { 805 unsigned long flags; 806 807 spin_lock_irqsave(&zwplug->lock, flags); 808 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 809 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); 810 spin_unlock_irqrestore(&zwplug->lock, flags); 811 disk_put_zone_wplug(zwplug); 812 } 813 814 return wp_offset; 815 } 816 817 /** 818 * disk_report_zone - Report one zone 819 * @disk: Target disk 820 * @zone: The zone to report 821 * @idx: The index of the zone in the overall zone report 822 * @args: report zones callback and data 823 * 824 * Description: 825 * Helper function for block device drivers to report one zone of a zone 826 * report initiated with blkdev_report_zones(). The zone being reported is 827 * specified by @zone and used to update, if necessary, the zone write plug 828 * information for the zone. If @args specifies a user callback function, 829 * this callback is executed. 830 */ 831 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone, 832 unsigned int idx, struct blk_report_zones_args *args) 833 { 834 if (args && args->report_active) { 835 /* 836 * If we come here, then this is a report zones as a fallback 837 * for a cached report. So collapse the implicit open, explicit 838 * open and closed conditions into the active zone condition. 839 */ 840 switch (zone->cond) { 841 case BLK_ZONE_COND_IMP_OPEN: 842 case BLK_ZONE_COND_EXP_OPEN: 843 case BLK_ZONE_COND_CLOSED: 844 zone->cond = BLK_ZONE_COND_ACTIVE; 845 break; 846 default: 847 break; 848 } 849 } 850 851 if (disk->zone_wplugs_hash) 852 disk_zone_wplug_sync_wp_offset(disk, zone); 853 854 if (args && args->cb) 855 return args->cb(zone, idx, args->data); 856 857 return 0; 858 } 859 EXPORT_SYMBOL_GPL(disk_report_zone); 860 861 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx, 862 void *data) 863 { 864 memcpy(data, zone, sizeof(struct blk_zone)); 865 return 0; 866 } 867 868 static int blkdev_report_zone_fallback(struct block_device *bdev, 869 sector_t sector, struct blk_zone *zone) 870 { 871 struct blk_report_zones_args args = { 872 .cb = blkdev_report_zone_cb, 873 .data = zone, 874 .report_active = true, 875 }; 876 int error; 877 878 error = blkdev_do_report_zones(bdev, sector, 1, &args); 879 if (error < 0) 880 return error; 881 if (error == 0) 882 return -EIO; 883 return 0; 884 } 885 886 /* 887 * For devices that natively support zone append operations, we do not use zone 888 * write plugging for zone append writes, which makes the zone condition 889 * tracking invalid once zone append was used. In that case fall back to a 890 * regular report zones to get correct information. 891 */ 892 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev) 893 { 894 return disk_need_zone_resources(bdev->bd_disk) && 895 (bdev_emulates_zone_append(bdev) || 896 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state)); 897 } 898 899 /** 900 * blkdev_get_zone_info - Get a single zone information from cached data 901 * @bdev: Target block device 902 * @sector: Sector contained by the target zone 903 * @zone: zone structure to return the zone information 904 * 905 * Description: 906 * Get the zone information for the zone containing @sector using the zone 907 * write plug of the target zone, if one exist, or the disk zone condition 908 * array otherwise. The zone condition may be reported as being 909 * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit 910 * open, explicit open or closed condition. 911 * 912 * Returns 0 on success and a negative error code on failure. 913 */ 914 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector, 915 struct blk_zone *zone) 916 { 917 struct gendisk *disk = bdev->bd_disk; 918 sector_t zone_sectors = bdev_zone_sectors(bdev); 919 struct blk_zone_wplug *zwplug; 920 unsigned long flags; 921 u8 *zones_cond; 922 923 if (!bdev_is_zoned(bdev)) 924 return -EOPNOTSUPP; 925 926 if (sector >= get_capacity(disk)) 927 return -EINVAL; 928 929 memset(zone, 0, sizeof(*zone)); 930 sector = bdev_zone_start(bdev, sector); 931 932 if (!blkdev_has_cached_report_zones(bdev)) 933 return blkdev_report_zone_fallback(bdev, sector, zone); 934 935 rcu_read_lock(); 936 zones_cond = rcu_dereference(disk->zones_cond); 937 if (!disk->zone_wplugs_hash || !zones_cond) { 938 rcu_read_unlock(); 939 return blkdev_report_zone_fallback(bdev, sector, zone); 940 } 941 zone->cond = zones_cond[disk_zone_no(disk, sector)]; 942 rcu_read_unlock(); 943 944 zone->start = sector; 945 zone->len = zone_sectors; 946 947 /* 948 * If this is a conventional zone, we do not have a zone write plug and 949 * can report the zone immediately. 950 */ 951 if (zone->cond == BLK_ZONE_COND_NOT_WP) { 952 zone->type = BLK_ZONE_TYPE_CONVENTIONAL; 953 zone->capacity = zone_sectors; 954 zone->wp = ULLONG_MAX; 955 return 0; 956 } 957 958 /* 959 * This is a sequential write required zone. If the zone is read-only or 960 * offline, only set the zone write pointer to an invalid value and 961 * report the zone. 962 */ 963 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; 964 if (disk_zone_is_last(disk, zone)) 965 zone->capacity = disk->last_zone_capacity; 966 else 967 zone->capacity = disk->zone_capacity; 968 969 if (zone->cond == BLK_ZONE_COND_READONLY || 970 zone->cond == BLK_ZONE_COND_OFFLINE) { 971 zone->wp = ULLONG_MAX; 972 return 0; 973 } 974 975 /* 976 * If the zone does not have a zone write plug, it is either full or 977 * empty, as we otherwise would have a zone write plug for it. In this 978 * case, set the write pointer accordingly and report the zone. 979 * Otherwise, if we have a zone write plug, use it. 980 */ 981 zwplug = disk_get_zone_wplug(disk, sector); 982 if (!zwplug) { 983 if (zone->cond == BLK_ZONE_COND_FULL) 984 zone->wp = ULLONG_MAX; 985 else 986 zone->wp = sector; 987 return 0; 988 } 989 990 spin_lock_irqsave(&zwplug->lock, flags); 991 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) { 992 spin_unlock_irqrestore(&zwplug->lock, flags); 993 disk_put_zone_wplug(zwplug); 994 return blkdev_report_zone_fallback(bdev, sector, zone); 995 } 996 zone->cond = zwplug->cond; 997 zone->wp = sector + zwplug->wp_offset; 998 spin_unlock_irqrestore(&zwplug->lock, flags); 999 1000 disk_put_zone_wplug(zwplug); 1001 1002 return 0; 1003 } 1004 EXPORT_SYMBOL_GPL(blkdev_get_zone_info); 1005 1006 /** 1007 * blkdev_report_zones_cached - Get cached zones information 1008 * @bdev: Target block device 1009 * @sector: Sector from which to report zones 1010 * @nr_zones: Maximum number of zones to report 1011 * @cb: Callback function called for each reported zone 1012 * @data: Private data for the callback function 1013 * 1014 * Description: 1015 * Similar to blkdev_report_zones() but instead of calling into the low level 1016 * device driver to get the zone report from the device, use 1017 * blkdev_get_zone_info() to generate the report from the disk zone write 1018 * plugs and zones condition array. Since calling this function without a 1019 * callback does not make sense, @cb must be specified. 1020 */ 1021 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector, 1022 unsigned int nr_zones, report_zones_cb cb, void *data) 1023 { 1024 struct gendisk *disk = bdev->bd_disk; 1025 sector_t capacity = get_capacity(disk); 1026 sector_t zone_sectors = bdev_zone_sectors(bdev); 1027 unsigned int idx = 0; 1028 struct blk_zone zone; 1029 int ret; 1030 1031 if (!cb || !bdev_is_zoned(bdev) || 1032 WARN_ON_ONCE(!disk->fops->report_zones)) 1033 return -EOPNOTSUPP; 1034 1035 if (!nr_zones || sector >= capacity) 1036 return 0; 1037 1038 if (!blkdev_has_cached_report_zones(bdev)) { 1039 struct blk_report_zones_args args = { 1040 .cb = cb, 1041 .data = data, 1042 .report_active = true, 1043 }; 1044 1045 return blkdev_do_report_zones(bdev, sector, nr_zones, &args); 1046 } 1047 1048 for (sector = bdev_zone_start(bdev, sector); 1049 sector < capacity && idx < nr_zones; 1050 sector += zone_sectors, idx++) { 1051 ret = blkdev_get_zone_info(bdev, sector, &zone); 1052 if (ret) 1053 return ret; 1054 1055 ret = cb(&zone, idx, data); 1056 if (ret) 1057 return ret; 1058 } 1059 1060 return idx; 1061 } 1062 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached); 1063 1064 static void blk_zone_reset_bio_endio(struct bio *bio) 1065 { 1066 struct gendisk *disk = bio->bi_bdev->bd_disk; 1067 sector_t sector = bio->bi_iter.bi_sector; 1068 struct blk_zone_wplug *zwplug; 1069 1070 /* 1071 * If we have a zone write plug, set its write pointer offset to 0. 1072 * This will abort all BIOs plugged for the target zone. It is fine as 1073 * resetting zones while writes are still in-flight will result in the 1074 * writes failing anyway. 1075 */ 1076 zwplug = disk_get_zone_wplug(disk, sector); 1077 if (zwplug) { 1078 unsigned long flags; 1079 1080 spin_lock_irqsave(&zwplug->lock, flags); 1081 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 1082 spin_unlock_irqrestore(&zwplug->lock, flags); 1083 disk_put_zone_wplug(zwplug); 1084 } else { 1085 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); 1086 } 1087 } 1088 1089 static void blk_zone_reset_all_bio_endio(struct bio *bio) 1090 { 1091 struct gendisk *disk = bio->bi_bdev->bd_disk; 1092 sector_t capacity = get_capacity(disk); 1093 struct blk_zone_wplug *zwplug; 1094 unsigned long flags; 1095 sector_t sector; 1096 unsigned int i; 1097 1098 if (atomic_read(&disk->nr_zone_wplugs)) { 1099 /* Update the condition of all zone write plugs. */ 1100 rcu_read_lock(); 1101 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1102 hlist_for_each_entry_rcu(zwplug, 1103 &disk->zone_wplugs_hash[i], 1104 node) { 1105 spin_lock_irqsave(&zwplug->lock, flags); 1106 disk_zone_wplug_set_wp_offset(disk, zwplug, 0); 1107 spin_unlock_irqrestore(&zwplug->lock, flags); 1108 } 1109 } 1110 rcu_read_unlock(); 1111 } 1112 1113 /* Update the cached zone conditions. */ 1114 for (sector = 0; sector < capacity; 1115 sector += bdev_zone_sectors(bio->bi_bdev)) 1116 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY); 1117 clear_bit(GD_ZONE_APPEND_USED, &disk->state); 1118 } 1119 1120 static void blk_zone_finish_bio_endio(struct bio *bio) 1121 { 1122 struct block_device *bdev = bio->bi_bdev; 1123 struct gendisk *disk = bdev->bd_disk; 1124 sector_t sector = bio->bi_iter.bi_sector; 1125 struct blk_zone_wplug *zwplug; 1126 1127 /* 1128 * If we have a zone write plug, set its write pointer offset to the 1129 * zone size. This will abort all BIOs plugged for the target zone. It 1130 * is fine as resetting zones while writes are still in-flight will 1131 * result in the writes failing anyway. 1132 */ 1133 zwplug = disk_get_zone_wplug(disk, sector); 1134 if (zwplug) { 1135 unsigned long flags; 1136 1137 spin_lock_irqsave(&zwplug->lock, flags); 1138 disk_zone_wplug_set_wp_offset(disk, zwplug, 1139 bdev_zone_sectors(bdev)); 1140 spin_unlock_irqrestore(&zwplug->lock, flags); 1141 disk_put_zone_wplug(zwplug); 1142 } else { 1143 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL); 1144 } 1145 } 1146 1147 void blk_zone_mgmt_bio_endio(struct bio *bio) 1148 { 1149 /* If the BIO failed, we have nothing to do. */ 1150 if (bio->bi_status != BLK_STS_OK) 1151 return; 1152 1153 switch (bio_op(bio)) { 1154 case REQ_OP_ZONE_RESET: 1155 blk_zone_reset_bio_endio(bio); 1156 return; 1157 case REQ_OP_ZONE_RESET_ALL: 1158 blk_zone_reset_all_bio_endio(bio); 1159 return; 1160 case REQ_OP_ZONE_FINISH: 1161 blk_zone_finish_bio_endio(bio); 1162 return; 1163 default: 1164 return; 1165 } 1166 } 1167 1168 static void disk_zone_wplug_schedule_work(struct gendisk *disk, 1169 struct blk_zone_wplug *zwplug) 1170 { 1171 lockdep_assert_held(&zwplug->lock); 1172 1173 /* 1174 * Schedule the submission of the next plugged BIO. Taking a reference 1175 * to the zone write plug is required as the bio_work belongs to the 1176 * plug, and thus we must ensure that the write plug does not go away 1177 * while the work is being scheduled but has not run yet. 1178 * blk_zone_wplug_bio_work() will release the reference we take here, 1179 * and we also drop this reference if the work is already scheduled. 1180 */ 1181 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); 1182 WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue)); 1183 refcount_inc(&zwplug->ref); 1184 if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work)) 1185 disk_put_zone_wplug(zwplug); 1186 } 1187 1188 static inline void disk_zone_wplug_add_bio(struct gendisk *disk, 1189 struct blk_zone_wplug *zwplug, 1190 struct bio *bio, unsigned int nr_segs) 1191 { 1192 /* 1193 * Grab an extra reference on the BIO request queue usage counter. 1194 * This reference will be reused to submit a request for the BIO for 1195 * blk-mq devices and dropped when the BIO is failed and after 1196 * it is issued in the case of BIO-based devices. 1197 */ 1198 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); 1199 1200 /* 1201 * The BIO is being plugged and thus will have to wait for the on-going 1202 * write and for all other writes already plugged. So polling makes 1203 * no sense. 1204 */ 1205 bio_clear_polled(bio); 1206 1207 /* 1208 * Reuse the poll cookie field to store the number of segments when 1209 * split to the hardware limits. 1210 */ 1211 bio->__bi_nr_segments = nr_segs; 1212 1213 /* 1214 * We always receive BIOs after they are split and ready to be issued. 1215 * The block layer passes the parts of a split BIO in order, and the 1216 * user must also issue write sequentially. So simply add the new BIO 1217 * at the tail of the list to preserve the sequential write order. 1218 */ 1219 bio_list_add(&zwplug->bio_list, bio); 1220 trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no, 1221 bio->bi_iter.bi_sector, bio_sectors(bio)); 1222 1223 /* 1224 * If we are using the disk zone write plugs worker instead of the per 1225 * zone write plug BIO work, add the zone write plug to the work list 1226 * if it is not already there. Make sure to also get an extra reference 1227 * on the zone write plug so that it does not go away until it is 1228 * removed from the work list. 1229 */ 1230 if (blk_queue_zoned_qd1_writes(disk->queue)) { 1231 spin_lock(&disk->zone_wplugs_list_lock); 1232 if (list_empty(&zwplug->entry)) { 1233 list_add_tail(&zwplug->entry, &disk->zone_wplugs_list); 1234 refcount_inc(&zwplug->ref); 1235 } 1236 spin_unlock(&disk->zone_wplugs_list_lock); 1237 } 1238 } 1239 1240 /* 1241 * Called from bio_attempt_back_merge() when a BIO was merged with a request. 1242 */ 1243 void blk_zone_write_plug_bio_merged(struct bio *bio) 1244 { 1245 struct gendisk *disk = bio->bi_bdev->bd_disk; 1246 struct blk_zone_wplug *zwplug; 1247 unsigned long flags; 1248 1249 /* 1250 * If the BIO was already plugged, then we were called through 1251 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). 1252 * For this case, we already hold a reference on the zone write plug for 1253 * the BIO and blk_zone_write_plug_init_request() will handle the 1254 * zone write pointer offset update. 1255 */ 1256 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) 1257 return; 1258 1259 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1260 1261 /* 1262 * Get a reference on the zone write plug of the target zone and advance 1263 * the zone write pointer offset. Given that this is a merge, we already 1264 * have at least one request and one BIO referencing the zone write 1265 * plug. So this should not fail. 1266 */ 1267 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1268 if (WARN_ON_ONCE(!zwplug)) 1269 return; 1270 1271 spin_lock_irqsave(&zwplug->lock, flags); 1272 zwplug->wp_offset += bio_sectors(bio); 1273 disk_zone_wplug_update_cond(disk, zwplug); 1274 spin_unlock_irqrestore(&zwplug->lock, flags); 1275 } 1276 1277 /* 1278 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that 1279 * already went through zone write plugging (either a new BIO or one that was 1280 * unplugged). 1281 */ 1282 void blk_zone_write_plug_init_request(struct request *req) 1283 { 1284 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); 1285 struct request_queue *q = req->q; 1286 struct gendisk *disk = q->disk; 1287 struct blk_zone_wplug *zwplug = 1288 disk_get_zone_wplug(disk, blk_rq_pos(req)); 1289 unsigned long flags; 1290 struct bio *bio; 1291 1292 if (WARN_ON_ONCE(!zwplug)) 1293 return; 1294 1295 /* 1296 * Indicate that completion of this request needs to be handled with 1297 * blk_zone_write_plug_finish_request(), which will drop the reference 1298 * on the zone write plug we took above on entry to this function. 1299 */ 1300 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; 1301 1302 if (blk_queue_nomerges(q)) 1303 return; 1304 1305 /* 1306 * Walk through the list of plugged BIOs to check if they can be merged 1307 * into the back of the request. 1308 */ 1309 spin_lock_irqsave(&zwplug->lock, flags); 1310 while (!disk_zone_wplug_is_full(disk, zwplug)) { 1311 bio = bio_list_peek(&zwplug->bio_list); 1312 if (!bio) 1313 break; 1314 1315 if (bio->bi_iter.bi_sector != req_back_sector || 1316 !blk_rq_merge_ok(req, bio)) 1317 break; 1318 1319 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && 1320 !bio->__bi_nr_segments); 1321 1322 bio_list_pop(&zwplug->bio_list); 1323 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != 1324 BIO_MERGE_OK) { 1325 bio_list_add_head(&zwplug->bio_list, bio); 1326 break; 1327 } 1328 1329 /* Drop the reference taken by disk_zone_wplug_add_bio(). */ 1330 blk_queue_exit(q); 1331 zwplug->wp_offset += bio_sectors(bio); 1332 disk_zone_wplug_update_cond(disk, zwplug); 1333 1334 req_back_sector += bio_sectors(bio); 1335 } 1336 spin_unlock_irqrestore(&zwplug->lock, flags); 1337 } 1338 1339 /* 1340 * Check and prepare a BIO for submission by incrementing the write pointer 1341 * offset of its zone write plug and changing zone append operations into 1342 * regular write when zone append emulation is needed. 1343 */ 1344 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, 1345 struct bio *bio) 1346 { 1347 struct gendisk *disk = bio->bi_bdev->bd_disk; 1348 1349 lockdep_assert_held(&zwplug->lock); 1350 1351 /* 1352 * If we lost track of the zone write pointer due to a write error, 1353 * the user must either execute a report zones, reset the zone or finish 1354 * the to recover a reliable write pointer position. Fail BIOs if the 1355 * user did not do that as we cannot handle emulated zone append 1356 * otherwise. 1357 */ 1358 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) 1359 return false; 1360 1361 /* 1362 * Check that the user is not attempting to write to a full zone. 1363 * We know such BIO will fail, and that would potentially overflow our 1364 * write pointer offset beyond the end of the zone. 1365 */ 1366 if (disk_zone_wplug_is_full(disk, zwplug)) 1367 return false; 1368 1369 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1370 /* 1371 * Use a regular write starting at the current write pointer. 1372 * Similarly to native zone append operations, do not allow 1373 * merging. 1374 */ 1375 bio->bi_opf &= ~REQ_OP_MASK; 1376 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; 1377 bio->bi_iter.bi_sector += zwplug->wp_offset; 1378 1379 /* 1380 * Remember that this BIO is in fact a zone append operation 1381 * so that we can restore its operation code on completion. 1382 */ 1383 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); 1384 } else { 1385 /* 1386 * Check for non-sequential writes early as we know that BIOs 1387 * with a start sector not unaligned to the zone write pointer 1388 * will fail. 1389 */ 1390 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) 1391 return false; 1392 } 1393 1394 /* Advance the zone write pointer offset. */ 1395 zwplug->wp_offset += bio_sectors(bio); 1396 disk_zone_wplug_update_cond(disk, zwplug); 1397 1398 return true; 1399 } 1400 1401 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) 1402 { 1403 struct gendisk *disk = bio->bi_bdev->bd_disk; 1404 sector_t sector = bio->bi_iter.bi_sector; 1405 struct blk_zone_wplug *zwplug; 1406 gfp_t gfp_mask = GFP_NOIO; 1407 unsigned long flags; 1408 1409 /* 1410 * BIOs must be fully contained within a zone so that we use the correct 1411 * zone write plug for the entire BIO. For blk-mq devices, the block 1412 * layer should already have done any splitting required to ensure this 1413 * and this BIO should thus not be straddling zone boundaries. For 1414 * BIO-based devices, it is the responsibility of the driver to split 1415 * the bio before submitting it. 1416 */ 1417 if (WARN_ON_ONCE(bio_straddles_zones(bio))) { 1418 bio_io_error(bio); 1419 return true; 1420 } 1421 1422 /* Conventional zones do not need write plugging. */ 1423 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { 1424 /* Zone append to conventional zones is not allowed. */ 1425 if (bio_op(bio) == REQ_OP_ZONE_APPEND) { 1426 bio_io_error(bio); 1427 return true; 1428 } 1429 return false; 1430 } 1431 1432 if (bio->bi_opf & REQ_NOWAIT) 1433 gfp_mask = GFP_NOWAIT; 1434 1435 zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask); 1436 if (!zwplug) { 1437 if (bio->bi_opf & REQ_NOWAIT) 1438 bio_wouldblock_error(bio); 1439 else 1440 bio_io_error(bio); 1441 return true; 1442 } 1443 1444 spin_lock_irqsave(&zwplug->lock, flags); 1445 1446 /* 1447 * If we got a zone write plug marked as dead, then the user is issuing 1448 * writes to a full zone, or without synchronizing with zone reset or 1449 * zone finish operations. In such case, fail the BIO to signal this 1450 * invalid usage. 1451 */ 1452 if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) { 1453 spin_unlock_irqrestore(&zwplug->lock, flags); 1454 disk_put_zone_wplug(zwplug); 1455 bio_io_error(bio); 1456 return true; 1457 } 1458 1459 /* Indicate that this BIO is being handled using zone write plugging. */ 1460 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1461 1462 /* 1463 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a 1464 * BLK_STS_AGAIN failure if we let the caller submit the BIO. 1465 */ 1466 if (bio->bi_opf & REQ_NOWAIT) { 1467 bio->bi_opf &= ~REQ_NOWAIT; 1468 goto queue_bio; 1469 } 1470 1471 /* 1472 * For rotational devices, we will use the gendisk zone write plugs 1473 * work instead of the per zone write plug BIO work, so queue the BIO. 1474 */ 1475 if (blk_queue_zoned_qd1_writes(disk->queue)) 1476 goto queue_bio; 1477 1478 /* If the zone is already plugged, add the BIO to the BIO plug list. */ 1479 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) 1480 goto queue_bio; 1481 1482 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { 1483 spin_unlock_irqrestore(&zwplug->lock, flags); 1484 bio_io_error(bio); 1485 return true; 1486 } 1487 1488 /* Otherwise, plug and let the caller submit the BIO. */ 1489 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1490 1491 spin_unlock_irqrestore(&zwplug->lock, flags); 1492 1493 return false; 1494 1495 queue_bio: 1496 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs); 1497 1498 if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) { 1499 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; 1500 if (blk_queue_zoned_qd1_writes(disk->queue)) 1501 wake_up_process(disk->zone_wplugs_worker); 1502 else 1503 disk_zone_wplug_schedule_work(disk, zwplug); 1504 } 1505 1506 spin_unlock_irqrestore(&zwplug->lock, flags); 1507 1508 return true; 1509 } 1510 1511 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) 1512 { 1513 struct gendisk *disk = bio->bi_bdev->bd_disk; 1514 struct blk_zone_wplug *zwplug; 1515 unsigned long flags; 1516 1517 if (!test_bit(GD_ZONE_APPEND_USED, &disk->state)) 1518 set_bit(GD_ZONE_APPEND_USED, &disk->state); 1519 1520 /* 1521 * We have native support for zone append operations, so we are not 1522 * going to handle @bio through plugging. However, we may already have a 1523 * zone write plug for the target zone if that zone was previously 1524 * partially written using regular writes. In such case, we risk leaving 1525 * the plug in the disk hash table if the zone is fully written using 1526 * zone append operations. Avoid this by removing the zone write plug. 1527 */ 1528 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1529 if (likely(!zwplug)) 1530 return; 1531 1532 spin_lock_irqsave(&zwplug->lock, flags); 1533 1534 /* 1535 * We are about to remove the zone write plug. But if the user 1536 * (mistakenly) has issued regular writes together with native zone 1537 * append, we must aborts the writes as otherwise the plugged BIOs would 1538 * not be executed by the plug BIO work as disk_get_zone_wplug() will 1539 * return NULL after the plug is removed. Aborting the plugged write 1540 * BIOs is consistent with the fact that these writes will most likely 1541 * fail anyway as there is no ordering guarantees between zone append 1542 * operations and regular write operations. 1543 */ 1544 if (!bio_list_empty(&zwplug->bio_list)) { 1545 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", 1546 disk->disk_name, zwplug->zone_no); 1547 disk_zone_wplug_abort(zwplug); 1548 } 1549 disk_mark_zone_wplug_dead(zwplug); 1550 spin_unlock_irqrestore(&zwplug->lock, flags); 1551 1552 disk_put_zone_wplug(zwplug); 1553 } 1554 1555 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio) 1556 { 1557 if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL && 1558 !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 1559 /* 1560 * Zone reset and zone finish operations do not apply to 1561 * conventional zones. 1562 */ 1563 bio_io_error(bio); 1564 return true; 1565 } 1566 1567 /* 1568 * No-wait zone management BIOs do not make much sense as the callers 1569 * issue these as blocking operations in most cases. To avoid issues 1570 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn 1571 * about REQ_NOWAIT being set and ignore that flag. 1572 */ 1573 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) 1574 bio->bi_opf &= ~REQ_NOWAIT; 1575 1576 return false; 1577 } 1578 1579 /** 1580 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging 1581 * @bio: The BIO being submitted 1582 * @nr_segs: The number of physical segments of @bio 1583 * 1584 * Handle write, write zeroes and zone append operations requiring emulation 1585 * using zone write plugging. 1586 * 1587 * Return true whenever @bio execution needs to be delayed through the zone 1588 * write plug. Otherwise, return false to let the submission path process 1589 * @bio normally. 1590 */ 1591 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) 1592 { 1593 struct block_device *bdev = bio->bi_bdev; 1594 1595 if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash)) 1596 return false; 1597 1598 /* 1599 * Regular writes and write zeroes need to be handled through the target 1600 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH 1601 * which may need to go through the flush machinery depending on the 1602 * target device capabilities. Plugging such writes is fine as the flush 1603 * machinery operates at the request level, below the plug, and 1604 * completion of the flush sequence will go through the regular BIO 1605 * completion, which will handle zone write plugging. 1606 * Zone append operations for devices that requested emulation must 1607 * also be plugged so that these BIOs can be changed into regular 1608 * write BIOs. 1609 * Zone reset, reset all and finish commands need special treatment 1610 * to correctly track the write pointer offset of zones. These commands 1611 * are not plugged as we do not need serialization with write 1612 * operations. It is the responsibility of the user to not issue reset 1613 * and finish commands when write operations are in flight. 1614 */ 1615 switch (bio_op(bio)) { 1616 case REQ_OP_ZONE_APPEND: 1617 if (!bdev_emulates_zone_append(bdev)) { 1618 blk_zone_wplug_handle_native_zone_append(bio); 1619 return false; 1620 } 1621 fallthrough; 1622 case REQ_OP_WRITE: 1623 case REQ_OP_WRITE_ZEROES: 1624 return blk_zone_wplug_handle_write(bio, nr_segs); 1625 case REQ_OP_ZONE_RESET: 1626 case REQ_OP_ZONE_FINISH: 1627 case REQ_OP_ZONE_RESET_ALL: 1628 return blk_zone_wplug_handle_zone_mgmt(bio); 1629 default: 1630 return false; 1631 } 1632 1633 return false; 1634 } 1635 EXPORT_SYMBOL_GPL(blk_zone_plug_bio); 1636 1637 static void disk_zone_wplug_unplug_bio(struct gendisk *disk, 1638 struct blk_zone_wplug *zwplug) 1639 { 1640 unsigned long flags; 1641 1642 spin_lock_irqsave(&zwplug->lock, flags); 1643 1644 /* 1645 * For rotational devices, signal the BIO completion to the zone write 1646 * plug work. Otherwise, schedule submission of the next plugged BIO 1647 * if we have one. 1648 */ 1649 if (bio_list_empty(&zwplug->bio_list)) 1650 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1651 1652 if (blk_queue_zoned_qd1_writes(disk->queue)) 1653 complete(&disk->zone_wplugs_worker_bio_done); 1654 else if (!bio_list_empty(&zwplug->bio_list)) 1655 disk_zone_wplug_schedule_work(disk, zwplug); 1656 1657 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug)) 1658 disk_mark_zone_wplug_dead(zwplug); 1659 1660 spin_unlock_irqrestore(&zwplug->lock, flags); 1661 } 1662 1663 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio) 1664 { 1665 /* 1666 * For zone append requests, the request sector indicates the location 1667 * at which the BIO data was written. Return this value to the BIO 1668 * issuer through the BIO iter sector. 1669 * For plugged zone writes, which include emulated zone append, we need 1670 * the original BIO sector so that blk_zone_write_plug_bio_endio() can 1671 * lookup the zone write plug. 1672 */ 1673 bio->bi_iter.bi_sector = rq->__sector; 1674 trace_blk_zone_append_update_request_bio(rq); 1675 } 1676 1677 void blk_zone_write_plug_bio_endio(struct bio *bio) 1678 { 1679 struct gendisk *disk = bio->bi_bdev->bd_disk; 1680 struct blk_zone_wplug *zwplug = 1681 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); 1682 unsigned long flags; 1683 1684 if (WARN_ON_ONCE(!zwplug)) 1685 return; 1686 1687 /* Make sure we do not see this BIO again by clearing the plug flag. */ 1688 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); 1689 1690 /* 1691 * If this is a regular write emulating a zone append operation, 1692 * restore the original operation code. 1693 */ 1694 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { 1695 bio->bi_opf &= ~REQ_OP_MASK; 1696 bio->bi_opf |= REQ_OP_ZONE_APPEND; 1697 bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND); 1698 } 1699 1700 /* 1701 * If the BIO failed, abort all plugged BIOs and mark the plug as 1702 * needing a write pointer update. 1703 */ 1704 if (bio->bi_status != BLK_STS_OK) { 1705 spin_lock_irqsave(&zwplug->lock, flags); 1706 disk_zone_wplug_abort(zwplug); 1707 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; 1708 spin_unlock_irqrestore(&zwplug->lock, flags); 1709 } 1710 1711 /* Drop the reference we took when the BIO was issued. */ 1712 disk_put_zone_wplug(zwplug); 1713 1714 /* 1715 * For BIO-based devices, blk_zone_write_plug_finish_request() 1716 * is not called. So we need to schedule execution of the next 1717 * plugged BIO here. 1718 */ 1719 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) 1720 disk_zone_wplug_unplug_bio(disk, zwplug); 1721 1722 /* Drop the reference we took when entering this function. */ 1723 disk_put_zone_wplug(zwplug); 1724 } 1725 1726 void blk_zone_write_plug_finish_request(struct request *req) 1727 { 1728 struct gendisk *disk = req->q->disk; 1729 struct blk_zone_wplug *zwplug; 1730 1731 zwplug = disk_get_zone_wplug(disk, req->__sector); 1732 if (WARN_ON_ONCE(!zwplug)) 1733 return; 1734 1735 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; 1736 1737 /* 1738 * Drop the reference we took when the request was initialized in 1739 * blk_zone_write_plug_init_request(). 1740 */ 1741 disk_put_zone_wplug(zwplug); 1742 1743 disk_zone_wplug_unplug_bio(disk, zwplug); 1744 1745 /* Drop the reference we took when entering this function. */ 1746 disk_put_zone_wplug(zwplug); 1747 } 1748 1749 static bool disk_zone_wplug_submit_bio(struct gendisk *disk, 1750 struct blk_zone_wplug *zwplug) 1751 { 1752 struct block_device *bdev; 1753 unsigned long flags; 1754 struct bio *bio; 1755 bool prepared; 1756 1757 /* 1758 * Submit the next plugged BIO. If we do not have any, clear 1759 * the plugged flag. 1760 */ 1761 again: 1762 spin_lock_irqsave(&zwplug->lock, flags); 1763 bio = bio_list_pop(&zwplug->bio_list); 1764 if (!bio) { 1765 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; 1766 spin_unlock_irqrestore(&zwplug->lock, flags); 1767 return false; 1768 } 1769 1770 trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no, 1771 bio->bi_iter.bi_sector, bio_sectors(bio)); 1772 1773 prepared = blk_zone_wplug_prepare_bio(zwplug, bio); 1774 spin_unlock_irqrestore(&zwplug->lock, flags); 1775 1776 if (!prepared) { 1777 blk_zone_wplug_bio_io_error(zwplug, bio); 1778 goto again; 1779 } 1780 1781 /* 1782 * blk-mq devices will reuse the extra reference on the request queue 1783 * usage counter we took when the BIO was plugged, but the submission 1784 * path for BIO-based devices will not do that. So drop this extra 1785 * reference here. 1786 */ 1787 if (blk_queue_zoned_qd1_writes(disk->queue)) 1788 reinit_completion(&disk->zone_wplugs_worker_bio_done); 1789 bdev = bio->bi_bdev; 1790 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) { 1791 bdev->bd_disk->fops->submit_bio(bio); 1792 blk_queue_exit(bdev->bd_disk->queue); 1793 } else { 1794 blk_mq_submit_bio(bio); 1795 } 1796 1797 return true; 1798 } 1799 1800 static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk) 1801 { 1802 struct blk_zone_wplug *zwplug; 1803 1804 spin_lock_irq(&disk->zone_wplugs_list_lock); 1805 zwplug = list_first_entry_or_null(&disk->zone_wplugs_list, 1806 struct blk_zone_wplug, entry); 1807 if (zwplug) 1808 list_del_init(&zwplug->entry); 1809 spin_unlock_irq(&disk->zone_wplugs_list_lock); 1810 1811 return zwplug; 1812 } 1813 1814 static int disk_zone_wplugs_worker(void *data) 1815 { 1816 struct gendisk *disk = data; 1817 struct blk_zone_wplug *zwplug; 1818 unsigned int noio_flag; 1819 1820 noio_flag = memalloc_noio_save(); 1821 set_user_nice(current, MIN_NICE); 1822 set_freezable(); 1823 1824 for (;;) { 1825 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1826 1827 zwplug = disk_get_zone_wplugs_work(disk); 1828 if (zwplug) { 1829 /* 1830 * Process all BIOs of this zone write plug and then 1831 * drop the reference we took when adding the zone write 1832 * plug to the active list. 1833 */ 1834 set_current_state(TASK_RUNNING); 1835 while (disk_zone_wplug_submit_bio(disk, zwplug)) 1836 blk_wait_io(&disk->zone_wplugs_worker_bio_done); 1837 disk_put_zone_wplug(zwplug); 1838 continue; 1839 } 1840 1841 /* 1842 * Only sleep if nothing sets the state to running. Else check 1843 * for zone write plugs work again as a newly submitted BIO 1844 * might have added a zone write plug to the work list. 1845 */ 1846 if (get_current_state() == TASK_RUNNING) { 1847 try_to_freeze(); 1848 } else { 1849 if (kthread_should_stop()) { 1850 set_current_state(TASK_RUNNING); 1851 break; 1852 } 1853 schedule(); 1854 } 1855 } 1856 1857 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); 1858 memalloc_noio_restore(noio_flag); 1859 1860 return 0; 1861 } 1862 1863 void disk_init_zone_resources(struct gendisk *disk) 1864 { 1865 spin_lock_init(&disk->zone_wplugs_hash_lock); 1866 spin_lock_init(&disk->zone_wplugs_list_lock); 1867 INIT_LIST_HEAD(&disk->zone_wplugs_list); 1868 init_completion(&disk->zone_wplugs_worker_bio_done); 1869 } 1870 1871 /* 1872 * For the size of a disk zone write plug hash table, use the size of the 1873 * zone write plug mempool, which is the maximum of the disk open zones and 1874 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, 1875 * 9 bits. For a disk that has no limits, mempool size defaults to 128. 1876 */ 1877 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 1878 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 1879 1880 static int disk_alloc_zone_resources(struct gendisk *disk, 1881 unsigned int pool_size) 1882 { 1883 unsigned int i; 1884 int ret = -ENOMEM; 1885 1886 atomic_set(&disk->nr_zone_wplugs, 0); 1887 disk->zone_wplugs_hash_bits = 1888 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); 1889 1890 disk->zone_wplugs_hash = 1891 kzalloc_objs(struct hlist_head, 1892 disk_zone_wplugs_hash_size(disk)); 1893 if (!disk->zone_wplugs_hash) 1894 return -ENOMEM; 1895 1896 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 1897 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); 1898 1899 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, 1900 sizeof(struct blk_zone_wplug)); 1901 if (!disk->zone_wplugs_pool) 1902 goto free_hash; 1903 1904 disk->zone_wplugs_wq = 1905 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, 1906 pool_size, disk->disk_name); 1907 if (!disk->zone_wplugs_wq) 1908 goto destroy_pool; 1909 1910 disk->zone_wplugs_worker = 1911 kthread_create(disk_zone_wplugs_worker, disk, 1912 "%s_zwplugs_worker", disk->disk_name); 1913 if (IS_ERR(disk->zone_wplugs_worker)) { 1914 ret = PTR_ERR(disk->zone_wplugs_worker); 1915 disk->zone_wplugs_worker = NULL; 1916 goto destroy_wq; 1917 } 1918 wake_up_process(disk->zone_wplugs_worker); 1919 1920 return 0; 1921 1922 destroy_wq: 1923 destroy_workqueue(disk->zone_wplugs_wq); 1924 disk->zone_wplugs_wq = NULL; 1925 destroy_pool: 1926 mempool_destroy(disk->zone_wplugs_pool); 1927 disk->zone_wplugs_pool = NULL; 1928 free_hash: 1929 kfree(disk->zone_wplugs_hash); 1930 disk->zone_wplugs_hash = NULL; 1931 disk->zone_wplugs_hash_bits = 0; 1932 return ret; 1933 } 1934 1935 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) 1936 { 1937 struct blk_zone_wplug *zwplug; 1938 unsigned int i; 1939 1940 if (!disk->zone_wplugs_hash) 1941 return; 1942 1943 /* Free all the zone write plugs we have. */ 1944 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { 1945 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { 1946 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, 1947 struct blk_zone_wplug, node); 1948 spin_lock_irq(&zwplug->lock); 1949 disk_mark_zone_wplug_dead(zwplug); 1950 spin_unlock_irq(&zwplug->lock); 1951 } 1952 } 1953 1954 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); 1955 kfree(disk->zone_wplugs_hash); 1956 disk->zone_wplugs_hash = NULL; 1957 disk->zone_wplugs_hash_bits = 0; 1958 1959 /* 1960 * Wait for the zone write plugs to be RCU-freed before destroying the 1961 * mempool. 1962 */ 1963 rcu_barrier(); 1964 mempool_destroy(disk->zone_wplugs_pool); 1965 disk->zone_wplugs_pool = NULL; 1966 } 1967 1968 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond) 1969 { 1970 unsigned long flags; 1971 1972 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags); 1973 zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond, 1974 lockdep_is_held(&disk->zone_wplugs_hash_lock)); 1975 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags); 1976 1977 kfree_rcu_mightsleep(zones_cond); 1978 } 1979 1980 void disk_free_zone_resources(struct gendisk *disk) 1981 { 1982 if (disk->zone_wplugs_worker) 1983 kthread_stop(disk->zone_wplugs_worker); 1984 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list)); 1985 1986 if (disk->zone_wplugs_wq) { 1987 destroy_workqueue(disk->zone_wplugs_wq); 1988 disk->zone_wplugs_wq = NULL; 1989 } 1990 1991 disk_destroy_zone_wplugs_hash_table(disk); 1992 1993 disk_set_zones_cond_array(disk, NULL); 1994 disk->zone_capacity = 0; 1995 disk->last_zone_capacity = 0; 1996 disk->nr_zones = 0; 1997 } 1998 1999 struct blk_revalidate_zone_args { 2000 struct gendisk *disk; 2001 u8 *zones_cond; 2002 unsigned int nr_zones; 2003 unsigned int nr_conv_zones; 2004 unsigned int zone_capacity; 2005 unsigned int last_zone_capacity; 2006 sector_t sector; 2007 }; 2008 2009 static int disk_revalidate_zone_resources(struct gendisk *disk, 2010 struct blk_revalidate_zone_args *args) 2011 { 2012 struct queue_limits *lim = &disk->queue->limits; 2013 unsigned int pool_size; 2014 int ret = 0; 2015 2016 args->disk = disk; 2017 args->nr_zones = 2018 DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors); 2019 2020 /* Cached zone conditions: 1 byte per zone */ 2021 args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO); 2022 if (!args->zones_cond) 2023 return -ENOMEM; 2024 2025 if (!disk_need_zone_resources(disk)) 2026 return 0; 2027 2028 /* 2029 * If the device has no limit on the maximum number of open and active 2030 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. 2031 */ 2032 pool_size = max(lim->max_open_zones, lim->max_active_zones); 2033 if (!pool_size) 2034 pool_size = 2035 min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones); 2036 2037 if (!disk->zone_wplugs_hash) { 2038 ret = disk_alloc_zone_resources(disk, pool_size); 2039 if (ret) 2040 kfree(args->zones_cond); 2041 } 2042 2043 return ret; 2044 } 2045 2046 /* 2047 * Update the disk zone resources information and device queue limits. 2048 * The disk queue is frozen when this is executed. 2049 */ 2050 static int disk_update_zone_resources(struct gendisk *disk, 2051 struct blk_revalidate_zone_args *args) 2052 { 2053 struct request_queue *q = disk->queue; 2054 unsigned int nr_seq_zones; 2055 unsigned int pool_size, memflags; 2056 struct queue_limits lim; 2057 int ret = 0; 2058 2059 lim = queue_limits_start_update(q); 2060 2061 memflags = blk_mq_freeze_queue(q); 2062 2063 disk->nr_zones = args->nr_zones; 2064 if (args->nr_conv_zones >= disk->nr_zones) { 2065 queue_limits_cancel_update(q); 2066 pr_warn("%s: Invalid number of conventional zones %u / %u\n", 2067 disk->disk_name, args->nr_conv_zones, disk->nr_zones); 2068 ret = -ENODEV; 2069 goto unfreeze; 2070 } 2071 2072 disk->zone_capacity = args->zone_capacity; 2073 disk->last_zone_capacity = args->last_zone_capacity; 2074 disk_set_zones_cond_array(disk, args->zones_cond); 2075 args->zones_cond = NULL; 2076 2077 /* 2078 * Some devices can advertise zone resource limits that are larger than 2079 * the number of sequential zones of the zoned block device, e.g. a 2080 * small ZNS namespace. For such case, assume that the zoned device has 2081 * no zone resource limits. 2082 */ 2083 nr_seq_zones = disk->nr_zones - args->nr_conv_zones; 2084 if (lim.max_open_zones >= nr_seq_zones) 2085 lim.max_open_zones = 0; 2086 if (lim.max_active_zones >= nr_seq_zones) 2087 lim.max_active_zones = 0; 2088 2089 if (!disk->zone_wplugs_pool) 2090 goto commit; 2091 2092 /* 2093 * If the device has no limit on the maximum number of open and active 2094 * zones, set its max open zone limit to the mempool size to indicate 2095 * to the user that there is a potential performance impact due to 2096 * dynamic zone write plug allocation when simultaneously writing to 2097 * more zones than the size of the mempool. 2098 */ 2099 pool_size = max(lim.max_open_zones, lim.max_active_zones); 2100 if (!pool_size) 2101 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); 2102 2103 mempool_resize(disk->zone_wplugs_pool, pool_size); 2104 2105 if (!lim.max_open_zones && !lim.max_active_zones) { 2106 if (pool_size < nr_seq_zones) 2107 lim.max_open_zones = pool_size; 2108 else 2109 lim.max_open_zones = 0; 2110 } 2111 2112 commit: 2113 ret = queue_limits_commit_update(q, &lim); 2114 2115 unfreeze: 2116 if (ret) 2117 disk_free_zone_resources(disk); 2118 2119 blk_mq_unfreeze_queue(q, memflags); 2120 2121 return ret; 2122 } 2123 2124 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx, 2125 struct blk_revalidate_zone_args *args) 2126 { 2127 enum blk_zone_cond cond = zone->cond; 2128 2129 /* Check that the zone condition is consistent with the zone type. */ 2130 switch (cond) { 2131 case BLK_ZONE_COND_NOT_WP: 2132 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) 2133 goto invalid_condition; 2134 break; 2135 case BLK_ZONE_COND_IMP_OPEN: 2136 case BLK_ZONE_COND_EXP_OPEN: 2137 case BLK_ZONE_COND_CLOSED: 2138 case BLK_ZONE_COND_EMPTY: 2139 case BLK_ZONE_COND_FULL: 2140 case BLK_ZONE_COND_OFFLINE: 2141 case BLK_ZONE_COND_READONLY: 2142 if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) 2143 goto invalid_condition; 2144 break; 2145 default: 2146 pr_warn("%s: Invalid zone condition 0x%X\n", 2147 args->disk->disk_name, cond); 2148 return -ENODEV; 2149 } 2150 2151 blk_zone_set_cond(args->zones_cond, idx, cond); 2152 2153 return 0; 2154 2155 invalid_condition: 2156 pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n", 2157 args->disk->disk_name, cond, zone->type); 2158 2159 return -ENODEV; 2160 } 2161 2162 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, 2163 struct blk_revalidate_zone_args *args) 2164 { 2165 struct gendisk *disk = args->disk; 2166 2167 if (zone->capacity != zone->len) { 2168 pr_warn("%s: Invalid conventional zone capacity\n", 2169 disk->disk_name); 2170 return -ENODEV; 2171 } 2172 2173 if (disk_zone_is_last(disk, zone)) 2174 args->last_zone_capacity = zone->capacity; 2175 2176 args->nr_conv_zones++; 2177 2178 return 0; 2179 } 2180 2181 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, 2182 struct blk_revalidate_zone_args *args) 2183 { 2184 struct gendisk *disk = args->disk; 2185 struct blk_zone_wplug *zwplug; 2186 unsigned int wp_offset; 2187 2188 /* 2189 * Remember the capacity of the first sequential zone and check 2190 * if it is constant for all zones, ignoring the last zone as it can be 2191 * smaller. 2192 */ 2193 if (!args->zone_capacity) 2194 args->zone_capacity = zone->capacity; 2195 if (disk_zone_is_last(disk, zone)) { 2196 args->last_zone_capacity = zone->capacity; 2197 } else if (zone->capacity != args->zone_capacity) { 2198 pr_warn("%s: Invalid variable zone capacity\n", 2199 disk->disk_name); 2200 return -ENODEV; 2201 } 2202 2203 /* 2204 * If the device needs zone append emulation, we need to track the 2205 * write pointer of all zones that are not empty nor full. So make sure 2206 * we have a zone write plug for such zone if the device has a zone 2207 * write plug hash table. 2208 */ 2209 if (!disk->zone_wplugs_hash) 2210 return 0; 2211 2212 wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone); 2213 if (!wp_offset || wp_offset >= zone->capacity) 2214 return 0; 2215 2216 zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO); 2217 if (!zwplug) 2218 return -ENOMEM; 2219 disk_put_zone_wplug(zwplug); 2220 2221 return 0; 2222 } 2223 2224 /* 2225 * Helper function to check the validity of zones of a zoned block device. 2226 */ 2227 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, 2228 void *data) 2229 { 2230 struct blk_revalidate_zone_args *args = data; 2231 struct gendisk *disk = args->disk; 2232 sector_t zone_sectors = disk->queue->limits.chunk_sectors; 2233 int ret; 2234 2235 /* Check for bad zones and holes in the zone report */ 2236 if (zone->start != args->sector) { 2237 pr_warn("%s: Zone gap at sectors %llu..%llu\n", 2238 disk->disk_name, args->sector, zone->start); 2239 return -ENODEV; 2240 } 2241 2242 if (zone->start >= get_capacity(disk) || !zone->len) { 2243 pr_warn("%s: Invalid zone start %llu, length %llu\n", 2244 disk->disk_name, zone->start, zone->len); 2245 return -ENODEV; 2246 } 2247 2248 /* 2249 * All zones must have the same size, with the exception on an eventual 2250 * smaller last zone. 2251 */ 2252 if (!disk_zone_is_last(disk, zone)) { 2253 if (zone->len != zone_sectors) { 2254 pr_warn("%s: Invalid zoned device with non constant zone size\n", 2255 disk->disk_name); 2256 return -ENODEV; 2257 } 2258 } else if (zone->len > zone_sectors) { 2259 pr_warn("%s: Invalid zoned device with larger last zone size\n", 2260 disk->disk_name); 2261 return -ENODEV; 2262 } 2263 2264 if (!zone->capacity || zone->capacity > zone->len) { 2265 pr_warn("%s: Invalid zone capacity\n", 2266 disk->disk_name); 2267 return -ENODEV; 2268 } 2269 2270 /* Check zone condition */ 2271 ret = blk_revalidate_zone_cond(zone, idx, args); 2272 if (ret) 2273 return ret; 2274 2275 /* Check zone type */ 2276 switch (zone->type) { 2277 case BLK_ZONE_TYPE_CONVENTIONAL: 2278 ret = blk_revalidate_conv_zone(zone, idx, args); 2279 break; 2280 case BLK_ZONE_TYPE_SEQWRITE_REQ: 2281 ret = blk_revalidate_seq_zone(zone, idx, args); 2282 break; 2283 case BLK_ZONE_TYPE_SEQWRITE_PREF: 2284 default: 2285 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", 2286 disk->disk_name, (int)zone->type, zone->start); 2287 ret = -ENODEV; 2288 } 2289 2290 if (!ret) 2291 args->sector += zone->len; 2292 2293 return ret; 2294 } 2295 2296 /** 2297 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs 2298 * @disk: Target disk 2299 * 2300 * Helper function for low-level device drivers to check, (re) allocate and 2301 * initialize resources used for managing zoned disks. This function should 2302 * normally be called by blk-mq based drivers when a zoned gendisk is probed 2303 * and when the zone configuration of the gendisk changes (e.g. after a format). 2304 * Before calling this function, the device driver must already have set the 2305 * device zone size (chunk_sector limit) and the max zone append limit. 2306 * BIO based drivers can also use this function as long as the device queue 2307 * can be safely frozen. 2308 */ 2309 int blk_revalidate_disk_zones(struct gendisk *disk) 2310 { 2311 struct request_queue *q = disk->queue; 2312 sector_t zone_sectors = q->limits.chunk_sectors; 2313 sector_t capacity = get_capacity(disk); 2314 struct blk_revalidate_zone_args args = { }; 2315 unsigned int memflags, noio_flag; 2316 struct blk_report_zones_args rep_args = { 2317 .cb = blk_revalidate_zone_cb, 2318 .data = &args, 2319 }; 2320 int ret = -ENOMEM; 2321 2322 if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) 2323 return -EIO; 2324 2325 if (!capacity) 2326 return -ENODEV; 2327 2328 /* 2329 * Checks that the device driver indicated a valid zone size and that 2330 * the max zone append limit is set. 2331 */ 2332 if (!zone_sectors || !is_power_of_2(zone_sectors)) { 2333 pr_warn("%s: Invalid non power of two zone size (%llu)\n", 2334 disk->disk_name, zone_sectors); 2335 return -ENODEV; 2336 } 2337 2338 /* 2339 * Ensure that all memory allocations in this context are done as if 2340 * GFP_NOIO was specified. 2341 */ 2342 noio_flag = memalloc_noio_save(); 2343 ret = disk_revalidate_zone_resources(disk, &args); 2344 if (ret) { 2345 memalloc_noio_restore(noio_flag); 2346 return ret; 2347 } 2348 2349 ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args); 2350 if (!ret) { 2351 pr_warn("%s: No zones reported\n", disk->disk_name); 2352 ret = -ENODEV; 2353 } 2354 memalloc_noio_restore(noio_flag); 2355 2356 if (ret <= 0) 2357 goto free_resources; 2358 2359 /* 2360 * If zones where reported, make sure that the entire disk capacity 2361 * has been checked. 2362 */ 2363 if (args.sector != capacity) { 2364 pr_warn("%s: Missing zones from sector %llu\n", 2365 disk->disk_name, args.sector); 2366 ret = -ENODEV; 2367 goto free_resources; 2368 } 2369 2370 ret = disk_update_zone_resources(disk, &args); 2371 if (ret) 2372 goto free_resources; 2373 2374 return 0; 2375 2376 free_resources: 2377 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); 2378 2379 kfree(args.zones_cond); 2380 memflags = blk_mq_freeze_queue(q); 2381 disk_free_zone_resources(disk); 2382 blk_mq_unfreeze_queue(q, memflags); 2383 2384 return ret; 2385 } 2386 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); 2387 2388 /** 2389 * blk_zone_issue_zeroout - zero-fill a block range in a zone 2390 * @bdev: blockdev to write 2391 * @sector: start sector 2392 * @nr_sects: number of sectors to write 2393 * @gfp_mask: memory allocation flags (for bio_alloc) 2394 * 2395 * Description: 2396 * Zero-fill a block range in a zone (@sector must be equal to the zone write 2397 * pointer), handling potential errors due to the (initially unknown) lack of 2398 * hardware offload (See blkdev_issue_zeroout()). 2399 */ 2400 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector, 2401 sector_t nr_sects, gfp_t gfp_mask) 2402 { 2403 struct gendisk *disk = bdev->bd_disk; 2404 int ret; 2405 2406 if (WARN_ON_ONCE(!bdev_is_zoned(bdev))) 2407 return -EIO; 2408 2409 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 2410 BLKDEV_ZERO_NOFALLBACK); 2411 if (ret != -EOPNOTSUPP) 2412 return ret; 2413 2414 /* 2415 * The failed call to blkdev_issue_zeroout() advanced the zone write 2416 * pointer. Undo this using a report zone to update the zone write 2417 * pointer to the correct current value. 2418 */ 2419 ret = disk->fops->report_zones(disk, sector, 1, NULL); 2420 if (ret != 1) 2421 return ret < 0 ? ret : -EIO; 2422 2423 /* 2424 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a 2425 * regular write with zero-pages. 2426 */ 2427 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0); 2428 } 2429 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout); 2430 2431 #ifdef CONFIG_BLK_DEBUG_FS 2432 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug, 2433 struct seq_file *m) 2434 { 2435 unsigned int zwp_wp_offset, zwp_flags; 2436 unsigned int zwp_zone_no, zwp_ref; 2437 unsigned int zwp_bio_list_size; 2438 enum blk_zone_cond zwp_cond; 2439 unsigned long flags; 2440 2441 spin_lock_irqsave(&zwplug->lock, flags); 2442 zwp_zone_no = zwplug->zone_no; 2443 zwp_flags = zwplug->flags; 2444 zwp_ref = refcount_read(&zwplug->ref); 2445 zwp_cond = zwplug->cond; 2446 zwp_wp_offset = zwplug->wp_offset; 2447 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); 2448 spin_unlock_irqrestore(&zwplug->lock, flags); 2449 2450 seq_printf(m, 2451 "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n", 2452 zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond), 2453 zwp_wp_offset, zwp_bio_list_size); 2454 } 2455 2456 int queue_zone_wplugs_show(void *data, struct seq_file *m) 2457 { 2458 struct request_queue *q = data; 2459 struct gendisk *disk = q->disk; 2460 struct blk_zone_wplug *zwplug; 2461 unsigned int i; 2462 2463 if (!disk->zone_wplugs_hash) 2464 return 0; 2465 2466 rcu_read_lock(); 2467 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) 2468 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i], 2469 node) 2470 queue_zone_wplug_show(zwplug, m); 2471 rcu_read_unlock(); 2472 2473 return 0; 2474 } 2475 2476 #endif 2477