Lines Matching +full:write +full:- +full:to +full:- +full:write
1 // SPDX-License-Identifier: GPL-2.0
16 #include <linux/blk-mq.h>
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
42 * Per-zone write plug.
44 * @ref: Zone write plug reference counter. A zone write plug reference is
47 * submitted and when a function needs to manipulate a plug. The
50 * reference is dropped whenever the zone of the zone write plug is reset,
51 * finished and when the zone becomes full (last write BIO to the zone
53 * @lock: Spinlock to atomically manipulate the plug.
56 * @wp_offset: The zone write pointer location relative to the start of the zone
59 * @bio_work: Work struct to handle issuing of plugged BIOs
60 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
61 * @disk: The gendisk the plug belongs to.
77 * Zone write plug flags bits:
78 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
79 * that is, that write BIOs are being throttled due to a write BIO already
80 * being executed or the zone write plug bio list is not empty.
81 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
82 * write pointer offset and need to update it.
83 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
84 * from the disk hash table and that the initial reference to the zone
85 * write plug set when the plug was first added to the hash table has been
87 * to prevent new references to the zone write plug to be taken for
88 * newly incoming BIOs. A zone write plug flagged with this flag will be
96 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
99 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
127 struct gendisk *disk = args->disk; in disk_report_zones_cb()
129 if (disk->zone_wplugs_hash) in disk_report_zones_cb()
132 if (!args->user_cb) in disk_report_zones_cb()
135 return args->user_cb(zone, idx, args->user_data); in disk_report_zones_cb()
139 * blkdev_report_zones - Get zones information
141 * @sector: Sector from which to report zones
142 * @nr_zones: Maximum number of zones to report
149 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
150 * constant can be passed to @nr_zones.
154 * Note: The caller must use memalloc_noXX_save/restore() calls to control
160 struct gendisk *disk = bdev->bd_disk; in blkdev_report_zones()
168 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) in blkdev_report_zones()
169 return -EOPNOTSUPP; in blkdev_report_zones()
174 return disk->fops->report_zones(disk, sector, nr_zones, in blkdev_report_zones()
188 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
190 * @op: Operation to be performed on the zones
191 * @sector: Start sector of the first zone to operate on
199 * The operation to execute on each zone can be a zone reset, open, close
212 return -EOPNOTSUPP; in blkdev_zone_mgmt()
215 return -EPERM; in blkdev_zone_mgmt()
218 return -EOPNOTSUPP; in blkdev_zone_mgmt()
222 return -EINVAL; in blkdev_zone_mgmt()
226 return -EINVAL; in blkdev_zone_mgmt()
229 return -EINVAL; in blkdev_zone_mgmt()
240 bio->bi_iter.bi_sector = sector; in blkdev_zone_mgmt()
243 /* This may take a while, so be nice to others */ in blkdev_zone_mgmt()
263 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) in blkdev_copy_zone_to_user()
264 return -EFAULT; in blkdev_copy_zone_to_user()
281 return -EINVAL; in blkdev_report_zones_ioctl()
284 return -ENOTTY; in blkdev_report_zones_ioctl()
287 return -EFAULT; in blkdev_report_zones_ioctl()
290 return -EINVAL; in blkdev_report_zones_ioctl()
301 return -EFAULT; in blkdev_report_zones_ioctl()
310 if (zrange->sector + zrange->nr_sectors <= zrange->sector || in blkdev_truncate_zone_range()
311 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) in blkdev_truncate_zone_range()
313 return -EINVAL; in blkdev_truncate_zone_range()
315 start = zrange->sector << SECTOR_SHIFT; in blkdev_truncate_zone_range()
316 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; in blkdev_truncate_zone_range()
334 return -EINVAL; in blkdev_zone_mgmt_ioctl()
337 return -ENOTTY; in blkdev_zone_mgmt_ioctl()
340 return -EBADF; in blkdev_zone_mgmt_ioctl()
343 return -EFAULT; in blkdev_zone_mgmt_ioctl()
350 filemap_invalidate_lock(bdev->bd_mapping); in blkdev_zone_mgmt_ioctl()
365 return -ENOTTY; in blkdev_zone_mgmt_ioctl()
372 filemap_invalidate_unlock(bdev->bd_mapping); in blkdev_zone_mgmt_ioctl()
379 return zone->start + zone->len >= get_capacity(disk); in disk_zone_is_last()
385 if (zno < disk->nr_zones - 1) in disk_zone_is_full()
386 return offset_in_zone >= disk->zone_capacity; in disk_zone_is_full()
387 return offset_in_zone >= disk->last_zone_capacity; in disk_zone_is_full()
393 return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset); in disk_zone_wplug_is_full()
402 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); in disk_insert_zone_wplug()
405 * Add the new zone write plug to the hash table, but carefully as we in disk_insert_zone_wplug()
407 * zone write plug for the same zone. in disk_insert_zone_wplug()
409 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
410 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { in disk_insert_zone_wplug()
411 if (zwplg->zone_no == zwplug->zone_no) { in disk_insert_zone_wplug()
412 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
416 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); in disk_insert_zone_wplug()
417 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_insert_zone_wplug()
426 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); in disk_get_zone_wplug()
431 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { in disk_get_zone_wplug()
432 if (zwplug->zone_no == zno && in disk_get_zone_wplug()
433 refcount_inc_not_zero(&zwplug->ref)) { in disk_get_zone_wplug()
449 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); in disk_free_zone_wplug_rcu()
454 if (refcount_dec_and_test(&zwplug->ref)) { in disk_put_zone_wplug()
455 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); in disk_put_zone_wplug()
456 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); in disk_put_zone_wplug()
457 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); in disk_put_zone_wplug()
459 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); in disk_put_zone_wplug()
466 /* If the zone write plug was already removed, we are done. */ in disk_should_remove_zone_wplug()
467 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) in disk_should_remove_zone_wplug()
470 /* If the zone write plug is still plugged, it cannot be removed. */ in disk_should_remove_zone_wplug()
471 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) in disk_should_remove_zone_wplug()
479 * should not attempt to remove the zone write plug until all BIO in disk_should_remove_zone_wplug()
480 * completions are seen. Check by looking at the zone write plug in disk_should_remove_zone_wplug()
485 if (refcount_read(&zwplug->ref) > 2) in disk_should_remove_zone_wplug()
488 /* We can remove zone write plugs for zones that are empty or full. */ in disk_should_remove_zone_wplug()
489 return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug); in disk_should_remove_zone_wplug()
497 /* If the zone write plug was already removed, we have nothing to do. */ in disk_remove_zone_wplug()
498 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) in disk_remove_zone_wplug()
502 * Mark the zone write plug as unhashed and drop the extra reference we in disk_remove_zone_wplug()
505 zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; in disk_remove_zone_wplug()
506 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_remove_zone_wplug()
507 hlist_del_init_rcu(&zwplug->node); in disk_remove_zone_wplug()
508 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_remove_zone_wplug()
515 * Get a reference on the write plug for the zone containing @sector.
517 * Return a pointer to the zone write plug with the plug spinlock held.
531 * operation has not already removed the zone write plug from in disk_get_and_lock_zone_wplug()
533 * we need to get a new plug so start over from the beginning. in disk_get_and_lock_zone_wplug()
535 spin_lock_irqsave(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
536 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { in disk_get_and_lock_zone_wplug()
537 spin_unlock_irqrestore(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
545 * Allocate and initialize a zone write plug with an extra reference in disk_get_and_lock_zone_wplug()
546 * so that it is not freed when the zone write plug becomes idle without in disk_get_and_lock_zone_wplug()
549 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); in disk_get_and_lock_zone_wplug()
553 INIT_HLIST_NODE(&zwplug->node); in disk_get_and_lock_zone_wplug()
554 refcount_set(&zwplug->ref, 2); in disk_get_and_lock_zone_wplug()
555 spin_lock_init(&zwplug->lock); in disk_get_and_lock_zone_wplug()
556 zwplug->flags = 0; in disk_get_and_lock_zone_wplug()
557 zwplug->zone_no = zno; in disk_get_and_lock_zone_wplug()
558 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector); in disk_get_and_lock_zone_wplug()
559 bio_list_init(&zwplug->bio_list); in disk_get_and_lock_zone_wplug()
560 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); in disk_get_and_lock_zone_wplug()
561 zwplug->disk = disk; in disk_get_and_lock_zone_wplug()
563 spin_lock_irqsave(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
566 * Insert the new zone write plug in the hash table. This can fail only in disk_get_and_lock_zone_wplug()
571 spin_unlock_irqrestore(&zwplug->lock, *flags); in disk_get_and_lock_zone_wplug()
572 mempool_free(zwplug, disk->zone_wplugs_pool); in disk_get_and_lock_zone_wplug()
582 struct request_queue *q = zwplug->disk->queue; in blk_zone_wplug_bio_io_error()
591 * Abort (fail) all plugged BIOs of a zone write plug.
597 while ((bio = bio_list_pop(&zwplug->bio_list))) in disk_zone_wplug_abort()
602 * Set a zone write plug write pointer offset to the specified value.
605 * update from a report zone after a write error.
611 lockdep_assert_held(&zwplug->lock); in disk_zone_wplug_set_wp_offset()
613 /* Update the zone write pointer and abort all plugged BIOs. */ in disk_zone_wplug_set_wp_offset()
614 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE; in disk_zone_wplug_set_wp_offset()
615 zwplug->wp_offset = wp_offset; in disk_zone_wplug_set_wp_offset()
619 * The zone write plug now has no BIO plugged: remove it from the in disk_zone_wplug_set_wp_offset()
629 switch (zone->cond) { in blk_zone_wp_offset()
633 return zone->wp - zone->start; in blk_zone_wp_offset()
635 return zone->len; in blk_zone_wp_offset()
643 * Conventional, offline and read-only zones do not have a valid in blk_zone_wp_offset()
644 * write pointer. in blk_zone_wp_offset()
656 zwplug = disk_get_zone_wplug(disk, zone->start); in disk_zone_wplug_sync_wp_offset()
660 spin_lock_irqsave(&zwplug->lock, flags); in disk_zone_wplug_sync_wp_offset()
661 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) in disk_zone_wplug_sync_wp_offset()
664 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_sync_wp_offset()
675 return disk->fops->report_zones(disk, sector, 1, in disk_zone_sync_wp_offset()
682 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_reset_or_finish()
683 sector_t sector = bio->bi_iter.bi_sector; in blk_zone_wplug_handle_reset_or_finish()
688 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { in blk_zone_wplug_handle_reset_or_finish()
694 * No-wait reset or finish BIOs do not make much sense as the callers in blk_zone_wplug_handle_reset_or_finish()
695 * issue these as blocking operations in most cases. To avoid issues in blk_zone_wplug_handle_reset_or_finish()
699 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT)) in blk_zone_wplug_handle_reset_or_finish()
700 bio->bi_opf &= ~REQ_NOWAIT; in blk_zone_wplug_handle_reset_or_finish()
703 * If we have a zone write plug, set its write pointer offset to 0 in blk_zone_wplug_handle_reset_or_finish()
704 * (reset case) or to the zone size (finish case). This will abort all in blk_zone_wplug_handle_reset_or_finish()
706 * finishing zones while writes are still in-flight will result in the in blk_zone_wplug_handle_reset_or_finish()
711 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_wplug_handle_reset_or_finish()
713 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_reset_or_finish()
722 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_reset_all()
728 * Set the write pointer offset of all zone write plugs to 0. This will in blk_zone_wplug_handle_reset_all()
730 * are still in-flight will result in the writes failing anyway. in blk_zone_wplug_handle_reset_all()
733 sector += disk->queue->limits.chunk_sectors) { in blk_zone_wplug_handle_reset_all()
736 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_wplug_handle_reset_all()
738 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_reset_all()
750 * Take a reference on the zone write plug and schedule the submission in disk_zone_wplug_schedule_bio_work()
754 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); in disk_zone_wplug_schedule_bio_work()
755 refcount_inc(&zwplug->ref); in disk_zone_wplug_schedule_bio_work()
756 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); in disk_zone_wplug_schedule_bio_work()
767 * This reference will be reused to submit a request for the BIO for in disk_zone_wplug_add_bio()
768 * blk-mq devices and dropped when the BIO is failed and after in disk_zone_wplug_add_bio()
769 * it is issued in the case of BIO-based devices. in disk_zone_wplug_add_bio()
771 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); in disk_zone_wplug_add_bio()
774 * The BIO is being plugged and thus will have to wait for the on-going in disk_zone_wplug_add_bio()
775 * write and for all other writes already plugged. So polling makes in disk_zone_wplug_add_bio()
781 * REQ_NOWAIT BIOs are always handled using the zone write plug BIO in disk_zone_wplug_add_bio()
785 if (bio->bi_opf & REQ_NOWAIT) { in disk_zone_wplug_add_bio()
786 schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED); in disk_zone_wplug_add_bio()
787 bio->bi_opf &= ~REQ_NOWAIT; in disk_zone_wplug_add_bio()
791 * Reuse the poll cookie field to store the number of segments when in disk_zone_wplug_add_bio()
792 * split to the hardware limits. in disk_zone_wplug_add_bio()
794 bio->__bi_nr_segments = nr_segs; in disk_zone_wplug_add_bio()
797 * We always receive BIOs after they are split and ready to be issued. in disk_zone_wplug_add_bio()
799 * user must also issue write sequentially. So simply add the new BIO in disk_zone_wplug_add_bio()
800 * at the tail of the list to preserve the sequential write order. in disk_zone_wplug_add_bio()
802 bio_list_add(&zwplug->bio_list, bio); in disk_zone_wplug_add_bio()
804 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; in disk_zone_wplug_add_bio()
820 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). in blk_zone_write_plug_bio_merged()
821 * For this case, we already hold a reference on the zone write plug for in blk_zone_write_plug_bio_merged()
823 * zone write pointer offset update. in blk_zone_write_plug_bio_merged()
831 * Get a reference on the zone write plug of the target zone and advance in blk_zone_write_plug_bio_merged()
832 * the zone write pointer offset. Given that this is a merge, we already in blk_zone_write_plug_bio_merged()
833 * have at least one request and one BIO referencing the zone write in blk_zone_write_plug_bio_merged()
836 zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, in blk_zone_write_plug_bio_merged()
837 bio->bi_iter.bi_sector); in blk_zone_write_plug_bio_merged()
841 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_bio_merged()
842 zwplug->wp_offset += bio_sectors(bio); in blk_zone_write_plug_bio_merged()
843 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_bio_merged()
847 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
848 * already went through zone write plugging (either a new BIO or one that was
854 struct request_queue *q = req->q; in blk_zone_write_plug_init_request()
855 struct gendisk *disk = q->disk; in blk_zone_write_plug_init_request()
865 * Indicate that completion of this request needs to be handled with in blk_zone_write_plug_init_request()
867 * on the zone write plug we took above on entry to this function. in blk_zone_write_plug_init_request()
869 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; in blk_zone_write_plug_init_request()
875 * Walk through the list of plugged BIOs to check if they can be merged in blk_zone_write_plug_init_request()
878 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_init_request()
880 bio = bio_list_peek(&zwplug->bio_list); in blk_zone_write_plug_init_request()
884 if (bio->bi_iter.bi_sector != req_back_sector || in blk_zone_write_plug_init_request()
889 !bio->__bi_nr_segments); in blk_zone_write_plug_init_request()
891 bio_list_pop(&zwplug->bio_list); in blk_zone_write_plug_init_request()
892 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != in blk_zone_write_plug_init_request()
894 bio_list_add_head(&zwplug->bio_list, bio); in blk_zone_write_plug_init_request()
900 * plugging the BIO and advance the write pointer offset. in blk_zone_write_plug_init_request()
903 zwplug->wp_offset += bio_sectors(bio); in blk_zone_write_plug_init_request()
907 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_init_request()
911 * Check and prepare a BIO for submission by incrementing the write pointer
912 * offset of its zone write plug and changing zone append operations into
913 * regular write when zone append emulation is needed.
918 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_prepare_bio()
921 * If we lost track of the zone write pointer due to a write error, in blk_zone_wplug_prepare_bio()
923 * the to recover a reliable write pointer position. Fail BIOs if the in blk_zone_wplug_prepare_bio()
927 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) in blk_zone_wplug_prepare_bio()
931 * Check that the user is not attempting to write to a full zone. in blk_zone_wplug_prepare_bio()
933 * write pointer offset beyond the end of the zone. in blk_zone_wplug_prepare_bio()
940 * Use a regular write starting at the current write pointer. in blk_zone_wplug_prepare_bio()
941 * Similarly to native zone append operations, do not allow in blk_zone_wplug_prepare_bio()
944 bio->bi_opf &= ~REQ_OP_MASK; in blk_zone_wplug_prepare_bio()
945 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; in blk_zone_wplug_prepare_bio()
946 bio->bi_iter.bi_sector += zwplug->wp_offset; in blk_zone_wplug_prepare_bio()
955 * Check for non-sequential writes early as we know that BIOs in blk_zone_wplug_prepare_bio()
956 * with a start sector not unaligned to the zone write pointer in blk_zone_wplug_prepare_bio()
959 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) in blk_zone_wplug_prepare_bio()
963 /* Advance the zone write pointer offset. */ in blk_zone_wplug_prepare_bio()
964 zwplug->wp_offset += bio_sectors(bio); in blk_zone_wplug_prepare_bio()
971 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_wplug_handle_write()
972 sector_t sector = bio->bi_iter.bi_sector; in blk_zone_wplug_handle_write()
979 * zone write plug for the entire BIO. For blk-mq devices, the block in blk_zone_wplug_handle_write()
980 * layer should already have done any splitting required to ensure this in blk_zone_wplug_handle_write()
982 * BIO-based devices, it is the responsibility of the driver to split in blk_zone_wplug_handle_write()
990 /* Conventional zones do not need write plugging. */ in blk_zone_wplug_handle_write()
991 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) { in blk_zone_wplug_handle_write()
992 /* Zone append to conventional zones is not allowed. */ in blk_zone_wplug_handle_write()
1000 if (bio->bi_opf & REQ_NOWAIT) in blk_zone_wplug_handle_write()
1005 if (bio->bi_opf & REQ_NOWAIT) in blk_zone_wplug_handle_write()
1012 /* Indicate that this BIO is being handled using zone write plugging. */ in blk_zone_wplug_handle_write()
1016 * If the zone is already plugged, add the BIO to the plug BIO list. in blk_zone_wplug_handle_write()
1017 * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a in blk_zone_wplug_handle_write()
1021 if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) || in blk_zone_wplug_handle_write()
1022 (bio->bi_opf & REQ_NOWAIT)) in blk_zone_wplug_handle_write()
1026 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_write()
1031 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; in blk_zone_wplug_handle_write()
1033 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_write()
1040 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_handle_write()
1046 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1050 * Handle write, write zeroes and zone append operations requiring emulation
1051 * using zone write plugging.
1053 * Return true whenever @bio execution needs to be delayed through the zone
1054 * write plug. Otherwise, return false to let the submission path process
1059 struct block_device *bdev = bio->bi_bdev; in blk_zone_plug_bio()
1061 if (!bdev->bd_disk->zone_wplugs_hash) in blk_zone_plug_bio()
1073 * We do not need to do anything special for empty flush BIOs, e.g in blk_zone_plug_bio()
1075 * the responsibility of the user to first wait for the completion of in blk_zone_plug_bio()
1076 * write operations for flush to have any effect on the persistence of in blk_zone_plug_bio()
1079 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) in blk_zone_plug_bio()
1083 * Regular writes and write zeroes need to be handled through the target in blk_zone_plug_bio()
1084 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH in blk_zone_plug_bio()
1085 * which may need to go through the flush machinery depending on the in blk_zone_plug_bio()
1089 * completion, which will handle zone write plugging. in blk_zone_plug_bio()
1092 * write BIOs. in blk_zone_plug_bio()
1094 * to correctly track the write pointer offset of zones. These commands in blk_zone_plug_bio()
1095 * are not plugged as we do not need serialization with write in blk_zone_plug_bio()
1096 * operations. It is the responsibility of the user to not issue reset in blk_zone_plug_bio()
1097 * and finish commands when write operations are in flight. in blk_zone_plug_bio()
1127 spin_lock_irqsave(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1130 if (!bio_list_empty(&zwplug->bio_list)) { in disk_zone_wplug_unplug_bio()
1132 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1136 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; in disk_zone_wplug_unplug_bio()
1140 * (it was reset), remove its zone write plug from the hash table. in disk_zone_wplug_unplug_bio()
1145 spin_unlock_irqrestore(&zwplug->lock, flags); in disk_zone_wplug_unplug_bio()
1150 struct gendisk *disk = bio->bi_bdev->bd_disk; in blk_zone_write_plug_bio_endio()
1152 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); in blk_zone_write_plug_bio_endio()
1162 * If this is a regular write emulating a zone append operation, in blk_zone_write_plug_bio_endio()
1166 bio->bi_opf &= ~REQ_OP_MASK; in blk_zone_write_plug_bio_endio()
1167 bio->bi_opf |= REQ_OP_ZONE_APPEND; in blk_zone_write_plug_bio_endio()
1172 * needing a write pointer update. in blk_zone_write_plug_bio_endio()
1174 if (bio->bi_status != BLK_STS_OK) { in blk_zone_write_plug_bio_endio()
1175 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_write_plug_bio_endio()
1177 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE; in blk_zone_write_plug_bio_endio()
1178 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_write_plug_bio_endio()
1185 * For BIO-based devices, blk_zone_write_plug_finish_request() in blk_zone_write_plug_bio_endio()
1186 * is not called. So we need to schedule execution of the next in blk_zone_write_plug_bio_endio()
1189 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO)) in blk_zone_write_plug_bio_endio()
1198 struct gendisk *disk = req->q->disk; in blk_zone_write_plug_finish_request()
1201 zwplug = disk_get_zone_wplug(disk, req->__sector); in blk_zone_write_plug_finish_request()
1205 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; in blk_zone_write_plug_finish_request()
1231 spin_lock_irqsave(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1234 bio = bio_list_pop(&zwplug->bio_list); in blk_zone_wplug_bio_work()
1236 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; in blk_zone_wplug_bio_work()
1237 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1246 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_zone_wplug_bio_work()
1248 bdev = bio->bi_bdev; in blk_zone_wplug_bio_work()
1252 * blk-mq devices will reuse the extra reference on the request queue in blk_zone_wplug_bio_work()
1254 * path for BIO-based devices will not do that. So drop this extra in blk_zone_wplug_bio_work()
1258 blk_queue_exit(bdev->bd_disk->queue); in blk_zone_wplug_bio_work()
1267 return 1U << disk->zone_wplugs_hash_bits; in disk_zone_wplugs_hash_size()
1272 spin_lock_init(&disk->zone_wplugs_lock); in disk_init_zone_resources()
1276 * For the size of a disk zone write plug hash table, use the size of the
1277 * zone write plug mempool, which is the maximum of the disk open zones and
1279 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1289 disk->zone_wplugs_hash_bits = in disk_alloc_zone_resources()
1292 disk->zone_wplugs_hash = in disk_alloc_zone_resources()
1295 if (!disk->zone_wplugs_hash) in disk_alloc_zone_resources()
1296 return -ENOMEM; in disk_alloc_zone_resources()
1299 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); in disk_alloc_zone_resources()
1301 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, in disk_alloc_zone_resources()
1303 if (!disk->zone_wplugs_pool) in disk_alloc_zone_resources()
1306 disk->zone_wplugs_wq = in disk_alloc_zone_resources()
1308 pool_size, disk->disk_name); in disk_alloc_zone_resources()
1309 if (!disk->zone_wplugs_wq) in disk_alloc_zone_resources()
1315 mempool_destroy(disk->zone_wplugs_pool); in disk_alloc_zone_resources()
1316 disk->zone_wplugs_pool = NULL; in disk_alloc_zone_resources()
1318 kfree(disk->zone_wplugs_hash); in disk_alloc_zone_resources()
1319 disk->zone_wplugs_hash = NULL; in disk_alloc_zone_resources()
1320 disk->zone_wplugs_hash_bits = 0; in disk_alloc_zone_resources()
1321 return -ENOMEM; in disk_alloc_zone_resources()
1329 if (!disk->zone_wplugs_hash) in disk_destroy_zone_wplugs_hash_table()
1332 /* Free all the zone write plugs we have. */ in disk_destroy_zone_wplugs_hash_table()
1334 while (!hlist_empty(&disk->zone_wplugs_hash[i])) { in disk_destroy_zone_wplugs_hash_table()
1335 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, in disk_destroy_zone_wplugs_hash_table()
1337 refcount_inc(&zwplug->ref); in disk_destroy_zone_wplugs_hash_table()
1343 kfree(disk->zone_wplugs_hash); in disk_destroy_zone_wplugs_hash_table()
1344 disk->zone_wplugs_hash = NULL; in disk_destroy_zone_wplugs_hash_table()
1345 disk->zone_wplugs_hash_bits = 0; in disk_destroy_zone_wplugs_hash_table()
1354 spin_lock_irqsave(&disk->zone_wplugs_lock, flags); in disk_set_conv_zones_bitmap()
1356 nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones); in disk_set_conv_zones_bitmap()
1357 bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap, in disk_set_conv_zones_bitmap()
1358 lockdep_is_held(&disk->zone_wplugs_lock)); in disk_set_conv_zones_bitmap()
1359 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); in disk_set_conv_zones_bitmap()
1368 if (!disk->zone_wplugs_pool) in disk_free_zone_resources()
1371 if (disk->zone_wplugs_wq) { in disk_free_zone_resources()
1372 destroy_workqueue(disk->zone_wplugs_wq); in disk_free_zone_resources()
1373 disk->zone_wplugs_wq = NULL; in disk_free_zone_resources()
1379 * Wait for the zone write plugs to be RCU-freed before in disk_free_zone_resources()
1384 mempool_destroy(disk->zone_wplugs_pool); in disk_free_zone_resources()
1385 disk->zone_wplugs_pool = NULL; in disk_free_zone_resources()
1388 disk->zone_capacity = 0; in disk_free_zone_resources()
1389 disk->last_zone_capacity = 0; in disk_free_zone_resources()
1390 disk->nr_zones = 0; in disk_free_zone_resources()
1397 * can automatically handle write BIO plugging. BIO-based device drivers in disk_need_zone_resources()
1398 * (e.g. DM devices) are normally responsible for handling zone write in disk_need_zone_resources()
1402 return queue_is_mq(disk->queue) || in disk_need_zone_resources()
1403 queue_emulates_zone_append(disk->queue); in disk_need_zone_resources()
1409 struct queue_limits *lim = &disk->queue->limits; in disk_revalidate_zone_resources()
1419 pool_size = max(lim->max_open_zones, lim->max_active_zones); in disk_revalidate_zone_resources()
1423 if (!disk->zone_wplugs_hash) in disk_revalidate_zone_resources()
1445 struct request_queue *q = disk->queue; in disk_update_zone_resources()
1451 disk->nr_zones = args->nr_zones; in disk_update_zone_resources()
1452 disk->zone_capacity = args->zone_capacity; in disk_update_zone_resources()
1453 disk->last_zone_capacity = args->last_zone_capacity; in disk_update_zone_resources()
1455 disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap); in disk_update_zone_resources()
1456 if (nr_conv_zones >= disk->nr_zones) { in disk_update_zone_resources()
1458 disk->disk_name, nr_conv_zones, disk->nr_zones); in disk_update_zone_resources()
1459 return -ENODEV; in disk_update_zone_resources()
1470 nr_seq_zones = disk->nr_zones - nr_conv_zones; in disk_update_zone_resources()
1476 if (!disk->zone_wplugs_pool) in disk_update_zone_resources()
1481 * zones, set its max open zone limit to the mempool size to indicate in disk_update_zone_resources()
1482 * to the user that there is a potential performance impact due to in disk_update_zone_resources()
1483 * dynamic zone write plug allocation when simultaneously writing to in disk_update_zone_resources()
1490 mempool_resize(disk->zone_wplugs_pool, pool_size); in disk_update_zone_resources()
1510 struct gendisk *disk = args->disk; in blk_revalidate_conv_zone()
1512 if (zone->capacity != zone->len) { in blk_revalidate_conv_zone()
1514 disk->disk_name); in blk_revalidate_conv_zone()
1515 return -ENODEV; in blk_revalidate_conv_zone()
1519 args->last_zone_capacity = zone->capacity; in blk_revalidate_conv_zone()
1524 if (!args->conv_zones_bitmap) { in blk_revalidate_conv_zone()
1525 args->conv_zones_bitmap = in blk_revalidate_conv_zone()
1526 bitmap_zalloc(args->nr_zones, GFP_NOIO); in blk_revalidate_conv_zone()
1527 if (!args->conv_zones_bitmap) in blk_revalidate_conv_zone()
1528 return -ENOMEM; in blk_revalidate_conv_zone()
1531 set_bit(idx, args->conv_zones_bitmap); in blk_revalidate_conv_zone()
1539 struct gendisk *disk = args->disk; in blk_revalidate_seq_zone()
1549 if (!args->zone_capacity) in blk_revalidate_seq_zone()
1550 args->zone_capacity = zone->capacity; in blk_revalidate_seq_zone()
1552 args->last_zone_capacity = zone->capacity; in blk_revalidate_seq_zone()
1553 } else if (zone->capacity != args->zone_capacity) { in blk_revalidate_seq_zone()
1555 disk->disk_name); in blk_revalidate_seq_zone()
1556 return -ENODEV; in blk_revalidate_seq_zone()
1560 * We need to track the write pointer of all zones that are not in blk_revalidate_seq_zone()
1561 * empty nor full. So make sure we have a zone write plug for in blk_revalidate_seq_zone()
1562 * such zone if the device has a zone write plug hash table. in blk_revalidate_seq_zone()
1564 if (!disk->zone_wplugs_hash) in blk_revalidate_seq_zone()
1570 if (!wp_offset || wp_offset >= zone->capacity) in blk_revalidate_seq_zone()
1573 zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); in blk_revalidate_seq_zone()
1575 return -ENOMEM; in blk_revalidate_seq_zone()
1576 spin_unlock_irqrestore(&zwplug->lock, flags); in blk_revalidate_seq_zone()
1583 * Helper function to check the validity of zones of a zoned block device.
1589 struct gendisk *disk = args->disk; in blk_revalidate_zone_cb()
1590 sector_t zone_sectors = disk->queue->limits.chunk_sectors; in blk_revalidate_zone_cb()
1594 if (zone->start != args->sector) { in blk_revalidate_zone_cb()
1596 disk->disk_name, args->sector, zone->start); in blk_revalidate_zone_cb()
1597 return -ENODEV; in blk_revalidate_zone_cb()
1600 if (zone->start >= get_capacity(disk) || !zone->len) { in blk_revalidate_zone_cb()
1602 disk->disk_name, zone->start, zone->len); in blk_revalidate_zone_cb()
1603 return -ENODEV; in blk_revalidate_zone_cb()
1611 if (zone->len != zone_sectors) { in blk_revalidate_zone_cb()
1613 disk->disk_name); in blk_revalidate_zone_cb()
1614 return -ENODEV; in blk_revalidate_zone_cb()
1616 } else if (zone->len > zone_sectors) { in blk_revalidate_zone_cb()
1618 disk->disk_name); in blk_revalidate_zone_cb()
1619 return -ENODEV; in blk_revalidate_zone_cb()
1622 if (!zone->capacity || zone->capacity > zone->len) { in blk_revalidate_zone_cb()
1624 disk->disk_name); in blk_revalidate_zone_cb()
1625 return -ENODEV; in blk_revalidate_zone_cb()
1629 switch (zone->type) { in blk_revalidate_zone_cb()
1639 disk->disk_name, (int)zone->type, zone->start); in blk_revalidate_zone_cb()
1640 ret = -ENODEV; in blk_revalidate_zone_cb()
1644 args->sector += zone->len; in blk_revalidate_zone_cb()
1650 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
1653 * Helper function for low-level device drivers to check, (re) allocate and
1655 * normally be called by blk-mq based drivers when a zoned gendisk is probed
1664 struct request_queue *q = disk->queue; in blk_revalidate_disk_zones()
1665 sector_t zone_sectors = q->limits.chunk_sectors; in blk_revalidate_disk_zones()
1669 int ret = -ENOMEM; in blk_revalidate_disk_zones()
1672 return -EIO; in blk_revalidate_disk_zones()
1675 return -ENODEV; in blk_revalidate_disk_zones()
1683 disk->disk_name, zone_sectors); in blk_revalidate_disk_zones()
1684 return -ENODEV; in blk_revalidate_disk_zones()
1692 args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); in blk_revalidate_disk_zones()
1700 ret = disk->fops->report_zones(disk, 0, UINT_MAX, in blk_revalidate_disk_zones()
1703 pr_warn("%s: No zones reported\n", disk->disk_name); in blk_revalidate_disk_zones()
1704 ret = -ENODEV; in blk_revalidate_disk_zones()
1714 disk->disk_name, args.sector); in blk_revalidate_disk_zones()
1715 ret = -ENODEV; in blk_revalidate_disk_zones()
1725 pr_warn("%s: failed to revalidate zones\n", disk->disk_name); in blk_revalidate_disk_zones()
1737 * blk_zone_issue_zeroout - zero-fill a block range in a zone
1738 * @bdev: blockdev to write
1740 * @nr_sects: number of sectors to write
1744 * Zero-fill a block range in a zone (@sector must be equal to the zone write
1745 * pointer), handling potential errors due to the (initially unknown) lack of
1754 return -EIO; in blk_zone_issue_zeroout()
1758 if (ret != -EOPNOTSUPP) in blk_zone_issue_zeroout()
1762 * The failed call to blkdev_issue_zeroout() advanced the zone write in blk_zone_issue_zeroout()
1763 * pointer. Undo this using a report zone to update the zone write in blk_zone_issue_zeroout()
1764 * pointer to the correct current value. in blk_zone_issue_zeroout()
1766 ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector); in blk_zone_issue_zeroout()
1768 return ret < 0 ? ret : -EIO; in blk_zone_issue_zeroout()
1771 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a in blk_zone_issue_zeroout()
1772 * regular write with zero-pages. in blk_zone_issue_zeroout()
1783 struct gendisk *disk = q->disk; in queue_zone_wplugs_show()
1790 if (!disk->zone_wplugs_hash) in queue_zone_wplugs_show()
1796 &disk->zone_wplugs_hash[i], node) { in queue_zone_wplugs_show()
1797 spin_lock_irqsave(&zwplug->lock, flags); in queue_zone_wplugs_show()
1798 zwp_zone_no = zwplug->zone_no; in queue_zone_wplugs_show()
1799 zwp_flags = zwplug->flags; in queue_zone_wplugs_show()
1800 zwp_ref = refcount_read(&zwplug->ref); in queue_zone_wplugs_show()
1801 zwp_wp_offset = zwplug->wp_offset; in queue_zone_wplugs_show()
1802 zwp_bio_list_size = bio_list_size(&zwplug->bio_list); in queue_zone_wplugs_show()
1803 spin_unlock_irqrestore(&zwplug->lock, flags); in queue_zone_wplugs_show()