xref: /linux/block/blk-zoned.c (revision 539fb773a3f7c07cf7fd00617f33ed4e33058d72)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Zoned block device handling
4  *
5  * Copyright (c) 2015, Hannes Reinecke
6  * Copyright (c) 2015, SUSE Linux GmbH
7  *
8  * Copyright (c) 2016, Damien Le Moal
9  * Copyright (c) 2016, Western Digital
10  * Copyright (c) 2024, Western Digital Corporation or its affiliates.
11  */
12 
13 #include <linux/kernel.h>
14 #include <linux/blkdev.h>
15 #include <linux/blk-mq.h>
16 #include <linux/spinlock.h>
17 #include <linux/refcount.h>
18 #include <linux/mempool.h>
19 #include <linux/kthread.h>
20 #include <linux/freezer.h>
21 
22 #include <trace/events/block.h>
23 
24 #include "blk.h"
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
27 
28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29 static const char *const zone_cond_name[] = {
30 	ZONE_COND_NAME(NOT_WP),
31 	ZONE_COND_NAME(EMPTY),
32 	ZONE_COND_NAME(IMP_OPEN),
33 	ZONE_COND_NAME(EXP_OPEN),
34 	ZONE_COND_NAME(CLOSED),
35 	ZONE_COND_NAME(READONLY),
36 	ZONE_COND_NAME(FULL),
37 	ZONE_COND_NAME(OFFLINE),
38 	ZONE_COND_NAME(ACTIVE),
39 };
40 #undef ZONE_COND_NAME
41 
42 /*
43  * Per-zone write plug.
44  * @node: hlist_node structure for managing the plug using a hash table.
45  * @entry: list_head structure for listing the plug in the disk list of active
46  *         zone write plugs.
47  * @bio_list: The list of BIOs that are currently plugged.
48  * @bio_work: Work struct to handle issuing of plugged BIOs
49  * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
50  * @disk: The gendisk the plug belongs to.
51  * @lock: Spinlock to atomically manipulate the plug.
52  * @ref: Zone write plug reference counter. A zone write plug reference is
53  *       always at least 1 when the plug is hashed in the disk plug hash table.
54  *       The reference is incremented whenever a new BIO needing plugging is
55  *       submitted and when a function needs to manipulate a plug. The
56  *       reference count is decremented whenever a plugged BIO completes and
57  *       when a function that referenced the plug returns. The initial
58  *       reference is dropped whenever the zone of the zone write plug is reset,
59  *       finished and when the zone becomes full (last write BIO to the zone
60  *       completes).
61  * @flags: Flags indicating the plug state.
62  * @zone_no: The number of the zone the plug is managing.
63  * @wp_offset: The zone write pointer location relative to the start of the zone
64  *             as a number of 512B sectors.
65  * @cond: Condition of the zone
66  */
67 struct blk_zone_wplug {
68 	struct hlist_node	node;
69 	struct list_head	entry;
70 	struct bio_list		bio_list;
71 	struct work_struct	bio_work;
72 	struct rcu_head		rcu_head;
73 	struct gendisk		*disk;
74 	spinlock_t		lock;
75 	refcount_t		ref;
76 	unsigned int		flags;
77 	unsigned int		zone_no;
78 	unsigned int		wp_offset;
79 	enum blk_zone_cond	cond;
80 };
81 
82 static inline bool disk_need_zone_resources(struct gendisk *disk)
83 {
84 	/*
85 	 * All request-based zoned devices need zone resources so that the
86 	 * block layer can automatically handle write BIO plugging. BIO-based
87 	 * device drivers (e.g. DM devices) are normally responsible for
88 	 * handling zone write ordering and do not need zone resources, unless
89 	 * the driver requires zone append emulation.
90 	 */
91 	return queue_is_mq(disk->queue) ||
92 		queue_emulates_zone_append(disk->queue);
93 }
94 
95 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
96 {
97 	return 1U << disk->zone_wplugs_hash_bits;
98 }
99 
100 /*
101  * Zone write plug flags bits:
102  *  - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
103  *    that is, that write BIOs are being throttled due to a write BIO already
104  *    being executed or the zone write plug bio list is not empty.
105  *  - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
106  *    write pointer offset and need to update it.
107  *  - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be
108  *    removed from the disk hash table of zone write plugs when the last
109  *    reference on the zone write plug is dropped. If set, this flag also
110  *    indicates that the initial extra reference on the zone write plug was
111  *    dropped, meaning that the reference count indicates the current number of
112  *    active users (code context or BIOs and requests in flight). This flag is
113  *    set when a zone is reset, finished or becomes full.
114  */
115 #define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
116 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE	(1U << 1)
117 #define BLK_ZONE_WPLUG_DEAD		(1U << 2)
118 
119 /**
120  * blk_zone_cond_str - Return a zone condition name string
121  * @zone_cond: a zone condition BLK_ZONE_COND_name
122  *
123  * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful
124  * for the debugging and tracing zone conditions. For an invalid zone
125  * conditions, the string "UNKNOWN" is returned.
126  */
127 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
128 {
129 	static const char *zone_cond_str = "UNKNOWN";
130 
131 	if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
132 		zone_cond_str = zone_cond_name[zone_cond];
133 
134 	return zone_cond_str;
135 }
136 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
137 
138 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno,
139 			      enum blk_zone_cond cond)
140 {
141 	if (!zones_cond)
142 		return;
143 
144 	switch (cond) {
145 	case BLK_ZONE_COND_IMP_OPEN:
146 	case BLK_ZONE_COND_EXP_OPEN:
147 	case BLK_ZONE_COND_CLOSED:
148 		zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
149 		return;
150 	case BLK_ZONE_COND_NOT_WP:
151 	case BLK_ZONE_COND_EMPTY:
152 	case BLK_ZONE_COND_FULL:
153 	case BLK_ZONE_COND_OFFLINE:
154 	case BLK_ZONE_COND_READONLY:
155 	default:
156 		zones_cond[zno] = cond;
157 		return;
158 	}
159 }
160 
161 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
162 			       enum blk_zone_cond cond)
163 {
164 	u8 *zones_cond;
165 
166 	rcu_read_lock();
167 	zones_cond = rcu_dereference(disk->zones_cond);
168 	if (zones_cond) {
169 		unsigned int zno = disk_zone_no(disk, sector);
170 
171 		/*
172 		 * The condition of a conventional, readonly and offline zones
173 		 * never changes, so do nothing if the target zone is in one of
174 		 * these conditions.
175 		 */
176 		switch (zones_cond[zno]) {
177 		case BLK_ZONE_COND_NOT_WP:
178 		case BLK_ZONE_COND_READONLY:
179 		case BLK_ZONE_COND_OFFLINE:
180 			break;
181 		default:
182 			blk_zone_set_cond(zones_cond, zno, cond);
183 			break;
184 		}
185 	}
186 	rcu_read_unlock();
187 }
188 
189 /**
190  * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
191  * @bdev:       block device to check
192  * @sector:     sector number
193  *
194  * Check if @sector on @bdev is contained in a sequential write required zone.
195  */
196 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
197 {
198 	struct gendisk *disk = bdev->bd_disk;
199 	unsigned int zno = disk_zone_no(disk, sector);
200 	bool is_seq = false;
201 	u8 *zones_cond;
202 
203 	if (!bdev_is_zoned(bdev))
204 		return false;
205 
206 	rcu_read_lock();
207 	zones_cond = rcu_dereference(disk->zones_cond);
208 	if (zones_cond && zno < disk->nr_zones)
209 		is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
210 	rcu_read_unlock();
211 
212 	return is_seq;
213 }
214 EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
215 
216 /*
217  * Zone report arguments for block device drivers report_zones operation.
218  * @cb: report_zones_cb callback for each reported zone.
219  * @data: Private data passed to report_zones_cb.
220  */
221 struct blk_report_zones_args {
222 	report_zones_cb cb;
223 	void		*data;
224 	bool		report_active;
225 };
226 
227 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
228 				  unsigned int nr_zones,
229 				  struct blk_report_zones_args *args)
230 {
231 	struct gendisk *disk = bdev->bd_disk;
232 
233 	if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
234 		return -EOPNOTSUPP;
235 
236 	if (!nr_zones || sector >= get_capacity(disk))
237 		return 0;
238 
239 	return disk->fops->report_zones(disk, sector, nr_zones, args);
240 }
241 
242 /**
243  * blkdev_report_zones - Get zones information
244  * @bdev:	Target block device
245  * @sector:	Sector from which to report zones
246  * @nr_zones:	Maximum number of zones to report
247  * @cb:		Callback function called for each reported zone
248  * @data:	Private data for the callback
249  *
250  * Description:
251  *    Get zone information starting from the zone containing @sector for at most
252  *    @nr_zones, and call @cb for each zone reported by the device.
253  *    To report all zones in a device starting from @sector, the BLK_ALL_ZONES
254  *    constant can be passed to @nr_zones.
255  *    Returns the number of zones reported by the device, or a negative errno
256  *    value in case of failure.
257  *
258  *    Note: The caller must use memalloc_noXX_save/restore() calls to control
259  *    memory allocations done within this function.
260  */
261 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
262 			unsigned int nr_zones, report_zones_cb cb, void *data)
263 {
264 	struct blk_report_zones_args args = {
265 		.cb = cb,
266 		.data = data,
267 	};
268 
269 	return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
270 }
271 EXPORT_SYMBOL_GPL(blkdev_report_zones);
272 
273 static int blkdev_zone_reset_all(struct block_device *bdev)
274 {
275 	struct bio bio;
276 
277 	bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
278 	trace_blkdev_zone_mgmt(&bio, 0);
279 	return submit_bio_wait(&bio);
280 }
281 
282 /**
283  * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
284  * @bdev:	Target block device
285  * @op:		Operation to be performed on the zones
286  * @sector:	Start sector of the first zone to operate on
287  * @nr_sectors:	Number of sectors, should be at least the length of one zone and
288  *		must be zone size aligned.
289  *
290  * Description:
291  *    Perform the specified operation on the range of zones specified by
292  *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
293  *    is valid, but the specified range should not contain conventional zones.
294  *    The operation to execute on each zone can be a zone reset, open, close
295  *    or finish request.
296  */
297 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
298 		     sector_t sector, sector_t nr_sectors)
299 {
300 	sector_t zone_sectors = bdev_zone_sectors(bdev);
301 	sector_t capacity = bdev_nr_sectors(bdev);
302 	sector_t end_sector = sector + nr_sectors;
303 	struct bio *bio = NULL;
304 	int ret = 0;
305 
306 	if (!bdev_is_zoned(bdev))
307 		return -EOPNOTSUPP;
308 
309 	if (bdev_read_only(bdev))
310 		return -EPERM;
311 
312 	if (!op_is_zone_mgmt(op))
313 		return -EOPNOTSUPP;
314 
315 	if (end_sector <= sector || end_sector > capacity)
316 		/* Out of range */
317 		return -EINVAL;
318 
319 	/* Check alignment (handle eventual smaller last zone) */
320 	if (!bdev_is_zone_start(bdev, sector))
321 		return -EINVAL;
322 
323 	if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
324 		return -EINVAL;
325 
326 	/*
327 	 * In the case of a zone reset operation over all zones, use
328 	 * REQ_OP_ZONE_RESET_ALL.
329 	 */
330 	if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
331 		return blkdev_zone_reset_all(bdev);
332 
333 	while (sector < end_sector) {
334 		bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
335 		bio->bi_iter.bi_sector = sector;
336 		sector += zone_sectors;
337 
338 		/* This may take a while, so be nice to others */
339 		cond_resched();
340 	}
341 
342 	trace_blkdev_zone_mgmt(bio, nr_sectors);
343 	ret = submit_bio_wait(bio);
344 	bio_put(bio);
345 
346 	return ret;
347 }
348 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
349 
350 struct zone_report_args {
351 	struct blk_zone __user *zones;
352 };
353 
354 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
355 				    void *data)
356 {
357 	struct zone_report_args *args = data;
358 
359 	if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
360 		return -EFAULT;
361 	return 0;
362 }
363 
364 /*
365  * Mask of valid input flags for BLKREPORTZONEV2 ioctl.
366  */
367 #define BLK_ZONE_REPV2_INPUT_FLAGS	BLK_ZONE_REP_CACHED
368 
369 /*
370  * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
371  * Called from blkdev_ioctl.
372  */
373 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
374 		unsigned long arg)
375 {
376 	void __user *argp = (void __user *)arg;
377 	struct zone_report_args args;
378 	struct blk_zone_report rep;
379 	int ret;
380 
381 	if (!argp)
382 		return -EINVAL;
383 
384 	if (!bdev_is_zoned(bdev))
385 		return -ENOTTY;
386 
387 	if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
388 		return -EFAULT;
389 
390 	if (!rep.nr_zones)
391 		return -EINVAL;
392 
393 	args.zones = argp + sizeof(struct blk_zone_report);
394 
395 	switch (cmd) {
396 	case BLKREPORTZONE:
397 		ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
398 					  blkdev_copy_zone_to_user, &args);
399 		break;
400 	case BLKREPORTZONEV2:
401 		if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
402 			return -EINVAL;
403 		ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones,
404 					 blkdev_copy_zone_to_user, &args);
405 		break;
406 	default:
407 		return -EINVAL;
408 	}
409 
410 	if (ret < 0)
411 		return ret;
412 
413 	rep.nr_zones = ret;
414 	rep.flags = BLK_ZONE_REP_CAPACITY;
415 	if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
416 		return -EFAULT;
417 	return 0;
418 }
419 
420 static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode,
421 			     struct blk_zone_range *zrange)
422 {
423 	loff_t start, end;
424 	int ret = -EINVAL;
425 
426 	inode_lock(bdev->bd_mapping->host);
427 	filemap_invalidate_lock(bdev->bd_mapping);
428 	if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
429 	    zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
430 		/* Out of range */
431 		goto out_unlock;
432 
433 	start = zrange->sector << SECTOR_SHIFT;
434 	end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
435 
436 	ret = truncate_bdev_range(bdev, mode, start, end);
437 	if (ret)
438 		goto out_unlock;
439 
440 	ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector,
441 			       zrange->nr_sectors);
442 out_unlock:
443 	filemap_invalidate_unlock(bdev->bd_mapping);
444 	inode_unlock(bdev->bd_mapping->host);
445 	return ret;
446 }
447 
448 /*
449  * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
450  * Called from blkdev_ioctl.
451  */
452 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
453 			   unsigned int cmd, unsigned long arg)
454 {
455 	void __user *argp = (void __user *)arg;
456 	struct blk_zone_range zrange;
457 	enum req_op op;
458 
459 	if (!argp)
460 		return -EINVAL;
461 
462 	if (!bdev_is_zoned(bdev))
463 		return -ENOTTY;
464 
465 	if (!(mode & BLK_OPEN_WRITE))
466 		return -EBADF;
467 
468 	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
469 		return -EFAULT;
470 
471 	switch (cmd) {
472 	case BLKRESETZONE:
473 		return blkdev_reset_zone(bdev, mode, &zrange);
474 	case BLKOPENZONE:
475 		op = REQ_OP_ZONE_OPEN;
476 		break;
477 	case BLKCLOSEZONE:
478 		op = REQ_OP_ZONE_CLOSE;
479 		break;
480 	case BLKFINISHZONE:
481 		op = REQ_OP_ZONE_FINISH;
482 		break;
483 	default:
484 		return -ENOTTY;
485 	}
486 
487 	return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
488 }
489 
490 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
491 {
492 	return zone->start + zone->len >= get_capacity(disk);
493 }
494 
495 static bool disk_zone_wplug_is_full(struct gendisk *disk,
496 				    struct blk_zone_wplug *zwplug)
497 {
498 	if (zwplug->zone_no < disk->nr_zones - 1)
499 		return zwplug->wp_offset >= disk->zone_capacity;
500 	return zwplug->wp_offset >= disk->last_zone_capacity;
501 }
502 
503 static bool disk_insert_zone_wplug(struct gendisk *disk,
504 				   struct blk_zone_wplug *zwplug)
505 {
506 	struct blk_zone_wplug *zwplg;
507 	unsigned long flags;
508 	u8 *zones_cond;
509 	unsigned int idx =
510 		hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
511 
512 	/*
513 	 * Add the new zone write plug to the hash table, but carefully as we
514 	 * are racing with other submission context, so we may already have a
515 	 * zone write plug for the same zone.
516 	 */
517 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
518 	hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
519 		if (zwplg->zone_no == zwplug->zone_no) {
520 			spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock,
521 					       flags);
522 			return false;
523 		}
524 	}
525 
526 	/*
527 	 * Set the zone condition: if we do not yet have a zones_cond array
528 	 * attached to the disk, then this is a zone write plug insert from the
529 	 * first call to blk_revalidate_disk_zones(), in which case the zone is
530 	 * necessarilly in the active condition.
531 	 */
532 	zones_cond = rcu_dereference_check(disk->zones_cond,
533 				lockdep_is_held(&disk->zone_wplugs_hash_lock));
534 	if (zones_cond)
535 		zwplug->cond = zones_cond[zwplug->zone_no];
536 	else
537 		zwplug->cond = BLK_ZONE_COND_ACTIVE;
538 
539 	hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
540 	atomic_inc(&disk->nr_zone_wplugs);
541 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
542 
543 	return true;
544 }
545 
546 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
547 							 sector_t sector)
548 {
549 	unsigned int zno = disk_zone_no(disk, sector);
550 	unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
551 	struct blk_zone_wplug *zwplug;
552 
553 	rcu_read_lock();
554 
555 	hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
556 		if (zwplug->zone_no == zno &&
557 		    refcount_inc_not_zero(&zwplug->ref)) {
558 			rcu_read_unlock();
559 			return zwplug;
560 		}
561 	}
562 
563 	rcu_read_unlock();
564 
565 	return NULL;
566 }
567 
568 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
569 							 sector_t sector)
570 {
571 	if (!atomic_read(&disk->nr_zone_wplugs))
572 		return NULL;
573 
574 	return disk_get_hashed_zone_wplug(disk, sector);
575 }
576 
577 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
578 {
579 	struct blk_zone_wplug *zwplug =
580 		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
581 
582 	mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
583 }
584 
585 static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
586 {
587 	struct gendisk *disk = zwplug->disk;
588 	unsigned long flags;
589 
590 	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD));
591 	WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
592 	WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
593 
594 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
595 	blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
596 				lockdep_is_held(&disk->zone_wplugs_hash_lock)),
597 			  zwplug->zone_no, zwplug->cond);
598 	hlist_del_init_rcu(&zwplug->node);
599 	atomic_dec(&disk->nr_zone_wplugs);
600 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
601 
602 	call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
603 }
604 
605 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
606 {
607 	if (refcount_dec_and_test(&zwplug->ref))
608 		disk_free_zone_wplug(zwplug);
609 }
610 
611 /*
612  * Flag the zone write plug as dead and drop the initial reference we got when
613  * the zone write plug was added to the hash table. The zone write plug will be
614  * unhashed when its last reference is dropped.
615  */
616 static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug)
617 {
618 	lockdep_assert_held(&zwplug->lock);
619 
620 	if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) {
621 		zwplug->flags |= BLK_ZONE_WPLUG_DEAD;
622 		disk_put_zone_wplug(zwplug);
623 	}
624 }
625 
626 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
627 				       struct blk_zone_wplug *zwplug);
628 
629 static void blk_zone_wplug_bio_work(struct work_struct *work)
630 {
631 	struct blk_zone_wplug *zwplug =
632 		container_of(work, struct blk_zone_wplug, bio_work);
633 
634 	disk_zone_wplug_submit_bio(zwplug->disk, zwplug);
635 
636 	/* Drop the reference we took in disk_zone_wplug_schedule_work(). */
637 	disk_put_zone_wplug(zwplug);
638 }
639 
640 /*
641  * Get a zone write plug for the zone containing @sector.
642  * If the plug does not exist, it is allocated and inserted in the disk hash
643  * table.
644  */
645 static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk,
646 					sector_t sector, gfp_t gfp_mask)
647 {
648 	unsigned int zno = disk_zone_no(disk, sector);
649 	struct blk_zone_wplug *zwplug;
650 
651 again:
652 	zwplug = disk_get_zone_wplug(disk, sector);
653 	if (zwplug)
654 		return zwplug;
655 
656 	/*
657 	 * Allocate and initialize a zone write plug with an extra reference
658 	 * so that it is not freed when the zone write plug becomes idle without
659 	 * the zone being full.
660 	 */
661 	zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
662 	if (!zwplug)
663 		return NULL;
664 
665 	INIT_HLIST_NODE(&zwplug->node);
666 	refcount_set(&zwplug->ref, 2);
667 	spin_lock_init(&zwplug->lock);
668 	zwplug->flags = 0;
669 	zwplug->zone_no = zno;
670 	zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
671 	bio_list_init(&zwplug->bio_list);
672 	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
673 	INIT_LIST_HEAD(&zwplug->entry);
674 	zwplug->disk = disk;
675 
676 	/*
677 	 * Insert the new zone write plug in the hash table. This can fail only
678 	 * if another context already inserted a plug. Retry from the beginning
679 	 * in such case.
680 	 */
681 	if (!disk_insert_zone_wplug(disk, zwplug)) {
682 		mempool_free(zwplug, disk->zone_wplugs_pool);
683 		goto again;
684 	}
685 
686 	return zwplug;
687 }
688 
689 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
690 					       struct bio *bio)
691 {
692 	struct request_queue *q = zwplug->disk->queue;
693 
694 	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
695 	bio_io_error(bio);
696 	disk_put_zone_wplug(zwplug);
697 	/* Drop the reference taken by disk_zone_wplug_add_bio(). */
698 	blk_queue_exit(q);
699 }
700 
701 /*
702  * Abort (fail) all plugged BIOs of a zone write plug.
703  */
704 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
705 {
706 	struct gendisk *disk = zwplug->disk;
707 	struct bio *bio;
708 
709 	lockdep_assert_held(&zwplug->lock);
710 
711 	if (bio_list_empty(&zwplug->bio_list))
712 		return;
713 
714 	pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
715 			    zwplug->disk->disk_name, zwplug->zone_no);
716 	while ((bio = bio_list_pop(&zwplug->bio_list)))
717 		blk_zone_wplug_bio_io_error(zwplug, bio);
718 
719 	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
720 
721 	/*
722 	 * If we are using the per disk zone write plugs worker thread, remove
723 	 * the zone write plug from the work list and drop the reference we
724 	 * took when the zone write plug was added to that list.
725 	 */
726 	if (blk_queue_zoned_qd1_writes(disk->queue)) {
727 		spin_lock(&disk->zone_wplugs_list_lock);
728 		if (!list_empty(&zwplug->entry)) {
729 			list_del_init(&zwplug->entry);
730 			disk_put_zone_wplug(zwplug);
731 		}
732 		spin_unlock(&disk->zone_wplugs_list_lock);
733 	}
734 }
735 
736 /*
737  * Update a zone write plug condition based on the write pointer offset.
738  */
739 static void disk_zone_wplug_update_cond(struct gendisk *disk,
740 					struct blk_zone_wplug *zwplug)
741 {
742 	lockdep_assert_held(&zwplug->lock);
743 
744 	if (disk_zone_wplug_is_full(disk, zwplug))
745 		zwplug->cond = BLK_ZONE_COND_FULL;
746 	else if (!zwplug->wp_offset)
747 		zwplug->cond = BLK_ZONE_COND_EMPTY;
748 	else
749 		zwplug->cond = BLK_ZONE_COND_ACTIVE;
750 }
751 
752 /*
753  * Set a zone write plug write pointer offset to the specified value.
754  * This aborts all plugged BIOs, which is fine as this function is called for
755  * a zone reset operation, a zone finish operation or if the zone needs a wp
756  * update from a report zone after a write error.
757  */
758 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
759 					  struct blk_zone_wplug *zwplug,
760 					  unsigned int wp_offset)
761 {
762 	lockdep_assert_held(&zwplug->lock);
763 
764 	/* Update the zone write pointer and abort all plugged BIOs. */
765 	zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
766 	zwplug->wp_offset = wp_offset;
767 	disk_zone_wplug_update_cond(disk, zwplug);
768 
769 	disk_zone_wplug_abort(zwplug);
770 	if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
771 		disk_mark_zone_wplug_dead(zwplug);
772 }
773 
774 static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
775 {
776 	switch (zone->cond) {
777 	case BLK_ZONE_COND_IMP_OPEN:
778 	case BLK_ZONE_COND_EXP_OPEN:
779 	case BLK_ZONE_COND_CLOSED:
780 	case BLK_ZONE_COND_ACTIVE:
781 		return zone->wp - zone->start;
782 	case BLK_ZONE_COND_EMPTY:
783 		return 0;
784 	case BLK_ZONE_COND_FULL:
785 	case BLK_ZONE_COND_NOT_WP:
786 	case BLK_ZONE_COND_OFFLINE:
787 	case BLK_ZONE_COND_READONLY:
788 	default:
789 		/*
790 		 * Conventional, full, offline and read-only zones do not have
791 		 * a valid write pointer.
792 		 */
793 		return UINT_MAX;
794 	}
795 }
796 
797 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
798 						   struct blk_zone *zone)
799 {
800 	struct blk_zone_wplug *zwplug;
801 	unsigned int wp_offset = blk_zone_wp_offset(zone);
802 
803 	zwplug = disk_get_zone_wplug(disk, zone->start);
804 	if (zwplug) {
805 		unsigned long flags;
806 
807 		spin_lock_irqsave(&zwplug->lock, flags);
808 		if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
809 			disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
810 		spin_unlock_irqrestore(&zwplug->lock, flags);
811 		disk_put_zone_wplug(zwplug);
812 	}
813 
814 	return wp_offset;
815 }
816 
817 /**
818  * disk_report_zone - Report one zone
819  * @disk:	Target disk
820  * @zone:	The zone to report
821  * @idx:	The index of the zone in the overall zone report
822  * @args:	report zones callback and data
823  *
824  * Description:
825  *    Helper function for block device drivers to report one zone of a zone
826  *    report initiated with blkdev_report_zones(). The zone being reported is
827  *    specified by @zone and used to update, if necessary, the zone write plug
828  *    information for the zone. If @args specifies a user callback function,
829  *    this callback is executed.
830  */
831 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
832 		     unsigned int idx, struct blk_report_zones_args *args)
833 {
834 	if (args && args->report_active) {
835 		/*
836 		 * If we come here, then this is a report zones as a fallback
837 		 * for a cached report. So collapse the implicit open, explicit
838 		 * open and closed conditions into the active zone condition.
839 		 */
840 		switch (zone->cond) {
841 		case BLK_ZONE_COND_IMP_OPEN:
842 		case BLK_ZONE_COND_EXP_OPEN:
843 		case BLK_ZONE_COND_CLOSED:
844 			zone->cond = BLK_ZONE_COND_ACTIVE;
845 			break;
846 		default:
847 			break;
848 		}
849 	}
850 
851 	if (disk->zone_wplugs_hash)
852 		disk_zone_wplug_sync_wp_offset(disk, zone);
853 
854 	if (args && args->cb)
855 		return args->cb(zone, idx, args->data);
856 
857 	return 0;
858 }
859 EXPORT_SYMBOL_GPL(disk_report_zone);
860 
861 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
862 				 void *data)
863 {
864 	memcpy(data, zone, sizeof(struct blk_zone));
865 	return 0;
866 }
867 
868 static int blkdev_report_zone_fallback(struct block_device *bdev,
869 				       sector_t sector, struct blk_zone *zone)
870 {
871 	struct blk_report_zones_args args = {
872 		.cb = blkdev_report_zone_cb,
873 		.data = zone,
874 		.report_active = true,
875 	};
876 	int error;
877 
878 	error = blkdev_do_report_zones(bdev, sector, 1, &args);
879 	if (error < 0)
880 		return error;
881 	if (error == 0)
882 		return -EIO;
883 	return 0;
884 }
885 
886 /*
887  * For devices that natively support zone append operations, we do not use zone
888  * write plugging for zone append writes, which makes the zone condition
889  * tracking invalid once zone append was used.  In that case fall back to a
890  * regular report zones to get correct information.
891  */
892 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev)
893 {
894 	return disk_need_zone_resources(bdev->bd_disk) &&
895 		(bdev_emulates_zone_append(bdev) ||
896 		 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state));
897 }
898 
899 /**
900  * blkdev_get_zone_info - Get a single zone information from cached data
901  * @bdev:   Target block device
902  * @sector: Sector contained by the target zone
903  * @zone:   zone structure to return the zone information
904  *
905  * Description:
906  *    Get the zone information for the zone containing @sector using the zone
907  *    write plug of the target zone, if one exist, or the disk zone condition
908  *    array otherwise. The zone condition may be reported as being
909  *    the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
910  *    open, explicit open or closed condition.
911  *
912  *    Returns 0 on success and a negative error code on failure.
913  */
914 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
915 			 struct blk_zone *zone)
916 {
917 	struct gendisk *disk = bdev->bd_disk;
918 	sector_t zone_sectors = bdev_zone_sectors(bdev);
919 	struct blk_zone_wplug *zwplug;
920 	unsigned long flags;
921 	u8 *zones_cond;
922 
923 	if (!bdev_is_zoned(bdev))
924 		return -EOPNOTSUPP;
925 
926 	if (sector >= get_capacity(disk))
927 		return -EINVAL;
928 
929 	memset(zone, 0, sizeof(*zone));
930 	sector = bdev_zone_start(bdev, sector);
931 
932 	if (!blkdev_has_cached_report_zones(bdev))
933 		return blkdev_report_zone_fallback(bdev, sector, zone);
934 
935 	rcu_read_lock();
936 	zones_cond = rcu_dereference(disk->zones_cond);
937 	if (!disk->zone_wplugs_hash || !zones_cond) {
938 		rcu_read_unlock();
939 		return blkdev_report_zone_fallback(bdev, sector, zone);
940 	}
941 	zone->cond = zones_cond[disk_zone_no(disk, sector)];
942 	rcu_read_unlock();
943 
944 	zone->start = sector;
945 	zone->len = zone_sectors;
946 
947 	/*
948 	 * If this is a conventional zone, we do not have a zone write plug and
949 	 * can report the zone immediately.
950 	 */
951 	if (zone->cond == BLK_ZONE_COND_NOT_WP) {
952 		zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
953 		zone->capacity = zone_sectors;
954 		zone->wp = ULLONG_MAX;
955 		return 0;
956 	}
957 
958 	/*
959 	 * This is a sequential write required zone. If the zone is read-only or
960 	 * offline, only set the zone write pointer to an invalid value and
961 	 * report the zone.
962 	 */
963 	zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
964 	if (disk_zone_is_last(disk, zone))
965 		zone->capacity = disk->last_zone_capacity;
966 	else
967 		zone->capacity = disk->zone_capacity;
968 
969 	if (zone->cond == BLK_ZONE_COND_READONLY ||
970 	    zone->cond == BLK_ZONE_COND_OFFLINE) {
971 		zone->wp = ULLONG_MAX;
972 		return 0;
973 	}
974 
975 	/*
976 	 * If the zone does not have a zone write plug, it is either full or
977 	 * empty, as we otherwise would have a zone write plug for it. In this
978 	 * case, set the write pointer accordingly and report the zone.
979 	 * Otherwise, if we have a zone write plug, use it.
980 	 */
981 	zwplug = disk_get_zone_wplug(disk, sector);
982 	if (!zwplug) {
983 		if (zone->cond == BLK_ZONE_COND_FULL)
984 			zone->wp = ULLONG_MAX;
985 		else
986 			zone->wp = sector;
987 		return 0;
988 	}
989 
990 	spin_lock_irqsave(&zwplug->lock, flags);
991 	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
992 		spin_unlock_irqrestore(&zwplug->lock, flags);
993 		disk_put_zone_wplug(zwplug);
994 		return blkdev_report_zone_fallback(bdev, sector, zone);
995 	}
996 	zone->cond = zwplug->cond;
997 	zone->wp = sector + zwplug->wp_offset;
998 	spin_unlock_irqrestore(&zwplug->lock, flags);
999 
1000 	disk_put_zone_wplug(zwplug);
1001 
1002 	return 0;
1003 }
1004 EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
1005 
1006 /**
1007  * blkdev_report_zones_cached - Get cached zones information
1008  * @bdev:     Target block device
1009  * @sector:   Sector from which to report zones
1010  * @nr_zones: Maximum number of zones to report
1011  * @cb:       Callback function called for each reported zone
1012  * @data:     Private data for the callback function
1013  *
1014  * Description:
1015  *    Similar to blkdev_report_zones() but instead of calling into the low level
1016  *    device driver to get the zone report from the device, use
1017  *    blkdev_get_zone_info() to generate the report from the disk zone write
1018  *    plugs and zones condition array. Since calling this function without a
1019  *    callback does not make sense, @cb must be specified.
1020  */
1021 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
1022 			unsigned int nr_zones, report_zones_cb cb, void *data)
1023 {
1024 	struct gendisk *disk = bdev->bd_disk;
1025 	sector_t capacity = get_capacity(disk);
1026 	sector_t zone_sectors = bdev_zone_sectors(bdev);
1027 	unsigned int idx = 0;
1028 	struct blk_zone zone;
1029 	int ret;
1030 
1031 	if (!cb || !bdev_is_zoned(bdev) ||
1032 	    WARN_ON_ONCE(!disk->fops->report_zones))
1033 		return -EOPNOTSUPP;
1034 
1035 	if (!nr_zones || sector >= capacity)
1036 		return 0;
1037 
1038 	if (!blkdev_has_cached_report_zones(bdev)) {
1039 		struct blk_report_zones_args args = {
1040 			.cb = cb,
1041 			.data = data,
1042 			.report_active = true,
1043 		};
1044 
1045 		return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
1046 	}
1047 
1048 	for (sector = bdev_zone_start(bdev, sector);
1049 	     sector < capacity && idx < nr_zones;
1050 	     sector += zone_sectors, idx++) {
1051 		ret = blkdev_get_zone_info(bdev, sector, &zone);
1052 		if (ret)
1053 			return ret;
1054 
1055 		ret = cb(&zone, idx, data);
1056 		if (ret)
1057 			return ret;
1058 	}
1059 
1060 	return idx;
1061 }
1062 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
1063 
1064 static void blk_zone_reset_bio_endio(struct bio *bio)
1065 {
1066 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1067 	sector_t sector = bio->bi_iter.bi_sector;
1068 	struct blk_zone_wplug *zwplug;
1069 
1070 	/*
1071 	 * If we have a zone write plug, set its write pointer offset to 0.
1072 	 * This will abort all BIOs plugged for the target zone. It is fine as
1073 	 * resetting zones while writes are still in-flight will result in the
1074 	 * writes failing anyway.
1075 	 */
1076 	zwplug = disk_get_zone_wplug(disk, sector);
1077 	if (zwplug) {
1078 		unsigned long flags;
1079 
1080 		spin_lock_irqsave(&zwplug->lock, flags);
1081 		disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1082 		spin_unlock_irqrestore(&zwplug->lock, flags);
1083 		disk_put_zone_wplug(zwplug);
1084 	} else {
1085 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1086 	}
1087 }
1088 
1089 static void blk_zone_reset_all_bio_endio(struct bio *bio)
1090 {
1091 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1092 	sector_t capacity = get_capacity(disk);
1093 	struct blk_zone_wplug *zwplug;
1094 	unsigned long flags;
1095 	sector_t sector;
1096 	unsigned int i;
1097 
1098 	if (atomic_read(&disk->nr_zone_wplugs)) {
1099 		/* Update the condition of all zone write plugs. */
1100 		rcu_read_lock();
1101 		for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1102 			hlist_for_each_entry_rcu(zwplug,
1103 						 &disk->zone_wplugs_hash[i],
1104 						 node) {
1105 				spin_lock_irqsave(&zwplug->lock, flags);
1106 				disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1107 				spin_unlock_irqrestore(&zwplug->lock, flags);
1108 			}
1109 		}
1110 		rcu_read_unlock();
1111 	}
1112 
1113 	/* Update the cached zone conditions. */
1114 	for (sector = 0; sector < capacity;
1115 	     sector += bdev_zone_sectors(bio->bi_bdev))
1116 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1117 	clear_bit(GD_ZONE_APPEND_USED, &disk->state);
1118 }
1119 
1120 static void blk_zone_finish_bio_endio(struct bio *bio)
1121 {
1122 	struct block_device *bdev = bio->bi_bdev;
1123 	struct gendisk *disk = bdev->bd_disk;
1124 	sector_t sector = bio->bi_iter.bi_sector;
1125 	struct blk_zone_wplug *zwplug;
1126 
1127 	/*
1128 	 * If we have a zone write plug, set its write pointer offset to the
1129 	 * zone size. This will abort all BIOs plugged for the target zone. It
1130 	 * is fine as resetting zones while writes are still in-flight will
1131 	 * result in the writes failing anyway.
1132 	 */
1133 	zwplug = disk_get_zone_wplug(disk, sector);
1134 	if (zwplug) {
1135 		unsigned long flags;
1136 
1137 		spin_lock_irqsave(&zwplug->lock, flags);
1138 		disk_zone_wplug_set_wp_offset(disk, zwplug,
1139 					      bdev_zone_sectors(bdev));
1140 		spin_unlock_irqrestore(&zwplug->lock, flags);
1141 		disk_put_zone_wplug(zwplug);
1142 	} else {
1143 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL);
1144 	}
1145 }
1146 
1147 void blk_zone_mgmt_bio_endio(struct bio *bio)
1148 {
1149 	/* If the BIO failed, we have nothing to do. */
1150 	if (bio->bi_status != BLK_STS_OK)
1151 		return;
1152 
1153 	switch (bio_op(bio)) {
1154 	case REQ_OP_ZONE_RESET:
1155 		blk_zone_reset_bio_endio(bio);
1156 		return;
1157 	case REQ_OP_ZONE_RESET_ALL:
1158 		blk_zone_reset_all_bio_endio(bio);
1159 		return;
1160 	case REQ_OP_ZONE_FINISH:
1161 		blk_zone_finish_bio_endio(bio);
1162 		return;
1163 	default:
1164 		return;
1165 	}
1166 }
1167 
1168 static void disk_zone_wplug_schedule_work(struct gendisk *disk,
1169 					  struct blk_zone_wplug *zwplug)
1170 {
1171 	lockdep_assert_held(&zwplug->lock);
1172 
1173 	/*
1174 	 * Schedule the submission of the next plugged BIO. Taking a reference
1175 	 * to the zone write plug is required as the bio_work belongs to the
1176 	 * plug, and thus we must ensure that the write plug does not go away
1177 	 * while the work is being scheduled but has not run yet.
1178 	 * blk_zone_wplug_bio_work() will release the reference we take here,
1179 	 * and we also drop this reference if the work is already scheduled.
1180 	 */
1181 	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1182 	WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue));
1183 	refcount_inc(&zwplug->ref);
1184 	if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work))
1185 		disk_put_zone_wplug(zwplug);
1186 }
1187 
1188 static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
1189 				struct blk_zone_wplug *zwplug,
1190 				struct bio *bio, unsigned int nr_segs)
1191 {
1192 	/*
1193 	 * Grab an extra reference on the BIO request queue usage counter.
1194 	 * This reference will be reused to submit a request for the BIO for
1195 	 * blk-mq devices and dropped when the BIO is failed and after
1196 	 * it is issued in the case of BIO-based devices.
1197 	 */
1198 	percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
1199 
1200 	/*
1201 	 * The BIO is being plugged and thus will have to wait for the on-going
1202 	 * write and for all other writes already plugged. So polling makes
1203 	 * no sense.
1204 	 */
1205 	bio_clear_polled(bio);
1206 
1207 	/*
1208 	 * Reuse the poll cookie field to store the number of segments when
1209 	 * split to the hardware limits.
1210 	 */
1211 	bio->__bi_nr_segments = nr_segs;
1212 
1213 	/*
1214 	 * We always receive BIOs after they are split and ready to be issued.
1215 	 * The block layer passes the parts of a split BIO in order, and the
1216 	 * user must also issue write sequentially. So simply add the new BIO
1217 	 * at the tail of the list to preserve the sequential write order.
1218 	 */
1219 	bio_list_add(&zwplug->bio_list, bio);
1220 	trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
1221 				      bio->bi_iter.bi_sector, bio_sectors(bio));
1222 
1223 	/*
1224 	 * If we are using the disk zone write plugs worker instead of the per
1225 	 * zone write plug BIO work, add the zone write plug to the work list
1226 	 * if it is not already there. Make sure to also get an extra reference
1227 	 * on the zone write plug so that it does not go away until it is
1228 	 * removed from the work list.
1229 	 */
1230 	if (blk_queue_zoned_qd1_writes(disk->queue)) {
1231 		spin_lock(&disk->zone_wplugs_list_lock);
1232 		if (list_empty(&zwplug->entry)) {
1233 			list_add_tail(&zwplug->entry, &disk->zone_wplugs_list);
1234 			refcount_inc(&zwplug->ref);
1235 		}
1236 		spin_unlock(&disk->zone_wplugs_list_lock);
1237 	}
1238 }
1239 
1240 /*
1241  * Called from bio_attempt_back_merge() when a BIO was merged with a request.
1242  */
1243 void blk_zone_write_plug_bio_merged(struct bio *bio)
1244 {
1245 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1246 	struct blk_zone_wplug *zwplug;
1247 	unsigned long flags;
1248 
1249 	/*
1250 	 * If the BIO was already plugged, then we were called through
1251 	 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
1252 	 * For this case, we already hold a reference on the zone write plug for
1253 	 * the BIO and blk_zone_write_plug_init_request() will handle the
1254 	 * zone write pointer offset update.
1255 	 */
1256 	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1257 		return;
1258 
1259 	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1260 
1261 	/*
1262 	 * Get a reference on the zone write plug of the target zone and advance
1263 	 * the zone write pointer offset. Given that this is a merge, we already
1264 	 * have at least one request and one BIO referencing the zone write
1265 	 * plug. So this should not fail.
1266 	 */
1267 	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1268 	if (WARN_ON_ONCE(!zwplug))
1269 		return;
1270 
1271 	spin_lock_irqsave(&zwplug->lock, flags);
1272 	zwplug->wp_offset += bio_sectors(bio);
1273 	disk_zone_wplug_update_cond(disk, zwplug);
1274 	spin_unlock_irqrestore(&zwplug->lock, flags);
1275 }
1276 
1277 /*
1278  * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
1279  * already went through zone write plugging (either a new BIO or one that was
1280  * unplugged).
1281  */
1282 void blk_zone_write_plug_init_request(struct request *req)
1283 {
1284 	sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
1285 	struct request_queue *q = req->q;
1286 	struct gendisk *disk = q->disk;
1287 	struct blk_zone_wplug *zwplug =
1288 		disk_get_zone_wplug(disk, blk_rq_pos(req));
1289 	unsigned long flags;
1290 	struct bio *bio;
1291 
1292 	if (WARN_ON_ONCE(!zwplug))
1293 		return;
1294 
1295 	/*
1296 	 * Indicate that completion of this request needs to be handled with
1297 	 * blk_zone_write_plug_finish_request(), which will drop the reference
1298 	 * on the zone write plug we took above on entry to this function.
1299 	 */
1300 	req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
1301 
1302 	if (blk_queue_nomerges(q))
1303 		return;
1304 
1305 	/*
1306 	 * Walk through the list of plugged BIOs to check if they can be merged
1307 	 * into the back of the request.
1308 	 */
1309 	spin_lock_irqsave(&zwplug->lock, flags);
1310 	while (!disk_zone_wplug_is_full(disk, zwplug)) {
1311 		bio = bio_list_peek(&zwplug->bio_list);
1312 		if (!bio)
1313 			break;
1314 
1315 		if (bio->bi_iter.bi_sector != req_back_sector ||
1316 		    !blk_rq_merge_ok(req, bio))
1317 			break;
1318 
1319 		WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
1320 			     !bio->__bi_nr_segments);
1321 
1322 		bio_list_pop(&zwplug->bio_list);
1323 		if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
1324 		    BIO_MERGE_OK) {
1325 			bio_list_add_head(&zwplug->bio_list, bio);
1326 			break;
1327 		}
1328 
1329 		/* Drop the reference taken by disk_zone_wplug_add_bio(). */
1330 		blk_queue_exit(q);
1331 		zwplug->wp_offset += bio_sectors(bio);
1332 		disk_zone_wplug_update_cond(disk, zwplug);
1333 
1334 		req_back_sector += bio_sectors(bio);
1335 	}
1336 	spin_unlock_irqrestore(&zwplug->lock, flags);
1337 }
1338 
1339 /*
1340  * Check and prepare a BIO for submission by incrementing the write pointer
1341  * offset of its zone write plug and changing zone append operations into
1342  * regular write when zone append emulation is needed.
1343  */
1344 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
1345 				       struct bio *bio)
1346 {
1347 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1348 
1349 	lockdep_assert_held(&zwplug->lock);
1350 
1351 	/*
1352 	 * If we lost track of the zone write pointer due to a write error,
1353 	 * the user must either execute a report zones, reset the zone or finish
1354 	 * the to recover a reliable write pointer position. Fail BIOs if the
1355 	 * user did not do that as we cannot handle emulated zone append
1356 	 * otherwise.
1357 	 */
1358 	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
1359 		return false;
1360 
1361 	/*
1362 	 * Check that the user is not attempting to write to a full zone.
1363 	 * We know such BIO will fail, and that would potentially overflow our
1364 	 * write pointer offset beyond the end of the zone.
1365 	 */
1366 	if (disk_zone_wplug_is_full(disk, zwplug))
1367 		return false;
1368 
1369 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1370 		/*
1371 		 * Use a regular write starting at the current write pointer.
1372 		 * Similarly to native zone append operations, do not allow
1373 		 * merging.
1374 		 */
1375 		bio->bi_opf &= ~REQ_OP_MASK;
1376 		bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
1377 		bio->bi_iter.bi_sector += zwplug->wp_offset;
1378 
1379 		/*
1380 		 * Remember that this BIO is in fact a zone append operation
1381 		 * so that we can restore its operation code on completion.
1382 		 */
1383 		bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
1384 	} else {
1385 		/*
1386 		 * Check for non-sequential writes early as we know that BIOs
1387 		 * with a start sector not unaligned to the zone write pointer
1388 		 * will fail.
1389 		 */
1390 		if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
1391 			return false;
1392 	}
1393 
1394 	/* Advance the zone write pointer offset. */
1395 	zwplug->wp_offset += bio_sectors(bio);
1396 	disk_zone_wplug_update_cond(disk, zwplug);
1397 
1398 	return true;
1399 }
1400 
1401 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
1402 {
1403 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1404 	sector_t sector = bio->bi_iter.bi_sector;
1405 	struct blk_zone_wplug *zwplug;
1406 	gfp_t gfp_mask = GFP_NOIO;
1407 	unsigned long flags;
1408 
1409 	/*
1410 	 * BIOs must be fully contained within a zone so that we use the correct
1411 	 * zone write plug for the entire BIO. For blk-mq devices, the block
1412 	 * layer should already have done any splitting required to ensure this
1413 	 * and this BIO should thus not be straddling zone boundaries. For
1414 	 * BIO-based devices, it is the responsibility of the driver to split
1415 	 * the bio before submitting it.
1416 	 */
1417 	if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1418 		bio_io_error(bio);
1419 		return true;
1420 	}
1421 
1422 	/* Conventional zones do not need write plugging. */
1423 	if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1424 		/* Zone append to conventional zones is not allowed. */
1425 		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1426 			bio_io_error(bio);
1427 			return true;
1428 		}
1429 		return false;
1430 	}
1431 
1432 	if (bio->bi_opf & REQ_NOWAIT)
1433 		gfp_mask = GFP_NOWAIT;
1434 
1435 	zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask);
1436 	if (!zwplug) {
1437 		if (bio->bi_opf & REQ_NOWAIT)
1438 			bio_wouldblock_error(bio);
1439 		else
1440 			bio_io_error(bio);
1441 		return true;
1442 	}
1443 
1444 	spin_lock_irqsave(&zwplug->lock, flags);
1445 
1446 	/*
1447 	 * If we got a zone write plug marked as dead, then the user is issuing
1448 	 * writes to a full zone, or without synchronizing with zone reset or
1449 	 * zone finish operations. In such case, fail the BIO to signal this
1450 	 * invalid usage.
1451 	 */
1452 	if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) {
1453 		spin_unlock_irqrestore(&zwplug->lock, flags);
1454 		disk_put_zone_wplug(zwplug);
1455 		bio_io_error(bio);
1456 		return true;
1457 	}
1458 
1459 	/* Indicate that this BIO is being handled using zone write plugging. */
1460 	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1461 
1462 	/*
1463 	 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a
1464 	 * BLK_STS_AGAIN failure if we let the caller submit the BIO.
1465 	 */
1466 	if (bio->bi_opf & REQ_NOWAIT) {
1467 		bio->bi_opf &= ~REQ_NOWAIT;
1468 		goto queue_bio;
1469 	}
1470 
1471 	/*
1472 	 * For rotational devices, we will use the gendisk zone write plugs
1473 	 * work instead of the per zone write plug BIO work, so queue the BIO.
1474 	 */
1475 	if (blk_queue_zoned_qd1_writes(disk->queue))
1476 		goto queue_bio;
1477 
1478 	/* If the zone is already plugged, add the BIO to the BIO plug list. */
1479 	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
1480 		goto queue_bio;
1481 
1482 	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1483 		spin_unlock_irqrestore(&zwplug->lock, flags);
1484 		bio_io_error(bio);
1485 		return true;
1486 	}
1487 
1488 	/* Otherwise, plug and let the caller submit the BIO. */
1489 	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1490 
1491 	spin_unlock_irqrestore(&zwplug->lock, flags);
1492 
1493 	return false;
1494 
1495 queue_bio:
1496 	disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1497 
1498 	if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
1499 		zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1500 		if (blk_queue_zoned_qd1_writes(disk->queue))
1501 			wake_up_process(disk->zone_wplugs_worker);
1502 		else
1503 			disk_zone_wplug_schedule_work(disk, zwplug);
1504 	}
1505 
1506 	spin_unlock_irqrestore(&zwplug->lock, flags);
1507 
1508 	return true;
1509 }
1510 
1511 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1512 {
1513 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1514 	struct blk_zone_wplug *zwplug;
1515 	unsigned long flags;
1516 
1517 	if (!test_bit(GD_ZONE_APPEND_USED, &disk->state))
1518 		set_bit(GD_ZONE_APPEND_USED, &disk->state);
1519 
1520 	/*
1521 	 * We have native support for zone append operations, so we are not
1522 	 * going to handle @bio through plugging. However, we may already have a
1523 	 * zone write plug for the target zone if that zone was previously
1524 	 * partially written using regular writes. In such case, we risk leaving
1525 	 * the plug in the disk hash table if the zone is fully written using
1526 	 * zone append operations. Avoid this by removing the zone write plug.
1527 	 */
1528 	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1529 	if (likely(!zwplug))
1530 		return;
1531 
1532 	spin_lock_irqsave(&zwplug->lock, flags);
1533 
1534 	/*
1535 	 * We are about to remove the zone write plug. But if the user
1536 	 * (mistakenly) has issued regular writes together with native zone
1537 	 * append, we must aborts the writes as otherwise the plugged BIOs would
1538 	 * not be executed by the plug BIO work as disk_get_zone_wplug() will
1539 	 * return NULL after the plug is removed. Aborting the plugged write
1540 	 * BIOs is consistent with the fact that these writes will most likely
1541 	 * fail anyway as there is no ordering guarantees between zone append
1542 	 * operations and regular write operations.
1543 	 */
1544 	if (!bio_list_empty(&zwplug->bio_list)) {
1545 		pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1546 				    disk->disk_name, zwplug->zone_no);
1547 		disk_zone_wplug_abort(zwplug);
1548 	}
1549 	disk_mark_zone_wplug_dead(zwplug);
1550 	spin_unlock_irqrestore(&zwplug->lock, flags);
1551 
1552 	disk_put_zone_wplug(zwplug);
1553 }
1554 
1555 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio)
1556 {
1557 	if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL &&
1558 	    !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
1559 		/*
1560 		 * Zone reset and zone finish operations do not apply to
1561 		 * conventional zones.
1562 		 */
1563 		bio_io_error(bio);
1564 		return true;
1565 	}
1566 
1567 	/*
1568 	 * No-wait zone management BIOs do not make much sense as the callers
1569 	 * issue these as blocking operations in most cases. To avoid issues
1570 	 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn
1571 	 * about REQ_NOWAIT being set and ignore that flag.
1572 	 */
1573 	if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
1574 		bio->bi_opf &= ~REQ_NOWAIT;
1575 
1576 	return false;
1577 }
1578 
1579 /**
1580  * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1581  * @bio: The BIO being submitted
1582  * @nr_segs: The number of physical segments of @bio
1583  *
1584  * Handle write, write zeroes and zone append operations requiring emulation
1585  * using zone write plugging.
1586  *
1587  * Return true whenever @bio execution needs to be delayed through the zone
1588  * write plug. Otherwise, return false to let the submission path process
1589  * @bio normally.
1590  */
1591 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1592 {
1593 	struct block_device *bdev = bio->bi_bdev;
1594 
1595 	if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
1596 		return false;
1597 
1598 	/*
1599 	 * Regular writes and write zeroes need to be handled through the target
1600 	 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1601 	 * which may need to go through the flush machinery depending on the
1602 	 * target device capabilities. Plugging such writes is fine as the flush
1603 	 * machinery operates at the request level, below the plug, and
1604 	 * completion of the flush sequence will go through the regular BIO
1605 	 * completion, which will handle zone write plugging.
1606 	 * Zone append operations for devices that requested emulation must
1607 	 * also be plugged so that these BIOs can be changed into regular
1608 	 * write BIOs.
1609 	 * Zone reset, reset all and finish commands need special treatment
1610 	 * to correctly track the write pointer offset of zones. These commands
1611 	 * are not plugged as we do not need serialization with write
1612 	 * operations. It is the responsibility of the user to not issue reset
1613 	 * and finish commands when write operations are in flight.
1614 	 */
1615 	switch (bio_op(bio)) {
1616 	case REQ_OP_ZONE_APPEND:
1617 		if (!bdev_emulates_zone_append(bdev)) {
1618 			blk_zone_wplug_handle_native_zone_append(bio);
1619 			return false;
1620 		}
1621 		fallthrough;
1622 	case REQ_OP_WRITE:
1623 	case REQ_OP_WRITE_ZEROES:
1624 		return blk_zone_wplug_handle_write(bio, nr_segs);
1625 	case REQ_OP_ZONE_RESET:
1626 	case REQ_OP_ZONE_FINISH:
1627 	case REQ_OP_ZONE_RESET_ALL:
1628 		return blk_zone_wplug_handle_zone_mgmt(bio);
1629 	default:
1630 		return false;
1631 	}
1632 
1633 	return false;
1634 }
1635 EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1636 
1637 static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1638 				       struct blk_zone_wplug *zwplug)
1639 {
1640 	unsigned long flags;
1641 
1642 	spin_lock_irqsave(&zwplug->lock, flags);
1643 
1644 	/*
1645 	 * For rotational devices, signal the BIO completion to the zone write
1646 	 * plug work. Otherwise, schedule submission of the next plugged BIO
1647 	 * if we have one.
1648 	 */
1649 	if (bio_list_empty(&zwplug->bio_list))
1650 		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1651 
1652 	if (blk_queue_zoned_qd1_writes(disk->queue))
1653 		complete(&disk->zone_wplugs_worker_bio_done);
1654 	else if (!bio_list_empty(&zwplug->bio_list))
1655 		disk_zone_wplug_schedule_work(disk, zwplug);
1656 
1657 	if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
1658 		disk_mark_zone_wplug_dead(zwplug);
1659 
1660 	spin_unlock_irqrestore(&zwplug->lock, flags);
1661 }
1662 
1663 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
1664 {
1665 	/*
1666 	 * For zone append requests, the request sector indicates the location
1667 	 * at which the BIO data was written. Return this value to the BIO
1668 	 * issuer through the BIO iter sector.
1669 	 * For plugged zone writes, which include emulated zone append, we need
1670 	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
1671 	 * lookup the zone write plug.
1672 	 */
1673 	bio->bi_iter.bi_sector = rq->__sector;
1674 	trace_blk_zone_append_update_request_bio(rq);
1675 }
1676 
1677 void blk_zone_write_plug_bio_endio(struct bio *bio)
1678 {
1679 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1680 	struct blk_zone_wplug *zwplug =
1681 		disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1682 	unsigned long flags;
1683 
1684 	if (WARN_ON_ONCE(!zwplug))
1685 		return;
1686 
1687 	/* Make sure we do not see this BIO again by clearing the plug flag. */
1688 	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1689 
1690 	/*
1691 	 * If this is a regular write emulating a zone append operation,
1692 	 * restore the original operation code.
1693 	 */
1694 	if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1695 		bio->bi_opf &= ~REQ_OP_MASK;
1696 		bio->bi_opf |= REQ_OP_ZONE_APPEND;
1697 		bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
1698 	}
1699 
1700 	/*
1701 	 * If the BIO failed, abort all plugged BIOs and mark the plug as
1702 	 * needing a write pointer update.
1703 	 */
1704 	if (bio->bi_status != BLK_STS_OK) {
1705 		spin_lock_irqsave(&zwplug->lock, flags);
1706 		disk_zone_wplug_abort(zwplug);
1707 		zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1708 		spin_unlock_irqrestore(&zwplug->lock, flags);
1709 	}
1710 
1711 	/* Drop the reference we took when the BIO was issued. */
1712 	disk_put_zone_wplug(zwplug);
1713 
1714 	/*
1715 	 * For BIO-based devices, blk_zone_write_plug_finish_request()
1716 	 * is not called. So we need to schedule execution of the next
1717 	 * plugged BIO here.
1718 	 */
1719 	if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1720 		disk_zone_wplug_unplug_bio(disk, zwplug);
1721 
1722 	/* Drop the reference we took when entering this function. */
1723 	disk_put_zone_wplug(zwplug);
1724 }
1725 
1726 void blk_zone_write_plug_finish_request(struct request *req)
1727 {
1728 	struct gendisk *disk = req->q->disk;
1729 	struct blk_zone_wplug *zwplug;
1730 
1731 	zwplug = disk_get_zone_wplug(disk, req->__sector);
1732 	if (WARN_ON_ONCE(!zwplug))
1733 		return;
1734 
1735 	req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1736 
1737 	/*
1738 	 * Drop the reference we took when the request was initialized in
1739 	 * blk_zone_write_plug_init_request().
1740 	 */
1741 	disk_put_zone_wplug(zwplug);
1742 
1743 	disk_zone_wplug_unplug_bio(disk, zwplug);
1744 
1745 	/* Drop the reference we took when entering this function. */
1746 	disk_put_zone_wplug(zwplug);
1747 }
1748 
1749 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
1750 				       struct blk_zone_wplug *zwplug)
1751 {
1752 	struct block_device *bdev;
1753 	unsigned long flags;
1754 	struct bio *bio;
1755 	bool prepared;
1756 
1757 	/*
1758 	 * Submit the next plugged BIO. If we do not have any, clear
1759 	 * the plugged flag.
1760 	 */
1761 again:
1762 	spin_lock_irqsave(&zwplug->lock, flags);
1763 	bio = bio_list_pop(&zwplug->bio_list);
1764 	if (!bio) {
1765 		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1766 		spin_unlock_irqrestore(&zwplug->lock, flags);
1767 		return false;
1768 	}
1769 
1770 	trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
1771 				 bio->bi_iter.bi_sector, bio_sectors(bio));
1772 
1773 	prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
1774 	spin_unlock_irqrestore(&zwplug->lock, flags);
1775 
1776 	if (!prepared) {
1777 		blk_zone_wplug_bio_io_error(zwplug, bio);
1778 		goto again;
1779 	}
1780 
1781 	/*
1782 	 * blk-mq devices will reuse the extra reference on the request queue
1783 	 * usage counter we took when the BIO was plugged, but the submission
1784 	 * path for BIO-based devices will not do that. So drop this extra
1785 	 * reference here.
1786 	 */
1787 	if (blk_queue_zoned_qd1_writes(disk->queue))
1788 		reinit_completion(&disk->zone_wplugs_worker_bio_done);
1789 	bdev = bio->bi_bdev;
1790 	if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
1791 		bdev->bd_disk->fops->submit_bio(bio);
1792 		blk_queue_exit(bdev->bd_disk->queue);
1793 	} else {
1794 		blk_mq_submit_bio(bio);
1795 	}
1796 
1797 	return true;
1798 }
1799 
1800 static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk)
1801 {
1802 	struct blk_zone_wplug *zwplug;
1803 
1804 	spin_lock_irq(&disk->zone_wplugs_list_lock);
1805 	zwplug = list_first_entry_or_null(&disk->zone_wplugs_list,
1806 					  struct blk_zone_wplug, entry);
1807 	if (zwplug)
1808 		list_del_init(&zwplug->entry);
1809 	spin_unlock_irq(&disk->zone_wplugs_list_lock);
1810 
1811 	return zwplug;
1812 }
1813 
1814 static int disk_zone_wplugs_worker(void *data)
1815 {
1816 	struct gendisk *disk = data;
1817 	struct blk_zone_wplug *zwplug;
1818 	unsigned int noio_flag;
1819 
1820 	noio_flag = memalloc_noio_save();
1821 	set_user_nice(current, MIN_NICE);
1822 	set_freezable();
1823 
1824 	for (;;) {
1825 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1826 
1827 		zwplug = disk_get_zone_wplugs_work(disk);
1828 		if (zwplug) {
1829 			/*
1830 			 * Process all BIOs of this zone write plug and then
1831 			 * drop the reference we took when adding the zone write
1832 			 * plug to the active list.
1833 			 */
1834 			set_current_state(TASK_RUNNING);
1835 			while (disk_zone_wplug_submit_bio(disk, zwplug))
1836 				blk_wait_io(&disk->zone_wplugs_worker_bio_done);
1837 			disk_put_zone_wplug(zwplug);
1838 			continue;
1839 		}
1840 
1841 		/*
1842 		 * Only sleep if nothing sets the state to running. Else check
1843 		 * for zone write plugs work again as a newly submitted BIO
1844 		 * might have added a zone write plug to the work list.
1845 		 */
1846 		if (get_current_state() == TASK_RUNNING) {
1847 			try_to_freeze();
1848 		} else {
1849 			if (kthread_should_stop()) {
1850 				set_current_state(TASK_RUNNING);
1851 				break;
1852 			}
1853 			schedule();
1854 		}
1855 	}
1856 
1857 	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
1858 	memalloc_noio_restore(noio_flag);
1859 
1860 	return 0;
1861 }
1862 
1863 void disk_init_zone_resources(struct gendisk *disk)
1864 {
1865 	spin_lock_init(&disk->zone_wplugs_hash_lock);
1866 	spin_lock_init(&disk->zone_wplugs_list_lock);
1867 	INIT_LIST_HEAD(&disk->zone_wplugs_list);
1868 	init_completion(&disk->zone_wplugs_worker_bio_done);
1869 }
1870 
1871 /*
1872  * For the size of a disk zone write plug hash table, use the size of the
1873  * zone write plug mempool, which is the maximum of the disk open zones and
1874  * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1875  * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1876  */
1877 #define BLK_ZONE_WPLUG_MAX_HASH_BITS		9
1878 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE	128
1879 
1880 static int disk_alloc_zone_resources(struct gendisk *disk,
1881 				     unsigned int pool_size)
1882 {
1883 	unsigned int i;
1884 	int ret = -ENOMEM;
1885 
1886 	atomic_set(&disk->nr_zone_wplugs, 0);
1887 	disk->zone_wplugs_hash_bits =
1888 		min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1889 
1890 	disk->zone_wplugs_hash =
1891 		kzalloc_objs(struct hlist_head,
1892 			     disk_zone_wplugs_hash_size(disk));
1893 	if (!disk->zone_wplugs_hash)
1894 		return -ENOMEM;
1895 
1896 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1897 		INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1898 
1899 	disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1900 						sizeof(struct blk_zone_wplug));
1901 	if (!disk->zone_wplugs_pool)
1902 		goto free_hash;
1903 
1904 	disk->zone_wplugs_wq =
1905 		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1906 				pool_size, disk->disk_name);
1907 	if (!disk->zone_wplugs_wq)
1908 		goto destroy_pool;
1909 
1910 	disk->zone_wplugs_worker =
1911 		kthread_create(disk_zone_wplugs_worker, disk,
1912 			       "%s_zwplugs_worker", disk->disk_name);
1913 	if (IS_ERR(disk->zone_wplugs_worker)) {
1914 		ret = PTR_ERR(disk->zone_wplugs_worker);
1915 		disk->zone_wplugs_worker = NULL;
1916 		goto destroy_wq;
1917 	}
1918 	wake_up_process(disk->zone_wplugs_worker);
1919 
1920 	return 0;
1921 
1922 destroy_wq:
1923 	destroy_workqueue(disk->zone_wplugs_wq);
1924 	disk->zone_wplugs_wq = NULL;
1925 destroy_pool:
1926 	mempool_destroy(disk->zone_wplugs_pool);
1927 	disk->zone_wplugs_pool = NULL;
1928 free_hash:
1929 	kfree(disk->zone_wplugs_hash);
1930 	disk->zone_wplugs_hash = NULL;
1931 	disk->zone_wplugs_hash_bits = 0;
1932 	return ret;
1933 }
1934 
1935 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1936 {
1937 	struct blk_zone_wplug *zwplug;
1938 	unsigned int i;
1939 
1940 	if (!disk->zone_wplugs_hash)
1941 		return;
1942 
1943 	/* Free all the zone write plugs we have. */
1944 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1945 		while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1946 			zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1947 					     struct blk_zone_wplug, node);
1948 			spin_lock_irq(&zwplug->lock);
1949 			disk_mark_zone_wplug_dead(zwplug);
1950 			spin_unlock_irq(&zwplug->lock);
1951 		}
1952 	}
1953 
1954 	WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1955 	kfree(disk->zone_wplugs_hash);
1956 	disk->zone_wplugs_hash = NULL;
1957 	disk->zone_wplugs_hash_bits = 0;
1958 
1959 	/*
1960 	 * Wait for the zone write plugs to be RCU-freed before destroying the
1961 	 * mempool.
1962 	 */
1963 	rcu_barrier();
1964 	mempool_destroy(disk->zone_wplugs_pool);
1965 	disk->zone_wplugs_pool = NULL;
1966 }
1967 
1968 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
1969 {
1970 	unsigned long flags;
1971 
1972 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
1973 	zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
1974 				lockdep_is_held(&disk->zone_wplugs_hash_lock));
1975 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
1976 
1977 	kfree_rcu_mightsleep(zones_cond);
1978 }
1979 
1980 void disk_free_zone_resources(struct gendisk *disk)
1981 {
1982 	if (disk->zone_wplugs_worker)
1983 		kthread_stop(disk->zone_wplugs_worker);
1984 	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
1985 
1986 	if (disk->zone_wplugs_wq) {
1987 		destroy_workqueue(disk->zone_wplugs_wq);
1988 		disk->zone_wplugs_wq = NULL;
1989 	}
1990 
1991 	disk_destroy_zone_wplugs_hash_table(disk);
1992 
1993 	disk_set_zones_cond_array(disk, NULL);
1994 	disk->zone_capacity = 0;
1995 	disk->last_zone_capacity = 0;
1996 	disk->nr_zones = 0;
1997 }
1998 
1999 struct blk_revalidate_zone_args {
2000 	struct gendisk	*disk;
2001 	u8		*zones_cond;
2002 	unsigned int	nr_zones;
2003 	unsigned int	nr_conv_zones;
2004 	unsigned int	zone_capacity;
2005 	unsigned int	last_zone_capacity;
2006 	sector_t	sector;
2007 };
2008 
2009 static int disk_revalidate_zone_resources(struct gendisk *disk,
2010 				struct blk_revalidate_zone_args *args)
2011 {
2012 	struct queue_limits *lim = &disk->queue->limits;
2013 	unsigned int pool_size;
2014 	int ret = 0;
2015 
2016 	args->disk = disk;
2017 	args->nr_zones =
2018 		DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
2019 
2020 	/* Cached zone conditions: 1 byte per zone */
2021 	args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
2022 	if (!args->zones_cond)
2023 		return -ENOMEM;
2024 
2025 	if (!disk_need_zone_resources(disk))
2026 		return 0;
2027 
2028 	/*
2029 	 * If the device has no limit on the maximum number of open and active
2030 	 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
2031 	 */
2032 	pool_size = max(lim->max_open_zones, lim->max_active_zones);
2033 	if (!pool_size)
2034 		pool_size =
2035 			min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
2036 
2037 	if (!disk->zone_wplugs_hash) {
2038 		ret = disk_alloc_zone_resources(disk, pool_size);
2039 		if (ret)
2040 			kfree(args->zones_cond);
2041 	}
2042 
2043 	return ret;
2044 }
2045 
2046 /*
2047  * Update the disk zone resources information and device queue limits.
2048  * The disk queue is frozen when this is executed.
2049  */
2050 static int disk_update_zone_resources(struct gendisk *disk,
2051 				      struct blk_revalidate_zone_args *args)
2052 {
2053 	struct request_queue *q = disk->queue;
2054 	unsigned int nr_seq_zones;
2055 	unsigned int pool_size, memflags;
2056 	struct queue_limits lim;
2057 	int ret = 0;
2058 
2059 	lim = queue_limits_start_update(q);
2060 
2061 	memflags = blk_mq_freeze_queue(q);
2062 
2063 	disk->nr_zones = args->nr_zones;
2064 	if (args->nr_conv_zones >= disk->nr_zones) {
2065 		queue_limits_cancel_update(q);
2066 		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
2067 			disk->disk_name, args->nr_conv_zones, disk->nr_zones);
2068 		ret = -ENODEV;
2069 		goto unfreeze;
2070 	}
2071 
2072 	disk->zone_capacity = args->zone_capacity;
2073 	disk->last_zone_capacity = args->last_zone_capacity;
2074 	disk_set_zones_cond_array(disk, args->zones_cond);
2075 	args->zones_cond = NULL;
2076 
2077 	/*
2078 	 * Some devices can advertise zone resource limits that are larger than
2079 	 * the number of sequential zones of the zoned block device, e.g. a
2080 	 * small ZNS namespace. For such case, assume that the zoned device has
2081 	 * no zone resource limits.
2082 	 */
2083 	nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
2084 	if (lim.max_open_zones >= nr_seq_zones)
2085 		lim.max_open_zones = 0;
2086 	if (lim.max_active_zones >= nr_seq_zones)
2087 		lim.max_active_zones = 0;
2088 
2089 	if (!disk->zone_wplugs_pool)
2090 		goto commit;
2091 
2092 	/*
2093 	 * If the device has no limit on the maximum number of open and active
2094 	 * zones, set its max open zone limit to the mempool size to indicate
2095 	 * to the user that there is a potential performance impact due to
2096 	 * dynamic zone write plug allocation when simultaneously writing to
2097 	 * more zones than the size of the mempool.
2098 	 */
2099 	pool_size = max(lim.max_open_zones, lim.max_active_zones);
2100 	if (!pool_size)
2101 		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
2102 
2103 	mempool_resize(disk->zone_wplugs_pool, pool_size);
2104 
2105 	if (!lim.max_open_zones && !lim.max_active_zones) {
2106 		if (pool_size < nr_seq_zones)
2107 			lim.max_open_zones = pool_size;
2108 		else
2109 			lim.max_open_zones = 0;
2110 	}
2111 
2112 commit:
2113 	ret = queue_limits_commit_update(q, &lim);
2114 
2115 unfreeze:
2116 	if (ret)
2117 		disk_free_zone_resources(disk);
2118 
2119 	blk_mq_unfreeze_queue(q, memflags);
2120 
2121 	return ret;
2122 }
2123 
2124 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
2125 				    struct blk_revalidate_zone_args *args)
2126 {
2127 	enum blk_zone_cond cond = zone->cond;
2128 
2129 	/* Check that the zone condition is consistent with the zone type. */
2130 	switch (cond) {
2131 	case BLK_ZONE_COND_NOT_WP:
2132 		if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
2133 			goto invalid_condition;
2134 		break;
2135 	case BLK_ZONE_COND_IMP_OPEN:
2136 	case BLK_ZONE_COND_EXP_OPEN:
2137 	case BLK_ZONE_COND_CLOSED:
2138 	case BLK_ZONE_COND_EMPTY:
2139 	case BLK_ZONE_COND_FULL:
2140 	case BLK_ZONE_COND_OFFLINE:
2141 	case BLK_ZONE_COND_READONLY:
2142 		if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
2143 			goto invalid_condition;
2144 		break;
2145 	default:
2146 		pr_warn("%s: Invalid zone condition 0x%X\n",
2147 			args->disk->disk_name, cond);
2148 		return -ENODEV;
2149 	}
2150 
2151 	blk_zone_set_cond(args->zones_cond, idx, cond);
2152 
2153 	return 0;
2154 
2155 invalid_condition:
2156 	pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
2157 		args->disk->disk_name, cond, zone->type);
2158 
2159 	return -ENODEV;
2160 }
2161 
2162 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
2163 				    struct blk_revalidate_zone_args *args)
2164 {
2165 	struct gendisk *disk = args->disk;
2166 
2167 	if (zone->capacity != zone->len) {
2168 		pr_warn("%s: Invalid conventional zone capacity\n",
2169 			disk->disk_name);
2170 		return -ENODEV;
2171 	}
2172 
2173 	if (disk_zone_is_last(disk, zone))
2174 		args->last_zone_capacity = zone->capacity;
2175 
2176 	args->nr_conv_zones++;
2177 
2178 	return 0;
2179 }
2180 
2181 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
2182 				   struct blk_revalidate_zone_args *args)
2183 {
2184 	struct gendisk *disk = args->disk;
2185 	struct blk_zone_wplug *zwplug;
2186 	unsigned int wp_offset;
2187 
2188 	/*
2189 	 * Remember the capacity of the first sequential zone and check
2190 	 * if it is constant for all zones, ignoring the last zone as it can be
2191 	 * smaller.
2192 	 */
2193 	if (!args->zone_capacity)
2194 		args->zone_capacity = zone->capacity;
2195 	if (disk_zone_is_last(disk, zone)) {
2196 		args->last_zone_capacity = zone->capacity;
2197 	} else if (zone->capacity != args->zone_capacity) {
2198 		pr_warn("%s: Invalid variable zone capacity\n",
2199 			disk->disk_name);
2200 		return -ENODEV;
2201 	}
2202 
2203 	/*
2204 	 * If the device needs zone append emulation, we need to track the
2205 	 * write pointer of all zones that are not empty nor full. So make sure
2206 	 * we have a zone write plug for such zone if the device has a zone
2207 	 * write plug hash table.
2208 	 */
2209 	if (!disk->zone_wplugs_hash)
2210 		return 0;
2211 
2212 	wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone);
2213 	if (!wp_offset || wp_offset >= zone->capacity)
2214 		return 0;
2215 
2216 	zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO);
2217 	if (!zwplug)
2218 		return -ENOMEM;
2219 	disk_put_zone_wplug(zwplug);
2220 
2221 	return 0;
2222 }
2223 
2224 /*
2225  * Helper function to check the validity of zones of a zoned block device.
2226  */
2227 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
2228 				  void *data)
2229 {
2230 	struct blk_revalidate_zone_args *args = data;
2231 	struct gendisk *disk = args->disk;
2232 	sector_t zone_sectors = disk->queue->limits.chunk_sectors;
2233 	int ret;
2234 
2235 	/* Check for bad zones and holes in the zone report */
2236 	if (zone->start != args->sector) {
2237 		pr_warn("%s: Zone gap at sectors %llu..%llu\n",
2238 			disk->disk_name, args->sector, zone->start);
2239 		return -ENODEV;
2240 	}
2241 
2242 	if (zone->start >= get_capacity(disk) || !zone->len) {
2243 		pr_warn("%s: Invalid zone start %llu, length %llu\n",
2244 			disk->disk_name, zone->start, zone->len);
2245 		return -ENODEV;
2246 	}
2247 
2248 	/*
2249 	 * All zones must have the same size, with the exception on an eventual
2250 	 * smaller last zone.
2251 	 */
2252 	if (!disk_zone_is_last(disk, zone)) {
2253 		if (zone->len != zone_sectors) {
2254 			pr_warn("%s: Invalid zoned device with non constant zone size\n",
2255 				disk->disk_name);
2256 			return -ENODEV;
2257 		}
2258 	} else if (zone->len > zone_sectors) {
2259 		pr_warn("%s: Invalid zoned device with larger last zone size\n",
2260 			disk->disk_name);
2261 		return -ENODEV;
2262 	}
2263 
2264 	if (!zone->capacity || zone->capacity > zone->len) {
2265 		pr_warn("%s: Invalid zone capacity\n",
2266 			disk->disk_name);
2267 		return -ENODEV;
2268 	}
2269 
2270 	/* Check zone condition */
2271 	ret = blk_revalidate_zone_cond(zone, idx, args);
2272 	if (ret)
2273 		return ret;
2274 
2275 	/* Check zone type */
2276 	switch (zone->type) {
2277 	case BLK_ZONE_TYPE_CONVENTIONAL:
2278 		ret = blk_revalidate_conv_zone(zone, idx, args);
2279 		break;
2280 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
2281 		ret = blk_revalidate_seq_zone(zone, idx, args);
2282 		break;
2283 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
2284 	default:
2285 		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
2286 			disk->disk_name, (int)zone->type, zone->start);
2287 		ret = -ENODEV;
2288 	}
2289 
2290 	if (!ret)
2291 		args->sector += zone->len;
2292 
2293 	return ret;
2294 }
2295 
2296 /**
2297  * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
2298  * @disk:	Target disk
2299  *
2300  * Helper function for low-level device drivers to check, (re) allocate and
2301  * initialize resources used for managing zoned disks. This function should
2302  * normally be called by blk-mq based drivers when a zoned gendisk is probed
2303  * and when the zone configuration of the gendisk changes (e.g. after a format).
2304  * Before calling this function, the device driver must already have set the
2305  * device zone size (chunk_sector limit) and the max zone append limit.
2306  * BIO based drivers can also use this function as long as the device queue
2307  * can be safely frozen.
2308  */
2309 int blk_revalidate_disk_zones(struct gendisk *disk)
2310 {
2311 	struct request_queue *q = disk->queue;
2312 	sector_t zone_sectors = q->limits.chunk_sectors;
2313 	sector_t capacity = get_capacity(disk);
2314 	struct blk_revalidate_zone_args args = { };
2315 	unsigned int memflags, noio_flag;
2316 	struct blk_report_zones_args rep_args = {
2317 		.cb = blk_revalidate_zone_cb,
2318 		.data = &args,
2319 	};
2320 	int ret = -ENOMEM;
2321 
2322 	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
2323 		return -EIO;
2324 
2325 	if (!capacity)
2326 		return -ENODEV;
2327 
2328 	/*
2329 	 * Checks that the device driver indicated a valid zone size and that
2330 	 * the max zone append limit is set.
2331 	 */
2332 	if (!zone_sectors || !is_power_of_2(zone_sectors)) {
2333 		pr_warn("%s: Invalid non power of two zone size (%llu)\n",
2334 			disk->disk_name, zone_sectors);
2335 		return -ENODEV;
2336 	}
2337 
2338 	/*
2339 	 * Ensure that all memory allocations in this context are done as if
2340 	 * GFP_NOIO was specified.
2341 	 */
2342 	noio_flag = memalloc_noio_save();
2343 	ret = disk_revalidate_zone_resources(disk, &args);
2344 	if (ret) {
2345 		memalloc_noio_restore(noio_flag);
2346 		return ret;
2347 	}
2348 
2349 	ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args);
2350 	if (!ret) {
2351 		pr_warn("%s: No zones reported\n", disk->disk_name);
2352 		ret = -ENODEV;
2353 	}
2354 	memalloc_noio_restore(noio_flag);
2355 
2356 	if (ret <= 0)
2357 		goto free_resources;
2358 
2359 	/*
2360 	 * If zones where reported, make sure that the entire disk capacity
2361 	 * has been checked.
2362 	 */
2363 	if (args.sector != capacity) {
2364 		pr_warn("%s: Missing zones from sector %llu\n",
2365 			disk->disk_name, args.sector);
2366 		ret = -ENODEV;
2367 		goto free_resources;
2368 	}
2369 
2370 	ret = disk_update_zone_resources(disk, &args);
2371 	if (ret)
2372 		goto free_resources;
2373 
2374 	return 0;
2375 
2376 free_resources:
2377 	pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
2378 
2379 	kfree(args.zones_cond);
2380 	memflags = blk_mq_freeze_queue(q);
2381 	disk_free_zone_resources(disk);
2382 	blk_mq_unfreeze_queue(q, memflags);
2383 
2384 	return ret;
2385 }
2386 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
2387 
2388 /**
2389  * blk_zone_issue_zeroout - zero-fill a block range in a zone
2390  * @bdev:	blockdev to write
2391  * @sector:	start sector
2392  * @nr_sects:	number of sectors to write
2393  * @gfp_mask:	memory allocation flags (for bio_alloc)
2394  *
2395  * Description:
2396  *  Zero-fill a block range in a zone (@sector must be equal to the zone write
2397  *  pointer), handling potential errors due to the (initially unknown) lack of
2398  *  hardware offload (See blkdev_issue_zeroout()).
2399  */
2400 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
2401 			   sector_t nr_sects, gfp_t gfp_mask)
2402 {
2403 	struct gendisk *disk = bdev->bd_disk;
2404 	int ret;
2405 
2406 	if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
2407 		return -EIO;
2408 
2409 	ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
2410 				   BLKDEV_ZERO_NOFALLBACK);
2411 	if (ret != -EOPNOTSUPP)
2412 		return ret;
2413 
2414 	/*
2415 	 * The failed call to blkdev_issue_zeroout() advanced the zone write
2416 	 * pointer. Undo this using a report zone to update the zone write
2417 	 * pointer to the correct current value.
2418 	 */
2419 	ret = disk->fops->report_zones(disk, sector, 1, NULL);
2420 	if (ret != 1)
2421 		return ret < 0 ? ret : -EIO;
2422 
2423 	/*
2424 	 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
2425 	 * regular write with zero-pages.
2426 	 */
2427 	return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
2428 }
2429 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
2430 
2431 #ifdef CONFIG_BLK_DEBUG_FS
2432 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
2433 				  struct seq_file *m)
2434 {
2435 	unsigned int zwp_wp_offset, zwp_flags;
2436 	unsigned int zwp_zone_no, zwp_ref;
2437 	unsigned int zwp_bio_list_size;
2438 	enum blk_zone_cond zwp_cond;
2439 	unsigned long flags;
2440 
2441 	spin_lock_irqsave(&zwplug->lock, flags);
2442 	zwp_zone_no = zwplug->zone_no;
2443 	zwp_flags = zwplug->flags;
2444 	zwp_ref = refcount_read(&zwplug->ref);
2445 	zwp_cond = zwplug->cond;
2446 	zwp_wp_offset = zwplug->wp_offset;
2447 	zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
2448 	spin_unlock_irqrestore(&zwplug->lock, flags);
2449 
2450 	seq_printf(m,
2451 		"Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n",
2452 		zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond),
2453 		zwp_wp_offset, zwp_bio_list_size);
2454 }
2455 
2456 int queue_zone_wplugs_show(void *data, struct seq_file *m)
2457 {
2458 	struct request_queue *q = data;
2459 	struct gendisk *disk = q->disk;
2460 	struct blk_zone_wplug *zwplug;
2461 	unsigned int i;
2462 
2463 	if (!disk->zone_wplugs_hash)
2464 		return 0;
2465 
2466 	rcu_read_lock();
2467 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
2468 		hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
2469 					 node)
2470 			queue_zone_wplug_show(zwplug, m);
2471 	rcu_read_unlock();
2472 
2473 	return 0;
2474 }
2475 
2476 #endif
2477