xref: /linux/block/blk-zoned.c (revision 9100a28c8bb4270744942cf834efcd80f1acda7d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Zoned block device handling
4  *
5  * Copyright (c) 2015, Hannes Reinecke
6  * Copyright (c) 2015, SUSE Linux GmbH
7  *
8  * Copyright (c) 2016, Damien Le Moal
9  * Copyright (c) 2016, Western Digital
10  * Copyright (c) 2024, Western Digital Corporation or its affiliates.
11  */
12 
13 #include <linux/kernel.h>
14 #include <linux/blkdev.h>
15 #include <linux/blk-mq.h>
16 #include <linux/spinlock.h>
17 #include <linux/refcount.h>
18 #include <linux/mempool.h>
19 #include <linux/kthread.h>
20 #include <linux/freezer.h>
21 
22 #include <trace/events/block.h>
23 
24 #include "blk.h"
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
27 
28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29 static const char *const zone_cond_name[] = {
30 	ZONE_COND_NAME(NOT_WP),
31 	ZONE_COND_NAME(EMPTY),
32 	ZONE_COND_NAME(IMP_OPEN),
33 	ZONE_COND_NAME(EXP_OPEN),
34 	ZONE_COND_NAME(CLOSED),
35 	ZONE_COND_NAME(READONLY),
36 	ZONE_COND_NAME(FULL),
37 	ZONE_COND_NAME(OFFLINE),
38 	ZONE_COND_NAME(ACTIVE),
39 };
40 #undef ZONE_COND_NAME
41 
42 /*
43  * Per-zone write plug.
44  * @node: hlist_node structure for managing the plug using a hash table.
45  * @entry: list_head structure for listing the plug in the disk list of active
46  *         zone write plugs.
47  * @bio_list: The list of BIOs that are currently plugged.
48  * @bio_work: Work struct to handle issuing of plugged BIOs
49  * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
50  * @disk: The gendisk the plug belongs to.
51  * @lock: Spinlock to atomically manipulate the plug.
52  * @ref: Zone write plug reference counter. A zone write plug reference is
53  *       always at least 1 when the plug is hashed in the disk plug hash table.
54  *       The reference is incremented whenever a new BIO needing plugging is
55  *       submitted and when a function needs to manipulate a plug. The
56  *       reference count is decremented whenever a plugged BIO completes and
57  *       when a function that referenced the plug returns. The initial
58  *       reference is dropped whenever the zone of the zone write plug is reset,
59  *       finished and when the zone becomes full (last write BIO to the zone
60  *       completes).
61  * @flags: Flags indicating the plug state.
62  * @zone_no: The number of the zone the plug is managing.
63  * @wp_offset: The zone write pointer location relative to the start of the zone
64  *             as a number of 512B sectors.
65  * @cond: Condition of the zone
66  */
67 struct blk_zone_wplug {
68 	struct hlist_node	node;
69 	struct list_head	entry;
70 	struct bio_list		bio_list;
71 	struct work_struct	bio_work;
72 	struct rcu_head		rcu_head;
73 	struct gendisk		*disk;
74 	spinlock_t		lock;
75 	refcount_t		ref;
76 	unsigned int		flags;
77 	unsigned int		zone_no;
78 	unsigned int		wp_offset;
79 	enum blk_zone_cond	cond;
80 };
81 
82 static inline bool disk_need_zone_resources(struct gendisk *disk)
83 {
84 	/*
85 	 * All request-based zoned devices need zone resources so that the
86 	 * block layer can automatically handle write BIO plugging. BIO-based
87 	 * device drivers (e.g. DM devices) are normally responsible for
88 	 * handling zone write ordering and do not need zone resources, unless
89 	 * the driver requires zone append emulation.
90 	 */
91 	return queue_is_mq(disk->queue) ||
92 		queue_emulates_zone_append(disk->queue);
93 }
94 
95 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
96 {
97 	return 1U << disk->zone_wplugs_hash_bits;
98 }
99 
100 /*
101  * Zone write plug flags bits:
102  *  - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
103  *    that is, that write BIOs are being throttled due to a write BIO already
104  *    being executed or the zone write plug bio list is not empty.
105  *  - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
106  *    write pointer offset and need to update it.
107  *  - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be
108  *    removed from the disk hash table of zone write plugs when the last
109  *    reference on the zone write plug is dropped. If set, this flag also
110  *    indicates that the initial extra reference on the zone write plug was
111  *    dropped, meaning that the reference count indicates the current number of
112  *    active users (code context or BIOs and requests in flight). This flag is
113  *    set when a zone is reset, finished or becomes full.
114  */
115 #define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
116 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE	(1U << 1)
117 #define BLK_ZONE_WPLUG_DEAD		(1U << 2)
118 
119 /**
120  * blk_zone_cond_str - Return a zone condition name string
121  * @zone_cond: a zone condition BLK_ZONE_COND_name
122  *
123  * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful
124  * for the debugging and tracing zone conditions. For an invalid zone
125  * conditions, the string "UNKNOWN" is returned.
126  */
127 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
128 {
129 	static const char *zone_cond_str = "UNKNOWN";
130 
131 	if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
132 		zone_cond_str = zone_cond_name[zone_cond];
133 
134 	return zone_cond_str;
135 }
136 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
137 
138 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno,
139 			      enum blk_zone_cond cond)
140 {
141 	if (!zones_cond)
142 		return;
143 
144 	switch (cond) {
145 	case BLK_ZONE_COND_IMP_OPEN:
146 	case BLK_ZONE_COND_EXP_OPEN:
147 	case BLK_ZONE_COND_CLOSED:
148 		zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
149 		return;
150 	case BLK_ZONE_COND_NOT_WP:
151 	case BLK_ZONE_COND_EMPTY:
152 	case BLK_ZONE_COND_FULL:
153 	case BLK_ZONE_COND_OFFLINE:
154 	case BLK_ZONE_COND_READONLY:
155 	default:
156 		zones_cond[zno] = cond;
157 		return;
158 	}
159 }
160 
161 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
162 			       enum blk_zone_cond cond)
163 {
164 	u8 *zones_cond;
165 
166 	rcu_read_lock();
167 	zones_cond = rcu_dereference(disk->zones_cond);
168 	if (zones_cond) {
169 		unsigned int zno = disk_zone_no(disk, sector);
170 
171 		/*
172 		 * The condition of a conventional, readonly and offline zones
173 		 * never changes, so do nothing if the target zone is in one of
174 		 * these conditions.
175 		 */
176 		switch (zones_cond[zno]) {
177 		case BLK_ZONE_COND_NOT_WP:
178 		case BLK_ZONE_COND_READONLY:
179 		case BLK_ZONE_COND_OFFLINE:
180 			break;
181 		default:
182 			blk_zone_set_cond(zones_cond, zno, cond);
183 			break;
184 		}
185 	}
186 	rcu_read_unlock();
187 }
188 
189 /**
190  * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
191  * @bdev:       block device to check
192  * @sector:     sector number
193  *
194  * Check if @sector on @bdev is contained in a sequential write required zone.
195  */
196 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
197 {
198 	struct gendisk *disk = bdev->bd_disk;
199 	unsigned int zno = disk_zone_no(disk, sector);
200 	bool is_seq = false;
201 	u8 *zones_cond;
202 
203 	if (!bdev_is_zoned(bdev))
204 		return false;
205 
206 	rcu_read_lock();
207 	zones_cond = rcu_dereference(disk->zones_cond);
208 	if (zones_cond && zno < disk->nr_zones)
209 		is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
210 	rcu_read_unlock();
211 
212 	return is_seq;
213 }
214 EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
215 
216 /*
217  * Zone report arguments for block device drivers report_zones operation.
218  * @cb: report_zones_cb callback for each reported zone.
219  * @data: Private data passed to report_zones_cb.
220  */
221 struct blk_report_zones_args {
222 	report_zones_cb cb;
223 	void		*data;
224 	bool		report_active;
225 };
226 
227 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
228 				  unsigned int nr_zones,
229 				  struct blk_report_zones_args *args)
230 {
231 	struct gendisk *disk = bdev->bd_disk;
232 
233 	if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
234 		return -EOPNOTSUPP;
235 
236 	if (!nr_zones || sector >= get_capacity(disk))
237 		return 0;
238 
239 	return disk->fops->report_zones(disk, sector, nr_zones, args);
240 }
241 
242 /**
243  * blkdev_report_zones - Get zones information
244  * @bdev:	Target block device
245  * @sector:	Sector from which to report zones
246  * @nr_zones:	Maximum number of zones to report
247  * @cb:		Callback function called for each reported zone
248  * @data:	Private data for the callback
249  *
250  * Description:
251  *    Get zone information starting from the zone containing @sector for at most
252  *    @nr_zones, and call @cb for each zone reported by the device.
253  *    To report all zones in a device starting from @sector, the BLK_ALL_ZONES
254  *    constant can be passed to @nr_zones.
255  *    Returns the number of zones reported by the device, or a negative errno
256  *    value in case of failure.
257  *
258  *    Note: The caller must use memalloc_noXX_save/restore() calls to control
259  *    memory allocations done within this function.
260  */
261 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
262 			unsigned int nr_zones, report_zones_cb cb, void *data)
263 {
264 	struct blk_report_zones_args args = {
265 		.cb = cb,
266 		.data = data,
267 	};
268 
269 	return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
270 }
271 EXPORT_SYMBOL_GPL(blkdev_report_zones);
272 
273 static int blkdev_zone_reset_all(struct block_device *bdev)
274 {
275 	struct bio bio;
276 
277 	bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
278 	trace_blkdev_zone_mgmt(&bio, 0);
279 	return submit_bio_wait(&bio);
280 }
281 
282 /**
283  * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
284  * @bdev:	Target block device
285  * @op:		Operation to be performed on the zones
286  * @sector:	Start sector of the first zone to operate on
287  * @nr_sectors:	Number of sectors, should be at least the length of one zone and
288  *		must be zone size aligned.
289  *
290  * Description:
291  *    Perform the specified operation on the range of zones specified by
292  *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
293  *    is valid, but the specified range should not contain conventional zones.
294  *    The operation to execute on each zone can be a zone reset, open, close
295  *    or finish request.
296  */
297 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
298 		     sector_t sector, sector_t nr_sectors)
299 {
300 	sector_t zone_sectors = bdev_zone_sectors(bdev);
301 	sector_t capacity = bdev_nr_sectors(bdev);
302 	sector_t end_sector = sector + nr_sectors;
303 	struct bio *bio = NULL;
304 	int ret = 0;
305 
306 	if (!bdev_is_zoned(bdev))
307 		return -EOPNOTSUPP;
308 
309 	if (bdev_read_only(bdev))
310 		return -EPERM;
311 
312 	if (!op_is_zone_mgmt(op))
313 		return -EOPNOTSUPP;
314 
315 	if (end_sector <= sector || end_sector > capacity)
316 		/* Out of range */
317 		return -EINVAL;
318 
319 	/* Check alignment (handle eventual smaller last zone) */
320 	if (!bdev_is_zone_start(bdev, sector))
321 		return -EINVAL;
322 
323 	if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
324 		return -EINVAL;
325 
326 	/*
327 	 * In the case of a zone reset operation over all zones, use
328 	 * REQ_OP_ZONE_RESET_ALL.
329 	 */
330 	if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
331 		return blkdev_zone_reset_all(bdev);
332 
333 	while (sector < end_sector) {
334 		bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
335 		bio->bi_iter.bi_sector = sector;
336 		sector += zone_sectors;
337 
338 		/* This may take a while, so be nice to others */
339 		cond_resched();
340 	}
341 
342 	trace_blkdev_zone_mgmt(bio, nr_sectors);
343 	ret = submit_bio_wait(bio);
344 	bio_put(bio);
345 
346 	return ret;
347 }
348 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
349 
350 struct zone_report_args {
351 	struct blk_zone __user *zones;
352 };
353 
354 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
355 				    void *data)
356 {
357 	struct zone_report_args *args = data;
358 
359 	if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
360 		return -EFAULT;
361 	return 0;
362 }
363 
364 /*
365  * Mask of valid input flags for BLKREPORTZONEV2 ioctl.
366  */
367 #define BLK_ZONE_REPV2_INPUT_FLAGS	BLK_ZONE_REP_CACHED
368 
369 /*
370  * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
371  * Called from blkdev_ioctl.
372  */
373 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
374 		unsigned long arg)
375 {
376 	void __user *argp = (void __user *)arg;
377 	struct zone_report_args args;
378 	struct blk_zone_report rep;
379 	int ret;
380 
381 	if (!argp)
382 		return -EINVAL;
383 
384 	if (!bdev_is_zoned(bdev))
385 		return -ENOTTY;
386 
387 	if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
388 		return -EFAULT;
389 
390 	if (!rep.nr_zones)
391 		return -EINVAL;
392 
393 	args.zones = argp + sizeof(struct blk_zone_report);
394 
395 	switch (cmd) {
396 	case BLKREPORTZONE:
397 		ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
398 					  blkdev_copy_zone_to_user, &args);
399 		break;
400 	case BLKREPORTZONEV2:
401 		if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
402 			return -EINVAL;
403 		ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones,
404 					 blkdev_copy_zone_to_user, &args);
405 		break;
406 	default:
407 		return -EINVAL;
408 	}
409 
410 	if (ret < 0)
411 		return ret;
412 
413 	rep.nr_zones = ret;
414 	rep.flags = BLK_ZONE_REP_CAPACITY;
415 	if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
416 		return -EFAULT;
417 	return 0;
418 }
419 
420 static int blkdev_truncate_zone_range(struct block_device *bdev,
421 		blk_mode_t mode, const struct blk_zone_range *zrange)
422 {
423 	loff_t start, end;
424 
425 	if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
426 	    zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
427 		/* Out of range */
428 		return -EINVAL;
429 
430 	start = zrange->sector << SECTOR_SHIFT;
431 	end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
432 
433 	return truncate_bdev_range(bdev, mode, start, end);
434 }
435 
436 /*
437  * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
438  * Called from blkdev_ioctl.
439  */
440 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
441 			   unsigned int cmd, unsigned long arg)
442 {
443 	void __user *argp = (void __user *)arg;
444 	struct blk_zone_range zrange;
445 	enum req_op op;
446 	int ret;
447 
448 	if (!argp)
449 		return -EINVAL;
450 
451 	if (!bdev_is_zoned(bdev))
452 		return -ENOTTY;
453 
454 	if (!(mode & BLK_OPEN_WRITE))
455 		return -EBADF;
456 
457 	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
458 		return -EFAULT;
459 
460 	switch (cmd) {
461 	case BLKRESETZONE:
462 		op = REQ_OP_ZONE_RESET;
463 
464 		/* Invalidate the page cache, including dirty pages. */
465 		inode_lock(bdev->bd_mapping->host);
466 		filemap_invalidate_lock(bdev->bd_mapping);
467 		ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
468 		if (ret)
469 			goto fail;
470 		break;
471 	case BLKOPENZONE:
472 		op = REQ_OP_ZONE_OPEN;
473 		break;
474 	case BLKCLOSEZONE:
475 		op = REQ_OP_ZONE_CLOSE;
476 		break;
477 	case BLKFINISHZONE:
478 		op = REQ_OP_ZONE_FINISH;
479 		break;
480 	default:
481 		return -ENOTTY;
482 	}
483 
484 	ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
485 
486 fail:
487 	if (cmd == BLKRESETZONE) {
488 		filemap_invalidate_unlock(bdev->bd_mapping);
489 		inode_unlock(bdev->bd_mapping->host);
490 	}
491 
492 	return ret;
493 }
494 
495 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
496 {
497 	return zone->start + zone->len >= get_capacity(disk);
498 }
499 
500 static bool disk_zone_wplug_is_full(struct gendisk *disk,
501 				    struct blk_zone_wplug *zwplug)
502 {
503 	if (zwplug->zone_no < disk->nr_zones - 1)
504 		return zwplug->wp_offset >= disk->zone_capacity;
505 	return zwplug->wp_offset >= disk->last_zone_capacity;
506 }
507 
508 static bool disk_insert_zone_wplug(struct gendisk *disk,
509 				   struct blk_zone_wplug *zwplug)
510 {
511 	struct blk_zone_wplug *zwplg;
512 	unsigned long flags;
513 	u8 *zones_cond;
514 	unsigned int idx =
515 		hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
516 
517 	/*
518 	 * Add the new zone write plug to the hash table, but carefully as we
519 	 * are racing with other submission context, so we may already have a
520 	 * zone write plug for the same zone.
521 	 */
522 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
523 	hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
524 		if (zwplg->zone_no == zwplug->zone_no) {
525 			spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock,
526 					       flags);
527 			return false;
528 		}
529 	}
530 
531 	/*
532 	 * Set the zone condition: if we do not yet have a zones_cond array
533 	 * attached to the disk, then this is a zone write plug insert from the
534 	 * first call to blk_revalidate_disk_zones(), in which case the zone is
535 	 * necessarilly in the active condition.
536 	 */
537 	zones_cond = rcu_dereference_check(disk->zones_cond,
538 				lockdep_is_held(&disk->zone_wplugs_hash_lock));
539 	if (zones_cond)
540 		zwplug->cond = zones_cond[zwplug->zone_no];
541 	else
542 		zwplug->cond = BLK_ZONE_COND_ACTIVE;
543 
544 	hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
545 	atomic_inc(&disk->nr_zone_wplugs);
546 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
547 
548 	return true;
549 }
550 
551 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
552 							 sector_t sector)
553 {
554 	unsigned int zno = disk_zone_no(disk, sector);
555 	unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
556 	struct blk_zone_wplug *zwplug;
557 
558 	rcu_read_lock();
559 
560 	hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
561 		if (zwplug->zone_no == zno &&
562 		    refcount_inc_not_zero(&zwplug->ref)) {
563 			rcu_read_unlock();
564 			return zwplug;
565 		}
566 	}
567 
568 	rcu_read_unlock();
569 
570 	return NULL;
571 }
572 
573 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
574 							 sector_t sector)
575 {
576 	if (!atomic_read(&disk->nr_zone_wplugs))
577 		return NULL;
578 
579 	return disk_get_hashed_zone_wplug(disk, sector);
580 }
581 
582 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
583 {
584 	struct blk_zone_wplug *zwplug =
585 		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
586 
587 	mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
588 }
589 
590 static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
591 {
592 	struct gendisk *disk = zwplug->disk;
593 	unsigned long flags;
594 
595 	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD));
596 	WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
597 	WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
598 
599 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
600 	blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
601 				lockdep_is_held(&disk->zone_wplugs_hash_lock)),
602 			  zwplug->zone_no, zwplug->cond);
603 	hlist_del_init_rcu(&zwplug->node);
604 	atomic_dec(&disk->nr_zone_wplugs);
605 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
606 
607 	call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
608 }
609 
610 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
611 {
612 	if (refcount_dec_and_test(&zwplug->ref))
613 		disk_free_zone_wplug(zwplug);
614 }
615 
616 /*
617  * Flag the zone write plug as dead and drop the initial reference we got when
618  * the zone write plug was added to the hash table. The zone write plug will be
619  * unhashed when its last reference is dropped.
620  */
621 static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug)
622 {
623 	lockdep_assert_held(&zwplug->lock);
624 
625 	if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) {
626 		zwplug->flags |= BLK_ZONE_WPLUG_DEAD;
627 		disk_put_zone_wplug(zwplug);
628 	}
629 }
630 
631 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
632 				       struct blk_zone_wplug *zwplug);
633 
634 static void blk_zone_wplug_bio_work(struct work_struct *work)
635 {
636 	struct blk_zone_wplug *zwplug =
637 		container_of(work, struct blk_zone_wplug, bio_work);
638 
639 	disk_zone_wplug_submit_bio(zwplug->disk, zwplug);
640 
641 	/* Drop the reference we took in disk_zone_wplug_schedule_work(). */
642 	disk_put_zone_wplug(zwplug);
643 }
644 
645 /*
646  * Get a zone write plug for the zone containing @sector.
647  * If the plug does not exist, it is allocated and inserted in the disk hash
648  * table.
649  */
650 static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk,
651 					sector_t sector, gfp_t gfp_mask)
652 {
653 	unsigned int zno = disk_zone_no(disk, sector);
654 	struct blk_zone_wplug *zwplug;
655 
656 again:
657 	zwplug = disk_get_zone_wplug(disk, sector);
658 	if (zwplug)
659 		return zwplug;
660 
661 	/*
662 	 * Allocate and initialize a zone write plug with an extra reference
663 	 * so that it is not freed when the zone write plug becomes idle without
664 	 * the zone being full.
665 	 */
666 	zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
667 	if (!zwplug)
668 		return NULL;
669 
670 	INIT_HLIST_NODE(&zwplug->node);
671 	refcount_set(&zwplug->ref, 2);
672 	spin_lock_init(&zwplug->lock);
673 	zwplug->flags = 0;
674 	zwplug->zone_no = zno;
675 	zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
676 	bio_list_init(&zwplug->bio_list);
677 	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
678 	INIT_LIST_HEAD(&zwplug->entry);
679 	zwplug->disk = disk;
680 
681 	/*
682 	 * Insert the new zone write plug in the hash table. This can fail only
683 	 * if another context already inserted a plug. Retry from the beginning
684 	 * in such case.
685 	 */
686 	if (!disk_insert_zone_wplug(disk, zwplug)) {
687 		mempool_free(zwplug, disk->zone_wplugs_pool);
688 		goto again;
689 	}
690 
691 	return zwplug;
692 }
693 
694 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
695 					       struct bio *bio)
696 {
697 	struct request_queue *q = zwplug->disk->queue;
698 
699 	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
700 	bio_io_error(bio);
701 	disk_put_zone_wplug(zwplug);
702 	/* Drop the reference taken by disk_zone_wplug_add_bio(). */
703 	blk_queue_exit(q);
704 }
705 
706 /*
707  * Abort (fail) all plugged BIOs of a zone write plug.
708  */
709 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
710 {
711 	struct gendisk *disk = zwplug->disk;
712 	struct bio *bio;
713 
714 	lockdep_assert_held(&zwplug->lock);
715 
716 	if (bio_list_empty(&zwplug->bio_list))
717 		return;
718 
719 	pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
720 			    zwplug->disk->disk_name, zwplug->zone_no);
721 	while ((bio = bio_list_pop(&zwplug->bio_list)))
722 		blk_zone_wplug_bio_io_error(zwplug, bio);
723 
724 	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
725 
726 	/*
727 	 * If we are using the per disk zone write plugs worker thread, remove
728 	 * the zone write plug from the work list and drop the reference we
729 	 * took when the zone write plug was added to that list.
730 	 */
731 	if (blk_queue_zoned_qd1_writes(disk->queue)) {
732 		spin_lock(&disk->zone_wplugs_list_lock);
733 		if (!list_empty(&zwplug->entry)) {
734 			list_del_init(&zwplug->entry);
735 			disk_put_zone_wplug(zwplug);
736 		}
737 		spin_unlock(&disk->zone_wplugs_list_lock);
738 	}
739 }
740 
741 /*
742  * Update a zone write plug condition based on the write pointer offset.
743  */
744 static void disk_zone_wplug_update_cond(struct gendisk *disk,
745 					struct blk_zone_wplug *zwplug)
746 {
747 	lockdep_assert_held(&zwplug->lock);
748 
749 	if (disk_zone_wplug_is_full(disk, zwplug))
750 		zwplug->cond = BLK_ZONE_COND_FULL;
751 	else if (!zwplug->wp_offset)
752 		zwplug->cond = BLK_ZONE_COND_EMPTY;
753 	else
754 		zwplug->cond = BLK_ZONE_COND_ACTIVE;
755 }
756 
757 /*
758  * Set a zone write plug write pointer offset to the specified value.
759  * This aborts all plugged BIOs, which is fine as this function is called for
760  * a zone reset operation, a zone finish operation or if the zone needs a wp
761  * update from a report zone after a write error.
762  */
763 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
764 					  struct blk_zone_wplug *zwplug,
765 					  unsigned int wp_offset)
766 {
767 	lockdep_assert_held(&zwplug->lock);
768 
769 	/* Update the zone write pointer and abort all plugged BIOs. */
770 	zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
771 	zwplug->wp_offset = wp_offset;
772 	disk_zone_wplug_update_cond(disk, zwplug);
773 
774 	disk_zone_wplug_abort(zwplug);
775 	if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
776 		disk_mark_zone_wplug_dead(zwplug);
777 }
778 
779 static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
780 {
781 	switch (zone->cond) {
782 	case BLK_ZONE_COND_IMP_OPEN:
783 	case BLK_ZONE_COND_EXP_OPEN:
784 	case BLK_ZONE_COND_CLOSED:
785 	case BLK_ZONE_COND_ACTIVE:
786 		return zone->wp - zone->start;
787 	case BLK_ZONE_COND_EMPTY:
788 		return 0;
789 	case BLK_ZONE_COND_FULL:
790 	case BLK_ZONE_COND_NOT_WP:
791 	case BLK_ZONE_COND_OFFLINE:
792 	case BLK_ZONE_COND_READONLY:
793 	default:
794 		/*
795 		 * Conventional, full, offline and read-only zones do not have
796 		 * a valid write pointer.
797 		 */
798 		return UINT_MAX;
799 	}
800 }
801 
802 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
803 						   struct blk_zone *zone)
804 {
805 	struct blk_zone_wplug *zwplug;
806 	unsigned int wp_offset = blk_zone_wp_offset(zone);
807 
808 	zwplug = disk_get_zone_wplug(disk, zone->start);
809 	if (zwplug) {
810 		unsigned long flags;
811 
812 		spin_lock_irqsave(&zwplug->lock, flags);
813 		if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
814 			disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
815 		spin_unlock_irqrestore(&zwplug->lock, flags);
816 		disk_put_zone_wplug(zwplug);
817 	}
818 
819 	return wp_offset;
820 }
821 
822 /**
823  * disk_report_zone - Report one zone
824  * @disk:	Target disk
825  * @zone:	The zone to report
826  * @idx:	The index of the zone in the overall zone report
827  * @args:	report zones callback and data
828  *
829  * Description:
830  *    Helper function for block device drivers to report one zone of a zone
831  *    report initiated with blkdev_report_zones(). The zone being reported is
832  *    specified by @zone and used to update, if necessary, the zone write plug
833  *    information for the zone. If @args specifies a user callback function,
834  *    this callback is executed.
835  */
836 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
837 		     unsigned int idx, struct blk_report_zones_args *args)
838 {
839 	if (args && args->report_active) {
840 		/*
841 		 * If we come here, then this is a report zones as a fallback
842 		 * for a cached report. So collapse the implicit open, explicit
843 		 * open and closed conditions into the active zone condition.
844 		 */
845 		switch (zone->cond) {
846 		case BLK_ZONE_COND_IMP_OPEN:
847 		case BLK_ZONE_COND_EXP_OPEN:
848 		case BLK_ZONE_COND_CLOSED:
849 			zone->cond = BLK_ZONE_COND_ACTIVE;
850 			break;
851 		default:
852 			break;
853 		}
854 	}
855 
856 	if (disk->zone_wplugs_hash)
857 		disk_zone_wplug_sync_wp_offset(disk, zone);
858 
859 	if (args && args->cb)
860 		return args->cb(zone, idx, args->data);
861 
862 	return 0;
863 }
864 EXPORT_SYMBOL_GPL(disk_report_zone);
865 
866 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
867 				 void *data)
868 {
869 	memcpy(data, zone, sizeof(struct blk_zone));
870 	return 0;
871 }
872 
873 static int blkdev_report_zone_fallback(struct block_device *bdev,
874 				       sector_t sector, struct blk_zone *zone)
875 {
876 	struct blk_report_zones_args args = {
877 		.cb = blkdev_report_zone_cb,
878 		.data = zone,
879 		.report_active = true,
880 	};
881 	int error;
882 
883 	error = blkdev_do_report_zones(bdev, sector, 1, &args);
884 	if (error < 0)
885 		return error;
886 	if (error == 0)
887 		return -EIO;
888 	return 0;
889 }
890 
891 /*
892  * For devices that natively support zone append operations, we do not use zone
893  * write plugging for zone append writes, which makes the zone condition
894  * tracking invalid once zone append was used.  In that case fall back to a
895  * regular report zones to get correct information.
896  */
897 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev)
898 {
899 	return disk_need_zone_resources(bdev->bd_disk) &&
900 		(bdev_emulates_zone_append(bdev) ||
901 		 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state));
902 }
903 
904 /**
905  * blkdev_get_zone_info - Get a single zone information from cached data
906  * @bdev:   Target block device
907  * @sector: Sector contained by the target zone
908  * @zone:   zone structure to return the zone information
909  *
910  * Description:
911  *    Get the zone information for the zone containing @sector using the zone
912  *    write plug of the target zone, if one exist, or the disk zone condition
913  *    array otherwise. The zone condition may be reported as being
914  *    the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
915  *    open, explicit open or closed condition.
916  *
917  *    Returns 0 on success and a negative error code on failure.
918  */
919 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
920 			 struct blk_zone *zone)
921 {
922 	struct gendisk *disk = bdev->bd_disk;
923 	sector_t zone_sectors = bdev_zone_sectors(bdev);
924 	struct blk_zone_wplug *zwplug;
925 	unsigned long flags;
926 	u8 *zones_cond;
927 
928 	if (!bdev_is_zoned(bdev))
929 		return -EOPNOTSUPP;
930 
931 	if (sector >= get_capacity(disk))
932 		return -EINVAL;
933 
934 	memset(zone, 0, sizeof(*zone));
935 	sector = bdev_zone_start(bdev, sector);
936 
937 	if (!blkdev_has_cached_report_zones(bdev))
938 		return blkdev_report_zone_fallback(bdev, sector, zone);
939 
940 	rcu_read_lock();
941 	zones_cond = rcu_dereference(disk->zones_cond);
942 	if (!disk->zone_wplugs_hash || !zones_cond) {
943 		rcu_read_unlock();
944 		return blkdev_report_zone_fallback(bdev, sector, zone);
945 	}
946 	zone->cond = zones_cond[disk_zone_no(disk, sector)];
947 	rcu_read_unlock();
948 
949 	zone->start = sector;
950 	zone->len = zone_sectors;
951 
952 	/*
953 	 * If this is a conventional zone, we do not have a zone write plug and
954 	 * can report the zone immediately.
955 	 */
956 	if (zone->cond == BLK_ZONE_COND_NOT_WP) {
957 		zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
958 		zone->capacity = zone_sectors;
959 		zone->wp = ULLONG_MAX;
960 		return 0;
961 	}
962 
963 	/*
964 	 * This is a sequential write required zone. If the zone is read-only or
965 	 * offline, only set the zone write pointer to an invalid value and
966 	 * report the zone.
967 	 */
968 	zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
969 	if (disk_zone_is_last(disk, zone))
970 		zone->capacity = disk->last_zone_capacity;
971 	else
972 		zone->capacity = disk->zone_capacity;
973 
974 	if (zone->cond == BLK_ZONE_COND_READONLY ||
975 	    zone->cond == BLK_ZONE_COND_OFFLINE) {
976 		zone->wp = ULLONG_MAX;
977 		return 0;
978 	}
979 
980 	/*
981 	 * If the zone does not have a zone write plug, it is either full or
982 	 * empty, as we otherwise would have a zone write plug for it. In this
983 	 * case, set the write pointer accordingly and report the zone.
984 	 * Otherwise, if we have a zone write plug, use it.
985 	 */
986 	zwplug = disk_get_zone_wplug(disk, sector);
987 	if (!zwplug) {
988 		if (zone->cond == BLK_ZONE_COND_FULL)
989 			zone->wp = ULLONG_MAX;
990 		else
991 			zone->wp = sector;
992 		return 0;
993 	}
994 
995 	spin_lock_irqsave(&zwplug->lock, flags);
996 	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
997 		spin_unlock_irqrestore(&zwplug->lock, flags);
998 		disk_put_zone_wplug(zwplug);
999 		return blkdev_report_zone_fallback(bdev, sector, zone);
1000 	}
1001 	zone->cond = zwplug->cond;
1002 	zone->wp = sector + zwplug->wp_offset;
1003 	spin_unlock_irqrestore(&zwplug->lock, flags);
1004 
1005 	disk_put_zone_wplug(zwplug);
1006 
1007 	return 0;
1008 }
1009 EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
1010 
1011 /**
1012  * blkdev_report_zones_cached - Get cached zones information
1013  * @bdev:     Target block device
1014  * @sector:   Sector from which to report zones
1015  * @nr_zones: Maximum number of zones to report
1016  * @cb:       Callback function called for each reported zone
1017  * @data:     Private data for the callback function
1018  *
1019  * Description:
1020  *    Similar to blkdev_report_zones() but instead of calling into the low level
1021  *    device driver to get the zone report from the device, use
1022  *    blkdev_get_zone_info() to generate the report from the disk zone write
1023  *    plugs and zones condition array. Since calling this function without a
1024  *    callback does not make sense, @cb must be specified.
1025  */
1026 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
1027 			unsigned int nr_zones, report_zones_cb cb, void *data)
1028 {
1029 	struct gendisk *disk = bdev->bd_disk;
1030 	sector_t capacity = get_capacity(disk);
1031 	sector_t zone_sectors = bdev_zone_sectors(bdev);
1032 	unsigned int idx = 0;
1033 	struct blk_zone zone;
1034 	int ret;
1035 
1036 	if (!cb || !bdev_is_zoned(bdev) ||
1037 	    WARN_ON_ONCE(!disk->fops->report_zones))
1038 		return -EOPNOTSUPP;
1039 
1040 	if (!nr_zones || sector >= capacity)
1041 		return 0;
1042 
1043 	if (!blkdev_has_cached_report_zones(bdev)) {
1044 		struct blk_report_zones_args args = {
1045 			.cb = cb,
1046 			.data = data,
1047 			.report_active = true,
1048 		};
1049 
1050 		return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
1051 	}
1052 
1053 	for (sector = bdev_zone_start(bdev, sector);
1054 	     sector < capacity && idx < nr_zones;
1055 	     sector += zone_sectors, idx++) {
1056 		ret = blkdev_get_zone_info(bdev, sector, &zone);
1057 		if (ret)
1058 			return ret;
1059 
1060 		ret = cb(&zone, idx, data);
1061 		if (ret)
1062 			return ret;
1063 	}
1064 
1065 	return idx;
1066 }
1067 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
1068 
1069 static void blk_zone_reset_bio_endio(struct bio *bio)
1070 {
1071 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1072 	sector_t sector = bio->bi_iter.bi_sector;
1073 	struct blk_zone_wplug *zwplug;
1074 
1075 	/*
1076 	 * If we have a zone write plug, set its write pointer offset to 0.
1077 	 * This will abort all BIOs plugged for the target zone. It is fine as
1078 	 * resetting zones while writes are still in-flight will result in the
1079 	 * writes failing anyway.
1080 	 */
1081 	zwplug = disk_get_zone_wplug(disk, sector);
1082 	if (zwplug) {
1083 		unsigned long flags;
1084 
1085 		spin_lock_irqsave(&zwplug->lock, flags);
1086 		disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1087 		spin_unlock_irqrestore(&zwplug->lock, flags);
1088 		disk_put_zone_wplug(zwplug);
1089 	} else {
1090 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1091 	}
1092 }
1093 
1094 static void blk_zone_reset_all_bio_endio(struct bio *bio)
1095 {
1096 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1097 	sector_t capacity = get_capacity(disk);
1098 	struct blk_zone_wplug *zwplug;
1099 	unsigned long flags;
1100 	sector_t sector;
1101 	unsigned int i;
1102 
1103 	if (atomic_read(&disk->nr_zone_wplugs)) {
1104 		/* Update the condition of all zone write plugs. */
1105 		rcu_read_lock();
1106 		for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1107 			hlist_for_each_entry_rcu(zwplug,
1108 						 &disk->zone_wplugs_hash[i],
1109 						 node) {
1110 				spin_lock_irqsave(&zwplug->lock, flags);
1111 				disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1112 				spin_unlock_irqrestore(&zwplug->lock, flags);
1113 			}
1114 		}
1115 		rcu_read_unlock();
1116 	}
1117 
1118 	/* Update the cached zone conditions. */
1119 	for (sector = 0; sector < capacity;
1120 	     sector += bdev_zone_sectors(bio->bi_bdev))
1121 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1122 	clear_bit(GD_ZONE_APPEND_USED, &disk->state);
1123 }
1124 
1125 static void blk_zone_finish_bio_endio(struct bio *bio)
1126 {
1127 	struct block_device *bdev = bio->bi_bdev;
1128 	struct gendisk *disk = bdev->bd_disk;
1129 	sector_t sector = bio->bi_iter.bi_sector;
1130 	struct blk_zone_wplug *zwplug;
1131 
1132 	/*
1133 	 * If we have a zone write plug, set its write pointer offset to the
1134 	 * zone size. This will abort all BIOs plugged for the target zone. It
1135 	 * is fine as resetting zones while writes are still in-flight will
1136 	 * result in the writes failing anyway.
1137 	 */
1138 	zwplug = disk_get_zone_wplug(disk, sector);
1139 	if (zwplug) {
1140 		unsigned long flags;
1141 
1142 		spin_lock_irqsave(&zwplug->lock, flags);
1143 		disk_zone_wplug_set_wp_offset(disk, zwplug,
1144 					      bdev_zone_sectors(bdev));
1145 		spin_unlock_irqrestore(&zwplug->lock, flags);
1146 		disk_put_zone_wplug(zwplug);
1147 	} else {
1148 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL);
1149 	}
1150 }
1151 
1152 void blk_zone_mgmt_bio_endio(struct bio *bio)
1153 {
1154 	/* If the BIO failed, we have nothing to do. */
1155 	if (bio->bi_status != BLK_STS_OK)
1156 		return;
1157 
1158 	switch (bio_op(bio)) {
1159 	case REQ_OP_ZONE_RESET:
1160 		blk_zone_reset_bio_endio(bio);
1161 		return;
1162 	case REQ_OP_ZONE_RESET_ALL:
1163 		blk_zone_reset_all_bio_endio(bio);
1164 		return;
1165 	case REQ_OP_ZONE_FINISH:
1166 		blk_zone_finish_bio_endio(bio);
1167 		return;
1168 	default:
1169 		return;
1170 	}
1171 }
1172 
1173 static void disk_zone_wplug_schedule_work(struct gendisk *disk,
1174 					  struct blk_zone_wplug *zwplug)
1175 {
1176 	lockdep_assert_held(&zwplug->lock);
1177 
1178 	/*
1179 	 * Schedule the submission of the next plugged BIO. Taking a reference
1180 	 * to the zone write plug is required as the bio_work belongs to the
1181 	 * plug, and thus we must ensure that the write plug does not go away
1182 	 * while the work is being scheduled but has not run yet.
1183 	 * blk_zone_wplug_bio_work() will release the reference we take here,
1184 	 * and we also drop this reference if the work is already scheduled.
1185 	 */
1186 	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1187 	WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue));
1188 	refcount_inc(&zwplug->ref);
1189 	if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work))
1190 		disk_put_zone_wplug(zwplug);
1191 }
1192 
1193 static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
1194 				struct blk_zone_wplug *zwplug,
1195 				struct bio *bio, unsigned int nr_segs)
1196 {
1197 	/*
1198 	 * Grab an extra reference on the BIO request queue usage counter.
1199 	 * This reference will be reused to submit a request for the BIO for
1200 	 * blk-mq devices and dropped when the BIO is failed and after
1201 	 * it is issued in the case of BIO-based devices.
1202 	 */
1203 	percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
1204 
1205 	/*
1206 	 * The BIO is being plugged and thus will have to wait for the on-going
1207 	 * write and for all other writes already plugged. So polling makes
1208 	 * no sense.
1209 	 */
1210 	bio_clear_polled(bio);
1211 
1212 	/*
1213 	 * Reuse the poll cookie field to store the number of segments when
1214 	 * split to the hardware limits.
1215 	 */
1216 	bio->__bi_nr_segments = nr_segs;
1217 
1218 	/*
1219 	 * We always receive BIOs after they are split and ready to be issued.
1220 	 * The block layer passes the parts of a split BIO in order, and the
1221 	 * user must also issue write sequentially. So simply add the new BIO
1222 	 * at the tail of the list to preserve the sequential write order.
1223 	 */
1224 	bio_list_add(&zwplug->bio_list, bio);
1225 	trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
1226 				      bio->bi_iter.bi_sector, bio_sectors(bio));
1227 
1228 	/*
1229 	 * If we are using the disk zone write plugs worker instead of the per
1230 	 * zone write plug BIO work, add the zone write plug to the work list
1231 	 * if it is not already there. Make sure to also get an extra reference
1232 	 * on the zone write plug so that it does not go away until it is
1233 	 * removed from the work list.
1234 	 */
1235 	if (blk_queue_zoned_qd1_writes(disk->queue)) {
1236 		spin_lock(&disk->zone_wplugs_list_lock);
1237 		if (list_empty(&zwplug->entry)) {
1238 			list_add_tail(&zwplug->entry, &disk->zone_wplugs_list);
1239 			refcount_inc(&zwplug->ref);
1240 		}
1241 		spin_unlock(&disk->zone_wplugs_list_lock);
1242 	}
1243 }
1244 
1245 /*
1246  * Called from bio_attempt_back_merge() when a BIO was merged with a request.
1247  */
1248 void blk_zone_write_plug_bio_merged(struct bio *bio)
1249 {
1250 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1251 	struct blk_zone_wplug *zwplug;
1252 	unsigned long flags;
1253 
1254 	/*
1255 	 * If the BIO was already plugged, then we were called through
1256 	 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
1257 	 * For this case, we already hold a reference on the zone write plug for
1258 	 * the BIO and blk_zone_write_plug_init_request() will handle the
1259 	 * zone write pointer offset update.
1260 	 */
1261 	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1262 		return;
1263 
1264 	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1265 
1266 	/*
1267 	 * Get a reference on the zone write plug of the target zone and advance
1268 	 * the zone write pointer offset. Given that this is a merge, we already
1269 	 * have at least one request and one BIO referencing the zone write
1270 	 * plug. So this should not fail.
1271 	 */
1272 	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1273 	if (WARN_ON_ONCE(!zwplug))
1274 		return;
1275 
1276 	spin_lock_irqsave(&zwplug->lock, flags);
1277 	zwplug->wp_offset += bio_sectors(bio);
1278 	disk_zone_wplug_update_cond(disk, zwplug);
1279 	spin_unlock_irqrestore(&zwplug->lock, flags);
1280 }
1281 
1282 /*
1283  * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
1284  * already went through zone write plugging (either a new BIO or one that was
1285  * unplugged).
1286  */
1287 void blk_zone_write_plug_init_request(struct request *req)
1288 {
1289 	sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
1290 	struct request_queue *q = req->q;
1291 	struct gendisk *disk = q->disk;
1292 	struct blk_zone_wplug *zwplug =
1293 		disk_get_zone_wplug(disk, blk_rq_pos(req));
1294 	unsigned long flags;
1295 	struct bio *bio;
1296 
1297 	if (WARN_ON_ONCE(!zwplug))
1298 		return;
1299 
1300 	/*
1301 	 * Indicate that completion of this request needs to be handled with
1302 	 * blk_zone_write_plug_finish_request(), which will drop the reference
1303 	 * on the zone write plug we took above on entry to this function.
1304 	 */
1305 	req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
1306 
1307 	if (blk_queue_nomerges(q))
1308 		return;
1309 
1310 	/*
1311 	 * Walk through the list of plugged BIOs to check if they can be merged
1312 	 * into the back of the request.
1313 	 */
1314 	spin_lock_irqsave(&zwplug->lock, flags);
1315 	while (!disk_zone_wplug_is_full(disk, zwplug)) {
1316 		bio = bio_list_peek(&zwplug->bio_list);
1317 		if (!bio)
1318 			break;
1319 
1320 		if (bio->bi_iter.bi_sector != req_back_sector ||
1321 		    !blk_rq_merge_ok(req, bio))
1322 			break;
1323 
1324 		WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
1325 			     !bio->__bi_nr_segments);
1326 
1327 		bio_list_pop(&zwplug->bio_list);
1328 		if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
1329 		    BIO_MERGE_OK) {
1330 			bio_list_add_head(&zwplug->bio_list, bio);
1331 			break;
1332 		}
1333 
1334 		/* Drop the reference taken by disk_zone_wplug_add_bio(). */
1335 		blk_queue_exit(q);
1336 		zwplug->wp_offset += bio_sectors(bio);
1337 		disk_zone_wplug_update_cond(disk, zwplug);
1338 
1339 		req_back_sector += bio_sectors(bio);
1340 	}
1341 	spin_unlock_irqrestore(&zwplug->lock, flags);
1342 }
1343 
1344 /*
1345  * Check and prepare a BIO for submission by incrementing the write pointer
1346  * offset of its zone write plug and changing zone append operations into
1347  * regular write when zone append emulation is needed.
1348  */
1349 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
1350 				       struct bio *bio)
1351 {
1352 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1353 
1354 	lockdep_assert_held(&zwplug->lock);
1355 
1356 	/*
1357 	 * If we lost track of the zone write pointer due to a write error,
1358 	 * the user must either execute a report zones, reset the zone or finish
1359 	 * the to recover a reliable write pointer position. Fail BIOs if the
1360 	 * user did not do that as we cannot handle emulated zone append
1361 	 * otherwise.
1362 	 */
1363 	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
1364 		return false;
1365 
1366 	/*
1367 	 * Check that the user is not attempting to write to a full zone.
1368 	 * We know such BIO will fail, and that would potentially overflow our
1369 	 * write pointer offset beyond the end of the zone.
1370 	 */
1371 	if (disk_zone_wplug_is_full(disk, zwplug))
1372 		return false;
1373 
1374 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1375 		/*
1376 		 * Use a regular write starting at the current write pointer.
1377 		 * Similarly to native zone append operations, do not allow
1378 		 * merging.
1379 		 */
1380 		bio->bi_opf &= ~REQ_OP_MASK;
1381 		bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
1382 		bio->bi_iter.bi_sector += zwplug->wp_offset;
1383 
1384 		/*
1385 		 * Remember that this BIO is in fact a zone append operation
1386 		 * so that we can restore its operation code on completion.
1387 		 */
1388 		bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
1389 	} else {
1390 		/*
1391 		 * Check for non-sequential writes early as we know that BIOs
1392 		 * with a start sector not unaligned to the zone write pointer
1393 		 * will fail.
1394 		 */
1395 		if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
1396 			return false;
1397 	}
1398 
1399 	/* Advance the zone write pointer offset. */
1400 	zwplug->wp_offset += bio_sectors(bio);
1401 	disk_zone_wplug_update_cond(disk, zwplug);
1402 
1403 	return true;
1404 }
1405 
1406 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
1407 {
1408 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1409 	sector_t sector = bio->bi_iter.bi_sector;
1410 	struct blk_zone_wplug *zwplug;
1411 	gfp_t gfp_mask = GFP_NOIO;
1412 	unsigned long flags;
1413 
1414 	/*
1415 	 * BIOs must be fully contained within a zone so that we use the correct
1416 	 * zone write plug for the entire BIO. For blk-mq devices, the block
1417 	 * layer should already have done any splitting required to ensure this
1418 	 * and this BIO should thus not be straddling zone boundaries. For
1419 	 * BIO-based devices, it is the responsibility of the driver to split
1420 	 * the bio before submitting it.
1421 	 */
1422 	if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1423 		bio_io_error(bio);
1424 		return true;
1425 	}
1426 
1427 	/* Conventional zones do not need write plugging. */
1428 	if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1429 		/* Zone append to conventional zones is not allowed. */
1430 		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1431 			bio_io_error(bio);
1432 			return true;
1433 		}
1434 		return false;
1435 	}
1436 
1437 	if (bio->bi_opf & REQ_NOWAIT)
1438 		gfp_mask = GFP_NOWAIT;
1439 
1440 	zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask);
1441 	if (!zwplug) {
1442 		if (bio->bi_opf & REQ_NOWAIT)
1443 			bio_wouldblock_error(bio);
1444 		else
1445 			bio_io_error(bio);
1446 		return true;
1447 	}
1448 
1449 	spin_lock_irqsave(&zwplug->lock, flags);
1450 
1451 	/*
1452 	 * If we got a zone write plug marked as dead, then the user is issuing
1453 	 * writes to a full zone, or without synchronizing with zone reset or
1454 	 * zone finish operations. In such case, fail the BIO to signal this
1455 	 * invalid usage.
1456 	 */
1457 	if (zwplug->flags & BLK_ZONE_WPLUG_DEAD) {
1458 		spin_unlock_irqrestore(&zwplug->lock, flags);
1459 		disk_put_zone_wplug(zwplug);
1460 		bio_io_error(bio);
1461 		return true;
1462 	}
1463 
1464 	/* Indicate that this BIO is being handled using zone write plugging. */
1465 	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1466 
1467 	/*
1468 	 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a
1469 	 * BLK_STS_AGAIN failure if we let the caller submit the BIO.
1470 	 */
1471 	if (bio->bi_opf & REQ_NOWAIT) {
1472 		bio->bi_opf &= ~REQ_NOWAIT;
1473 		goto queue_bio;
1474 	}
1475 
1476 	/*
1477 	 * For rotational devices, we will use the gendisk zone write plugs
1478 	 * work instead of the per zone write plug BIO work, so queue the BIO.
1479 	 */
1480 	if (blk_queue_zoned_qd1_writes(disk->queue))
1481 		goto queue_bio;
1482 
1483 	/* If the zone is already plugged, add the BIO to the BIO plug list. */
1484 	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
1485 		goto queue_bio;
1486 
1487 	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1488 		spin_unlock_irqrestore(&zwplug->lock, flags);
1489 		bio_io_error(bio);
1490 		return true;
1491 	}
1492 
1493 	/* Otherwise, plug and let the caller submit the BIO. */
1494 	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1495 
1496 	spin_unlock_irqrestore(&zwplug->lock, flags);
1497 
1498 	return false;
1499 
1500 queue_bio:
1501 	disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1502 
1503 	if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
1504 		zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1505 		if (blk_queue_zoned_qd1_writes(disk->queue))
1506 			wake_up_process(disk->zone_wplugs_worker);
1507 		else
1508 			disk_zone_wplug_schedule_work(disk, zwplug);
1509 	}
1510 
1511 	spin_unlock_irqrestore(&zwplug->lock, flags);
1512 
1513 	return true;
1514 }
1515 
1516 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1517 {
1518 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1519 	struct blk_zone_wplug *zwplug;
1520 	unsigned long flags;
1521 
1522 	if (!test_bit(GD_ZONE_APPEND_USED, &disk->state))
1523 		set_bit(GD_ZONE_APPEND_USED, &disk->state);
1524 
1525 	/*
1526 	 * We have native support for zone append operations, so we are not
1527 	 * going to handle @bio through plugging. However, we may already have a
1528 	 * zone write plug for the target zone if that zone was previously
1529 	 * partially written using regular writes. In such case, we risk leaving
1530 	 * the plug in the disk hash table if the zone is fully written using
1531 	 * zone append operations. Avoid this by removing the zone write plug.
1532 	 */
1533 	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1534 	if (likely(!zwplug))
1535 		return;
1536 
1537 	spin_lock_irqsave(&zwplug->lock, flags);
1538 
1539 	/*
1540 	 * We are about to remove the zone write plug. But if the user
1541 	 * (mistakenly) has issued regular writes together with native zone
1542 	 * append, we must aborts the writes as otherwise the plugged BIOs would
1543 	 * not be executed by the plug BIO work as disk_get_zone_wplug() will
1544 	 * return NULL after the plug is removed. Aborting the plugged write
1545 	 * BIOs is consistent with the fact that these writes will most likely
1546 	 * fail anyway as there is no ordering guarantees between zone append
1547 	 * operations and regular write operations.
1548 	 */
1549 	if (!bio_list_empty(&zwplug->bio_list)) {
1550 		pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1551 				    disk->disk_name, zwplug->zone_no);
1552 		disk_zone_wplug_abort(zwplug);
1553 	}
1554 	disk_mark_zone_wplug_dead(zwplug);
1555 	spin_unlock_irqrestore(&zwplug->lock, flags);
1556 
1557 	disk_put_zone_wplug(zwplug);
1558 }
1559 
1560 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio)
1561 {
1562 	if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL &&
1563 	    !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
1564 		/*
1565 		 * Zone reset and zone finish operations do not apply to
1566 		 * conventional zones.
1567 		 */
1568 		bio_io_error(bio);
1569 		return true;
1570 	}
1571 
1572 	/*
1573 	 * No-wait zone management BIOs do not make much sense as the callers
1574 	 * issue these as blocking operations in most cases. To avoid issues
1575 	 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn
1576 	 * about REQ_NOWAIT being set and ignore that flag.
1577 	 */
1578 	if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
1579 		bio->bi_opf &= ~REQ_NOWAIT;
1580 
1581 	return false;
1582 }
1583 
1584 /**
1585  * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1586  * @bio: The BIO being submitted
1587  * @nr_segs: The number of physical segments of @bio
1588  *
1589  * Handle write, write zeroes and zone append operations requiring emulation
1590  * using zone write plugging.
1591  *
1592  * Return true whenever @bio execution needs to be delayed through the zone
1593  * write plug. Otherwise, return false to let the submission path process
1594  * @bio normally.
1595  */
1596 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1597 {
1598 	struct block_device *bdev = bio->bi_bdev;
1599 
1600 	if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
1601 		return false;
1602 
1603 	/*
1604 	 * Regular writes and write zeroes need to be handled through the target
1605 	 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1606 	 * which may need to go through the flush machinery depending on the
1607 	 * target device capabilities. Plugging such writes is fine as the flush
1608 	 * machinery operates at the request level, below the plug, and
1609 	 * completion of the flush sequence will go through the regular BIO
1610 	 * completion, which will handle zone write plugging.
1611 	 * Zone append operations for devices that requested emulation must
1612 	 * also be plugged so that these BIOs can be changed into regular
1613 	 * write BIOs.
1614 	 * Zone reset, reset all and finish commands need special treatment
1615 	 * to correctly track the write pointer offset of zones. These commands
1616 	 * are not plugged as we do not need serialization with write
1617 	 * operations. It is the responsibility of the user to not issue reset
1618 	 * and finish commands when write operations are in flight.
1619 	 */
1620 	switch (bio_op(bio)) {
1621 	case REQ_OP_ZONE_APPEND:
1622 		if (!bdev_emulates_zone_append(bdev)) {
1623 			blk_zone_wplug_handle_native_zone_append(bio);
1624 			return false;
1625 		}
1626 		fallthrough;
1627 	case REQ_OP_WRITE:
1628 	case REQ_OP_WRITE_ZEROES:
1629 		return blk_zone_wplug_handle_write(bio, nr_segs);
1630 	case REQ_OP_ZONE_RESET:
1631 	case REQ_OP_ZONE_FINISH:
1632 	case REQ_OP_ZONE_RESET_ALL:
1633 		return blk_zone_wplug_handle_zone_mgmt(bio);
1634 	default:
1635 		return false;
1636 	}
1637 
1638 	return false;
1639 }
1640 EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1641 
1642 static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1643 				       struct blk_zone_wplug *zwplug)
1644 {
1645 	unsigned long flags;
1646 
1647 	spin_lock_irqsave(&zwplug->lock, flags);
1648 
1649 	/*
1650 	 * For rotational devices, signal the BIO completion to the zone write
1651 	 * plug work. Otherwise, schedule submission of the next plugged BIO
1652 	 * if we have one.
1653 	 */
1654 	if (bio_list_empty(&zwplug->bio_list))
1655 		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1656 
1657 	if (blk_queue_zoned_qd1_writes(disk->queue))
1658 		complete(&disk->zone_wplugs_worker_bio_done);
1659 	else if (!bio_list_empty(&zwplug->bio_list))
1660 		disk_zone_wplug_schedule_work(disk, zwplug);
1661 
1662 	if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
1663 		disk_mark_zone_wplug_dead(zwplug);
1664 
1665 	spin_unlock_irqrestore(&zwplug->lock, flags);
1666 }
1667 
1668 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
1669 {
1670 	/*
1671 	 * For zone append requests, the request sector indicates the location
1672 	 * at which the BIO data was written. Return this value to the BIO
1673 	 * issuer through the BIO iter sector.
1674 	 * For plugged zone writes, which include emulated zone append, we need
1675 	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
1676 	 * lookup the zone write plug.
1677 	 */
1678 	bio->bi_iter.bi_sector = rq->__sector;
1679 	trace_blk_zone_append_update_request_bio(rq);
1680 }
1681 
1682 void blk_zone_write_plug_bio_endio(struct bio *bio)
1683 {
1684 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1685 	struct blk_zone_wplug *zwplug =
1686 		disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1687 	unsigned long flags;
1688 
1689 	if (WARN_ON_ONCE(!zwplug))
1690 		return;
1691 
1692 	/* Make sure we do not see this BIO again by clearing the plug flag. */
1693 	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1694 
1695 	/*
1696 	 * If this is a regular write emulating a zone append operation,
1697 	 * restore the original operation code.
1698 	 */
1699 	if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1700 		bio->bi_opf &= ~REQ_OP_MASK;
1701 		bio->bi_opf |= REQ_OP_ZONE_APPEND;
1702 		bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
1703 	}
1704 
1705 	/*
1706 	 * If the BIO failed, abort all plugged BIOs and mark the plug as
1707 	 * needing a write pointer update.
1708 	 */
1709 	if (bio->bi_status != BLK_STS_OK) {
1710 		spin_lock_irqsave(&zwplug->lock, flags);
1711 		disk_zone_wplug_abort(zwplug);
1712 		zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1713 		spin_unlock_irqrestore(&zwplug->lock, flags);
1714 	}
1715 
1716 	/* Drop the reference we took when the BIO was issued. */
1717 	disk_put_zone_wplug(zwplug);
1718 
1719 	/*
1720 	 * For BIO-based devices, blk_zone_write_plug_finish_request()
1721 	 * is not called. So we need to schedule execution of the next
1722 	 * plugged BIO here.
1723 	 */
1724 	if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1725 		disk_zone_wplug_unplug_bio(disk, zwplug);
1726 
1727 	/* Drop the reference we took when entering this function. */
1728 	disk_put_zone_wplug(zwplug);
1729 }
1730 
1731 void blk_zone_write_plug_finish_request(struct request *req)
1732 {
1733 	struct gendisk *disk = req->q->disk;
1734 	struct blk_zone_wplug *zwplug;
1735 
1736 	zwplug = disk_get_zone_wplug(disk, req->__sector);
1737 	if (WARN_ON_ONCE(!zwplug))
1738 		return;
1739 
1740 	req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1741 
1742 	/*
1743 	 * Drop the reference we took when the request was initialized in
1744 	 * blk_zone_write_plug_init_request().
1745 	 */
1746 	disk_put_zone_wplug(zwplug);
1747 
1748 	disk_zone_wplug_unplug_bio(disk, zwplug);
1749 
1750 	/* Drop the reference we took when entering this function. */
1751 	disk_put_zone_wplug(zwplug);
1752 }
1753 
1754 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
1755 				       struct blk_zone_wplug *zwplug)
1756 {
1757 	struct block_device *bdev;
1758 	unsigned long flags;
1759 	struct bio *bio;
1760 	bool prepared;
1761 
1762 	/*
1763 	 * Submit the next plugged BIO. If we do not have any, clear
1764 	 * the plugged flag.
1765 	 */
1766 again:
1767 	spin_lock_irqsave(&zwplug->lock, flags);
1768 	bio = bio_list_pop(&zwplug->bio_list);
1769 	if (!bio) {
1770 		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1771 		spin_unlock_irqrestore(&zwplug->lock, flags);
1772 		return false;
1773 	}
1774 
1775 	trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
1776 				 bio->bi_iter.bi_sector, bio_sectors(bio));
1777 
1778 	prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
1779 	spin_unlock_irqrestore(&zwplug->lock, flags);
1780 
1781 	if (!prepared) {
1782 		blk_zone_wplug_bio_io_error(zwplug, bio);
1783 		goto again;
1784 	}
1785 
1786 	/*
1787 	 * blk-mq devices will reuse the extra reference on the request queue
1788 	 * usage counter we took when the BIO was plugged, but the submission
1789 	 * path for BIO-based devices will not do that. So drop this extra
1790 	 * reference here.
1791 	 */
1792 	if (blk_queue_zoned_qd1_writes(disk->queue))
1793 		reinit_completion(&disk->zone_wplugs_worker_bio_done);
1794 	bdev = bio->bi_bdev;
1795 	if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
1796 		bdev->bd_disk->fops->submit_bio(bio);
1797 		blk_queue_exit(bdev->bd_disk->queue);
1798 	} else {
1799 		blk_mq_submit_bio(bio);
1800 	}
1801 
1802 	return true;
1803 }
1804 
1805 static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk)
1806 {
1807 	struct blk_zone_wplug *zwplug;
1808 
1809 	spin_lock_irq(&disk->zone_wplugs_list_lock);
1810 	zwplug = list_first_entry_or_null(&disk->zone_wplugs_list,
1811 					  struct blk_zone_wplug, entry);
1812 	if (zwplug)
1813 		list_del_init(&zwplug->entry);
1814 	spin_unlock_irq(&disk->zone_wplugs_list_lock);
1815 
1816 	return zwplug;
1817 }
1818 
1819 static int disk_zone_wplugs_worker(void *data)
1820 {
1821 	struct gendisk *disk = data;
1822 	struct blk_zone_wplug *zwplug;
1823 	unsigned int noio_flag;
1824 
1825 	noio_flag = memalloc_noio_save();
1826 	set_user_nice(current, MIN_NICE);
1827 	set_freezable();
1828 
1829 	for (;;) {
1830 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1831 
1832 		zwplug = disk_get_zone_wplugs_work(disk);
1833 		if (zwplug) {
1834 			/*
1835 			 * Process all BIOs of this zone write plug and then
1836 			 * drop the reference we took when adding the zone write
1837 			 * plug to the active list.
1838 			 */
1839 			set_current_state(TASK_RUNNING);
1840 			while (disk_zone_wplug_submit_bio(disk, zwplug))
1841 				blk_wait_io(&disk->zone_wplugs_worker_bio_done);
1842 			disk_put_zone_wplug(zwplug);
1843 			continue;
1844 		}
1845 
1846 		/*
1847 		 * Only sleep if nothing sets the state to running. Else check
1848 		 * for zone write plugs work again as a newly submitted BIO
1849 		 * might have added a zone write plug to the work list.
1850 		 */
1851 		if (get_current_state() == TASK_RUNNING) {
1852 			try_to_freeze();
1853 		} else {
1854 			if (kthread_should_stop()) {
1855 				set_current_state(TASK_RUNNING);
1856 				break;
1857 			}
1858 			schedule();
1859 		}
1860 	}
1861 
1862 	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
1863 	memalloc_noio_restore(noio_flag);
1864 
1865 	return 0;
1866 }
1867 
1868 void disk_init_zone_resources(struct gendisk *disk)
1869 {
1870 	spin_lock_init(&disk->zone_wplugs_hash_lock);
1871 	spin_lock_init(&disk->zone_wplugs_list_lock);
1872 	INIT_LIST_HEAD(&disk->zone_wplugs_list);
1873 	init_completion(&disk->zone_wplugs_worker_bio_done);
1874 }
1875 
1876 /*
1877  * For the size of a disk zone write plug hash table, use the size of the
1878  * zone write plug mempool, which is the maximum of the disk open zones and
1879  * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1880  * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1881  */
1882 #define BLK_ZONE_WPLUG_MAX_HASH_BITS		9
1883 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE	128
1884 
1885 static int disk_alloc_zone_resources(struct gendisk *disk,
1886 				     unsigned int pool_size)
1887 {
1888 	unsigned int i;
1889 	int ret = -ENOMEM;
1890 
1891 	atomic_set(&disk->nr_zone_wplugs, 0);
1892 	disk->zone_wplugs_hash_bits =
1893 		min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1894 
1895 	disk->zone_wplugs_hash =
1896 		kzalloc_objs(struct hlist_head,
1897 			     disk_zone_wplugs_hash_size(disk));
1898 	if (!disk->zone_wplugs_hash)
1899 		return -ENOMEM;
1900 
1901 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1902 		INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1903 
1904 	disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1905 						sizeof(struct blk_zone_wplug));
1906 	if (!disk->zone_wplugs_pool)
1907 		goto free_hash;
1908 
1909 	disk->zone_wplugs_wq =
1910 		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1911 				pool_size, disk->disk_name);
1912 	if (!disk->zone_wplugs_wq)
1913 		goto destroy_pool;
1914 
1915 	disk->zone_wplugs_worker =
1916 		kthread_create(disk_zone_wplugs_worker, disk,
1917 			       "%s_zwplugs_worker", disk->disk_name);
1918 	if (IS_ERR(disk->zone_wplugs_worker)) {
1919 		ret = PTR_ERR(disk->zone_wplugs_worker);
1920 		disk->zone_wplugs_worker = NULL;
1921 		goto destroy_wq;
1922 	}
1923 	wake_up_process(disk->zone_wplugs_worker);
1924 
1925 	return 0;
1926 
1927 destroy_wq:
1928 	destroy_workqueue(disk->zone_wplugs_wq);
1929 	disk->zone_wplugs_wq = NULL;
1930 destroy_pool:
1931 	mempool_destroy(disk->zone_wplugs_pool);
1932 	disk->zone_wplugs_pool = NULL;
1933 free_hash:
1934 	kfree(disk->zone_wplugs_hash);
1935 	disk->zone_wplugs_hash = NULL;
1936 	disk->zone_wplugs_hash_bits = 0;
1937 	return ret;
1938 }
1939 
1940 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1941 {
1942 	struct blk_zone_wplug *zwplug;
1943 	unsigned int i;
1944 
1945 	if (!disk->zone_wplugs_hash)
1946 		return;
1947 
1948 	/* Free all the zone write plugs we have. */
1949 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1950 		while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1951 			zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1952 					     struct blk_zone_wplug, node);
1953 			spin_lock_irq(&zwplug->lock);
1954 			disk_mark_zone_wplug_dead(zwplug);
1955 			spin_unlock_irq(&zwplug->lock);
1956 		}
1957 	}
1958 
1959 	WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1960 	kfree(disk->zone_wplugs_hash);
1961 	disk->zone_wplugs_hash = NULL;
1962 	disk->zone_wplugs_hash_bits = 0;
1963 
1964 	/*
1965 	 * Wait for the zone write plugs to be RCU-freed before destroying the
1966 	 * mempool.
1967 	 */
1968 	rcu_barrier();
1969 	mempool_destroy(disk->zone_wplugs_pool);
1970 	disk->zone_wplugs_pool = NULL;
1971 }
1972 
1973 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
1974 {
1975 	unsigned long flags;
1976 
1977 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
1978 	zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
1979 				lockdep_is_held(&disk->zone_wplugs_hash_lock));
1980 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
1981 
1982 	kfree_rcu_mightsleep(zones_cond);
1983 }
1984 
1985 void disk_free_zone_resources(struct gendisk *disk)
1986 {
1987 	if (disk->zone_wplugs_worker)
1988 		kthread_stop(disk->zone_wplugs_worker);
1989 	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
1990 
1991 	if (disk->zone_wplugs_wq) {
1992 		destroy_workqueue(disk->zone_wplugs_wq);
1993 		disk->zone_wplugs_wq = NULL;
1994 	}
1995 
1996 	disk_destroy_zone_wplugs_hash_table(disk);
1997 
1998 	disk_set_zones_cond_array(disk, NULL);
1999 	disk->zone_capacity = 0;
2000 	disk->last_zone_capacity = 0;
2001 	disk->nr_zones = 0;
2002 }
2003 
2004 struct blk_revalidate_zone_args {
2005 	struct gendisk	*disk;
2006 	u8		*zones_cond;
2007 	unsigned int	nr_zones;
2008 	unsigned int	nr_conv_zones;
2009 	unsigned int	zone_capacity;
2010 	unsigned int	last_zone_capacity;
2011 	sector_t	sector;
2012 };
2013 
2014 static int disk_revalidate_zone_resources(struct gendisk *disk,
2015 				struct blk_revalidate_zone_args *args)
2016 {
2017 	struct queue_limits *lim = &disk->queue->limits;
2018 	unsigned int pool_size;
2019 
2020 	args->disk = disk;
2021 	args->nr_zones =
2022 		DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
2023 
2024 	/* Cached zone conditions: 1 byte per zone */
2025 	args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
2026 	if (!args->zones_cond)
2027 		return -ENOMEM;
2028 
2029 	if (!disk_need_zone_resources(disk))
2030 		return 0;
2031 
2032 	/*
2033 	 * If the device has no limit on the maximum number of open and active
2034 	 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
2035 	 */
2036 	pool_size = max(lim->max_open_zones, lim->max_active_zones);
2037 	if (!pool_size)
2038 		pool_size =
2039 			min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
2040 
2041 	if (!disk->zone_wplugs_hash)
2042 		return disk_alloc_zone_resources(disk, pool_size);
2043 
2044 	return 0;
2045 }
2046 
2047 /*
2048  * Update the disk zone resources information and device queue limits.
2049  * The disk queue is frozen when this is executed.
2050  */
2051 static int disk_update_zone_resources(struct gendisk *disk,
2052 				      struct blk_revalidate_zone_args *args)
2053 {
2054 	struct request_queue *q = disk->queue;
2055 	unsigned int nr_seq_zones;
2056 	unsigned int pool_size, memflags;
2057 	struct queue_limits lim;
2058 	int ret = 0;
2059 
2060 	lim = queue_limits_start_update(q);
2061 
2062 	memflags = blk_mq_freeze_queue(q);
2063 
2064 	disk->nr_zones = args->nr_zones;
2065 	if (args->nr_conv_zones >= disk->nr_zones) {
2066 		queue_limits_cancel_update(q);
2067 		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
2068 			disk->disk_name, args->nr_conv_zones, disk->nr_zones);
2069 		ret = -ENODEV;
2070 		goto unfreeze;
2071 	}
2072 
2073 	disk->zone_capacity = args->zone_capacity;
2074 	disk->last_zone_capacity = args->last_zone_capacity;
2075 	disk_set_zones_cond_array(disk, args->zones_cond);
2076 
2077 	/*
2078 	 * Some devices can advertise zone resource limits that are larger than
2079 	 * the number of sequential zones of the zoned block device, e.g. a
2080 	 * small ZNS namespace. For such case, assume that the zoned device has
2081 	 * no zone resource limits.
2082 	 */
2083 	nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
2084 	if (lim.max_open_zones >= nr_seq_zones)
2085 		lim.max_open_zones = 0;
2086 	if (lim.max_active_zones >= nr_seq_zones)
2087 		lim.max_active_zones = 0;
2088 
2089 	if (!disk->zone_wplugs_pool)
2090 		goto commit;
2091 
2092 	/*
2093 	 * If the device has no limit on the maximum number of open and active
2094 	 * zones, set its max open zone limit to the mempool size to indicate
2095 	 * to the user that there is a potential performance impact due to
2096 	 * dynamic zone write plug allocation when simultaneously writing to
2097 	 * more zones than the size of the mempool.
2098 	 */
2099 	pool_size = max(lim.max_open_zones, lim.max_active_zones);
2100 	if (!pool_size)
2101 		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
2102 
2103 	mempool_resize(disk->zone_wplugs_pool, pool_size);
2104 
2105 	if (!lim.max_open_zones && !lim.max_active_zones) {
2106 		if (pool_size < nr_seq_zones)
2107 			lim.max_open_zones = pool_size;
2108 		else
2109 			lim.max_open_zones = 0;
2110 	}
2111 
2112 commit:
2113 	ret = queue_limits_commit_update(q, &lim);
2114 
2115 unfreeze:
2116 	if (ret)
2117 		disk_free_zone_resources(disk);
2118 
2119 	blk_mq_unfreeze_queue(q, memflags);
2120 
2121 	return ret;
2122 }
2123 
2124 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
2125 				    struct blk_revalidate_zone_args *args)
2126 {
2127 	enum blk_zone_cond cond = zone->cond;
2128 
2129 	/* Check that the zone condition is consistent with the zone type. */
2130 	switch (cond) {
2131 	case BLK_ZONE_COND_NOT_WP:
2132 		if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
2133 			goto invalid_condition;
2134 		break;
2135 	case BLK_ZONE_COND_IMP_OPEN:
2136 	case BLK_ZONE_COND_EXP_OPEN:
2137 	case BLK_ZONE_COND_CLOSED:
2138 	case BLK_ZONE_COND_EMPTY:
2139 	case BLK_ZONE_COND_FULL:
2140 	case BLK_ZONE_COND_OFFLINE:
2141 	case BLK_ZONE_COND_READONLY:
2142 		if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
2143 			goto invalid_condition;
2144 		break;
2145 	default:
2146 		pr_warn("%s: Invalid zone condition 0x%X\n",
2147 			args->disk->disk_name, cond);
2148 		return -ENODEV;
2149 	}
2150 
2151 	blk_zone_set_cond(args->zones_cond, idx, cond);
2152 
2153 	return 0;
2154 
2155 invalid_condition:
2156 	pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
2157 		args->disk->disk_name, cond, zone->type);
2158 
2159 	return -ENODEV;
2160 }
2161 
2162 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
2163 				    struct blk_revalidate_zone_args *args)
2164 {
2165 	struct gendisk *disk = args->disk;
2166 
2167 	if (zone->capacity != zone->len) {
2168 		pr_warn("%s: Invalid conventional zone capacity\n",
2169 			disk->disk_name);
2170 		return -ENODEV;
2171 	}
2172 
2173 	if (disk_zone_is_last(disk, zone))
2174 		args->last_zone_capacity = zone->capacity;
2175 
2176 	args->nr_conv_zones++;
2177 
2178 	return 0;
2179 }
2180 
2181 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
2182 				   struct blk_revalidate_zone_args *args)
2183 {
2184 	struct gendisk *disk = args->disk;
2185 	struct blk_zone_wplug *zwplug;
2186 	unsigned int wp_offset;
2187 
2188 	/*
2189 	 * Remember the capacity of the first sequential zone and check
2190 	 * if it is constant for all zones, ignoring the last zone as it can be
2191 	 * smaller.
2192 	 */
2193 	if (!args->zone_capacity)
2194 		args->zone_capacity = zone->capacity;
2195 	if (disk_zone_is_last(disk, zone)) {
2196 		args->last_zone_capacity = zone->capacity;
2197 	} else if (zone->capacity != args->zone_capacity) {
2198 		pr_warn("%s: Invalid variable zone capacity\n",
2199 			disk->disk_name);
2200 		return -ENODEV;
2201 	}
2202 
2203 	/*
2204 	 * If the device needs zone append emulation, we need to track the
2205 	 * write pointer of all zones that are not empty nor full. So make sure
2206 	 * we have a zone write plug for such zone if the device has a zone
2207 	 * write plug hash table.
2208 	 */
2209 	if (!disk->zone_wplugs_hash)
2210 		return 0;
2211 
2212 	wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone);
2213 	if (!wp_offset || wp_offset >= zone->capacity)
2214 		return 0;
2215 
2216 	zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO);
2217 	if (!zwplug)
2218 		return -ENOMEM;
2219 	disk_put_zone_wplug(zwplug);
2220 
2221 	return 0;
2222 }
2223 
2224 /*
2225  * Helper function to check the validity of zones of a zoned block device.
2226  */
2227 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
2228 				  void *data)
2229 {
2230 	struct blk_revalidate_zone_args *args = data;
2231 	struct gendisk *disk = args->disk;
2232 	sector_t zone_sectors = disk->queue->limits.chunk_sectors;
2233 	int ret;
2234 
2235 	/* Check for bad zones and holes in the zone report */
2236 	if (zone->start != args->sector) {
2237 		pr_warn("%s: Zone gap at sectors %llu..%llu\n",
2238 			disk->disk_name, args->sector, zone->start);
2239 		return -ENODEV;
2240 	}
2241 
2242 	if (zone->start >= get_capacity(disk) || !zone->len) {
2243 		pr_warn("%s: Invalid zone start %llu, length %llu\n",
2244 			disk->disk_name, zone->start, zone->len);
2245 		return -ENODEV;
2246 	}
2247 
2248 	/*
2249 	 * All zones must have the same size, with the exception on an eventual
2250 	 * smaller last zone.
2251 	 */
2252 	if (!disk_zone_is_last(disk, zone)) {
2253 		if (zone->len != zone_sectors) {
2254 			pr_warn("%s: Invalid zoned device with non constant zone size\n",
2255 				disk->disk_name);
2256 			return -ENODEV;
2257 		}
2258 	} else if (zone->len > zone_sectors) {
2259 		pr_warn("%s: Invalid zoned device with larger last zone size\n",
2260 			disk->disk_name);
2261 		return -ENODEV;
2262 	}
2263 
2264 	if (!zone->capacity || zone->capacity > zone->len) {
2265 		pr_warn("%s: Invalid zone capacity\n",
2266 			disk->disk_name);
2267 		return -ENODEV;
2268 	}
2269 
2270 	/* Check zone condition */
2271 	ret = blk_revalidate_zone_cond(zone, idx, args);
2272 	if (ret)
2273 		return ret;
2274 
2275 	/* Check zone type */
2276 	switch (zone->type) {
2277 	case BLK_ZONE_TYPE_CONVENTIONAL:
2278 		ret = blk_revalidate_conv_zone(zone, idx, args);
2279 		break;
2280 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
2281 		ret = blk_revalidate_seq_zone(zone, idx, args);
2282 		break;
2283 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
2284 	default:
2285 		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
2286 			disk->disk_name, (int)zone->type, zone->start);
2287 		ret = -ENODEV;
2288 	}
2289 
2290 	if (!ret)
2291 		args->sector += zone->len;
2292 
2293 	return ret;
2294 }
2295 
2296 /**
2297  * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
2298  * @disk:	Target disk
2299  *
2300  * Helper function for low-level device drivers to check, (re) allocate and
2301  * initialize resources used for managing zoned disks. This function should
2302  * normally be called by blk-mq based drivers when a zoned gendisk is probed
2303  * and when the zone configuration of the gendisk changes (e.g. after a format).
2304  * Before calling this function, the device driver must already have set the
2305  * device zone size (chunk_sector limit) and the max zone append limit.
2306  * BIO based drivers can also use this function as long as the device queue
2307  * can be safely frozen.
2308  */
2309 int blk_revalidate_disk_zones(struct gendisk *disk)
2310 {
2311 	struct request_queue *q = disk->queue;
2312 	sector_t zone_sectors = q->limits.chunk_sectors;
2313 	sector_t capacity = get_capacity(disk);
2314 	struct blk_revalidate_zone_args args = { };
2315 	unsigned int memflags, noio_flag;
2316 	struct blk_report_zones_args rep_args = {
2317 		.cb = blk_revalidate_zone_cb,
2318 		.data = &args,
2319 	};
2320 	int ret = -ENOMEM;
2321 
2322 	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
2323 		return -EIO;
2324 
2325 	if (!capacity)
2326 		return -ENODEV;
2327 
2328 	/*
2329 	 * Checks that the device driver indicated a valid zone size and that
2330 	 * the max zone append limit is set.
2331 	 */
2332 	if (!zone_sectors || !is_power_of_2(zone_sectors)) {
2333 		pr_warn("%s: Invalid non power of two zone size (%llu)\n",
2334 			disk->disk_name, zone_sectors);
2335 		return -ENODEV;
2336 	}
2337 
2338 	/*
2339 	 * Ensure that all memory allocations in this context are done as if
2340 	 * GFP_NOIO was specified.
2341 	 */
2342 	noio_flag = memalloc_noio_save();
2343 	ret = disk_revalidate_zone_resources(disk, &args);
2344 	if (ret) {
2345 		memalloc_noio_restore(noio_flag);
2346 		return ret;
2347 	}
2348 
2349 	ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args);
2350 	if (!ret) {
2351 		pr_warn("%s: No zones reported\n", disk->disk_name);
2352 		ret = -ENODEV;
2353 	}
2354 	memalloc_noio_restore(noio_flag);
2355 
2356 	/*
2357 	 * If zones where reported, make sure that the entire disk capacity
2358 	 * has been checked.
2359 	 */
2360 	if (ret > 0 && args.sector != capacity) {
2361 		pr_warn("%s: Missing zones from sector %llu\n",
2362 			disk->disk_name, args.sector);
2363 		ret = -ENODEV;
2364 	}
2365 
2366 	if (ret > 0)
2367 		return disk_update_zone_resources(disk, &args);
2368 
2369 	pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
2370 
2371 	memflags = blk_mq_freeze_queue(q);
2372 	disk_free_zone_resources(disk);
2373 	blk_mq_unfreeze_queue(q, memflags);
2374 
2375 	return ret;
2376 }
2377 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
2378 
2379 /**
2380  * blk_zone_issue_zeroout - zero-fill a block range in a zone
2381  * @bdev:	blockdev to write
2382  * @sector:	start sector
2383  * @nr_sects:	number of sectors to write
2384  * @gfp_mask:	memory allocation flags (for bio_alloc)
2385  *
2386  * Description:
2387  *  Zero-fill a block range in a zone (@sector must be equal to the zone write
2388  *  pointer), handling potential errors due to the (initially unknown) lack of
2389  *  hardware offload (See blkdev_issue_zeroout()).
2390  */
2391 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
2392 			   sector_t nr_sects, gfp_t gfp_mask)
2393 {
2394 	struct gendisk *disk = bdev->bd_disk;
2395 	int ret;
2396 
2397 	if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
2398 		return -EIO;
2399 
2400 	ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
2401 				   BLKDEV_ZERO_NOFALLBACK);
2402 	if (ret != -EOPNOTSUPP)
2403 		return ret;
2404 
2405 	/*
2406 	 * The failed call to blkdev_issue_zeroout() advanced the zone write
2407 	 * pointer. Undo this using a report zone to update the zone write
2408 	 * pointer to the correct current value.
2409 	 */
2410 	ret = disk->fops->report_zones(disk, sector, 1, NULL);
2411 	if (ret != 1)
2412 		return ret < 0 ? ret : -EIO;
2413 
2414 	/*
2415 	 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
2416 	 * regular write with zero-pages.
2417 	 */
2418 	return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
2419 }
2420 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
2421 
2422 #ifdef CONFIG_BLK_DEBUG_FS
2423 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
2424 				  struct seq_file *m)
2425 {
2426 	unsigned int zwp_wp_offset, zwp_flags;
2427 	unsigned int zwp_zone_no, zwp_ref;
2428 	unsigned int zwp_bio_list_size;
2429 	enum blk_zone_cond zwp_cond;
2430 	unsigned long flags;
2431 
2432 	spin_lock_irqsave(&zwplug->lock, flags);
2433 	zwp_zone_no = zwplug->zone_no;
2434 	zwp_flags = zwplug->flags;
2435 	zwp_ref = refcount_read(&zwplug->ref);
2436 	zwp_cond = zwplug->cond;
2437 	zwp_wp_offset = zwplug->wp_offset;
2438 	zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
2439 	spin_unlock_irqrestore(&zwplug->lock, flags);
2440 
2441 	seq_printf(m,
2442 		"Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n",
2443 		zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond),
2444 		zwp_wp_offset, zwp_bio_list_size);
2445 }
2446 
2447 int queue_zone_wplugs_show(void *data, struct seq_file *m)
2448 {
2449 	struct request_queue *q = data;
2450 	struct gendisk *disk = q->disk;
2451 	struct blk_zone_wplug *zwplug;
2452 	unsigned int i;
2453 
2454 	if (!disk->zone_wplugs_hash)
2455 		return 0;
2456 
2457 	rcu_read_lock();
2458 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
2459 		hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
2460 					 node)
2461 			queue_zone_wplug_show(zwplug, m);
2462 	rcu_read_unlock();
2463 
2464 	return 0;
2465 }
2466 
2467 #endif
2468