xref: /linux/block/blk-zoned.c (revision d458a240344c4369bf6f3da203f2779515177738)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Zoned block device handling
4  *
5  * Copyright (c) 2015, Hannes Reinecke
6  * Copyright (c) 2015, SUSE Linux GmbH
7  *
8  * Copyright (c) 2016, Damien Le Moal
9  * Copyright (c) 2016, Western Digital
10  * Copyright (c) 2024, Western Digital Corporation or its affiliates.
11  */
12 
13 #include <linux/kernel.h>
14 #include <linux/blkdev.h>
15 #include <linux/blk-mq.h>
16 #include <linux/spinlock.h>
17 #include <linux/refcount.h>
18 #include <linux/mempool.h>
19 #include <linux/kthread.h>
20 #include <linux/freezer.h>
21 
22 #include <trace/events/block.h>
23 
24 #include "blk.h"
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
27 
28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29 static const char *const zone_cond_name[] = {
30 	ZONE_COND_NAME(NOT_WP),
31 	ZONE_COND_NAME(EMPTY),
32 	ZONE_COND_NAME(IMP_OPEN),
33 	ZONE_COND_NAME(EXP_OPEN),
34 	ZONE_COND_NAME(CLOSED),
35 	ZONE_COND_NAME(READONLY),
36 	ZONE_COND_NAME(FULL),
37 	ZONE_COND_NAME(OFFLINE),
38 	ZONE_COND_NAME(ACTIVE),
39 };
40 #undef ZONE_COND_NAME
41 
42 /*
43  * Per-zone write plug.
44  * @node: hlist_node structure for managing the plug using a hash table.
45  * @entry: list_head structure for listing the plug in the disk list of active
46  *         zone write plugs.
47  * @bio_list: The list of BIOs that are currently plugged.
48  * @bio_work: Work struct to handle issuing of plugged BIOs
49  * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
50  * @disk: The gendisk the plug belongs to.
51  * @lock: Spinlock to atomically manipulate the plug.
52  * @ref: Zone write plug reference counter. A zone write plug reference is
53  *       always at least 1 when the plug is hashed in the disk plug hash table.
54  *       The reference is incremented whenever a new BIO needing plugging is
55  *       submitted and when a function needs to manipulate a plug. The
56  *       reference count is decremented whenever a plugged BIO completes and
57  *       when a function that referenced the plug returns. The initial
58  *       reference is dropped whenever the zone of the zone write plug is reset,
59  *       finished and when the zone becomes full (last write BIO to the zone
60  *       completes).
61  * @flags: Flags indicating the plug state.
62  * @zone_no: The number of the zone the plug is managing.
63  * @wp_offset: The zone write pointer location relative to the start of the zone
64  *             as a number of 512B sectors.
65  * @cond: Condition of the zone
66  */
67 struct blk_zone_wplug {
68 	struct hlist_node	node;
69 	struct list_head	entry;
70 	struct bio_list		bio_list;
71 	struct work_struct	bio_work;
72 	struct rcu_head		rcu_head;
73 	struct gendisk		*disk;
74 	spinlock_t		lock;
75 	refcount_t		ref;
76 	unsigned int		flags;
77 	unsigned int		zone_no;
78 	unsigned int		wp_offset;
79 	enum blk_zone_cond	cond;
80 };
81 
disk_need_zone_resources(struct gendisk * disk)82 static inline bool disk_need_zone_resources(struct gendisk *disk)
83 {
84 	/*
85 	 * All request-based zoned devices need zone resources so that the
86 	 * block layer can automatically handle write BIO plugging. BIO-based
87 	 * device drivers (e.g. DM devices) are normally responsible for
88 	 * handling zone write ordering and do not need zone resources, unless
89 	 * the driver requires zone append emulation.
90 	 */
91 	return queue_is_mq(disk->queue) ||
92 		queue_emulates_zone_append(disk->queue);
93 }
94 
disk_zone_wplugs_hash_size(struct gendisk * disk)95 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
96 {
97 	return 1U << disk->zone_wplugs_hash_bits;
98 }
99 
100 /*
101  * Zone write plug flags bits:
102  *  - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
103  *    that is, that write BIOs are being throttled due to a write BIO already
104  *    being executed or the zone write plug bio list is not empty.
105  *  - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
106  *    write pointer offset and need to update it.
107  *  - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be
108  *    removed from the disk hash table of zone write plugs when the last
109  *    reference on the zone write plug is dropped. If set, this flag also
110  *    indicates that the initial extra reference on the zone write plug was
111  *    dropped, meaning that the reference count indicates the current number of
112  *    active users (code context or BIOs and requests in flight). This flag is
113  *    set when a zone is reset, finished or becomes full.
114  */
115 #define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
116 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE	(1U << 1)
117 #define BLK_ZONE_WPLUG_DEAD		(1U << 2)
118 
119 /**
120  * blk_zone_cond_str - Return a zone condition name string
121  * @zone_cond: a zone condition BLK_ZONE_COND_name
122  *
123  * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful
124  * for the debugging and tracing zone conditions. For an invalid zone
125  * conditions, the string "UNKNOWN" is returned.
126  */
blk_zone_cond_str(enum blk_zone_cond zone_cond)127 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
128 {
129 	static const char *zone_cond_str = "UNKNOWN";
130 
131 	if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
132 		zone_cond_str = zone_cond_name[zone_cond];
133 
134 	return zone_cond_str;
135 }
136 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
137 
blk_zone_set_cond(u8 * zones_cond,unsigned int zno,enum blk_zone_cond cond)138 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno,
139 			      enum blk_zone_cond cond)
140 {
141 	if (!zones_cond)
142 		return;
143 
144 	switch (cond) {
145 	case BLK_ZONE_COND_IMP_OPEN:
146 	case BLK_ZONE_COND_EXP_OPEN:
147 	case BLK_ZONE_COND_CLOSED:
148 		zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
149 		return;
150 	case BLK_ZONE_COND_NOT_WP:
151 	case BLK_ZONE_COND_EMPTY:
152 	case BLK_ZONE_COND_FULL:
153 	case BLK_ZONE_COND_OFFLINE:
154 	case BLK_ZONE_COND_READONLY:
155 	default:
156 		zones_cond[zno] = cond;
157 		return;
158 	}
159 }
160 
disk_zone_set_cond(struct gendisk * disk,sector_t sector,enum blk_zone_cond cond)161 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
162 			       enum blk_zone_cond cond)
163 {
164 	u8 *zones_cond;
165 
166 	rcu_read_lock();
167 	zones_cond = rcu_dereference(disk->zones_cond);
168 	if (zones_cond) {
169 		unsigned int zno = disk_zone_no(disk, sector);
170 
171 		/*
172 		 * The condition of a conventional, readonly and offline zones
173 		 * never changes, so do nothing if the target zone is in one of
174 		 * these conditions.
175 		 */
176 		switch (zones_cond[zno]) {
177 		case BLK_ZONE_COND_NOT_WP:
178 		case BLK_ZONE_COND_READONLY:
179 		case BLK_ZONE_COND_OFFLINE:
180 			break;
181 		default:
182 			blk_zone_set_cond(zones_cond, zno, cond);
183 			break;
184 		}
185 	}
186 	rcu_read_unlock();
187 }
188 
189 /**
190  * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
191  * @bdev:       block device to check
192  * @sector:     sector number
193  *
194  * Check if @sector on @bdev is contained in a sequential write required zone.
195  */
bdev_zone_is_seq(struct block_device * bdev,sector_t sector)196 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
197 {
198 	struct gendisk *disk = bdev->bd_disk;
199 	unsigned int zno = disk_zone_no(disk, sector);
200 	bool is_seq = false;
201 	u8 *zones_cond;
202 
203 	if (!bdev_is_zoned(bdev))
204 		return false;
205 
206 	rcu_read_lock();
207 	zones_cond = rcu_dereference(disk->zones_cond);
208 	if (zones_cond && zno < disk->nr_zones)
209 		is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
210 	rcu_read_unlock();
211 
212 	return is_seq;
213 }
214 EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
215 
216 /*
217  * Zone report arguments for block device drivers report_zones operation.
218  * @cb: report_zones_cb callback for each reported zone.
219  * @data: Private data passed to report_zones_cb.
220  */
221 struct blk_report_zones_args {
222 	report_zones_cb cb;
223 	void		*data;
224 	bool		report_active;
225 };
226 
blkdev_do_report_zones(struct block_device * bdev,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)227 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
228 				  unsigned int nr_zones,
229 				  struct blk_report_zones_args *args)
230 {
231 	struct gendisk *disk = bdev->bd_disk;
232 
233 	if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
234 		return -EOPNOTSUPP;
235 
236 	if (!nr_zones || sector >= get_capacity(disk))
237 		return 0;
238 
239 	return disk->fops->report_zones(disk, sector, nr_zones, args);
240 }
241 
242 /**
243  * blkdev_report_zones - Get zones information
244  * @bdev:	Target block device
245  * @sector:	Sector from which to report zones
246  * @nr_zones:	Maximum number of zones to report
247  * @cb:		Callback function called for each reported zone
248  * @data:	Private data for the callback
249  *
250  * Description:
251  *    Get zone information starting from the zone containing @sector for at most
252  *    @nr_zones, and call @cb for each zone reported by the device.
253  *    To report all zones in a device starting from @sector, the BLK_ALL_ZONES
254  *    constant can be passed to @nr_zones.
255  *    Returns the number of zones reported by the device, or a negative errno
256  *    value in case of failure.
257  *
258  *    Note: The caller must use memalloc_noXX_save/restore() calls to control
259  *    memory allocations done within this function.
260  */
blkdev_report_zones(struct block_device * bdev,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)261 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
262 			unsigned int nr_zones, report_zones_cb cb, void *data)
263 {
264 	struct blk_report_zones_args args = {
265 		.cb = cb,
266 		.data = data,
267 	};
268 
269 	return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
270 }
271 EXPORT_SYMBOL_GPL(blkdev_report_zones);
272 
blkdev_zone_reset_all(struct block_device * bdev)273 static int blkdev_zone_reset_all(struct block_device *bdev)
274 {
275 	struct bio bio;
276 
277 	bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
278 	trace_blkdev_zone_mgmt(&bio, 0);
279 	return submit_bio_wait(&bio);
280 }
281 
282 /**
283  * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
284  * @bdev:	Target block device
285  * @op:		Operation to be performed on the zones
286  * @sector:	Start sector of the first zone to operate on
287  * @nr_sectors:	Number of sectors, should be at least the length of one zone and
288  *		must be zone size aligned.
289  *
290  * Description:
291  *    Perform the specified operation on the range of zones specified by
292  *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
293  *    is valid, but the specified range should not contain conventional zones.
294  *    The operation to execute on each zone can be a zone reset, open, close
295  *    or finish request.
296  */
blkdev_zone_mgmt(struct block_device * bdev,enum req_op op,sector_t sector,sector_t nr_sectors)297 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
298 		     sector_t sector, sector_t nr_sectors)
299 {
300 	sector_t zone_sectors = bdev_zone_sectors(bdev);
301 	sector_t capacity = bdev_nr_sectors(bdev);
302 	sector_t end_sector = sector + nr_sectors;
303 	struct bio *bio = NULL;
304 	int ret = 0;
305 
306 	if (!bdev_is_zoned(bdev))
307 		return -EOPNOTSUPP;
308 
309 	if (bdev_read_only(bdev))
310 		return -EPERM;
311 
312 	if (!op_is_zone_mgmt(op))
313 		return -EOPNOTSUPP;
314 
315 	if (end_sector <= sector || end_sector > capacity)
316 		/* Out of range */
317 		return -EINVAL;
318 
319 	/* Check alignment (handle eventual smaller last zone) */
320 	if (!bdev_is_zone_start(bdev, sector))
321 		return -EINVAL;
322 
323 	if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
324 		return -EINVAL;
325 
326 	/*
327 	 * In the case of a zone reset operation over all zones, use
328 	 * REQ_OP_ZONE_RESET_ALL.
329 	 */
330 	if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
331 		return blkdev_zone_reset_all(bdev);
332 
333 	while (sector < end_sector) {
334 		bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
335 		bio->bi_iter.bi_sector = sector;
336 		sector += zone_sectors;
337 
338 		/* This may take a while, so be nice to others */
339 		cond_resched();
340 	}
341 
342 	trace_blkdev_zone_mgmt(bio, nr_sectors);
343 	ret = submit_bio_wait(bio);
344 	bio_put(bio);
345 
346 	return ret;
347 }
348 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
349 
350 struct zone_report_args {
351 	struct blk_zone __user *zones;
352 };
353 
blkdev_copy_zone_to_user(struct blk_zone * zone,unsigned int idx,void * data)354 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
355 				    void *data)
356 {
357 	struct zone_report_args *args = data;
358 
359 	if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
360 		return -EFAULT;
361 	return 0;
362 }
363 
364 /*
365  * Mask of valid input flags for BLKREPORTZONEV2 ioctl.
366  */
367 #define BLK_ZONE_REPV2_INPUT_FLAGS	BLK_ZONE_REP_CACHED
368 
369 /*
370  * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
371  * Called from blkdev_ioctl.
372  */
blkdev_report_zones_ioctl(struct block_device * bdev,unsigned int cmd,unsigned long arg)373 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
374 		unsigned long arg)
375 {
376 	void __user *argp = (void __user *)arg;
377 	struct zone_report_args args;
378 	struct blk_zone_report rep;
379 	int ret;
380 
381 	if (!argp)
382 		return -EINVAL;
383 
384 	if (!bdev_is_zoned(bdev))
385 		return -ENOTTY;
386 
387 	if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
388 		return -EFAULT;
389 
390 	if (!rep.nr_zones)
391 		return -EINVAL;
392 
393 	args.zones = argp + sizeof(struct blk_zone_report);
394 
395 	switch (cmd) {
396 	case BLKREPORTZONE:
397 		ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
398 					  blkdev_copy_zone_to_user, &args);
399 		break;
400 	case BLKREPORTZONEV2:
401 		if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
402 			return -EINVAL;
403 		ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones,
404 					 blkdev_copy_zone_to_user, &args);
405 		break;
406 	default:
407 		return -EINVAL;
408 	}
409 
410 	if (ret < 0)
411 		return ret;
412 
413 	rep.nr_zones = ret;
414 	rep.flags = BLK_ZONE_REP_CAPACITY;
415 	if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
416 		return -EFAULT;
417 	return 0;
418 }
419 
blkdev_reset_zone(struct block_device * bdev,blk_mode_t mode,struct blk_zone_range * zrange)420 static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode,
421 			     struct blk_zone_range *zrange)
422 {
423 	loff_t start, end;
424 	int ret = -EINVAL;
425 
426 	inode_lock(bdev->bd_mapping->host);
427 	filemap_invalidate_lock(bdev->bd_mapping);
428 	if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
429 	    zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
430 		/* Out of range */
431 		goto out_unlock;
432 
433 	start = zrange->sector << SECTOR_SHIFT;
434 	end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
435 
436 	ret = truncate_bdev_range(bdev, mode, start, end);
437 	if (ret)
438 		goto out_unlock;
439 
440 	ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector,
441 			       zrange->nr_sectors);
442 out_unlock:
443 	filemap_invalidate_unlock(bdev->bd_mapping);
444 	inode_unlock(bdev->bd_mapping->host);
445 	return ret;
446 }
447 
448 /*
449  * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
450  * Called from blkdev_ioctl.
451  */
blkdev_zone_mgmt_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)452 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
453 			   unsigned int cmd, unsigned long arg)
454 {
455 	void __user *argp = (void __user *)arg;
456 	struct blk_zone_range zrange;
457 	enum req_op op;
458 
459 	if (!argp)
460 		return -EINVAL;
461 
462 	if (!bdev_is_zoned(bdev))
463 		return -ENOTTY;
464 
465 	if (!(mode & BLK_OPEN_WRITE))
466 		return -EBADF;
467 
468 	if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
469 		return -EFAULT;
470 
471 	switch (cmd) {
472 	case BLKRESETZONE:
473 		return blkdev_reset_zone(bdev, mode, &zrange);
474 	case BLKOPENZONE:
475 		op = REQ_OP_ZONE_OPEN;
476 		break;
477 	case BLKCLOSEZONE:
478 		op = REQ_OP_ZONE_CLOSE;
479 		break;
480 	case BLKFINISHZONE:
481 		op = REQ_OP_ZONE_FINISH;
482 		break;
483 	default:
484 		return -ENOTTY;
485 	}
486 
487 	return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
488 }
489 
disk_zone_is_last(struct gendisk * disk,struct blk_zone * zone)490 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
491 {
492 	return zone->start + zone->len >= get_capacity(disk);
493 }
494 
disk_zone_wplug_is_full(struct gendisk * disk,struct blk_zone_wplug * zwplug)495 static bool disk_zone_wplug_is_full(struct gendisk *disk,
496 				    struct blk_zone_wplug *zwplug)
497 {
498 	if (zwplug->zone_no < disk->nr_zones - 1)
499 		return zwplug->wp_offset >= disk->zone_capacity;
500 	return zwplug->wp_offset >= disk->last_zone_capacity;
501 }
502 
disk_insert_zone_wplug(struct gendisk * disk,struct blk_zone_wplug * zwplug)503 static bool disk_insert_zone_wplug(struct gendisk *disk,
504 				   struct blk_zone_wplug *zwplug)
505 {
506 	struct blk_zone_wplug *zwplg;
507 	unsigned long flags;
508 	u8 *zones_cond;
509 	unsigned int idx =
510 		hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
511 
512 	/*
513 	 * Add the new zone write plug to the hash table, but carefully as we
514 	 * are racing with other submission context, so we may already have a
515 	 * zone write plug for the same zone.
516 	 */
517 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
518 	hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
519 		if (zwplg->zone_no == zwplug->zone_no) {
520 			spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock,
521 					       flags);
522 			return false;
523 		}
524 	}
525 
526 	/*
527 	 * Set the zone condition: if we do not yet have a zones_cond array
528 	 * attached to the disk, then this is a zone write plug insert from the
529 	 * first call to blk_revalidate_disk_zones(), in which case the zone is
530 	 * necessarilly in the active condition.
531 	 */
532 	zones_cond = rcu_dereference_check(disk->zones_cond,
533 				lockdep_is_held(&disk->zone_wplugs_hash_lock));
534 	if (zones_cond)
535 		zwplug->cond = zones_cond[zwplug->zone_no];
536 	else
537 		zwplug->cond = BLK_ZONE_COND_ACTIVE;
538 
539 	hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
540 	atomic_inc(&disk->nr_zone_wplugs);
541 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
542 
543 	return true;
544 }
545 
disk_get_hashed_zone_wplug(struct gendisk * disk,sector_t sector)546 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
547 							 sector_t sector)
548 {
549 	unsigned int zno = disk_zone_no(disk, sector);
550 	unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
551 	struct blk_zone_wplug *zwplug;
552 
553 	rcu_read_lock();
554 
555 	hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
556 		if (zwplug->zone_no == zno &&
557 		    refcount_inc_not_zero(&zwplug->ref)) {
558 			rcu_read_unlock();
559 			return zwplug;
560 		}
561 	}
562 
563 	rcu_read_unlock();
564 
565 	return NULL;
566 }
567 
disk_get_zone_wplug(struct gendisk * disk,sector_t sector)568 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
569 							 sector_t sector)
570 {
571 	if (!atomic_read(&disk->nr_zone_wplugs))
572 		return NULL;
573 
574 	return disk_get_hashed_zone_wplug(disk, sector);
575 }
576 
disk_free_zone_wplug_rcu(struct rcu_head * rcu_head)577 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
578 {
579 	struct blk_zone_wplug *zwplug =
580 		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
581 
582 	mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
583 }
584 
disk_free_zone_wplug(struct blk_zone_wplug * zwplug)585 static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
586 {
587 	struct gendisk *disk = zwplug->disk;
588 	unsigned long flags;
589 
590 	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD));
591 	WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
592 	WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
593 
594 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
595 	blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
596 				lockdep_is_held(&disk->zone_wplugs_hash_lock)),
597 			  zwplug->zone_no, zwplug->cond);
598 	hlist_del_init_rcu(&zwplug->node);
599 	atomic_dec(&disk->nr_zone_wplugs);
600 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
601 
602 	call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
603 }
604 
disk_put_zone_wplug(struct blk_zone_wplug * zwplug)605 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
606 {
607 	if (refcount_dec_and_test(&zwplug->ref))
608 		disk_free_zone_wplug(zwplug);
609 }
610 
611 /*
612  * Flag the zone write plug as dead and drop the initial reference we got when
613  * the zone write plug was added to the hash table. The zone write plug will be
614  * unhashed when its last reference is dropped.
615  */
disk_mark_zone_wplug_dead(struct blk_zone_wplug * zwplug)616 static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug)
617 {
618 	lockdep_assert_held(&zwplug->lock);
619 
620 	if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) {
621 		zwplug->flags |= BLK_ZONE_WPLUG_DEAD;
622 		disk_put_zone_wplug(zwplug);
623 	}
624 }
625 
disk_check_zone_wplug_dead(struct blk_zone_wplug * zwplug)626 static inline bool disk_check_zone_wplug_dead(struct blk_zone_wplug *zwplug)
627 {
628 	if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD))
629 		return false;
630 
631 	/*
632 	 * If a new write is received right after a zone reset completes and
633 	 * while the disk_zone_wplugs_worker() thread has not yet released the
634 	 * reference on the zone write plug after processing the last write to
635 	 * the zone, then the new write BIO will see the zone write plug marked
636 	 * as dead. This case is however a false positive and a perfectly valid
637 	 * pattern. In such case, restore the zone write plug to a live one.
638 	 */
639 	if (!zwplug->wp_offset && bio_list_empty(&zwplug->bio_list)) {
640 		zwplug->flags &= ~BLK_ZONE_WPLUG_DEAD;
641 		refcount_inc(&zwplug->ref);
642 		return false;
643 	}
644 
645 	return true;
646 }
647 
648 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
649 				       struct blk_zone_wplug *zwplug);
650 
blk_zone_wplug_bio_work(struct work_struct * work)651 static void blk_zone_wplug_bio_work(struct work_struct *work)
652 {
653 	struct blk_zone_wplug *zwplug =
654 		container_of(work, struct blk_zone_wplug, bio_work);
655 
656 	disk_zone_wplug_submit_bio(zwplug->disk, zwplug);
657 
658 	/* Drop the reference we took in disk_zone_wplug_schedule_work(). */
659 	disk_put_zone_wplug(zwplug);
660 }
661 
662 /*
663  * Get a zone write plug for the zone containing @sector.
664  * If the plug does not exist, it is allocated and inserted in the disk hash
665  * table.
666  */
disk_get_or_alloc_zone_wplug(struct gendisk * disk,sector_t sector,gfp_t gfp_mask)667 static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk,
668 					sector_t sector, gfp_t gfp_mask)
669 {
670 	unsigned int zno = disk_zone_no(disk, sector);
671 	struct blk_zone_wplug *zwplug;
672 
673 again:
674 	zwplug = disk_get_zone_wplug(disk, sector);
675 	if (zwplug)
676 		return zwplug;
677 
678 	/*
679 	 * Allocate and initialize a zone write plug with an extra reference
680 	 * so that it is not freed when the zone write plug becomes idle without
681 	 * the zone being full.
682 	 */
683 	zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
684 	if (!zwplug)
685 		return NULL;
686 
687 	INIT_HLIST_NODE(&zwplug->node);
688 	refcount_set(&zwplug->ref, 2);
689 	spin_lock_init(&zwplug->lock);
690 	zwplug->flags = 0;
691 	zwplug->zone_no = zno;
692 	zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
693 	bio_list_init(&zwplug->bio_list);
694 	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
695 	INIT_LIST_HEAD(&zwplug->entry);
696 	zwplug->disk = disk;
697 
698 	/*
699 	 * Insert the new zone write plug in the hash table. This can fail only
700 	 * if another context already inserted a plug. Retry from the beginning
701 	 * in such case.
702 	 */
703 	if (!disk_insert_zone_wplug(disk, zwplug)) {
704 		mempool_free(zwplug, disk->zone_wplugs_pool);
705 		goto again;
706 	}
707 
708 	return zwplug;
709 }
710 
blk_zone_wplug_bio_io_error(struct blk_zone_wplug * zwplug,struct bio * bio)711 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
712 					       struct bio *bio)
713 {
714 	struct request_queue *q = zwplug->disk->queue;
715 
716 	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
717 	bio_io_error(bio);
718 	disk_put_zone_wplug(zwplug);
719 	/* Drop the reference taken by disk_zone_wplug_add_bio(). */
720 	blk_queue_exit(q);
721 }
722 
723 /*
724  * Abort (fail) all plugged BIOs of a zone write plug.
725  */
disk_zone_wplug_abort(struct blk_zone_wplug * zwplug)726 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
727 {
728 	struct gendisk *disk = zwplug->disk;
729 	struct bio *bio;
730 
731 	lockdep_assert_held(&zwplug->lock);
732 
733 	if (bio_list_empty(&zwplug->bio_list))
734 		return;
735 
736 	pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
737 			    zwplug->disk->disk_name, zwplug->zone_no);
738 	while ((bio = bio_list_pop(&zwplug->bio_list)))
739 		blk_zone_wplug_bio_io_error(zwplug, bio);
740 
741 	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
742 
743 	/*
744 	 * If we are using the per disk zone write plugs worker thread, remove
745 	 * the zone write plug from the work list and drop the reference we
746 	 * took when the zone write plug was added to that list.
747 	 */
748 	if (blk_queue_zoned_qd1_writes(disk->queue)) {
749 		spin_lock(&disk->zone_wplugs_list_lock);
750 		if (!list_empty(&zwplug->entry)) {
751 			list_del_init(&zwplug->entry);
752 			disk_put_zone_wplug(zwplug);
753 		}
754 		spin_unlock(&disk->zone_wplugs_list_lock);
755 	}
756 }
757 
758 /*
759  * Update a zone write plug condition based on the write pointer offset.
760  */
disk_zone_wplug_update_cond(struct gendisk * disk,struct blk_zone_wplug * zwplug)761 static void disk_zone_wplug_update_cond(struct gendisk *disk,
762 					struct blk_zone_wplug *zwplug)
763 {
764 	lockdep_assert_held(&zwplug->lock);
765 
766 	if (disk_zone_wplug_is_full(disk, zwplug))
767 		zwplug->cond = BLK_ZONE_COND_FULL;
768 	else if (!zwplug->wp_offset)
769 		zwplug->cond = BLK_ZONE_COND_EMPTY;
770 	else
771 		zwplug->cond = BLK_ZONE_COND_ACTIVE;
772 }
773 
774 /*
775  * Set a zone write plug write pointer offset to the specified value.
776  * This aborts all plugged BIOs, which is fine as this function is called for
777  * a zone reset operation, a zone finish operation or if the zone needs a wp
778  * update from a report zone after a write error.
779  */
disk_zone_wplug_set_wp_offset(struct gendisk * disk,struct blk_zone_wplug * zwplug,unsigned int wp_offset)780 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
781 					  struct blk_zone_wplug *zwplug,
782 					  unsigned int wp_offset)
783 {
784 	lockdep_assert_held(&zwplug->lock);
785 
786 	/* Update the zone write pointer and abort all plugged BIOs. */
787 	zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
788 	zwplug->wp_offset = wp_offset;
789 	disk_zone_wplug_update_cond(disk, zwplug);
790 
791 	disk_zone_wplug_abort(zwplug);
792 	if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
793 		disk_mark_zone_wplug_dead(zwplug);
794 }
795 
blk_zone_wp_offset(struct blk_zone * zone)796 static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
797 {
798 	switch (zone->cond) {
799 	case BLK_ZONE_COND_IMP_OPEN:
800 	case BLK_ZONE_COND_EXP_OPEN:
801 	case BLK_ZONE_COND_CLOSED:
802 	case BLK_ZONE_COND_ACTIVE:
803 		return zone->wp - zone->start;
804 	case BLK_ZONE_COND_EMPTY:
805 		return 0;
806 	case BLK_ZONE_COND_FULL:
807 	case BLK_ZONE_COND_NOT_WP:
808 	case BLK_ZONE_COND_OFFLINE:
809 	case BLK_ZONE_COND_READONLY:
810 	default:
811 		/*
812 		 * Conventional, full, offline and read-only zones do not have
813 		 * a valid write pointer.
814 		 */
815 		return UINT_MAX;
816 	}
817 }
818 
disk_zone_wplug_sync_wp_offset(struct gendisk * disk,struct blk_zone * zone)819 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
820 						   struct blk_zone *zone)
821 {
822 	struct blk_zone_wplug *zwplug;
823 	unsigned int wp_offset = blk_zone_wp_offset(zone);
824 
825 	zwplug = disk_get_zone_wplug(disk, zone->start);
826 	if (zwplug) {
827 		unsigned long flags;
828 
829 		spin_lock_irqsave(&zwplug->lock, flags);
830 		if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
831 			disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
832 		spin_unlock_irqrestore(&zwplug->lock, flags);
833 		disk_put_zone_wplug(zwplug);
834 	}
835 
836 	return wp_offset;
837 }
838 
839 /**
840  * disk_report_zone - Report one zone
841  * @disk:	Target disk
842  * @zone:	The zone to report
843  * @idx:	The index of the zone in the overall zone report
844  * @args:	report zones callback and data
845  *
846  * Description:
847  *    Helper function for block device drivers to report one zone of a zone
848  *    report initiated with blkdev_report_zones(). The zone being reported is
849  *    specified by @zone and used to update, if necessary, the zone write plug
850  *    information for the zone. If @args specifies a user callback function,
851  *    this callback is executed.
852  */
disk_report_zone(struct gendisk * disk,struct blk_zone * zone,unsigned int idx,struct blk_report_zones_args * args)853 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
854 		     unsigned int idx, struct blk_report_zones_args *args)
855 {
856 	if (args && args->report_active) {
857 		/*
858 		 * If we come here, then this is a report zones as a fallback
859 		 * for a cached report. So collapse the implicit open, explicit
860 		 * open and closed conditions into the active zone condition.
861 		 */
862 		switch (zone->cond) {
863 		case BLK_ZONE_COND_IMP_OPEN:
864 		case BLK_ZONE_COND_EXP_OPEN:
865 		case BLK_ZONE_COND_CLOSED:
866 			zone->cond = BLK_ZONE_COND_ACTIVE;
867 			break;
868 		default:
869 			break;
870 		}
871 	}
872 
873 	if (disk->zone_wplugs_hash)
874 		disk_zone_wplug_sync_wp_offset(disk, zone);
875 
876 	if (args && args->cb)
877 		return args->cb(zone, idx, args->data);
878 
879 	return 0;
880 }
881 EXPORT_SYMBOL_GPL(disk_report_zone);
882 
blkdev_report_zone_cb(struct blk_zone * zone,unsigned int idx,void * data)883 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
884 				 void *data)
885 {
886 	memcpy(data, zone, sizeof(struct blk_zone));
887 	return 0;
888 }
889 
blkdev_report_zone_fallback(struct block_device * bdev,sector_t sector,struct blk_zone * zone)890 static int blkdev_report_zone_fallback(struct block_device *bdev,
891 				       sector_t sector, struct blk_zone *zone)
892 {
893 	struct blk_report_zones_args args = {
894 		.cb = blkdev_report_zone_cb,
895 		.data = zone,
896 		.report_active = true,
897 	};
898 	int error;
899 
900 	error = blkdev_do_report_zones(bdev, sector, 1, &args);
901 	if (error < 0)
902 		return error;
903 	if (error == 0)
904 		return -EIO;
905 	return 0;
906 }
907 
908 /*
909  * For devices that natively support zone append operations, we do not use zone
910  * write plugging for zone append writes, which makes the zone condition
911  * tracking invalid once zone append was used.  In that case fall back to a
912  * regular report zones to get correct information.
913  */
blkdev_has_cached_report_zones(struct block_device * bdev)914 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev)
915 {
916 	return disk_need_zone_resources(bdev->bd_disk) &&
917 		(bdev_emulates_zone_append(bdev) ||
918 		 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state));
919 }
920 
921 /**
922  * blkdev_get_zone_info - Get a single zone information from cached data
923  * @bdev:   Target block device
924  * @sector: Sector contained by the target zone
925  * @zone:   zone structure to return the zone information
926  *
927  * Description:
928  *    Get the zone information for the zone containing @sector using the zone
929  *    write plug of the target zone, if one exist, or the disk zone condition
930  *    array otherwise. The zone condition may be reported as being
931  *    the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
932  *    open, explicit open or closed condition.
933  *
934  *    Returns 0 on success and a negative error code on failure.
935  */
blkdev_get_zone_info(struct block_device * bdev,sector_t sector,struct blk_zone * zone)936 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
937 			 struct blk_zone *zone)
938 {
939 	struct gendisk *disk = bdev->bd_disk;
940 	sector_t zone_sectors = bdev_zone_sectors(bdev);
941 	struct blk_zone_wplug *zwplug;
942 	unsigned long flags;
943 	u8 *zones_cond;
944 
945 	if (!bdev_is_zoned(bdev))
946 		return -EOPNOTSUPP;
947 
948 	if (sector >= get_capacity(disk))
949 		return -EINVAL;
950 
951 	memset(zone, 0, sizeof(*zone));
952 	sector = bdev_zone_start(bdev, sector);
953 
954 	if (!blkdev_has_cached_report_zones(bdev))
955 		return blkdev_report_zone_fallback(bdev, sector, zone);
956 
957 	rcu_read_lock();
958 	zones_cond = rcu_dereference(disk->zones_cond);
959 	if (!disk->zone_wplugs_hash || !zones_cond) {
960 		rcu_read_unlock();
961 		return blkdev_report_zone_fallback(bdev, sector, zone);
962 	}
963 	zone->cond = zones_cond[disk_zone_no(disk, sector)];
964 	rcu_read_unlock();
965 
966 	zone->start = sector;
967 	zone->len = zone_sectors;
968 
969 	/*
970 	 * If this is a conventional zone, we do not have a zone write plug and
971 	 * can report the zone immediately.
972 	 */
973 	if (zone->cond == BLK_ZONE_COND_NOT_WP) {
974 		zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
975 		zone->capacity = zone_sectors;
976 		zone->wp = ULLONG_MAX;
977 		return 0;
978 	}
979 
980 	/*
981 	 * This is a sequential write required zone. If the zone is read-only or
982 	 * offline, only set the zone write pointer to an invalid value and
983 	 * report the zone.
984 	 */
985 	zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
986 	if (disk_zone_is_last(disk, zone))
987 		zone->capacity = disk->last_zone_capacity;
988 	else
989 		zone->capacity = disk->zone_capacity;
990 
991 	if (zone->cond == BLK_ZONE_COND_READONLY ||
992 	    zone->cond == BLK_ZONE_COND_OFFLINE) {
993 		zone->wp = ULLONG_MAX;
994 		return 0;
995 	}
996 
997 	/*
998 	 * If the zone does not have a zone write plug, it is either full or
999 	 * empty, as we otherwise would have a zone write plug for it. In this
1000 	 * case, set the write pointer accordingly and report the zone.
1001 	 * Otherwise, if we have a zone write plug, use it.
1002 	 */
1003 	zwplug = disk_get_zone_wplug(disk, sector);
1004 	if (!zwplug) {
1005 		if (zone->cond == BLK_ZONE_COND_FULL)
1006 			zone->wp = ULLONG_MAX;
1007 		else
1008 			zone->wp = sector;
1009 		return 0;
1010 	}
1011 
1012 	spin_lock_irqsave(&zwplug->lock, flags);
1013 	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
1014 		spin_unlock_irqrestore(&zwplug->lock, flags);
1015 		disk_put_zone_wplug(zwplug);
1016 		return blkdev_report_zone_fallback(bdev, sector, zone);
1017 	}
1018 	zone->cond = zwplug->cond;
1019 	zone->wp = sector + zwplug->wp_offset;
1020 	spin_unlock_irqrestore(&zwplug->lock, flags);
1021 
1022 	disk_put_zone_wplug(zwplug);
1023 
1024 	return 0;
1025 }
1026 EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
1027 
1028 /**
1029  * blkdev_report_zones_cached - Get cached zones information
1030  * @bdev:     Target block device
1031  * @sector:   Sector from which to report zones
1032  * @nr_zones: Maximum number of zones to report
1033  * @cb:       Callback function called for each reported zone
1034  * @data:     Private data for the callback function
1035  *
1036  * Description:
1037  *    Similar to blkdev_report_zones() but instead of calling into the low level
1038  *    device driver to get the zone report from the device, use
1039  *    blkdev_get_zone_info() to generate the report from the disk zone write
1040  *    plugs and zones condition array. Since calling this function without a
1041  *    callback does not make sense, @cb must be specified.
1042  */
blkdev_report_zones_cached(struct block_device * bdev,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)1043 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
1044 			unsigned int nr_zones, report_zones_cb cb, void *data)
1045 {
1046 	struct gendisk *disk = bdev->bd_disk;
1047 	sector_t capacity = get_capacity(disk);
1048 	sector_t zone_sectors = bdev_zone_sectors(bdev);
1049 	unsigned int idx = 0;
1050 	struct blk_zone zone;
1051 	int ret;
1052 
1053 	if (!cb || !bdev_is_zoned(bdev) ||
1054 	    WARN_ON_ONCE(!disk->fops->report_zones))
1055 		return -EOPNOTSUPP;
1056 
1057 	if (!nr_zones || sector >= capacity)
1058 		return 0;
1059 
1060 	if (!blkdev_has_cached_report_zones(bdev)) {
1061 		struct blk_report_zones_args args = {
1062 			.cb = cb,
1063 			.data = data,
1064 			.report_active = true,
1065 		};
1066 
1067 		return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
1068 	}
1069 
1070 	for (sector = bdev_zone_start(bdev, sector);
1071 	     sector < capacity && idx < nr_zones;
1072 	     sector += zone_sectors, idx++) {
1073 		ret = blkdev_get_zone_info(bdev, sector, &zone);
1074 		if (ret)
1075 			return ret;
1076 
1077 		ret = cb(&zone, idx, data);
1078 		if (ret)
1079 			return ret;
1080 	}
1081 
1082 	return idx;
1083 }
1084 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
1085 
blk_zone_reset_bio_endio(struct bio * bio)1086 static void blk_zone_reset_bio_endio(struct bio *bio)
1087 {
1088 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1089 	sector_t sector = bio->bi_iter.bi_sector;
1090 	struct blk_zone_wplug *zwplug;
1091 
1092 	/*
1093 	 * If we have a zone write plug, set its write pointer offset to 0.
1094 	 * This will abort all BIOs plugged for the target zone. It is fine as
1095 	 * resetting zones while writes are still in-flight will result in the
1096 	 * writes failing anyway.
1097 	 */
1098 	zwplug = disk_get_zone_wplug(disk, sector);
1099 	if (zwplug) {
1100 		unsigned long flags;
1101 
1102 		spin_lock_irqsave(&zwplug->lock, flags);
1103 		disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1104 		spin_unlock_irqrestore(&zwplug->lock, flags);
1105 		disk_put_zone_wplug(zwplug);
1106 	} else {
1107 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1108 	}
1109 }
1110 
blk_zone_reset_all_bio_endio(struct bio * bio)1111 static void blk_zone_reset_all_bio_endio(struct bio *bio)
1112 {
1113 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1114 	sector_t capacity = get_capacity(disk);
1115 	struct blk_zone_wplug *zwplug;
1116 	unsigned long flags;
1117 	sector_t sector;
1118 	unsigned int i;
1119 
1120 	if (atomic_read(&disk->nr_zone_wplugs)) {
1121 		/* Update the condition of all zone write plugs. */
1122 		rcu_read_lock();
1123 		for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1124 			hlist_for_each_entry_rcu(zwplug,
1125 						 &disk->zone_wplugs_hash[i],
1126 						 node) {
1127 				spin_lock_irqsave(&zwplug->lock, flags);
1128 				disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1129 				spin_unlock_irqrestore(&zwplug->lock, flags);
1130 			}
1131 		}
1132 		rcu_read_unlock();
1133 	}
1134 
1135 	/* Update the cached zone conditions. */
1136 	for (sector = 0; sector < capacity;
1137 	     sector += bdev_zone_sectors(bio->bi_bdev))
1138 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1139 	clear_bit(GD_ZONE_APPEND_USED, &disk->state);
1140 }
1141 
blk_zone_finish_bio_endio(struct bio * bio)1142 static void blk_zone_finish_bio_endio(struct bio *bio)
1143 {
1144 	struct block_device *bdev = bio->bi_bdev;
1145 	struct gendisk *disk = bdev->bd_disk;
1146 	sector_t sector = bio->bi_iter.bi_sector;
1147 	struct blk_zone_wplug *zwplug;
1148 
1149 	/*
1150 	 * If we have a zone write plug, set its write pointer offset to the
1151 	 * zone size. This will abort all BIOs plugged for the target zone. It
1152 	 * is fine as resetting zones while writes are still in-flight will
1153 	 * result in the writes failing anyway.
1154 	 */
1155 	zwplug = disk_get_zone_wplug(disk, sector);
1156 	if (zwplug) {
1157 		unsigned long flags;
1158 
1159 		spin_lock_irqsave(&zwplug->lock, flags);
1160 		disk_zone_wplug_set_wp_offset(disk, zwplug,
1161 					      bdev_zone_sectors(bdev));
1162 		spin_unlock_irqrestore(&zwplug->lock, flags);
1163 		disk_put_zone_wplug(zwplug);
1164 	} else {
1165 		disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL);
1166 	}
1167 }
1168 
blk_zone_mgmt_bio_endio(struct bio * bio)1169 void blk_zone_mgmt_bio_endio(struct bio *bio)
1170 {
1171 	/* If the BIO failed, we have nothing to do. */
1172 	if (bio->bi_status != BLK_STS_OK)
1173 		return;
1174 
1175 	switch (bio_op(bio)) {
1176 	case REQ_OP_ZONE_RESET:
1177 		blk_zone_reset_bio_endio(bio);
1178 		return;
1179 	case REQ_OP_ZONE_RESET_ALL:
1180 		blk_zone_reset_all_bio_endio(bio);
1181 		return;
1182 	case REQ_OP_ZONE_FINISH:
1183 		blk_zone_finish_bio_endio(bio);
1184 		return;
1185 	default:
1186 		return;
1187 	}
1188 }
1189 
disk_zone_wplug_schedule_work(struct gendisk * disk,struct blk_zone_wplug * zwplug)1190 static void disk_zone_wplug_schedule_work(struct gendisk *disk,
1191 					  struct blk_zone_wplug *zwplug)
1192 {
1193 	lockdep_assert_held(&zwplug->lock);
1194 
1195 	/*
1196 	 * Schedule the submission of the next plugged BIO. Taking a reference
1197 	 * to the zone write plug is required as the bio_work belongs to the
1198 	 * plug, and thus we must ensure that the write plug does not go away
1199 	 * while the work is being scheduled but has not run yet.
1200 	 * blk_zone_wplug_bio_work() will release the reference we take here,
1201 	 * and we also drop this reference if the work is already scheduled.
1202 	 */
1203 	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1204 	WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue));
1205 	refcount_inc(&zwplug->ref);
1206 	if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work))
1207 		disk_put_zone_wplug(zwplug);
1208 }
1209 
disk_zone_wplug_add_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug,struct bio * bio,unsigned int nr_segs)1210 static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
1211 				struct blk_zone_wplug *zwplug,
1212 				struct bio *bio, unsigned int nr_segs)
1213 {
1214 	/*
1215 	 * Grab an extra reference on the BIO request queue usage counter.
1216 	 * This reference will be reused to submit a request for the BIO for
1217 	 * blk-mq devices and dropped when the BIO is failed and after
1218 	 * it is issued in the case of BIO-based devices.
1219 	 */
1220 	percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
1221 
1222 	/*
1223 	 * The BIO is being plugged and thus will have to wait for the on-going
1224 	 * write and for all other writes already plugged. So polling makes
1225 	 * no sense.
1226 	 */
1227 	bio_clear_polled(bio);
1228 
1229 	/*
1230 	 * Reuse the poll cookie field to store the number of segments when
1231 	 * split to the hardware limits.
1232 	 */
1233 	bio->__bi_nr_segments = nr_segs;
1234 
1235 	/*
1236 	 * We always receive BIOs after they are split and ready to be issued.
1237 	 * The block layer passes the parts of a split BIO in order, and the
1238 	 * user must also issue write sequentially. So simply add the new BIO
1239 	 * at the tail of the list to preserve the sequential write order.
1240 	 */
1241 	bio_list_add(&zwplug->bio_list, bio);
1242 	trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
1243 				      bio->bi_iter.bi_sector, bio_sectors(bio));
1244 
1245 	/*
1246 	 * If we are using the disk zone write plugs worker instead of the per
1247 	 * zone write plug BIO work, add the zone write plug to the work list
1248 	 * if it is not already there. Make sure to also get an extra reference
1249 	 * on the zone write plug so that it does not go away until it is
1250 	 * removed from the work list.
1251 	 */
1252 	if (blk_queue_zoned_qd1_writes(disk->queue)) {
1253 		spin_lock(&disk->zone_wplugs_list_lock);
1254 		if (list_empty(&zwplug->entry)) {
1255 			list_add_tail(&zwplug->entry, &disk->zone_wplugs_list);
1256 			refcount_inc(&zwplug->ref);
1257 		}
1258 		spin_unlock(&disk->zone_wplugs_list_lock);
1259 	}
1260 }
1261 
1262 /*
1263  * Called from bio_attempt_back_merge() when a BIO was merged with a request.
1264  */
blk_zone_write_plug_bio_merged(struct bio * bio)1265 void blk_zone_write_plug_bio_merged(struct bio *bio)
1266 {
1267 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1268 	struct blk_zone_wplug *zwplug;
1269 	unsigned long flags;
1270 
1271 	/*
1272 	 * If the BIO was already plugged, then we were called through
1273 	 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
1274 	 * For this case, we already hold a reference on the zone write plug for
1275 	 * the BIO and blk_zone_write_plug_init_request() will handle the
1276 	 * zone write pointer offset update.
1277 	 */
1278 	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1279 		return;
1280 
1281 	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1282 
1283 	/*
1284 	 * Get a reference on the zone write plug of the target zone and advance
1285 	 * the zone write pointer offset. Given that this is a merge, we already
1286 	 * have at least one request and one BIO referencing the zone write
1287 	 * plug. So this should not fail.
1288 	 */
1289 	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1290 	if (WARN_ON_ONCE(!zwplug))
1291 		return;
1292 
1293 	spin_lock_irqsave(&zwplug->lock, flags);
1294 	zwplug->wp_offset += bio_sectors(bio);
1295 	disk_zone_wplug_update_cond(disk, zwplug);
1296 	spin_unlock_irqrestore(&zwplug->lock, flags);
1297 }
1298 
1299 /*
1300  * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
1301  * already went through zone write plugging (either a new BIO or one that was
1302  * unplugged).
1303  */
blk_zone_write_plug_init_request(struct request * req)1304 void blk_zone_write_plug_init_request(struct request *req)
1305 {
1306 	sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
1307 	struct request_queue *q = req->q;
1308 	struct gendisk *disk = q->disk;
1309 	struct blk_zone_wplug *zwplug =
1310 		disk_get_zone_wplug(disk, blk_rq_pos(req));
1311 	unsigned long flags;
1312 	struct bio *bio;
1313 
1314 	if (WARN_ON_ONCE(!zwplug))
1315 		return;
1316 
1317 	/*
1318 	 * Indicate that completion of this request needs to be handled with
1319 	 * blk_zone_write_plug_finish_request(), which will drop the reference
1320 	 * on the zone write plug we took above on entry to this function.
1321 	 */
1322 	req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
1323 
1324 	if (blk_queue_nomerges(q))
1325 		return;
1326 
1327 	/*
1328 	 * Walk through the list of plugged BIOs to check if they can be merged
1329 	 * into the back of the request.
1330 	 */
1331 	spin_lock_irqsave(&zwplug->lock, flags);
1332 	while (!disk_zone_wplug_is_full(disk, zwplug)) {
1333 		bio = bio_list_peek(&zwplug->bio_list);
1334 		if (!bio)
1335 			break;
1336 
1337 		if (bio->bi_iter.bi_sector != req_back_sector ||
1338 		    !blk_rq_merge_ok(req, bio))
1339 			break;
1340 
1341 		WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
1342 			     !bio->__bi_nr_segments);
1343 
1344 		bio_list_pop(&zwplug->bio_list);
1345 		if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
1346 		    BIO_MERGE_OK) {
1347 			bio_list_add_head(&zwplug->bio_list, bio);
1348 			break;
1349 		}
1350 
1351 		/* Drop the reference taken by disk_zone_wplug_add_bio(). */
1352 		blk_queue_exit(q);
1353 		zwplug->wp_offset += bio_sectors(bio);
1354 		disk_zone_wplug_update_cond(disk, zwplug);
1355 
1356 		req_back_sector += bio_sectors(bio);
1357 	}
1358 	spin_unlock_irqrestore(&zwplug->lock, flags);
1359 }
1360 
1361 /*
1362  * Check and prepare a BIO for submission by incrementing the write pointer
1363  * offset of its zone write plug and changing zone append operations into
1364  * regular write when zone append emulation is needed.
1365  */
blk_zone_wplug_prepare_bio(struct blk_zone_wplug * zwplug,struct bio * bio)1366 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
1367 				       struct bio *bio)
1368 {
1369 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1370 
1371 	lockdep_assert_held(&zwplug->lock);
1372 
1373 	/*
1374 	 * If we lost track of the zone write pointer due to a write error,
1375 	 * the user must either execute a report zones, reset the zone or finish
1376 	 * the to recover a reliable write pointer position. Fail BIOs if the
1377 	 * user did not do that as we cannot handle emulated zone append
1378 	 * otherwise.
1379 	 */
1380 	if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
1381 		return false;
1382 
1383 	/*
1384 	 * Check that the user is not attempting to write to a full zone.
1385 	 * We know such BIO will fail, and that would potentially overflow our
1386 	 * write pointer offset beyond the end of the zone.
1387 	 */
1388 	if (disk_zone_wplug_is_full(disk, zwplug))
1389 		return false;
1390 
1391 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1392 		/*
1393 		 * Use a regular write starting at the current write pointer.
1394 		 * Similarly to native zone append operations, do not allow
1395 		 * merging.
1396 		 */
1397 		bio->bi_opf &= ~REQ_OP_MASK;
1398 		bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
1399 		bio->bi_iter.bi_sector += zwplug->wp_offset;
1400 
1401 		/*
1402 		 * Remember that this BIO is in fact a zone append operation
1403 		 * so that we can restore its operation code on completion.
1404 		 */
1405 		bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
1406 	} else {
1407 		/*
1408 		 * Check for non-sequential writes early as we know that BIOs
1409 		 * with a start sector not unaligned to the zone write pointer
1410 		 * will fail.
1411 		 */
1412 		if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
1413 			return false;
1414 	}
1415 
1416 	/* Advance the zone write pointer offset. */
1417 	zwplug->wp_offset += bio_sectors(bio);
1418 	disk_zone_wplug_update_cond(disk, zwplug);
1419 
1420 	return true;
1421 }
1422 
blk_zone_wplug_handle_write(struct bio * bio,unsigned int nr_segs)1423 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
1424 {
1425 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1426 	sector_t sector = bio->bi_iter.bi_sector;
1427 	struct blk_zone_wplug *zwplug;
1428 	gfp_t gfp_mask = GFP_NOIO;
1429 	unsigned long flags;
1430 
1431 	/*
1432 	 * BIOs must be fully contained within a zone so that we use the correct
1433 	 * zone write plug for the entire BIO. For blk-mq devices, the block
1434 	 * layer should already have done any splitting required to ensure this
1435 	 * and this BIO should thus not be straddling zone boundaries. For
1436 	 * BIO-based devices, it is the responsibility of the driver to split
1437 	 * the bio before submitting it.
1438 	 */
1439 	if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1440 		bio_io_error(bio);
1441 		return true;
1442 	}
1443 
1444 	/* Conventional zones do not need write plugging. */
1445 	if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1446 		/* Zone append to conventional zones is not allowed. */
1447 		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1448 			bio_io_error(bio);
1449 			return true;
1450 		}
1451 		return false;
1452 	}
1453 
1454 	if (bio->bi_opf & REQ_NOWAIT)
1455 		gfp_mask = GFP_NOWAIT;
1456 
1457 	zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask);
1458 	if (!zwplug) {
1459 		if (bio->bi_opf & REQ_NOWAIT)
1460 			bio_wouldblock_error(bio);
1461 		else
1462 			bio_io_error(bio);
1463 		return true;
1464 	}
1465 
1466 	spin_lock_irqsave(&zwplug->lock, flags);
1467 
1468 	/*
1469 	 * Check if we got a zone write plug marked as dead. If yes, then the
1470 	 * user is likely issuing writes to a full zone, or without
1471 	 * synchronizing with zone reset or zone finish operations. In such
1472 	 * case, fail the BIO to signal this invalid usage.
1473 	 */
1474 	if (disk_check_zone_wplug_dead(zwplug)) {
1475 		spin_unlock_irqrestore(&zwplug->lock, flags);
1476 		disk_put_zone_wplug(zwplug);
1477 		bio_io_error(bio);
1478 		return true;
1479 	}
1480 
1481 	/* Indicate that this BIO is being handled using zone write plugging. */
1482 	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1483 
1484 	/*
1485 	 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a
1486 	 * BLK_STS_AGAIN failure if we let the caller submit the BIO.
1487 	 */
1488 	if (bio->bi_opf & REQ_NOWAIT) {
1489 		bio->bi_opf &= ~REQ_NOWAIT;
1490 		goto queue_bio;
1491 	}
1492 
1493 	/*
1494 	 * For rotational devices, we will use the gendisk zone write plugs
1495 	 * work instead of the per zone write plug BIO work, so queue the BIO.
1496 	 */
1497 	if (blk_queue_zoned_qd1_writes(disk->queue))
1498 		goto queue_bio;
1499 
1500 	/* If the zone is already plugged, add the BIO to the BIO plug list. */
1501 	if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
1502 		goto queue_bio;
1503 
1504 	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1505 		spin_unlock_irqrestore(&zwplug->lock, flags);
1506 		bio_io_error(bio);
1507 		return true;
1508 	}
1509 
1510 	/* Otherwise, plug and let the caller submit the BIO. */
1511 	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1512 
1513 	spin_unlock_irqrestore(&zwplug->lock, flags);
1514 
1515 	return false;
1516 
1517 queue_bio:
1518 	disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1519 
1520 	if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
1521 		zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1522 		if (blk_queue_zoned_qd1_writes(disk->queue))
1523 			wake_up_process(disk->zone_wplugs_worker);
1524 		else
1525 			disk_zone_wplug_schedule_work(disk, zwplug);
1526 	}
1527 
1528 	spin_unlock_irqrestore(&zwplug->lock, flags);
1529 
1530 	return true;
1531 }
1532 
blk_zone_wplug_handle_native_zone_append(struct bio * bio)1533 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1534 {
1535 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1536 	struct blk_zone_wplug *zwplug;
1537 	unsigned long flags;
1538 
1539 	if (!test_bit(GD_ZONE_APPEND_USED, &disk->state))
1540 		set_bit(GD_ZONE_APPEND_USED, &disk->state);
1541 
1542 	/*
1543 	 * We have native support for zone append operations, so we are not
1544 	 * going to handle @bio through plugging. However, we may already have a
1545 	 * zone write plug for the target zone if that zone was previously
1546 	 * partially written using regular writes. In such case, we risk leaving
1547 	 * the plug in the disk hash table if the zone is fully written using
1548 	 * zone append operations. Avoid this by removing the zone write plug.
1549 	 */
1550 	zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1551 	if (likely(!zwplug))
1552 		return;
1553 
1554 	spin_lock_irqsave(&zwplug->lock, flags);
1555 
1556 	/*
1557 	 * We are about to remove the zone write plug. But if the user
1558 	 * (mistakenly) has issued regular writes together with native zone
1559 	 * append, we must aborts the writes as otherwise the plugged BIOs would
1560 	 * not be executed by the plug BIO work as disk_get_zone_wplug() will
1561 	 * return NULL after the plug is removed. Aborting the plugged write
1562 	 * BIOs is consistent with the fact that these writes will most likely
1563 	 * fail anyway as there is no ordering guarantees between zone append
1564 	 * operations and regular write operations.
1565 	 */
1566 	if (!bio_list_empty(&zwplug->bio_list)) {
1567 		pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1568 				    disk->disk_name, zwplug->zone_no);
1569 		disk_zone_wplug_abort(zwplug);
1570 	}
1571 	disk_mark_zone_wplug_dead(zwplug);
1572 	spin_unlock_irqrestore(&zwplug->lock, flags);
1573 
1574 	disk_put_zone_wplug(zwplug);
1575 }
1576 
blk_zone_wplug_handle_zone_mgmt(struct bio * bio)1577 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio)
1578 {
1579 	if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL &&
1580 	    !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
1581 		/*
1582 		 * Zone reset and zone finish operations do not apply to
1583 		 * conventional zones.
1584 		 */
1585 		bio_io_error(bio);
1586 		return true;
1587 	}
1588 
1589 	/*
1590 	 * No-wait zone management BIOs do not make much sense as the callers
1591 	 * issue these as blocking operations in most cases. To avoid issues
1592 	 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn
1593 	 * about REQ_NOWAIT being set and ignore that flag.
1594 	 */
1595 	if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
1596 		bio->bi_opf &= ~REQ_NOWAIT;
1597 
1598 	return false;
1599 }
1600 
1601 /**
1602  * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1603  * @bio: The BIO being submitted
1604  * @nr_segs: The number of physical segments of @bio
1605  *
1606  * Handle write, write zeroes and zone append operations requiring emulation
1607  * using zone write plugging.
1608  *
1609  * Return true whenever @bio execution needs to be delayed through the zone
1610  * write plug. Otherwise, return false to let the submission path process
1611  * @bio normally.
1612  */
blk_zone_plug_bio(struct bio * bio,unsigned int nr_segs)1613 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1614 {
1615 	struct block_device *bdev = bio->bi_bdev;
1616 
1617 	if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
1618 		return false;
1619 
1620 	/*
1621 	 * Regular writes and write zeroes need to be handled through the target
1622 	 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1623 	 * which may need to go through the flush machinery depending on the
1624 	 * target device capabilities. Plugging such writes is fine as the flush
1625 	 * machinery operates at the request level, below the plug, and
1626 	 * completion of the flush sequence will go through the regular BIO
1627 	 * completion, which will handle zone write plugging.
1628 	 * Zone append operations for devices that requested emulation must
1629 	 * also be plugged so that these BIOs can be changed into regular
1630 	 * write BIOs.
1631 	 * Zone reset, reset all and finish commands need special treatment
1632 	 * to correctly track the write pointer offset of zones. These commands
1633 	 * are not plugged as we do not need serialization with write
1634 	 * operations. It is the responsibility of the user to not issue reset
1635 	 * and finish commands when write operations are in flight.
1636 	 */
1637 	switch (bio_op(bio)) {
1638 	case REQ_OP_ZONE_APPEND:
1639 		if (!bdev_emulates_zone_append(bdev)) {
1640 			blk_zone_wplug_handle_native_zone_append(bio);
1641 			return false;
1642 		}
1643 		fallthrough;
1644 	case REQ_OP_WRITE:
1645 	case REQ_OP_WRITE_ZEROES:
1646 		return blk_zone_wplug_handle_write(bio, nr_segs);
1647 	case REQ_OP_ZONE_RESET:
1648 	case REQ_OP_ZONE_FINISH:
1649 	case REQ_OP_ZONE_RESET_ALL:
1650 		return blk_zone_wplug_handle_zone_mgmt(bio);
1651 	default:
1652 		return false;
1653 	}
1654 
1655 	return false;
1656 }
1657 EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1658 
disk_zone_wplug_unplug_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug)1659 static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1660 				       struct blk_zone_wplug *zwplug)
1661 {
1662 	unsigned long flags;
1663 
1664 	spin_lock_irqsave(&zwplug->lock, flags);
1665 
1666 	/*
1667 	 * For rotational devices, signal the BIO completion to the zone write
1668 	 * plug work. Otherwise, schedule submission of the next plugged BIO
1669 	 * if we have one.
1670 	 */
1671 	if (bio_list_empty(&zwplug->bio_list))
1672 		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1673 
1674 	if (blk_queue_zoned_qd1_writes(disk->queue))
1675 		complete(&disk->zone_wplugs_worker_bio_done);
1676 	else if (!bio_list_empty(&zwplug->bio_list))
1677 		disk_zone_wplug_schedule_work(disk, zwplug);
1678 
1679 	if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
1680 		disk_mark_zone_wplug_dead(zwplug);
1681 
1682 	spin_unlock_irqrestore(&zwplug->lock, flags);
1683 }
1684 
blk_zone_append_update_request_bio(struct request * rq,struct bio * bio)1685 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
1686 {
1687 	/*
1688 	 * For zone append requests, the request sector indicates the location
1689 	 * at which the BIO data was written. Return this value to the BIO
1690 	 * issuer through the BIO iter sector.
1691 	 * For plugged zone writes, which include emulated zone append, we need
1692 	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
1693 	 * lookup the zone write plug.
1694 	 */
1695 	bio->bi_iter.bi_sector = rq->__sector;
1696 	trace_blk_zone_append_update_request_bio(rq);
1697 }
1698 
blk_zone_write_plug_bio_endio(struct bio * bio)1699 void blk_zone_write_plug_bio_endio(struct bio *bio)
1700 {
1701 	struct gendisk *disk = bio->bi_bdev->bd_disk;
1702 	struct blk_zone_wplug *zwplug =
1703 		disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1704 	unsigned long flags;
1705 
1706 	if (WARN_ON_ONCE(!zwplug))
1707 		return;
1708 
1709 	/* Make sure we do not see this BIO again by clearing the plug flag. */
1710 	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1711 
1712 	/*
1713 	 * If this is a regular write emulating a zone append operation,
1714 	 * restore the original operation code.
1715 	 */
1716 	if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1717 		bio->bi_opf &= ~REQ_OP_MASK;
1718 		bio->bi_opf |= REQ_OP_ZONE_APPEND;
1719 		bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
1720 	}
1721 
1722 	/*
1723 	 * If the BIO failed, abort all plugged BIOs and mark the plug as
1724 	 * needing a write pointer update.
1725 	 */
1726 	if (bio->bi_status != BLK_STS_OK) {
1727 		spin_lock_irqsave(&zwplug->lock, flags);
1728 		disk_zone_wplug_abort(zwplug);
1729 		zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1730 		spin_unlock_irqrestore(&zwplug->lock, flags);
1731 	}
1732 
1733 	/* Drop the reference we took when the BIO was issued. */
1734 	disk_put_zone_wplug(zwplug);
1735 
1736 	/*
1737 	 * For BIO-based devices, blk_zone_write_plug_finish_request()
1738 	 * is not called. So we need to schedule execution of the next
1739 	 * plugged BIO here.
1740 	 */
1741 	if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1742 		disk_zone_wplug_unplug_bio(disk, zwplug);
1743 
1744 	/* Drop the reference we took when entering this function. */
1745 	disk_put_zone_wplug(zwplug);
1746 }
1747 
blk_zone_write_plug_finish_request(struct request * req)1748 void blk_zone_write_plug_finish_request(struct request *req)
1749 {
1750 	struct gendisk *disk = req->q->disk;
1751 	struct blk_zone_wplug *zwplug;
1752 
1753 	zwplug = disk_get_zone_wplug(disk, req->__sector);
1754 	if (WARN_ON_ONCE(!zwplug))
1755 		return;
1756 
1757 	req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1758 
1759 	/*
1760 	 * Drop the reference we took when the request was initialized in
1761 	 * blk_zone_write_plug_init_request().
1762 	 */
1763 	disk_put_zone_wplug(zwplug);
1764 
1765 	disk_zone_wplug_unplug_bio(disk, zwplug);
1766 
1767 	/* Drop the reference we took when entering this function. */
1768 	disk_put_zone_wplug(zwplug);
1769 }
1770 
disk_zone_wplug_submit_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug)1771 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
1772 				       struct blk_zone_wplug *zwplug)
1773 {
1774 	struct block_device *bdev;
1775 	unsigned long flags;
1776 	struct bio *bio;
1777 	bool prepared;
1778 
1779 	/*
1780 	 * Submit the next plugged BIO. If we do not have any, clear
1781 	 * the plugged flag.
1782 	 */
1783 again:
1784 	spin_lock_irqsave(&zwplug->lock, flags);
1785 	bio = bio_list_pop(&zwplug->bio_list);
1786 	if (!bio) {
1787 		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1788 		spin_unlock_irqrestore(&zwplug->lock, flags);
1789 		return false;
1790 	}
1791 
1792 	trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
1793 				 bio->bi_iter.bi_sector, bio_sectors(bio));
1794 
1795 	prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
1796 	spin_unlock_irqrestore(&zwplug->lock, flags);
1797 
1798 	if (!prepared) {
1799 		blk_zone_wplug_bio_io_error(zwplug, bio);
1800 		goto again;
1801 	}
1802 
1803 	/*
1804 	 * blk-mq devices will reuse the extra reference on the request queue
1805 	 * usage counter we took when the BIO was plugged, but the submission
1806 	 * path for BIO-based devices will not do that. So drop this extra
1807 	 * reference here.
1808 	 */
1809 	if (blk_queue_zoned_qd1_writes(disk->queue))
1810 		reinit_completion(&disk->zone_wplugs_worker_bio_done);
1811 	bdev = bio->bi_bdev;
1812 	if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
1813 		bdev->bd_disk->fops->submit_bio(bio);
1814 		blk_queue_exit(bdev->bd_disk->queue);
1815 	} else {
1816 		blk_mq_submit_bio(bio);
1817 	}
1818 
1819 	return true;
1820 }
1821 
disk_get_zone_wplugs_work(struct gendisk * disk)1822 static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk)
1823 {
1824 	struct blk_zone_wplug *zwplug;
1825 
1826 	spin_lock_irq(&disk->zone_wplugs_list_lock);
1827 	zwplug = list_first_entry_or_null(&disk->zone_wplugs_list,
1828 					  struct blk_zone_wplug, entry);
1829 	if (zwplug)
1830 		list_del_init(&zwplug->entry);
1831 	spin_unlock_irq(&disk->zone_wplugs_list_lock);
1832 
1833 	return zwplug;
1834 }
1835 
disk_zone_wplugs_worker(void * data)1836 static int disk_zone_wplugs_worker(void *data)
1837 {
1838 	struct gendisk *disk = data;
1839 	struct blk_zone_wplug *zwplug;
1840 	unsigned int noio_flag;
1841 
1842 	noio_flag = memalloc_noio_save();
1843 	set_user_nice(current, MIN_NICE);
1844 	set_freezable();
1845 
1846 	for (;;) {
1847 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1848 
1849 		zwplug = disk_get_zone_wplugs_work(disk);
1850 		if (zwplug) {
1851 			/*
1852 			 * Process all BIOs of this zone write plug and then
1853 			 * drop the reference we took when adding the zone write
1854 			 * plug to the active list.
1855 			 */
1856 			set_current_state(TASK_RUNNING);
1857 			while (disk_zone_wplug_submit_bio(disk, zwplug))
1858 				blk_wait_io(&disk->zone_wplugs_worker_bio_done);
1859 			disk_put_zone_wplug(zwplug);
1860 			continue;
1861 		}
1862 
1863 		/*
1864 		 * Only sleep if nothing sets the state to running. Else check
1865 		 * for zone write plugs work again as a newly submitted BIO
1866 		 * might have added a zone write plug to the work list.
1867 		 */
1868 		if (get_current_state() == TASK_RUNNING) {
1869 			try_to_freeze();
1870 		} else {
1871 			if (kthread_should_stop()) {
1872 				set_current_state(TASK_RUNNING);
1873 				break;
1874 			}
1875 			schedule();
1876 		}
1877 	}
1878 
1879 	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
1880 	memalloc_noio_restore(noio_flag);
1881 
1882 	return 0;
1883 }
1884 
disk_init_zone_resources(struct gendisk * disk)1885 void disk_init_zone_resources(struct gendisk *disk)
1886 {
1887 	spin_lock_init(&disk->zone_wplugs_hash_lock);
1888 	spin_lock_init(&disk->zone_wplugs_list_lock);
1889 	INIT_LIST_HEAD(&disk->zone_wplugs_list);
1890 	init_completion(&disk->zone_wplugs_worker_bio_done);
1891 }
1892 
1893 /*
1894  * For the size of a disk zone write plug hash table, use the size of the
1895  * zone write plug mempool, which is the maximum of the disk open zones and
1896  * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1897  * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1898  */
1899 #define BLK_ZONE_WPLUG_MAX_HASH_BITS		9
1900 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE	128
1901 
disk_alloc_zone_resources(struct gendisk * disk,unsigned int pool_size)1902 static int disk_alloc_zone_resources(struct gendisk *disk,
1903 				     unsigned int pool_size)
1904 {
1905 	unsigned int i;
1906 	int ret = -ENOMEM;
1907 
1908 	atomic_set(&disk->nr_zone_wplugs, 0);
1909 	disk->zone_wplugs_hash_bits =
1910 		min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1911 
1912 	disk->zone_wplugs_hash =
1913 		kzalloc_objs(struct hlist_head,
1914 			     disk_zone_wplugs_hash_size(disk));
1915 	if (!disk->zone_wplugs_hash)
1916 		return -ENOMEM;
1917 
1918 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1919 		INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1920 
1921 	disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1922 						sizeof(struct blk_zone_wplug));
1923 	if (!disk->zone_wplugs_pool)
1924 		goto free_hash;
1925 
1926 	disk->zone_wplugs_wq =
1927 		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1928 				pool_size, disk->disk_name);
1929 	if (!disk->zone_wplugs_wq)
1930 		goto destroy_pool;
1931 
1932 	disk->zone_wplugs_worker =
1933 		kthread_create(disk_zone_wplugs_worker, disk,
1934 			       "%s_zwplugs_worker", disk->disk_name);
1935 	if (IS_ERR(disk->zone_wplugs_worker)) {
1936 		ret = PTR_ERR(disk->zone_wplugs_worker);
1937 		disk->zone_wplugs_worker = NULL;
1938 		goto destroy_wq;
1939 	}
1940 	wake_up_process(disk->zone_wplugs_worker);
1941 
1942 	return 0;
1943 
1944 destroy_wq:
1945 	destroy_workqueue(disk->zone_wplugs_wq);
1946 	disk->zone_wplugs_wq = NULL;
1947 destroy_pool:
1948 	mempool_destroy(disk->zone_wplugs_pool);
1949 	disk->zone_wplugs_pool = NULL;
1950 free_hash:
1951 	kfree(disk->zone_wplugs_hash);
1952 	disk->zone_wplugs_hash = NULL;
1953 	disk->zone_wplugs_hash_bits = 0;
1954 	return ret;
1955 }
1956 
disk_destroy_zone_wplugs_hash_table(struct gendisk * disk)1957 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1958 {
1959 	struct blk_zone_wplug *zwplug;
1960 	unsigned int i;
1961 
1962 	if (!disk->zone_wplugs_hash)
1963 		return;
1964 
1965 	/* Free all the zone write plugs we have. */
1966 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1967 		while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1968 			zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1969 					     struct blk_zone_wplug, node);
1970 			spin_lock_irq(&zwplug->lock);
1971 			disk_mark_zone_wplug_dead(zwplug);
1972 			spin_unlock_irq(&zwplug->lock);
1973 		}
1974 	}
1975 
1976 	WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1977 	kfree(disk->zone_wplugs_hash);
1978 	disk->zone_wplugs_hash = NULL;
1979 	disk->zone_wplugs_hash_bits = 0;
1980 
1981 	/*
1982 	 * Wait for the zone write plugs to be RCU-freed before destroying the
1983 	 * mempool.
1984 	 */
1985 	rcu_barrier();
1986 	mempool_destroy(disk->zone_wplugs_pool);
1987 	disk->zone_wplugs_pool = NULL;
1988 }
1989 
disk_set_zones_cond_array(struct gendisk * disk,u8 * zones_cond)1990 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
1991 {
1992 	unsigned long flags;
1993 
1994 	spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
1995 	zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
1996 				lockdep_is_held(&disk->zone_wplugs_hash_lock));
1997 	spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
1998 
1999 	kfree_rcu_mightsleep(zones_cond);
2000 }
2001 
disk_free_zone_resources(struct gendisk * disk)2002 void disk_free_zone_resources(struct gendisk *disk)
2003 {
2004 	if (disk->zone_wplugs_worker)
2005 		kthread_stop(disk->zone_wplugs_worker);
2006 	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
2007 
2008 	if (disk->zone_wplugs_wq) {
2009 		destroy_workqueue(disk->zone_wplugs_wq);
2010 		disk->zone_wplugs_wq = NULL;
2011 	}
2012 
2013 	disk_destroy_zone_wplugs_hash_table(disk);
2014 
2015 	disk_set_zones_cond_array(disk, NULL);
2016 	disk->zone_capacity = 0;
2017 	disk->last_zone_capacity = 0;
2018 	disk->nr_zones = 0;
2019 }
2020 
2021 struct blk_revalidate_zone_args {
2022 	struct gendisk	*disk;
2023 	u8		*zones_cond;
2024 	unsigned int	nr_zones;
2025 	unsigned int	nr_conv_zones;
2026 	unsigned int	zone_capacity;
2027 	unsigned int	last_zone_capacity;
2028 	sector_t	sector;
2029 };
2030 
disk_revalidate_zone_resources(struct gendisk * disk,struct blk_revalidate_zone_args * args)2031 static int disk_revalidate_zone_resources(struct gendisk *disk,
2032 				struct blk_revalidate_zone_args *args)
2033 {
2034 	struct queue_limits *lim = &disk->queue->limits;
2035 	unsigned int pool_size;
2036 	int ret = 0;
2037 
2038 	args->disk = disk;
2039 	args->nr_zones =
2040 		DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
2041 
2042 	/* Cached zone conditions: 1 byte per zone */
2043 	args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
2044 	if (!args->zones_cond)
2045 		return -ENOMEM;
2046 
2047 	if (!disk_need_zone_resources(disk))
2048 		return 0;
2049 
2050 	/*
2051 	 * If the device has no limit on the maximum number of open and active
2052 	 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
2053 	 */
2054 	pool_size = max(lim->max_open_zones, lim->max_active_zones);
2055 	if (!pool_size)
2056 		pool_size =
2057 			min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
2058 
2059 	if (!disk->zone_wplugs_hash) {
2060 		ret = disk_alloc_zone_resources(disk, pool_size);
2061 		if (ret)
2062 			kfree(args->zones_cond);
2063 	}
2064 
2065 	return ret;
2066 }
2067 
2068 /*
2069  * Update the disk zone resources information and device queue limits.
2070  * The disk queue is frozen when this is executed.
2071  */
disk_update_zone_resources(struct gendisk * disk,struct blk_revalidate_zone_args * args)2072 static int disk_update_zone_resources(struct gendisk *disk,
2073 				      struct blk_revalidate_zone_args *args)
2074 {
2075 	struct request_queue *q = disk->queue;
2076 	unsigned int nr_seq_zones;
2077 	unsigned int pool_size, memflags;
2078 	struct queue_limits lim;
2079 	int ret = 0;
2080 
2081 	lim = queue_limits_start_update(q);
2082 
2083 	memflags = blk_mq_freeze_queue(q);
2084 
2085 	disk->nr_zones = args->nr_zones;
2086 	if (args->nr_conv_zones >= disk->nr_zones) {
2087 		queue_limits_cancel_update(q);
2088 		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
2089 			disk->disk_name, args->nr_conv_zones, disk->nr_zones);
2090 		ret = -ENODEV;
2091 		goto unfreeze;
2092 	}
2093 
2094 	disk->zone_capacity = args->zone_capacity;
2095 	disk->last_zone_capacity = args->last_zone_capacity;
2096 	disk_set_zones_cond_array(disk, args->zones_cond);
2097 	args->zones_cond = NULL;
2098 
2099 	/*
2100 	 * Some devices can advertise zone resource limits that are larger than
2101 	 * the number of sequential zones of the zoned block device, e.g. a
2102 	 * small ZNS namespace. For such case, assume that the zoned device has
2103 	 * no zone resource limits.
2104 	 */
2105 	nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
2106 	if (lim.max_open_zones >= nr_seq_zones)
2107 		lim.max_open_zones = 0;
2108 	if (lim.max_active_zones >= nr_seq_zones)
2109 		lim.max_active_zones = 0;
2110 
2111 	if (!disk->zone_wplugs_pool)
2112 		goto commit;
2113 
2114 	/*
2115 	 * If the device has no limit on the maximum number of open and active
2116 	 * zones, set its max open zone limit to the mempool size to indicate
2117 	 * to the user that there is a potential performance impact due to
2118 	 * dynamic zone write plug allocation when simultaneously writing to
2119 	 * more zones than the size of the mempool.
2120 	 */
2121 	pool_size = max(lim.max_open_zones, lim.max_active_zones);
2122 	if (!pool_size)
2123 		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
2124 
2125 	mempool_resize(disk->zone_wplugs_pool, pool_size);
2126 
2127 	if (!lim.max_open_zones && !lim.max_active_zones) {
2128 		if (pool_size < nr_seq_zones)
2129 			lim.max_open_zones = pool_size;
2130 		else
2131 			lim.max_open_zones = 0;
2132 	}
2133 
2134 commit:
2135 	ret = queue_limits_commit_update(q, &lim);
2136 
2137 unfreeze:
2138 	if (ret)
2139 		disk_free_zone_resources(disk);
2140 
2141 	blk_mq_unfreeze_queue(q, memflags);
2142 
2143 	return ret;
2144 }
2145 
blk_revalidate_zone_cond(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2146 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
2147 				    struct blk_revalidate_zone_args *args)
2148 {
2149 	enum blk_zone_cond cond = zone->cond;
2150 
2151 	/* Check that the zone condition is consistent with the zone type. */
2152 	switch (cond) {
2153 	case BLK_ZONE_COND_NOT_WP:
2154 		if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
2155 			goto invalid_condition;
2156 		break;
2157 	case BLK_ZONE_COND_IMP_OPEN:
2158 	case BLK_ZONE_COND_EXP_OPEN:
2159 	case BLK_ZONE_COND_CLOSED:
2160 	case BLK_ZONE_COND_EMPTY:
2161 	case BLK_ZONE_COND_FULL:
2162 	case BLK_ZONE_COND_OFFLINE:
2163 	case BLK_ZONE_COND_READONLY:
2164 		if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
2165 			goto invalid_condition;
2166 		break;
2167 	default:
2168 		pr_warn("%s: Invalid zone condition 0x%X\n",
2169 			args->disk->disk_name, cond);
2170 		return -ENODEV;
2171 	}
2172 
2173 	blk_zone_set_cond(args->zones_cond, idx, cond);
2174 
2175 	return 0;
2176 
2177 invalid_condition:
2178 	pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
2179 		args->disk->disk_name, cond, zone->type);
2180 
2181 	return -ENODEV;
2182 }
2183 
blk_revalidate_conv_zone(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2184 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
2185 				    struct blk_revalidate_zone_args *args)
2186 {
2187 	struct gendisk *disk = args->disk;
2188 
2189 	if (zone->capacity != zone->len) {
2190 		pr_warn("%s: Invalid conventional zone capacity\n",
2191 			disk->disk_name);
2192 		return -ENODEV;
2193 	}
2194 
2195 	if (disk_zone_is_last(disk, zone))
2196 		args->last_zone_capacity = zone->capacity;
2197 
2198 	args->nr_conv_zones++;
2199 
2200 	return 0;
2201 }
2202 
blk_revalidate_seq_zone(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2203 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
2204 				   struct blk_revalidate_zone_args *args)
2205 {
2206 	struct gendisk *disk = args->disk;
2207 	struct blk_zone_wplug *zwplug;
2208 	unsigned int wp_offset;
2209 
2210 	/*
2211 	 * Remember the capacity of the first sequential zone and check
2212 	 * if it is constant for all zones, ignoring the last zone as it can be
2213 	 * smaller.
2214 	 */
2215 	if (!args->zone_capacity)
2216 		args->zone_capacity = zone->capacity;
2217 	if (disk_zone_is_last(disk, zone)) {
2218 		args->last_zone_capacity = zone->capacity;
2219 	} else if (zone->capacity != args->zone_capacity) {
2220 		pr_warn("%s: Invalid variable zone capacity\n",
2221 			disk->disk_name);
2222 		return -ENODEV;
2223 	}
2224 
2225 	/*
2226 	 * If the device needs zone append emulation, we need to track the
2227 	 * write pointer of all zones that are not empty nor full. So make sure
2228 	 * we have a zone write plug for such zone if the device has a zone
2229 	 * write plug hash table.
2230 	 */
2231 	if (!disk->zone_wplugs_hash)
2232 		return 0;
2233 
2234 	wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone);
2235 	if (!wp_offset || wp_offset >= zone->capacity)
2236 		return 0;
2237 
2238 	zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO);
2239 	if (!zwplug)
2240 		return -ENOMEM;
2241 	disk_put_zone_wplug(zwplug);
2242 
2243 	return 0;
2244 }
2245 
2246 /*
2247  * Helper function to check the validity of zones of a zoned block device.
2248  */
blk_revalidate_zone_cb(struct blk_zone * zone,unsigned int idx,void * data)2249 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
2250 				  void *data)
2251 {
2252 	struct blk_revalidate_zone_args *args = data;
2253 	struct gendisk *disk = args->disk;
2254 	sector_t zone_sectors = disk->queue->limits.chunk_sectors;
2255 	int ret;
2256 
2257 	/* Check for bad zones and holes in the zone report */
2258 	if (zone->start != args->sector) {
2259 		pr_warn("%s: Zone gap at sectors %llu..%llu\n",
2260 			disk->disk_name, args->sector, zone->start);
2261 		return -ENODEV;
2262 	}
2263 
2264 	if (zone->start >= get_capacity(disk) || !zone->len) {
2265 		pr_warn("%s: Invalid zone start %llu, length %llu\n",
2266 			disk->disk_name, zone->start, zone->len);
2267 		return -ENODEV;
2268 	}
2269 
2270 	/*
2271 	 * All zones must have the same size, with the exception on an eventual
2272 	 * smaller last zone.
2273 	 */
2274 	if (!disk_zone_is_last(disk, zone)) {
2275 		if (zone->len != zone_sectors) {
2276 			pr_warn("%s: Invalid zoned device with non constant zone size\n",
2277 				disk->disk_name);
2278 			return -ENODEV;
2279 		}
2280 	} else if (zone->len > zone_sectors) {
2281 		pr_warn("%s: Invalid zoned device with larger last zone size\n",
2282 			disk->disk_name);
2283 		return -ENODEV;
2284 	}
2285 
2286 	if (!zone->capacity || zone->capacity > zone->len) {
2287 		pr_warn("%s: Invalid zone capacity\n",
2288 			disk->disk_name);
2289 		return -ENODEV;
2290 	}
2291 
2292 	/* Check zone condition */
2293 	ret = blk_revalidate_zone_cond(zone, idx, args);
2294 	if (ret)
2295 		return ret;
2296 
2297 	/* Check zone type */
2298 	switch (zone->type) {
2299 	case BLK_ZONE_TYPE_CONVENTIONAL:
2300 		ret = blk_revalidate_conv_zone(zone, idx, args);
2301 		break;
2302 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
2303 		ret = blk_revalidate_seq_zone(zone, idx, args);
2304 		break;
2305 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
2306 	default:
2307 		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
2308 			disk->disk_name, (int)zone->type, zone->start);
2309 		ret = -ENODEV;
2310 	}
2311 
2312 	if (!ret)
2313 		args->sector += zone->len;
2314 
2315 	return ret;
2316 }
2317 
2318 /**
2319  * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
2320  * @disk:	Target disk
2321  *
2322  * Helper function for low-level device drivers to check, (re) allocate and
2323  * initialize resources used for managing zoned disks. This function should
2324  * normally be called by blk-mq based drivers when a zoned gendisk is probed
2325  * and when the zone configuration of the gendisk changes (e.g. after a format).
2326  * Before calling this function, the device driver must already have set the
2327  * device zone size (chunk_sector limit) and the max zone append limit.
2328  * BIO based drivers can also use this function as long as the device queue
2329  * can be safely frozen.
2330  */
blk_revalidate_disk_zones(struct gendisk * disk)2331 int blk_revalidate_disk_zones(struct gendisk *disk)
2332 {
2333 	struct request_queue *q = disk->queue;
2334 	sector_t zone_sectors = q->limits.chunk_sectors;
2335 	sector_t capacity = get_capacity(disk);
2336 	struct blk_revalidate_zone_args args = { };
2337 	unsigned int memflags, noio_flag;
2338 	struct blk_report_zones_args rep_args = {
2339 		.cb = blk_revalidate_zone_cb,
2340 		.data = &args,
2341 	};
2342 	int ret = -ENOMEM;
2343 
2344 	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
2345 		return -EIO;
2346 
2347 	if (!capacity)
2348 		return -ENODEV;
2349 
2350 	/*
2351 	 * Checks that the device driver indicated a valid zone size and that
2352 	 * the max zone append limit is set.
2353 	 */
2354 	if (!zone_sectors || !is_power_of_2(zone_sectors)) {
2355 		pr_warn("%s: Invalid non power of two zone size (%llu)\n",
2356 			disk->disk_name, zone_sectors);
2357 		return -ENODEV;
2358 	}
2359 
2360 	/*
2361 	 * Ensure that all memory allocations in this context are done as if
2362 	 * GFP_NOIO was specified.
2363 	 */
2364 	noio_flag = memalloc_noio_save();
2365 	ret = disk_revalidate_zone_resources(disk, &args);
2366 	if (ret) {
2367 		memalloc_noio_restore(noio_flag);
2368 		return ret;
2369 	}
2370 
2371 	ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args);
2372 	if (!ret) {
2373 		pr_warn("%s: No zones reported\n", disk->disk_name);
2374 		ret = -ENODEV;
2375 	}
2376 	memalloc_noio_restore(noio_flag);
2377 
2378 	if (ret <= 0)
2379 		goto free_resources;
2380 
2381 	/*
2382 	 * If zones where reported, make sure that the entire disk capacity
2383 	 * has been checked.
2384 	 */
2385 	if (args.sector != capacity) {
2386 		pr_warn("%s: Missing zones from sector %llu\n",
2387 			disk->disk_name, args.sector);
2388 		ret = -ENODEV;
2389 		goto free_resources;
2390 	}
2391 
2392 	ret = disk_update_zone_resources(disk, &args);
2393 	if (ret)
2394 		goto free_resources;
2395 
2396 	return 0;
2397 
2398 free_resources:
2399 	pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
2400 
2401 	kfree(args.zones_cond);
2402 	memflags = blk_mq_freeze_queue(q);
2403 	disk_free_zone_resources(disk);
2404 	blk_mq_unfreeze_queue(q, memflags);
2405 
2406 	return ret;
2407 }
2408 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
2409 
2410 /**
2411  * blk_zone_issue_zeroout - zero-fill a block range in a zone
2412  * @bdev:	blockdev to write
2413  * @sector:	start sector
2414  * @nr_sects:	number of sectors to write
2415  * @gfp_mask:	memory allocation flags (for bio_alloc)
2416  *
2417  * Description:
2418  *  Zero-fill a block range in a zone (@sector must be equal to the zone write
2419  *  pointer), handling potential errors due to the (initially unknown) lack of
2420  *  hardware offload (See blkdev_issue_zeroout()).
2421  */
blk_zone_issue_zeroout(struct block_device * bdev,sector_t sector,sector_t nr_sects,gfp_t gfp_mask)2422 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
2423 			   sector_t nr_sects, gfp_t gfp_mask)
2424 {
2425 	struct gendisk *disk = bdev->bd_disk;
2426 	int ret;
2427 
2428 	if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
2429 		return -EIO;
2430 
2431 	ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
2432 				   BLKDEV_ZERO_NOFALLBACK);
2433 	if (ret != -EOPNOTSUPP)
2434 		return ret;
2435 
2436 	/*
2437 	 * The failed call to blkdev_issue_zeroout() advanced the zone write
2438 	 * pointer. Undo this using a report zone to update the zone write
2439 	 * pointer to the correct current value.
2440 	 */
2441 	ret = disk->fops->report_zones(disk, sector, 1, NULL);
2442 	if (ret != 1)
2443 		return ret < 0 ? ret : -EIO;
2444 
2445 	/*
2446 	 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
2447 	 * regular write with zero-pages.
2448 	 */
2449 	return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
2450 }
2451 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
2452 
2453 #ifdef CONFIG_BLK_DEBUG_FS
queue_zone_wplug_show(struct blk_zone_wplug * zwplug,struct seq_file * m)2454 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
2455 				  struct seq_file *m)
2456 {
2457 	unsigned int zwp_wp_offset, zwp_flags;
2458 	unsigned int zwp_zone_no, zwp_ref;
2459 	unsigned int zwp_bio_list_size;
2460 	enum blk_zone_cond zwp_cond;
2461 	unsigned long flags;
2462 
2463 	spin_lock_irqsave(&zwplug->lock, flags);
2464 	zwp_zone_no = zwplug->zone_no;
2465 	zwp_flags = zwplug->flags;
2466 	zwp_ref = refcount_read(&zwplug->ref);
2467 	zwp_cond = zwplug->cond;
2468 	zwp_wp_offset = zwplug->wp_offset;
2469 	zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
2470 	spin_unlock_irqrestore(&zwplug->lock, flags);
2471 
2472 	seq_printf(m,
2473 		"Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n",
2474 		zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond),
2475 		zwp_wp_offset, zwp_bio_list_size);
2476 }
2477 
queue_zone_wplugs_show(void * data,struct seq_file * m)2478 int queue_zone_wplugs_show(void *data, struct seq_file *m)
2479 {
2480 	struct request_queue *q = data;
2481 	struct gendisk *disk = q->disk;
2482 	struct blk_zone_wplug *zwplug;
2483 	unsigned int i;
2484 
2485 	if (!disk->zone_wplugs_hash)
2486 		return 0;
2487 
2488 	rcu_read_lock();
2489 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
2490 		hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
2491 					 node)
2492 			queue_zone_wplug_show(zwplug, m);
2493 	rcu_read_unlock();
2494 
2495 	return 0;
2496 }
2497 
2498 #endif
2499