1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Zoned block device handling
4 *
5 * Copyright (c) 2015, Hannes Reinecke
6 * Copyright (c) 2015, SUSE Linux GmbH
7 *
8 * Copyright (c) 2016, Damien Le Moal
9 * Copyright (c) 2016, Western Digital
10 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
11 */
12
13 #include <linux/kernel.h>
14 #include <linux/blkdev.h>
15 #include <linux/blk-mq.h>
16 #include <linux/spinlock.h>
17 #include <linux/refcount.h>
18 #include <linux/mempool.h>
19
20 #include "blk.h"
21 #include "blk-mq-sched.h"
22 #include "blk-mq-debugfs.h"
23
24 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
25 static const char *const zone_cond_name[] = {
26 ZONE_COND_NAME(NOT_WP),
27 ZONE_COND_NAME(EMPTY),
28 ZONE_COND_NAME(IMP_OPEN),
29 ZONE_COND_NAME(EXP_OPEN),
30 ZONE_COND_NAME(CLOSED),
31 ZONE_COND_NAME(READONLY),
32 ZONE_COND_NAME(FULL),
33 ZONE_COND_NAME(OFFLINE),
34 };
35 #undef ZONE_COND_NAME
36
37 /*
38 * Per-zone write plug.
39 * @node: hlist_node structure for managing the plug using a hash table.
40 * @ref: Zone write plug reference counter. A zone write plug reference is
41 * always at least 1 when the plug is hashed in the disk plug hash table.
42 * The reference is incremented whenever a new BIO needing plugging is
43 * submitted and when a function needs to manipulate a plug. The
44 * reference count is decremented whenever a plugged BIO completes and
45 * when a function that referenced the plug returns. The initial
46 * reference is dropped whenever the zone of the zone write plug is reset,
47 * finished and when the zone becomes full (last write BIO to the zone
48 * completes).
49 * @lock: Spinlock to atomically manipulate the plug.
50 * @flags: Flags indicating the plug state.
51 * @zone_no: The number of the zone the plug is managing.
52 * @wp_offset: The zone write pointer location relative to the start of the zone
53 * as a number of 512B sectors.
54 * @bio_list: The list of BIOs that are currently plugged.
55 * @bio_work: Work struct to handle issuing of plugged BIOs
56 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
57 * @disk: The gendisk the plug belongs to.
58 */
59 struct blk_zone_wplug {
60 struct hlist_node node;
61 refcount_t ref;
62 spinlock_t lock;
63 unsigned int flags;
64 unsigned int zone_no;
65 unsigned int wp_offset;
66 struct bio_list bio_list;
67 struct work_struct bio_work;
68 struct rcu_head rcu_head;
69 struct gendisk *disk;
70 };
71
72 /*
73 * Zone write plug flags bits:
74 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
75 * that is, that write BIOs are being throttled due to a write BIO already
76 * being executed or the zone write plug bio list is not empty.
77 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
78 * write pointer offset and need to update it.
79 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
80 * from the disk hash table and that the initial reference to the zone
81 * write plug set when the plug was first added to the hash table has been
82 * dropped. This flag is set when a zone is reset, finished or become full,
83 * to prevent new references to the zone write plug to be taken for
84 * newly incoming BIOs. A zone write plug flagged with this flag will be
85 * freed once all remaining references from BIOs or functions are dropped.
86 */
87 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
88 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1)
89 #define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
90
91 /**
92 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
93 * @zone_cond: BLK_ZONE_COND_XXX.
94 *
95 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
96 * into string format. Useful in the debugging and tracing zone conditions. For
97 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
98 */
blk_zone_cond_str(enum blk_zone_cond zone_cond)99 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
100 {
101 static const char *zone_cond_str = "UNKNOWN";
102
103 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
104 zone_cond_str = zone_cond_name[zone_cond];
105
106 return zone_cond_str;
107 }
108 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
109
110 struct disk_report_zones_cb_args {
111 struct gendisk *disk;
112 report_zones_cb user_cb;
113 void *user_data;
114 };
115
116 static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
117 struct blk_zone *zone);
118
disk_report_zones_cb(struct blk_zone * zone,unsigned int idx,void * data)119 static int disk_report_zones_cb(struct blk_zone *zone, unsigned int idx,
120 void *data)
121 {
122 struct disk_report_zones_cb_args *args = data;
123 struct gendisk *disk = args->disk;
124
125 if (disk->zone_wplugs_hash)
126 disk_zone_wplug_sync_wp_offset(disk, zone);
127
128 if (!args->user_cb)
129 return 0;
130
131 return args->user_cb(zone, idx, args->user_data);
132 }
133
134 /**
135 * blkdev_report_zones - Get zones information
136 * @bdev: Target block device
137 * @sector: Sector from which to report zones
138 * @nr_zones: Maximum number of zones to report
139 * @cb: Callback function called for each reported zone
140 * @data: Private data for the callback
141 *
142 * Description:
143 * Get zone information starting from the zone containing @sector for at most
144 * @nr_zones, and call @cb for each zone reported by the device.
145 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
146 * constant can be passed to @nr_zones.
147 * Returns the number of zones reported by the device, or a negative errno
148 * value in case of failure.
149 *
150 * Note: The caller must use memalloc_noXX_save/restore() calls to control
151 * memory allocations done within this function.
152 */
blkdev_report_zones(struct block_device * bdev,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)153 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
154 unsigned int nr_zones, report_zones_cb cb, void *data)
155 {
156 struct gendisk *disk = bdev->bd_disk;
157 sector_t capacity = get_capacity(disk);
158 struct disk_report_zones_cb_args args = {
159 .disk = disk,
160 .user_cb = cb,
161 .user_data = data,
162 };
163
164 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
165 return -EOPNOTSUPP;
166
167 if (!nr_zones || sector >= capacity)
168 return 0;
169
170 return disk->fops->report_zones(disk, sector, nr_zones,
171 disk_report_zones_cb, &args);
172 }
173 EXPORT_SYMBOL_GPL(blkdev_report_zones);
174
blkdev_zone_reset_all(struct block_device * bdev)175 static int blkdev_zone_reset_all(struct block_device *bdev)
176 {
177 struct bio bio;
178
179 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
180 return submit_bio_wait(&bio);
181 }
182
183 /**
184 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
185 * @bdev: Target block device
186 * @op: Operation to be performed on the zones
187 * @sector: Start sector of the first zone to operate on
188 * @nr_sectors: Number of sectors, should be at least the length of one zone and
189 * must be zone size aligned.
190 *
191 * Description:
192 * Perform the specified operation on the range of zones specified by
193 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
194 * is valid, but the specified range should not contain conventional zones.
195 * The operation to execute on each zone can be a zone reset, open, close
196 * or finish request.
197 */
blkdev_zone_mgmt(struct block_device * bdev,enum req_op op,sector_t sector,sector_t nr_sectors)198 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
199 sector_t sector, sector_t nr_sectors)
200 {
201 sector_t zone_sectors = bdev_zone_sectors(bdev);
202 sector_t capacity = bdev_nr_sectors(bdev);
203 sector_t end_sector = sector + nr_sectors;
204 struct bio *bio = NULL;
205 int ret = 0;
206
207 if (!bdev_is_zoned(bdev))
208 return -EOPNOTSUPP;
209
210 if (bdev_read_only(bdev))
211 return -EPERM;
212
213 if (!op_is_zone_mgmt(op))
214 return -EOPNOTSUPP;
215
216 if (end_sector <= sector || end_sector > capacity)
217 /* Out of range */
218 return -EINVAL;
219
220 /* Check alignment (handle eventual smaller last zone) */
221 if (!bdev_is_zone_start(bdev, sector))
222 return -EINVAL;
223
224 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
225 return -EINVAL;
226
227 /*
228 * In the case of a zone reset operation over all zones, use
229 * REQ_OP_ZONE_RESET_ALL.
230 */
231 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
232 return blkdev_zone_reset_all(bdev);
233
234 while (sector < end_sector) {
235 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
236 bio->bi_iter.bi_sector = sector;
237 sector += zone_sectors;
238
239 /* This may take a while, so be nice to others */
240 cond_resched();
241 }
242
243 ret = submit_bio_wait(bio);
244 bio_put(bio);
245
246 return ret;
247 }
248 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
249
250 struct zone_report_args {
251 struct blk_zone __user *zones;
252 };
253
blkdev_copy_zone_to_user(struct blk_zone * zone,unsigned int idx,void * data)254 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
255 void *data)
256 {
257 struct zone_report_args *args = data;
258
259 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
260 return -EFAULT;
261 return 0;
262 }
263
264 /*
265 * BLKREPORTZONE ioctl processing.
266 * Called from blkdev_ioctl.
267 */
blkdev_report_zones_ioctl(struct block_device * bdev,unsigned int cmd,unsigned long arg)268 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
269 unsigned long arg)
270 {
271 void __user *argp = (void __user *)arg;
272 struct zone_report_args args;
273 struct blk_zone_report rep;
274 int ret;
275
276 if (!argp)
277 return -EINVAL;
278
279 if (!bdev_is_zoned(bdev))
280 return -ENOTTY;
281
282 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
283 return -EFAULT;
284
285 if (!rep.nr_zones)
286 return -EINVAL;
287
288 args.zones = argp + sizeof(struct blk_zone_report);
289 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
290 blkdev_copy_zone_to_user, &args);
291 if (ret < 0)
292 return ret;
293
294 rep.nr_zones = ret;
295 rep.flags = BLK_ZONE_REP_CAPACITY;
296 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
297 return -EFAULT;
298 return 0;
299 }
300
blkdev_truncate_zone_range(struct block_device * bdev,blk_mode_t mode,const struct blk_zone_range * zrange)301 static int blkdev_truncate_zone_range(struct block_device *bdev,
302 blk_mode_t mode, const struct blk_zone_range *zrange)
303 {
304 loff_t start, end;
305
306 if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
307 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
308 /* Out of range */
309 return -EINVAL;
310
311 start = zrange->sector << SECTOR_SHIFT;
312 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
313
314 return truncate_bdev_range(bdev, mode, start, end);
315 }
316
317 /*
318 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
319 * Called from blkdev_ioctl.
320 */
blkdev_zone_mgmt_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)321 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
322 unsigned int cmd, unsigned long arg)
323 {
324 void __user *argp = (void __user *)arg;
325 struct blk_zone_range zrange;
326 enum req_op op;
327 int ret;
328
329 if (!argp)
330 return -EINVAL;
331
332 if (!bdev_is_zoned(bdev))
333 return -ENOTTY;
334
335 if (!(mode & BLK_OPEN_WRITE))
336 return -EBADF;
337
338 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
339 return -EFAULT;
340
341 switch (cmd) {
342 case BLKRESETZONE:
343 op = REQ_OP_ZONE_RESET;
344
345 /* Invalidate the page cache, including dirty pages. */
346 filemap_invalidate_lock(bdev->bd_mapping);
347 ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
348 if (ret)
349 goto fail;
350 break;
351 case BLKOPENZONE:
352 op = REQ_OP_ZONE_OPEN;
353 break;
354 case BLKCLOSEZONE:
355 op = REQ_OP_ZONE_CLOSE;
356 break;
357 case BLKFINISHZONE:
358 op = REQ_OP_ZONE_FINISH;
359 break;
360 default:
361 return -ENOTTY;
362 }
363
364 ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
365
366 fail:
367 if (cmd == BLKRESETZONE)
368 filemap_invalidate_unlock(bdev->bd_mapping);
369
370 return ret;
371 }
372
disk_zone_is_last(struct gendisk * disk,struct blk_zone * zone)373 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
374 {
375 return zone->start + zone->len >= get_capacity(disk);
376 }
377
disk_zone_is_full(struct gendisk * disk,unsigned int zno,unsigned int offset_in_zone)378 static bool disk_zone_is_full(struct gendisk *disk,
379 unsigned int zno, unsigned int offset_in_zone)
380 {
381 if (zno < disk->nr_zones - 1)
382 return offset_in_zone >= disk->zone_capacity;
383 return offset_in_zone >= disk->last_zone_capacity;
384 }
385
disk_zone_wplug_is_full(struct gendisk * disk,struct blk_zone_wplug * zwplug)386 static bool disk_zone_wplug_is_full(struct gendisk *disk,
387 struct blk_zone_wplug *zwplug)
388 {
389 return disk_zone_is_full(disk, zwplug->zone_no, zwplug->wp_offset);
390 }
391
disk_insert_zone_wplug(struct gendisk * disk,struct blk_zone_wplug * zwplug)392 static bool disk_insert_zone_wplug(struct gendisk *disk,
393 struct blk_zone_wplug *zwplug)
394 {
395 struct blk_zone_wplug *zwplg;
396 unsigned long flags;
397 unsigned int idx =
398 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
399
400 /*
401 * Add the new zone write plug to the hash table, but carefully as we
402 * are racing with other submission context, so we may already have a
403 * zone write plug for the same zone.
404 */
405 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
406 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
407 if (zwplg->zone_no == zwplug->zone_no) {
408 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
409 return false;
410 }
411 }
412 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
413 atomic_inc(&disk->nr_zone_wplugs);
414 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
415
416 return true;
417 }
418
disk_get_hashed_zone_wplug(struct gendisk * disk,sector_t sector)419 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
420 sector_t sector)
421 {
422 unsigned int zno = disk_zone_no(disk, sector);
423 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
424 struct blk_zone_wplug *zwplug;
425
426 rcu_read_lock();
427
428 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
429 if (zwplug->zone_no == zno &&
430 refcount_inc_not_zero(&zwplug->ref)) {
431 rcu_read_unlock();
432 return zwplug;
433 }
434 }
435
436 rcu_read_unlock();
437
438 return NULL;
439 }
440
disk_get_zone_wplug(struct gendisk * disk,sector_t sector)441 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
442 sector_t sector)
443 {
444 if (!atomic_read(&disk->nr_zone_wplugs))
445 return NULL;
446
447 return disk_get_hashed_zone_wplug(disk, sector);
448 }
449
disk_free_zone_wplug_rcu(struct rcu_head * rcu_head)450 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
451 {
452 struct blk_zone_wplug *zwplug =
453 container_of(rcu_head, struct blk_zone_wplug, rcu_head);
454
455 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
456 }
457
disk_put_zone_wplug(struct blk_zone_wplug * zwplug)458 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
459 {
460 if (refcount_dec_and_test(&zwplug->ref)) {
461 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
462 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
463 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
464
465 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
466 }
467 }
468
disk_should_remove_zone_wplug(struct gendisk * disk,struct blk_zone_wplug * zwplug)469 static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
470 struct blk_zone_wplug *zwplug)
471 {
472 lockdep_assert_held(&zwplug->lock);
473
474 /* If the zone write plug was already removed, we are done. */
475 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
476 return false;
477
478 /* If the zone write plug is still plugged, it cannot be removed. */
479 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
480 return false;
481
482 /*
483 * Completions of BIOs with blk_zone_write_plug_bio_endio() may
484 * happen after handling a request completion with
485 * blk_zone_write_plug_finish_request() (e.g. with split BIOs
486 * that are chained). In such case, disk_zone_wplug_unplug_bio()
487 * should not attempt to remove the zone write plug until all BIO
488 * completions are seen. Check by looking at the zone write plug
489 * reference count, which is 2 when the plug is unused (one reference
490 * taken when the plug was allocated and another reference taken by the
491 * caller context).
492 */
493 if (refcount_read(&zwplug->ref) > 2)
494 return false;
495
496 /* We can remove zone write plugs for zones that are empty or full. */
497 return !zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug);
498 }
499
disk_remove_zone_wplug(struct gendisk * disk,struct blk_zone_wplug * zwplug)500 static void disk_remove_zone_wplug(struct gendisk *disk,
501 struct blk_zone_wplug *zwplug)
502 {
503 unsigned long flags;
504
505 /* If the zone write plug was already removed, we have nothing to do. */
506 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
507 return;
508
509 /*
510 * Mark the zone write plug as unhashed and drop the extra reference we
511 * took when the plug was inserted in the hash table.
512 */
513 zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
514 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
515 hlist_del_init_rcu(&zwplug->node);
516 atomic_dec(&disk->nr_zone_wplugs);
517 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
518 disk_put_zone_wplug(zwplug);
519 }
520
521 static void blk_zone_wplug_bio_work(struct work_struct *work);
522
523 /*
524 * Get a reference on the write plug for the zone containing @sector.
525 * If the plug does not exist, it is allocated and hashed.
526 * Return a pointer to the zone write plug with the plug spinlock held.
527 */
disk_get_and_lock_zone_wplug(struct gendisk * disk,sector_t sector,gfp_t gfp_mask,unsigned long * flags)528 static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
529 sector_t sector, gfp_t gfp_mask,
530 unsigned long *flags)
531 {
532 unsigned int zno = disk_zone_no(disk, sector);
533 struct blk_zone_wplug *zwplug;
534
535 again:
536 zwplug = disk_get_zone_wplug(disk, sector);
537 if (zwplug) {
538 /*
539 * Check that a BIO completion or a zone reset or finish
540 * operation has not already removed the zone write plug from
541 * the hash table and dropped its reference count. In such case,
542 * we need to get a new plug so start over from the beginning.
543 */
544 spin_lock_irqsave(&zwplug->lock, *flags);
545 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
546 spin_unlock_irqrestore(&zwplug->lock, *flags);
547 disk_put_zone_wplug(zwplug);
548 goto again;
549 }
550 return zwplug;
551 }
552
553 /*
554 * Allocate and initialize a zone write plug with an extra reference
555 * so that it is not freed when the zone write plug becomes idle without
556 * the zone being full.
557 */
558 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
559 if (!zwplug)
560 return NULL;
561
562 INIT_HLIST_NODE(&zwplug->node);
563 refcount_set(&zwplug->ref, 2);
564 spin_lock_init(&zwplug->lock);
565 zwplug->flags = 0;
566 zwplug->zone_no = zno;
567 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
568 bio_list_init(&zwplug->bio_list);
569 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
570 zwplug->disk = disk;
571
572 spin_lock_irqsave(&zwplug->lock, *flags);
573
574 /*
575 * Insert the new zone write plug in the hash table. This can fail only
576 * if another context already inserted a plug. Retry from the beginning
577 * in such case.
578 */
579 if (!disk_insert_zone_wplug(disk, zwplug)) {
580 spin_unlock_irqrestore(&zwplug->lock, *flags);
581 mempool_free(zwplug, disk->zone_wplugs_pool);
582 goto again;
583 }
584
585 return zwplug;
586 }
587
blk_zone_wplug_bio_io_error(struct blk_zone_wplug * zwplug,struct bio * bio)588 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
589 struct bio *bio)
590 {
591 struct request_queue *q = zwplug->disk->queue;
592
593 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
594 bio_io_error(bio);
595 disk_put_zone_wplug(zwplug);
596 /* Drop the reference taken by disk_zone_wplug_add_bio(() */
597 blk_queue_exit(q);
598 }
599
600 /*
601 * Abort (fail) all plugged BIOs of a zone write plug.
602 */
disk_zone_wplug_abort(struct blk_zone_wplug * zwplug)603 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
604 {
605 struct bio *bio;
606
607 if (bio_list_empty(&zwplug->bio_list))
608 return;
609
610 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
611 zwplug->disk->disk_name, zwplug->zone_no);
612 while ((bio = bio_list_pop(&zwplug->bio_list)))
613 blk_zone_wplug_bio_io_error(zwplug, bio);
614 }
615
616 /*
617 * Set a zone write plug write pointer offset to the specified value.
618 * This aborts all plugged BIOs, which is fine as this function is called for
619 * a zone reset operation, a zone finish operation or if the zone needs a wp
620 * update from a report zone after a write error.
621 */
disk_zone_wplug_set_wp_offset(struct gendisk * disk,struct blk_zone_wplug * zwplug,unsigned int wp_offset)622 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
623 struct blk_zone_wplug *zwplug,
624 unsigned int wp_offset)
625 {
626 lockdep_assert_held(&zwplug->lock);
627
628 /* Update the zone write pointer and abort all plugged BIOs. */
629 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
630 zwplug->wp_offset = wp_offset;
631 disk_zone_wplug_abort(zwplug);
632
633 /*
634 * The zone write plug now has no BIO plugged: remove it from the
635 * hash table so that it cannot be seen. The plug will be freed
636 * when the last reference is dropped.
637 */
638 if (disk_should_remove_zone_wplug(disk, zwplug))
639 disk_remove_zone_wplug(disk, zwplug);
640 }
641
blk_zone_wp_offset(struct blk_zone * zone)642 static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
643 {
644 switch (zone->cond) {
645 case BLK_ZONE_COND_IMP_OPEN:
646 case BLK_ZONE_COND_EXP_OPEN:
647 case BLK_ZONE_COND_CLOSED:
648 return zone->wp - zone->start;
649 case BLK_ZONE_COND_FULL:
650 return zone->len;
651 case BLK_ZONE_COND_EMPTY:
652 return 0;
653 case BLK_ZONE_COND_NOT_WP:
654 case BLK_ZONE_COND_OFFLINE:
655 case BLK_ZONE_COND_READONLY:
656 default:
657 /*
658 * Conventional, offline and read-only zones do not have a valid
659 * write pointer.
660 */
661 return UINT_MAX;
662 }
663 }
664
disk_zone_wplug_sync_wp_offset(struct gendisk * disk,struct blk_zone * zone)665 static void disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
666 struct blk_zone *zone)
667 {
668 struct blk_zone_wplug *zwplug;
669 unsigned long flags;
670
671 zwplug = disk_get_zone_wplug(disk, zone->start);
672 if (!zwplug)
673 return;
674
675 spin_lock_irqsave(&zwplug->lock, flags);
676 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
677 disk_zone_wplug_set_wp_offset(disk, zwplug,
678 blk_zone_wp_offset(zone));
679 spin_unlock_irqrestore(&zwplug->lock, flags);
680
681 disk_put_zone_wplug(zwplug);
682 }
683
disk_zone_sync_wp_offset(struct gendisk * disk,sector_t sector)684 static int disk_zone_sync_wp_offset(struct gendisk *disk, sector_t sector)
685 {
686 struct disk_report_zones_cb_args args = {
687 .disk = disk,
688 };
689
690 return disk->fops->report_zones(disk, sector, 1,
691 disk_report_zones_cb, &args);
692 }
693
blk_zone_wplug_handle_reset_or_finish(struct bio * bio,unsigned int wp_offset)694 static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
695 unsigned int wp_offset)
696 {
697 struct gendisk *disk = bio->bi_bdev->bd_disk;
698 sector_t sector = bio->bi_iter.bi_sector;
699 struct blk_zone_wplug *zwplug;
700 unsigned long flags;
701
702 /* Conventional zones cannot be reset nor finished. */
703 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
704 bio_io_error(bio);
705 return true;
706 }
707
708 /*
709 * No-wait reset or finish BIOs do not make much sense as the callers
710 * issue these as blocking operations in most cases. To avoid issues
711 * the BIO execution potentially failing with BLK_STS_AGAIN, warn about
712 * REQ_NOWAIT being set and ignore that flag.
713 */
714 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
715 bio->bi_opf &= ~REQ_NOWAIT;
716
717 /*
718 * If we have a zone write plug, set its write pointer offset to 0
719 * (reset case) or to the zone size (finish case). This will abort all
720 * BIOs plugged for the target zone. It is fine as resetting or
721 * finishing zones while writes are still in-flight will result in the
722 * writes failing anyway.
723 */
724 zwplug = disk_get_zone_wplug(disk, sector);
725 if (zwplug) {
726 spin_lock_irqsave(&zwplug->lock, flags);
727 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
728 spin_unlock_irqrestore(&zwplug->lock, flags);
729 disk_put_zone_wplug(zwplug);
730 }
731
732 return false;
733 }
734
blk_zone_wplug_handle_reset_all(struct bio * bio)735 static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
736 {
737 struct gendisk *disk = bio->bi_bdev->bd_disk;
738 struct blk_zone_wplug *zwplug;
739 unsigned long flags;
740 sector_t sector;
741
742 /*
743 * Set the write pointer offset of all zone write plugs to 0. This will
744 * abort all plugged BIOs. It is fine as resetting zones while writes
745 * are still in-flight will result in the writes failing anyway.
746 */
747 for (sector = 0; sector < get_capacity(disk);
748 sector += disk->queue->limits.chunk_sectors) {
749 zwplug = disk_get_zone_wplug(disk, sector);
750 if (zwplug) {
751 spin_lock_irqsave(&zwplug->lock, flags);
752 disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
753 spin_unlock_irqrestore(&zwplug->lock, flags);
754 disk_put_zone_wplug(zwplug);
755 }
756 }
757
758 return false;
759 }
760
disk_zone_wplug_schedule_bio_work(struct gendisk * disk,struct blk_zone_wplug * zwplug)761 static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
762 struct blk_zone_wplug *zwplug)
763 {
764 /*
765 * Take a reference on the zone write plug and schedule the submission
766 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
767 * reference we take here.
768 */
769 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
770 refcount_inc(&zwplug->ref);
771 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
772 }
773
disk_zone_wplug_add_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug,struct bio * bio,unsigned int nr_segs)774 static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
775 struct blk_zone_wplug *zwplug,
776 struct bio *bio, unsigned int nr_segs)
777 {
778 bool schedule_bio_work = false;
779
780 /*
781 * Grab an extra reference on the BIO request queue usage counter.
782 * This reference will be reused to submit a request for the BIO for
783 * blk-mq devices and dropped when the BIO is failed and after
784 * it is issued in the case of BIO-based devices.
785 */
786 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
787
788 /*
789 * The BIO is being plugged and thus will have to wait for the on-going
790 * write and for all other writes already plugged. So polling makes
791 * no sense.
792 */
793 bio_clear_polled(bio);
794
795 /*
796 * REQ_NOWAIT BIOs are always handled using the zone write plug BIO
797 * work, which can block. So clear the REQ_NOWAIT flag and schedule the
798 * work if this is the first BIO we are plugging.
799 */
800 if (bio->bi_opf & REQ_NOWAIT) {
801 schedule_bio_work = !(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
802 bio->bi_opf &= ~REQ_NOWAIT;
803 }
804
805 /*
806 * Reuse the poll cookie field to store the number of segments when
807 * split to the hardware limits.
808 */
809 bio->__bi_nr_segments = nr_segs;
810
811 /*
812 * We always receive BIOs after they are split and ready to be issued.
813 * The block layer passes the parts of a split BIO in order, and the
814 * user must also issue write sequentially. So simply add the new BIO
815 * at the tail of the list to preserve the sequential write order.
816 */
817 bio_list_add(&zwplug->bio_list, bio);
818
819 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
820
821 if (schedule_bio_work)
822 disk_zone_wplug_schedule_bio_work(disk, zwplug);
823 }
824
825 /*
826 * Called from bio_attempt_back_merge() when a BIO was merged with a request.
827 */
blk_zone_write_plug_bio_merged(struct bio * bio)828 void blk_zone_write_plug_bio_merged(struct bio *bio)
829 {
830 struct blk_zone_wplug *zwplug;
831 unsigned long flags;
832
833 /*
834 * If the BIO was already plugged, then we were called through
835 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
836 * For this case, we already hold a reference on the zone write plug for
837 * the BIO and blk_zone_write_plug_init_request() will handle the
838 * zone write pointer offset update.
839 */
840 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
841 return;
842
843 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
844
845 /*
846 * Get a reference on the zone write plug of the target zone and advance
847 * the zone write pointer offset. Given that this is a merge, we already
848 * have at least one request and one BIO referencing the zone write
849 * plug. So this should not fail.
850 */
851 zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
852 bio->bi_iter.bi_sector);
853 if (WARN_ON_ONCE(!zwplug))
854 return;
855
856 spin_lock_irqsave(&zwplug->lock, flags);
857 zwplug->wp_offset += bio_sectors(bio);
858 spin_unlock_irqrestore(&zwplug->lock, flags);
859 }
860
861 /*
862 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
863 * already went through zone write plugging (either a new BIO or one that was
864 * unplugged).
865 */
blk_zone_write_plug_init_request(struct request * req)866 void blk_zone_write_plug_init_request(struct request *req)
867 {
868 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
869 struct request_queue *q = req->q;
870 struct gendisk *disk = q->disk;
871 struct blk_zone_wplug *zwplug =
872 disk_get_zone_wplug(disk, blk_rq_pos(req));
873 unsigned long flags;
874 struct bio *bio;
875
876 if (WARN_ON_ONCE(!zwplug))
877 return;
878
879 /*
880 * Indicate that completion of this request needs to be handled with
881 * blk_zone_write_plug_finish_request(), which will drop the reference
882 * on the zone write plug we took above on entry to this function.
883 */
884 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
885
886 if (blk_queue_nomerges(q))
887 return;
888
889 /*
890 * Walk through the list of plugged BIOs to check if they can be merged
891 * into the back of the request.
892 */
893 spin_lock_irqsave(&zwplug->lock, flags);
894 while (!disk_zone_wplug_is_full(disk, zwplug)) {
895 bio = bio_list_peek(&zwplug->bio_list);
896 if (!bio)
897 break;
898
899 if (bio->bi_iter.bi_sector != req_back_sector ||
900 !blk_rq_merge_ok(req, bio))
901 break;
902
903 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
904 !bio->__bi_nr_segments);
905
906 bio_list_pop(&zwplug->bio_list);
907 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
908 BIO_MERGE_OK) {
909 bio_list_add_head(&zwplug->bio_list, bio);
910 break;
911 }
912
913 /* Drop the reference taken by disk_zone_wplug_add_bio(). */
914 blk_queue_exit(q);
915 zwplug->wp_offset += bio_sectors(bio);
916
917 req_back_sector += bio_sectors(bio);
918 }
919 spin_unlock_irqrestore(&zwplug->lock, flags);
920 }
921
922 /*
923 * Check and prepare a BIO for submission by incrementing the write pointer
924 * offset of its zone write plug and changing zone append operations into
925 * regular write when zone append emulation is needed.
926 */
blk_zone_wplug_prepare_bio(struct blk_zone_wplug * zwplug,struct bio * bio)927 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
928 struct bio *bio)
929 {
930 struct gendisk *disk = bio->bi_bdev->bd_disk;
931
932 lockdep_assert_held(&zwplug->lock);
933
934 /*
935 * If we lost track of the zone write pointer due to a write error,
936 * the user must either execute a report zones, reset the zone or finish
937 * the to recover a reliable write pointer position. Fail BIOs if the
938 * user did not do that as we cannot handle emulated zone append
939 * otherwise.
940 */
941 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
942 return false;
943
944 /*
945 * Check that the user is not attempting to write to a full zone.
946 * We know such BIO will fail, and that would potentially overflow our
947 * write pointer offset beyond the end of the zone.
948 */
949 if (disk_zone_wplug_is_full(disk, zwplug))
950 return false;
951
952 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
953 /*
954 * Use a regular write starting at the current write pointer.
955 * Similarly to native zone append operations, do not allow
956 * merging.
957 */
958 bio->bi_opf &= ~REQ_OP_MASK;
959 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
960 bio->bi_iter.bi_sector += zwplug->wp_offset;
961
962 /*
963 * Remember that this BIO is in fact a zone append operation
964 * so that we can restore its operation code on completion.
965 */
966 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
967 } else {
968 /*
969 * Check for non-sequential writes early as we know that BIOs
970 * with a start sector not unaligned to the zone write pointer
971 * will fail.
972 */
973 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
974 return false;
975 }
976
977 /* Advance the zone write pointer offset. */
978 zwplug->wp_offset += bio_sectors(bio);
979
980 return true;
981 }
982
blk_zone_wplug_handle_write(struct bio * bio,unsigned int nr_segs)983 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
984 {
985 struct gendisk *disk = bio->bi_bdev->bd_disk;
986 sector_t sector = bio->bi_iter.bi_sector;
987 struct blk_zone_wplug *zwplug;
988 gfp_t gfp_mask = GFP_NOIO;
989 unsigned long flags;
990
991 /*
992 * BIOs must be fully contained within a zone so that we use the correct
993 * zone write plug for the entire BIO. For blk-mq devices, the block
994 * layer should already have done any splitting required to ensure this
995 * and this BIO should thus not be straddling zone boundaries. For
996 * BIO-based devices, it is the responsibility of the driver to split
997 * the bio before submitting it.
998 */
999 if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1000 bio_io_error(bio);
1001 return true;
1002 }
1003
1004 /* Conventional zones do not need write plugging. */
1005 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1006 /* Zone append to conventional zones is not allowed. */
1007 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1008 bio_io_error(bio);
1009 return true;
1010 }
1011 return false;
1012 }
1013
1014 if (bio->bi_opf & REQ_NOWAIT)
1015 gfp_mask = GFP_NOWAIT;
1016
1017 zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
1018 if (!zwplug) {
1019 if (bio->bi_opf & REQ_NOWAIT)
1020 bio_wouldblock_error(bio);
1021 else
1022 bio_io_error(bio);
1023 return true;
1024 }
1025
1026 /* Indicate that this BIO is being handled using zone write plugging. */
1027 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1028
1029 /*
1030 * If the zone is already plugged, add the BIO to the plug BIO list.
1031 * Do the same for REQ_NOWAIT BIOs to ensure that we will not see a
1032 * BLK_STS_AGAIN failure if we let the BIO execute.
1033 * Otherwise, plug and let the BIO execute.
1034 */
1035 if ((zwplug->flags & BLK_ZONE_WPLUG_PLUGGED) ||
1036 (bio->bi_opf & REQ_NOWAIT))
1037 goto plug;
1038
1039 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1040 spin_unlock_irqrestore(&zwplug->lock, flags);
1041 bio_io_error(bio);
1042 return true;
1043 }
1044
1045 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1046
1047 spin_unlock_irqrestore(&zwplug->lock, flags);
1048
1049 return false;
1050
1051 plug:
1052 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1053
1054 spin_unlock_irqrestore(&zwplug->lock, flags);
1055
1056 return true;
1057 }
1058
blk_zone_wplug_handle_native_zone_append(struct bio * bio)1059 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1060 {
1061 struct gendisk *disk = bio->bi_bdev->bd_disk;
1062 struct blk_zone_wplug *zwplug;
1063 unsigned long flags;
1064
1065 /*
1066 * We have native support for zone append operations, so we are not
1067 * going to handle @bio through plugging. However, we may already have a
1068 * zone write plug for the target zone if that zone was previously
1069 * partially written using regular writes. In such case, we risk leaving
1070 * the plug in the disk hash table if the zone is fully written using
1071 * zone append operations. Avoid this by removing the zone write plug.
1072 */
1073 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1074 if (likely(!zwplug))
1075 return;
1076
1077 spin_lock_irqsave(&zwplug->lock, flags);
1078
1079 /*
1080 * We are about to remove the zone write plug. But if the user
1081 * (mistakenly) has issued regular writes together with native zone
1082 * append, we must aborts the writes as otherwise the plugged BIOs would
1083 * not be executed by the plug BIO work as disk_get_zone_wplug() will
1084 * return NULL after the plug is removed. Aborting the plugged write
1085 * BIOs is consistent with the fact that these writes will most likely
1086 * fail anyway as there is no ordering guarantees between zone append
1087 * operations and regular write operations.
1088 */
1089 if (!bio_list_empty(&zwplug->bio_list)) {
1090 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1091 disk->disk_name, zwplug->zone_no);
1092 disk_zone_wplug_abort(zwplug);
1093 }
1094 disk_remove_zone_wplug(disk, zwplug);
1095 spin_unlock_irqrestore(&zwplug->lock, flags);
1096
1097 disk_put_zone_wplug(zwplug);
1098 }
1099
1100 /**
1101 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1102 * @bio: The BIO being submitted
1103 * @nr_segs: The number of physical segments of @bio
1104 *
1105 * Handle write, write zeroes and zone append operations requiring emulation
1106 * using zone write plugging.
1107 *
1108 * Return true whenever @bio execution needs to be delayed through the zone
1109 * write plug. Otherwise, return false to let the submission path process
1110 * @bio normally.
1111 */
blk_zone_plug_bio(struct bio * bio,unsigned int nr_segs)1112 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1113 {
1114 struct block_device *bdev = bio->bi_bdev;
1115
1116 if (!bdev->bd_disk->zone_wplugs_hash)
1117 return false;
1118
1119 /*
1120 * If the BIO already has the plugging flag set, then it was already
1121 * handled through this path and this is a submission from the zone
1122 * plug bio submit work.
1123 */
1124 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1125 return false;
1126
1127 /*
1128 * We do not need to do anything special for empty flush BIOs, e.g
1129 * BIOs such as issued by blkdev_issue_flush(). The is because it is
1130 * the responsibility of the user to first wait for the completion of
1131 * write operations for flush to have any effect on the persistence of
1132 * the written data.
1133 */
1134 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
1135 return false;
1136
1137 /*
1138 * Regular writes and write zeroes need to be handled through the target
1139 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1140 * which may need to go through the flush machinery depending on the
1141 * target device capabilities. Plugging such writes is fine as the flush
1142 * machinery operates at the request level, below the plug, and
1143 * completion of the flush sequence will go through the regular BIO
1144 * completion, which will handle zone write plugging.
1145 * Zone append operations for devices that requested emulation must
1146 * also be plugged so that these BIOs can be changed into regular
1147 * write BIOs.
1148 * Zone reset, reset all and finish commands need special treatment
1149 * to correctly track the write pointer offset of zones. These commands
1150 * are not plugged as we do not need serialization with write
1151 * operations. It is the responsibility of the user to not issue reset
1152 * and finish commands when write operations are in flight.
1153 */
1154 switch (bio_op(bio)) {
1155 case REQ_OP_ZONE_APPEND:
1156 if (!bdev_emulates_zone_append(bdev)) {
1157 blk_zone_wplug_handle_native_zone_append(bio);
1158 return false;
1159 }
1160 fallthrough;
1161 case REQ_OP_WRITE:
1162 case REQ_OP_WRITE_ZEROES:
1163 return blk_zone_wplug_handle_write(bio, nr_segs);
1164 case REQ_OP_ZONE_RESET:
1165 return blk_zone_wplug_handle_reset_or_finish(bio, 0);
1166 case REQ_OP_ZONE_FINISH:
1167 return blk_zone_wplug_handle_reset_or_finish(bio,
1168 bdev_zone_sectors(bdev));
1169 case REQ_OP_ZONE_RESET_ALL:
1170 return blk_zone_wplug_handle_reset_all(bio);
1171 default:
1172 return false;
1173 }
1174
1175 return false;
1176 }
1177 EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1178
disk_zone_wplug_unplug_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug)1179 static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1180 struct blk_zone_wplug *zwplug)
1181 {
1182 unsigned long flags;
1183
1184 spin_lock_irqsave(&zwplug->lock, flags);
1185
1186 /* Schedule submission of the next plugged BIO if we have one. */
1187 if (!bio_list_empty(&zwplug->bio_list)) {
1188 disk_zone_wplug_schedule_bio_work(disk, zwplug);
1189 spin_unlock_irqrestore(&zwplug->lock, flags);
1190 return;
1191 }
1192
1193 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1194
1195 /*
1196 * If the zone is full (it was fully written or finished, or empty
1197 * (it was reset), remove its zone write plug from the hash table.
1198 */
1199 if (disk_should_remove_zone_wplug(disk, zwplug))
1200 disk_remove_zone_wplug(disk, zwplug);
1201
1202 spin_unlock_irqrestore(&zwplug->lock, flags);
1203 }
1204
blk_zone_write_plug_bio_endio(struct bio * bio)1205 void blk_zone_write_plug_bio_endio(struct bio *bio)
1206 {
1207 struct gendisk *disk = bio->bi_bdev->bd_disk;
1208 struct blk_zone_wplug *zwplug =
1209 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1210 unsigned long flags;
1211
1212 if (WARN_ON_ONCE(!zwplug))
1213 return;
1214
1215 /* Make sure we do not see this BIO again by clearing the plug flag. */
1216 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1217
1218 /*
1219 * If this is a regular write emulating a zone append operation,
1220 * restore the original operation code.
1221 */
1222 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1223 bio->bi_opf &= ~REQ_OP_MASK;
1224 bio->bi_opf |= REQ_OP_ZONE_APPEND;
1225 }
1226
1227 /*
1228 * If the BIO failed, abort all plugged BIOs and mark the plug as
1229 * needing a write pointer update.
1230 */
1231 if (bio->bi_status != BLK_STS_OK) {
1232 spin_lock_irqsave(&zwplug->lock, flags);
1233 disk_zone_wplug_abort(zwplug);
1234 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1235 spin_unlock_irqrestore(&zwplug->lock, flags);
1236 }
1237
1238 /* Drop the reference we took when the BIO was issued. */
1239 disk_put_zone_wplug(zwplug);
1240
1241 /*
1242 * For BIO-based devices, blk_zone_write_plug_finish_request()
1243 * is not called. So we need to schedule execution of the next
1244 * plugged BIO here.
1245 */
1246 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1247 disk_zone_wplug_unplug_bio(disk, zwplug);
1248
1249 /* Drop the reference we took when entering this function. */
1250 disk_put_zone_wplug(zwplug);
1251 }
1252
blk_zone_write_plug_finish_request(struct request * req)1253 void blk_zone_write_plug_finish_request(struct request *req)
1254 {
1255 struct gendisk *disk = req->q->disk;
1256 struct blk_zone_wplug *zwplug;
1257
1258 zwplug = disk_get_zone_wplug(disk, req->__sector);
1259 if (WARN_ON_ONCE(!zwplug))
1260 return;
1261
1262 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1263
1264 /*
1265 * Drop the reference we took when the request was initialized in
1266 * blk_zone_write_plug_init_request().
1267 */
1268 disk_put_zone_wplug(zwplug);
1269
1270 disk_zone_wplug_unplug_bio(disk, zwplug);
1271
1272 /* Drop the reference we took when entering this function. */
1273 disk_put_zone_wplug(zwplug);
1274 }
1275
blk_zone_wplug_bio_work(struct work_struct * work)1276 static void blk_zone_wplug_bio_work(struct work_struct *work)
1277 {
1278 struct blk_zone_wplug *zwplug =
1279 container_of(work, struct blk_zone_wplug, bio_work);
1280 struct block_device *bdev;
1281 unsigned long flags;
1282 struct bio *bio;
1283
1284 /*
1285 * Submit the next plugged BIO. If we do not have any, clear
1286 * the plugged flag.
1287 */
1288 spin_lock_irqsave(&zwplug->lock, flags);
1289
1290 again:
1291 bio = bio_list_pop(&zwplug->bio_list);
1292 if (!bio) {
1293 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1294 spin_unlock_irqrestore(&zwplug->lock, flags);
1295 goto put_zwplug;
1296 }
1297
1298 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1299 blk_zone_wplug_bio_io_error(zwplug, bio);
1300 goto again;
1301 }
1302
1303 spin_unlock_irqrestore(&zwplug->lock, flags);
1304
1305 bdev = bio->bi_bdev;
1306 submit_bio_noacct_nocheck(bio);
1307
1308 /*
1309 * blk-mq devices will reuse the extra reference on the request queue
1310 * usage counter we took when the BIO was plugged, but the submission
1311 * path for BIO-based devices will not do that. So drop this extra
1312 * reference here.
1313 */
1314 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
1315 blk_queue_exit(bdev->bd_disk->queue);
1316
1317 put_zwplug:
1318 /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
1319 disk_put_zone_wplug(zwplug);
1320 }
1321
disk_zone_wplugs_hash_size(struct gendisk * disk)1322 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
1323 {
1324 return 1U << disk->zone_wplugs_hash_bits;
1325 }
1326
disk_init_zone_resources(struct gendisk * disk)1327 void disk_init_zone_resources(struct gendisk *disk)
1328 {
1329 spin_lock_init(&disk->zone_wplugs_lock);
1330 }
1331
1332 /*
1333 * For the size of a disk zone write plug hash table, use the size of the
1334 * zone write plug mempool, which is the maximum of the disk open zones and
1335 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1336 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1337 */
1338 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
1339 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
1340
disk_alloc_zone_resources(struct gendisk * disk,unsigned int pool_size)1341 static int disk_alloc_zone_resources(struct gendisk *disk,
1342 unsigned int pool_size)
1343 {
1344 unsigned int i;
1345
1346 atomic_set(&disk->nr_zone_wplugs, 0);
1347 disk->zone_wplugs_hash_bits =
1348 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1349
1350 disk->zone_wplugs_hash =
1351 kcalloc(disk_zone_wplugs_hash_size(disk),
1352 sizeof(struct hlist_head), GFP_KERNEL);
1353 if (!disk->zone_wplugs_hash)
1354 return -ENOMEM;
1355
1356 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1357 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1358
1359 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1360 sizeof(struct blk_zone_wplug));
1361 if (!disk->zone_wplugs_pool)
1362 goto free_hash;
1363
1364 disk->zone_wplugs_wq =
1365 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1366 pool_size, disk->disk_name);
1367 if (!disk->zone_wplugs_wq)
1368 goto destroy_pool;
1369
1370 return 0;
1371
1372 destroy_pool:
1373 mempool_destroy(disk->zone_wplugs_pool);
1374 disk->zone_wplugs_pool = NULL;
1375 free_hash:
1376 kfree(disk->zone_wplugs_hash);
1377 disk->zone_wplugs_hash = NULL;
1378 disk->zone_wplugs_hash_bits = 0;
1379 return -ENOMEM;
1380 }
1381
disk_destroy_zone_wplugs_hash_table(struct gendisk * disk)1382 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1383 {
1384 struct blk_zone_wplug *zwplug;
1385 unsigned int i;
1386
1387 if (!disk->zone_wplugs_hash)
1388 return;
1389
1390 /* Free all the zone write plugs we have. */
1391 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1392 while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1393 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1394 struct blk_zone_wplug, node);
1395 refcount_inc(&zwplug->ref);
1396 disk_remove_zone_wplug(disk, zwplug);
1397 disk_put_zone_wplug(zwplug);
1398 }
1399 }
1400
1401 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1402 kfree(disk->zone_wplugs_hash);
1403 disk->zone_wplugs_hash = NULL;
1404 disk->zone_wplugs_hash_bits = 0;
1405 }
1406
disk_set_conv_zones_bitmap(struct gendisk * disk,unsigned long * bitmap)1407 static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
1408 unsigned long *bitmap)
1409 {
1410 unsigned int nr_conv_zones = 0;
1411 unsigned long flags;
1412
1413 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1414 if (bitmap)
1415 nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
1416 bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
1417 lockdep_is_held(&disk->zone_wplugs_lock));
1418 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
1419
1420 kfree_rcu_mightsleep(bitmap);
1421
1422 return nr_conv_zones;
1423 }
1424
disk_free_zone_resources(struct gendisk * disk)1425 void disk_free_zone_resources(struct gendisk *disk)
1426 {
1427 if (!disk->zone_wplugs_pool)
1428 return;
1429
1430 if (disk->zone_wplugs_wq) {
1431 destroy_workqueue(disk->zone_wplugs_wq);
1432 disk->zone_wplugs_wq = NULL;
1433 }
1434
1435 disk_destroy_zone_wplugs_hash_table(disk);
1436
1437 /*
1438 * Wait for the zone write plugs to be RCU-freed before
1439 * destorying the mempool.
1440 */
1441 rcu_barrier();
1442
1443 mempool_destroy(disk->zone_wplugs_pool);
1444 disk->zone_wplugs_pool = NULL;
1445
1446 disk_set_conv_zones_bitmap(disk, NULL);
1447 disk->zone_capacity = 0;
1448 disk->last_zone_capacity = 0;
1449 disk->nr_zones = 0;
1450 }
1451
disk_need_zone_resources(struct gendisk * disk)1452 static inline bool disk_need_zone_resources(struct gendisk *disk)
1453 {
1454 /*
1455 * All mq zoned devices need zone resources so that the block layer
1456 * can automatically handle write BIO plugging. BIO-based device drivers
1457 * (e.g. DM devices) are normally responsible for handling zone write
1458 * ordering and do not need zone resources, unless the driver requires
1459 * zone append emulation.
1460 */
1461 return queue_is_mq(disk->queue) ||
1462 queue_emulates_zone_append(disk->queue);
1463 }
1464
disk_revalidate_zone_resources(struct gendisk * disk,unsigned int nr_zones)1465 static int disk_revalidate_zone_resources(struct gendisk *disk,
1466 unsigned int nr_zones)
1467 {
1468 struct queue_limits *lim = &disk->queue->limits;
1469 unsigned int pool_size;
1470
1471 if (!disk_need_zone_resources(disk))
1472 return 0;
1473
1474 /*
1475 * If the device has no limit on the maximum number of open and active
1476 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
1477 */
1478 pool_size = max(lim->max_open_zones, lim->max_active_zones);
1479 if (!pool_size)
1480 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
1481
1482 if (!disk->zone_wplugs_hash)
1483 return disk_alloc_zone_resources(disk, pool_size);
1484
1485 return 0;
1486 }
1487
1488 struct blk_revalidate_zone_args {
1489 struct gendisk *disk;
1490 unsigned long *conv_zones_bitmap;
1491 unsigned int nr_zones;
1492 unsigned int zone_capacity;
1493 unsigned int last_zone_capacity;
1494 sector_t sector;
1495 };
1496
1497 /*
1498 * Update the disk zone resources information and device queue limits.
1499 * The disk queue is frozen when this is executed.
1500 */
disk_update_zone_resources(struct gendisk * disk,struct blk_revalidate_zone_args * args)1501 static int disk_update_zone_resources(struct gendisk *disk,
1502 struct blk_revalidate_zone_args *args)
1503 {
1504 struct request_queue *q = disk->queue;
1505 unsigned int nr_seq_zones, nr_conv_zones;
1506 unsigned int pool_size;
1507 struct queue_limits lim;
1508
1509 disk->nr_zones = args->nr_zones;
1510 disk->zone_capacity = args->zone_capacity;
1511 disk->last_zone_capacity = args->last_zone_capacity;
1512 nr_conv_zones =
1513 disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
1514 if (nr_conv_zones >= disk->nr_zones) {
1515 pr_warn("%s: Invalid number of conventional zones %u / %u\n",
1516 disk->disk_name, nr_conv_zones, disk->nr_zones);
1517 return -ENODEV;
1518 }
1519
1520 lim = queue_limits_start_update(q);
1521
1522 /*
1523 * Some devices can advertize zone resource limits that are larger than
1524 * the number of sequential zones of the zoned block device, e.g. a
1525 * small ZNS namespace. For such case, assume that the zoned device has
1526 * no zone resource limits.
1527 */
1528 nr_seq_zones = disk->nr_zones - nr_conv_zones;
1529 if (lim.max_open_zones >= nr_seq_zones)
1530 lim.max_open_zones = 0;
1531 if (lim.max_active_zones >= nr_seq_zones)
1532 lim.max_active_zones = 0;
1533
1534 if (!disk->zone_wplugs_pool)
1535 goto commit;
1536
1537 /*
1538 * If the device has no limit on the maximum number of open and active
1539 * zones, set its max open zone limit to the mempool size to indicate
1540 * to the user that there is a potential performance impact due to
1541 * dynamic zone write plug allocation when simultaneously writing to
1542 * more zones than the size of the mempool.
1543 */
1544 pool_size = max(lim.max_open_zones, lim.max_active_zones);
1545 if (!pool_size)
1546 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
1547
1548 mempool_resize(disk->zone_wplugs_pool, pool_size);
1549
1550 if (!lim.max_open_zones && !lim.max_active_zones) {
1551 if (pool_size < nr_seq_zones)
1552 lim.max_open_zones = pool_size;
1553 else
1554 lim.max_open_zones = 0;
1555 }
1556
1557 commit:
1558 return queue_limits_commit_update_frozen(q, &lim);
1559 }
1560
blk_revalidate_conv_zone(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)1561 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
1562 struct blk_revalidate_zone_args *args)
1563 {
1564 struct gendisk *disk = args->disk;
1565
1566 if (zone->capacity != zone->len) {
1567 pr_warn("%s: Invalid conventional zone capacity\n",
1568 disk->disk_name);
1569 return -ENODEV;
1570 }
1571
1572 if (disk_zone_is_last(disk, zone))
1573 args->last_zone_capacity = zone->capacity;
1574
1575 if (!disk_need_zone_resources(disk))
1576 return 0;
1577
1578 if (!args->conv_zones_bitmap) {
1579 args->conv_zones_bitmap =
1580 bitmap_zalloc(args->nr_zones, GFP_NOIO);
1581 if (!args->conv_zones_bitmap)
1582 return -ENOMEM;
1583 }
1584
1585 set_bit(idx, args->conv_zones_bitmap);
1586
1587 return 0;
1588 }
1589
blk_revalidate_seq_zone(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)1590 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
1591 struct blk_revalidate_zone_args *args)
1592 {
1593 struct gendisk *disk = args->disk;
1594 struct blk_zone_wplug *zwplug;
1595 unsigned int wp_offset;
1596 unsigned long flags;
1597
1598 /*
1599 * Remember the capacity of the first sequential zone and check
1600 * if it is constant for all zones, ignoring the last zone as it can be
1601 * smaller.
1602 */
1603 if (!args->zone_capacity)
1604 args->zone_capacity = zone->capacity;
1605 if (disk_zone_is_last(disk, zone)) {
1606 args->last_zone_capacity = zone->capacity;
1607 } else if (zone->capacity != args->zone_capacity) {
1608 pr_warn("%s: Invalid variable zone capacity\n",
1609 disk->disk_name);
1610 return -ENODEV;
1611 }
1612
1613 /*
1614 * If the device needs zone append emulation, we need to track the
1615 * write pointer of all zones that are not empty nor full. So make sure
1616 * we have a zone write plug for such zone if the device has a zone
1617 * write plug hash table.
1618 */
1619 if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash)
1620 return 0;
1621
1622 disk_zone_wplug_sync_wp_offset(disk, zone);
1623
1624 wp_offset = blk_zone_wp_offset(zone);
1625 if (!wp_offset || wp_offset >= zone->capacity)
1626 return 0;
1627
1628 zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
1629 if (!zwplug)
1630 return -ENOMEM;
1631 spin_unlock_irqrestore(&zwplug->lock, flags);
1632 disk_put_zone_wplug(zwplug);
1633
1634 return 0;
1635 }
1636
1637 /*
1638 * Helper function to check the validity of zones of a zoned block device.
1639 */
blk_revalidate_zone_cb(struct blk_zone * zone,unsigned int idx,void * data)1640 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
1641 void *data)
1642 {
1643 struct blk_revalidate_zone_args *args = data;
1644 struct gendisk *disk = args->disk;
1645 sector_t zone_sectors = disk->queue->limits.chunk_sectors;
1646 int ret;
1647
1648 /* Check for bad zones and holes in the zone report */
1649 if (zone->start != args->sector) {
1650 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
1651 disk->disk_name, args->sector, zone->start);
1652 return -ENODEV;
1653 }
1654
1655 if (zone->start >= get_capacity(disk) || !zone->len) {
1656 pr_warn("%s: Invalid zone start %llu, length %llu\n",
1657 disk->disk_name, zone->start, zone->len);
1658 return -ENODEV;
1659 }
1660
1661 /*
1662 * All zones must have the same size, with the exception on an eventual
1663 * smaller last zone.
1664 */
1665 if (!disk_zone_is_last(disk, zone)) {
1666 if (zone->len != zone_sectors) {
1667 pr_warn("%s: Invalid zoned device with non constant zone size\n",
1668 disk->disk_name);
1669 return -ENODEV;
1670 }
1671 } else if (zone->len > zone_sectors) {
1672 pr_warn("%s: Invalid zoned device with larger last zone size\n",
1673 disk->disk_name);
1674 return -ENODEV;
1675 }
1676
1677 if (!zone->capacity || zone->capacity > zone->len) {
1678 pr_warn("%s: Invalid zone capacity\n",
1679 disk->disk_name);
1680 return -ENODEV;
1681 }
1682
1683 /* Check zone type */
1684 switch (zone->type) {
1685 case BLK_ZONE_TYPE_CONVENTIONAL:
1686 ret = blk_revalidate_conv_zone(zone, idx, args);
1687 break;
1688 case BLK_ZONE_TYPE_SEQWRITE_REQ:
1689 ret = blk_revalidate_seq_zone(zone, idx, args);
1690 break;
1691 case BLK_ZONE_TYPE_SEQWRITE_PREF:
1692 default:
1693 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
1694 disk->disk_name, (int)zone->type, zone->start);
1695 ret = -ENODEV;
1696 }
1697
1698 if (!ret)
1699 args->sector += zone->len;
1700
1701 return ret;
1702 }
1703
1704 /**
1705 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
1706 * @disk: Target disk
1707 *
1708 * Helper function for low-level device drivers to check, (re) allocate and
1709 * initialize resources used for managing zoned disks. This function should
1710 * normally be called by blk-mq based drivers when a zoned gendisk is probed
1711 * and when the zone configuration of the gendisk changes (e.g. after a format).
1712 * Before calling this function, the device driver must already have set the
1713 * device zone size (chunk_sector limit) and the max zone append limit.
1714 * BIO based drivers can also use this function as long as the device queue
1715 * can be safely frozen.
1716 */
blk_revalidate_disk_zones(struct gendisk * disk)1717 int blk_revalidate_disk_zones(struct gendisk *disk)
1718 {
1719 struct request_queue *q = disk->queue;
1720 sector_t zone_sectors = q->limits.chunk_sectors;
1721 sector_t capacity = get_capacity(disk);
1722 struct blk_revalidate_zone_args args = { };
1723 unsigned int noio_flag;
1724 int ret = -ENOMEM;
1725
1726 if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
1727 return -EIO;
1728
1729 if (!capacity)
1730 return -ENODEV;
1731
1732 /*
1733 * Checks that the device driver indicated a valid zone size and that
1734 * the max zone append limit is set.
1735 */
1736 if (!zone_sectors || !is_power_of_2(zone_sectors)) {
1737 pr_warn("%s: Invalid non power of two zone size (%llu)\n",
1738 disk->disk_name, zone_sectors);
1739 return -ENODEV;
1740 }
1741
1742 /*
1743 * Ensure that all memory allocations in this context are done as if
1744 * GFP_NOIO was specified.
1745 */
1746 args.disk = disk;
1747 args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
1748 noio_flag = memalloc_noio_save();
1749 ret = disk_revalidate_zone_resources(disk, args.nr_zones);
1750 if (ret) {
1751 memalloc_noio_restore(noio_flag);
1752 return ret;
1753 }
1754
1755 ret = disk->fops->report_zones(disk, 0, UINT_MAX,
1756 blk_revalidate_zone_cb, &args);
1757 if (!ret) {
1758 pr_warn("%s: No zones reported\n", disk->disk_name);
1759 ret = -ENODEV;
1760 }
1761 memalloc_noio_restore(noio_flag);
1762
1763 /*
1764 * If zones where reported, make sure that the entire disk capacity
1765 * has been checked.
1766 */
1767 if (ret > 0 && args.sector != capacity) {
1768 pr_warn("%s: Missing zones from sector %llu\n",
1769 disk->disk_name, args.sector);
1770 ret = -ENODEV;
1771 }
1772
1773 /*
1774 * Set the new disk zone parameters only once the queue is frozen and
1775 * all I/Os are completed.
1776 */
1777 if (ret > 0)
1778 ret = disk_update_zone_resources(disk, &args);
1779 else
1780 pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
1781 if (ret) {
1782 unsigned int memflags = blk_mq_freeze_queue(q);
1783
1784 disk_free_zone_resources(disk);
1785 blk_mq_unfreeze_queue(q, memflags);
1786 }
1787
1788 return ret;
1789 }
1790 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
1791
1792 /**
1793 * blk_zone_issue_zeroout - zero-fill a block range in a zone
1794 * @bdev: blockdev to write
1795 * @sector: start sector
1796 * @nr_sects: number of sectors to write
1797 * @gfp_mask: memory allocation flags (for bio_alloc)
1798 *
1799 * Description:
1800 * Zero-fill a block range in a zone (@sector must be equal to the zone write
1801 * pointer), handling potential errors due to the (initially unknown) lack of
1802 * hardware offload (See blkdev_issue_zeroout()).
1803 */
blk_zone_issue_zeroout(struct block_device * bdev,sector_t sector,sector_t nr_sects,gfp_t gfp_mask)1804 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
1805 sector_t nr_sects, gfp_t gfp_mask)
1806 {
1807 int ret;
1808
1809 if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
1810 return -EIO;
1811
1812 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
1813 BLKDEV_ZERO_NOFALLBACK);
1814 if (ret != -EOPNOTSUPP)
1815 return ret;
1816
1817 /*
1818 * The failed call to blkdev_issue_zeroout() advanced the zone write
1819 * pointer. Undo this using a report zone to update the zone write
1820 * pointer to the correct current value.
1821 */
1822 ret = disk_zone_sync_wp_offset(bdev->bd_disk, sector);
1823 if (ret != 1)
1824 return ret < 0 ? ret : -EIO;
1825
1826 /*
1827 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
1828 * regular write with zero-pages.
1829 */
1830 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
1831 }
1832 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
1833
1834 #ifdef CONFIG_BLK_DEBUG_FS
queue_zone_wplug_show(struct blk_zone_wplug * zwplug,struct seq_file * m)1835 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
1836 struct seq_file *m)
1837 {
1838 unsigned int zwp_wp_offset, zwp_flags;
1839 unsigned int zwp_zone_no, zwp_ref;
1840 unsigned int zwp_bio_list_size;
1841 unsigned long flags;
1842
1843 spin_lock_irqsave(&zwplug->lock, flags);
1844 zwp_zone_no = zwplug->zone_no;
1845 zwp_flags = zwplug->flags;
1846 zwp_ref = refcount_read(&zwplug->ref);
1847 zwp_wp_offset = zwplug->wp_offset;
1848 zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
1849 spin_unlock_irqrestore(&zwplug->lock, flags);
1850
1851 seq_printf(m, "%u 0x%x %u %u %u\n", zwp_zone_no, zwp_flags, zwp_ref,
1852 zwp_wp_offset, zwp_bio_list_size);
1853 }
1854
queue_zone_wplugs_show(void * data,struct seq_file * m)1855 int queue_zone_wplugs_show(void *data, struct seq_file *m)
1856 {
1857 struct request_queue *q = data;
1858 struct gendisk *disk = q->disk;
1859 struct blk_zone_wplug *zwplug;
1860 unsigned int i;
1861
1862 if (!disk->zone_wplugs_hash)
1863 return 0;
1864
1865 rcu_read_lock();
1866 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1867 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
1868 node)
1869 queue_zone_wplug_show(zwplug, m);
1870 rcu_read_unlock();
1871
1872 return 0;
1873 }
1874
1875 #endif
1876