1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Zoned block device handling
4 *
5 * Copyright (c) 2015, Hannes Reinecke
6 * Copyright (c) 2015, SUSE Linux GmbH
7 *
8 * Copyright (c) 2016, Damien Le Moal
9 * Copyright (c) 2016, Western Digital
10 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
11 */
12
13 #include <linux/kernel.h>
14 #include <linux/blkdev.h>
15 #include <linux/blk-mq.h>
16 #include <linux/spinlock.h>
17 #include <linux/refcount.h>
18 #include <linux/mempool.h>
19 #include <linux/kthread.h>
20 #include <linux/freezer.h>
21
22 #include <trace/events/block.h>
23
24 #include "blk.h"
25 #include "blk-mq-sched.h"
26 #include "blk-mq-debugfs.h"
27
28 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29 static const char *const zone_cond_name[] = {
30 ZONE_COND_NAME(NOT_WP),
31 ZONE_COND_NAME(EMPTY),
32 ZONE_COND_NAME(IMP_OPEN),
33 ZONE_COND_NAME(EXP_OPEN),
34 ZONE_COND_NAME(CLOSED),
35 ZONE_COND_NAME(READONLY),
36 ZONE_COND_NAME(FULL),
37 ZONE_COND_NAME(OFFLINE),
38 ZONE_COND_NAME(ACTIVE),
39 };
40 #undef ZONE_COND_NAME
41
42 /*
43 * Per-zone write plug.
44 * @node: hlist_node structure for managing the plug using a hash table.
45 * @entry: list_head structure for listing the plug in the disk list of active
46 * zone write plugs.
47 * @bio_list: The list of BIOs that are currently plugged.
48 * @bio_work: Work struct to handle issuing of plugged BIOs
49 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
50 * @disk: The gendisk the plug belongs to.
51 * @lock: Spinlock to atomically manipulate the plug.
52 * @ref: Zone write plug reference counter. A zone write plug reference is
53 * always at least 1 when the plug is hashed in the disk plug hash table.
54 * The reference is incremented whenever a new BIO needing plugging is
55 * submitted and when a function needs to manipulate a plug. The
56 * reference count is decremented whenever a plugged BIO completes and
57 * when a function that referenced the plug returns. The initial
58 * reference is dropped whenever the zone of the zone write plug is reset,
59 * finished and when the zone becomes full (last write BIO to the zone
60 * completes).
61 * @flags: Flags indicating the plug state.
62 * @zone_no: The number of the zone the plug is managing.
63 * @wp_offset: The zone write pointer location relative to the start of the zone
64 * as a number of 512B sectors.
65 * @cond: Condition of the zone
66 */
67 struct blk_zone_wplug {
68 struct hlist_node node;
69 struct list_head entry;
70 struct bio_list bio_list;
71 struct work_struct bio_work;
72 struct rcu_head rcu_head;
73 struct gendisk *disk;
74 spinlock_t lock;
75 refcount_t ref;
76 unsigned int flags;
77 unsigned int zone_no;
78 unsigned int wp_offset;
79 enum blk_zone_cond cond;
80 };
81
disk_need_zone_resources(struct gendisk * disk)82 static inline bool disk_need_zone_resources(struct gendisk *disk)
83 {
84 /*
85 * All request-based zoned devices need zone resources so that the
86 * block layer can automatically handle write BIO plugging. BIO-based
87 * device drivers (e.g. DM devices) are normally responsible for
88 * handling zone write ordering and do not need zone resources, unless
89 * the driver requires zone append emulation.
90 */
91 return queue_is_mq(disk->queue) ||
92 queue_emulates_zone_append(disk->queue);
93 }
94
disk_zone_wplugs_hash_size(struct gendisk * disk)95 static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
96 {
97 return 1U << disk->zone_wplugs_hash_bits;
98 }
99
100 /*
101 * Zone write plug flags bits:
102 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
103 * that is, that write BIOs are being throttled due to a write BIO already
104 * being executed or the zone write plug bio list is not empty.
105 * - BLK_ZONE_WPLUG_NEED_WP_UPDATE: Indicates that we lost track of a zone
106 * write pointer offset and need to update it.
107 * - BLK_ZONE_WPLUG_DEAD: Indicates that the zone write plug will be
108 * removed from the disk hash table of zone write plugs when the last
109 * reference on the zone write plug is dropped. If set, this flag also
110 * indicates that the initial extra reference on the zone write plug was
111 * dropped, meaning that the reference count indicates the current number of
112 * active users (code context or BIOs and requests in flight). This flag is
113 * set when a zone is reset, finished or becomes full.
114 */
115 #define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
116 #define BLK_ZONE_WPLUG_NEED_WP_UPDATE (1U << 1)
117 #define BLK_ZONE_WPLUG_DEAD (1U << 2)
118
119 /**
120 * blk_zone_cond_str - Return a zone condition name string
121 * @zone_cond: a zone condition BLK_ZONE_COND_name
122 *
123 * Convert a BLK_ZONE_COND_name zone condition into the string "name". Useful
124 * for the debugging and tracing zone conditions. For an invalid zone
125 * conditions, the string "UNKNOWN" is returned.
126 */
blk_zone_cond_str(enum blk_zone_cond zone_cond)127 const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
128 {
129 static const char *zone_cond_str = "UNKNOWN";
130
131 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
132 zone_cond_str = zone_cond_name[zone_cond];
133
134 return zone_cond_str;
135 }
136 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
137
blk_zone_set_cond(u8 * zones_cond,unsigned int zno,enum blk_zone_cond cond)138 static void blk_zone_set_cond(u8 *zones_cond, unsigned int zno,
139 enum blk_zone_cond cond)
140 {
141 if (!zones_cond)
142 return;
143
144 switch (cond) {
145 case BLK_ZONE_COND_IMP_OPEN:
146 case BLK_ZONE_COND_EXP_OPEN:
147 case BLK_ZONE_COND_CLOSED:
148 zones_cond[zno] = BLK_ZONE_COND_ACTIVE;
149 return;
150 case BLK_ZONE_COND_NOT_WP:
151 case BLK_ZONE_COND_EMPTY:
152 case BLK_ZONE_COND_FULL:
153 case BLK_ZONE_COND_OFFLINE:
154 case BLK_ZONE_COND_READONLY:
155 default:
156 zones_cond[zno] = cond;
157 return;
158 }
159 }
160
disk_zone_set_cond(struct gendisk * disk,sector_t sector,enum blk_zone_cond cond)161 static void disk_zone_set_cond(struct gendisk *disk, sector_t sector,
162 enum blk_zone_cond cond)
163 {
164 u8 *zones_cond;
165
166 rcu_read_lock();
167 zones_cond = rcu_dereference(disk->zones_cond);
168 if (zones_cond) {
169 unsigned int zno = disk_zone_no(disk, sector);
170
171 /*
172 * The condition of a conventional, readonly and offline zones
173 * never changes, so do nothing if the target zone is in one of
174 * these conditions.
175 */
176 switch (zones_cond[zno]) {
177 case BLK_ZONE_COND_NOT_WP:
178 case BLK_ZONE_COND_READONLY:
179 case BLK_ZONE_COND_OFFLINE:
180 break;
181 default:
182 blk_zone_set_cond(zones_cond, zno, cond);
183 break;
184 }
185 }
186 rcu_read_unlock();
187 }
188
189 /**
190 * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
191 * @bdev: block device to check
192 * @sector: sector number
193 *
194 * Check if @sector on @bdev is contained in a sequential write required zone.
195 */
bdev_zone_is_seq(struct block_device * bdev,sector_t sector)196 bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
197 {
198 struct gendisk *disk = bdev->bd_disk;
199 unsigned int zno = disk_zone_no(disk, sector);
200 bool is_seq = false;
201 u8 *zones_cond;
202
203 if (!bdev_is_zoned(bdev))
204 return false;
205
206 rcu_read_lock();
207 zones_cond = rcu_dereference(disk->zones_cond);
208 if (zones_cond && zno < disk->nr_zones)
209 is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
210 rcu_read_unlock();
211
212 return is_seq;
213 }
214 EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
215
216 /*
217 * Zone report arguments for block device drivers report_zones operation.
218 * @cb: report_zones_cb callback for each reported zone.
219 * @data: Private data passed to report_zones_cb.
220 */
221 struct blk_report_zones_args {
222 report_zones_cb cb;
223 void *data;
224 bool report_active;
225 };
226
blkdev_do_report_zones(struct block_device * bdev,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)227 static int blkdev_do_report_zones(struct block_device *bdev, sector_t sector,
228 unsigned int nr_zones,
229 struct blk_report_zones_args *args)
230 {
231 struct gendisk *disk = bdev->bd_disk;
232
233 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
234 return -EOPNOTSUPP;
235
236 if (!nr_zones || sector >= get_capacity(disk))
237 return 0;
238
239 return disk->fops->report_zones(disk, sector, nr_zones, args);
240 }
241
242 /**
243 * blkdev_report_zones - Get zones information
244 * @bdev: Target block device
245 * @sector: Sector from which to report zones
246 * @nr_zones: Maximum number of zones to report
247 * @cb: Callback function called for each reported zone
248 * @data: Private data for the callback
249 *
250 * Description:
251 * Get zone information starting from the zone containing @sector for at most
252 * @nr_zones, and call @cb for each zone reported by the device.
253 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
254 * constant can be passed to @nr_zones.
255 * Returns the number of zones reported by the device, or a negative errno
256 * value in case of failure.
257 *
258 * Note: The caller must use memalloc_noXX_save/restore() calls to control
259 * memory allocations done within this function.
260 */
blkdev_report_zones(struct block_device * bdev,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)261 int blkdev_report_zones(struct block_device *bdev, sector_t sector,
262 unsigned int nr_zones, report_zones_cb cb, void *data)
263 {
264 struct blk_report_zones_args args = {
265 .cb = cb,
266 .data = data,
267 };
268
269 return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
270 }
271 EXPORT_SYMBOL_GPL(blkdev_report_zones);
272
blkdev_zone_reset_all(struct block_device * bdev)273 static int blkdev_zone_reset_all(struct block_device *bdev)
274 {
275 struct bio bio;
276
277 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
278 trace_blkdev_zone_mgmt(&bio, 0);
279 return submit_bio_wait(&bio);
280 }
281
282 /**
283 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
284 * @bdev: Target block device
285 * @op: Operation to be performed on the zones
286 * @sector: Start sector of the first zone to operate on
287 * @nr_sectors: Number of sectors, should be at least the length of one zone and
288 * must be zone size aligned.
289 *
290 * Description:
291 * Perform the specified operation on the range of zones specified by
292 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
293 * is valid, but the specified range should not contain conventional zones.
294 * The operation to execute on each zone can be a zone reset, open, close
295 * or finish request.
296 */
blkdev_zone_mgmt(struct block_device * bdev,enum req_op op,sector_t sector,sector_t nr_sectors)297 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
298 sector_t sector, sector_t nr_sectors)
299 {
300 sector_t zone_sectors = bdev_zone_sectors(bdev);
301 sector_t capacity = bdev_nr_sectors(bdev);
302 sector_t end_sector = sector + nr_sectors;
303 struct bio *bio = NULL;
304 int ret = 0;
305
306 if (!bdev_is_zoned(bdev))
307 return -EOPNOTSUPP;
308
309 if (bdev_read_only(bdev))
310 return -EPERM;
311
312 if (!op_is_zone_mgmt(op))
313 return -EOPNOTSUPP;
314
315 if (end_sector <= sector || end_sector > capacity)
316 /* Out of range */
317 return -EINVAL;
318
319 /* Check alignment (handle eventual smaller last zone) */
320 if (!bdev_is_zone_start(bdev, sector))
321 return -EINVAL;
322
323 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
324 return -EINVAL;
325
326 /*
327 * In the case of a zone reset operation over all zones, use
328 * REQ_OP_ZONE_RESET_ALL.
329 */
330 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity)
331 return blkdev_zone_reset_all(bdev);
332
333 while (sector < end_sector) {
334 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
335 bio->bi_iter.bi_sector = sector;
336 sector += zone_sectors;
337
338 /* This may take a while, so be nice to others */
339 cond_resched();
340 }
341
342 trace_blkdev_zone_mgmt(bio, nr_sectors);
343 ret = submit_bio_wait(bio);
344 bio_put(bio);
345
346 return ret;
347 }
348 EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
349
350 struct zone_report_args {
351 struct blk_zone __user *zones;
352 };
353
blkdev_copy_zone_to_user(struct blk_zone * zone,unsigned int idx,void * data)354 static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
355 void *data)
356 {
357 struct zone_report_args *args = data;
358
359 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
360 return -EFAULT;
361 return 0;
362 }
363
364 /*
365 * Mask of valid input flags for BLKREPORTZONEV2 ioctl.
366 */
367 #define BLK_ZONE_REPV2_INPUT_FLAGS BLK_ZONE_REP_CACHED
368
369 /*
370 * BLKREPORTZONE and BLKREPORTZONEV2 ioctl processing.
371 * Called from blkdev_ioctl.
372 */
blkdev_report_zones_ioctl(struct block_device * bdev,unsigned int cmd,unsigned long arg)373 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
374 unsigned long arg)
375 {
376 void __user *argp = (void __user *)arg;
377 struct zone_report_args args;
378 struct blk_zone_report rep;
379 int ret;
380
381 if (!argp)
382 return -EINVAL;
383
384 if (!bdev_is_zoned(bdev))
385 return -ENOTTY;
386
387 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
388 return -EFAULT;
389
390 if (!rep.nr_zones)
391 return -EINVAL;
392
393 args.zones = argp + sizeof(struct blk_zone_report);
394
395 switch (cmd) {
396 case BLKREPORTZONE:
397 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
398 blkdev_copy_zone_to_user, &args);
399 break;
400 case BLKREPORTZONEV2:
401 if (rep.flags & ~BLK_ZONE_REPV2_INPUT_FLAGS)
402 return -EINVAL;
403 ret = blkdev_report_zones_cached(bdev, rep.sector, rep.nr_zones,
404 blkdev_copy_zone_to_user, &args);
405 break;
406 default:
407 return -EINVAL;
408 }
409
410 if (ret < 0)
411 return ret;
412
413 rep.nr_zones = ret;
414 rep.flags = BLK_ZONE_REP_CAPACITY;
415 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
416 return -EFAULT;
417 return 0;
418 }
419
blkdev_reset_zone(struct block_device * bdev,blk_mode_t mode,struct blk_zone_range * zrange)420 static int blkdev_reset_zone(struct block_device *bdev, blk_mode_t mode,
421 struct blk_zone_range *zrange)
422 {
423 loff_t start, end;
424 int ret = -EINVAL;
425
426 inode_lock(bdev->bd_mapping->host);
427 filemap_invalidate_lock(bdev->bd_mapping);
428 if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
429 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
430 /* Out of range */
431 goto out_unlock;
432
433 start = zrange->sector << SECTOR_SHIFT;
434 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
435
436 ret = truncate_bdev_range(bdev, mode, start, end);
437 if (ret)
438 goto out_unlock;
439
440 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, zrange->sector,
441 zrange->nr_sectors);
442 out_unlock:
443 filemap_invalidate_unlock(bdev->bd_mapping);
444 inode_unlock(bdev->bd_mapping->host);
445 return ret;
446 }
447
448 /*
449 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
450 * Called from blkdev_ioctl.
451 */
blkdev_zone_mgmt_ioctl(struct block_device * bdev,blk_mode_t mode,unsigned int cmd,unsigned long arg)452 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
453 unsigned int cmd, unsigned long arg)
454 {
455 void __user *argp = (void __user *)arg;
456 struct blk_zone_range zrange;
457 enum req_op op;
458
459 if (!argp)
460 return -EINVAL;
461
462 if (!bdev_is_zoned(bdev))
463 return -ENOTTY;
464
465 if (!(mode & BLK_OPEN_WRITE))
466 return -EBADF;
467
468 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
469 return -EFAULT;
470
471 switch (cmd) {
472 case BLKRESETZONE:
473 return blkdev_reset_zone(bdev, mode, &zrange);
474 case BLKOPENZONE:
475 op = REQ_OP_ZONE_OPEN;
476 break;
477 case BLKCLOSEZONE:
478 op = REQ_OP_ZONE_CLOSE;
479 break;
480 case BLKFINISHZONE:
481 op = REQ_OP_ZONE_FINISH;
482 break;
483 default:
484 return -ENOTTY;
485 }
486
487 return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
488 }
489
disk_zone_is_last(struct gendisk * disk,struct blk_zone * zone)490 static bool disk_zone_is_last(struct gendisk *disk, struct blk_zone *zone)
491 {
492 return zone->start + zone->len >= get_capacity(disk);
493 }
494
disk_zone_wplug_is_full(struct gendisk * disk,struct blk_zone_wplug * zwplug)495 static bool disk_zone_wplug_is_full(struct gendisk *disk,
496 struct blk_zone_wplug *zwplug)
497 {
498 if (zwplug->zone_no < disk->nr_zones - 1)
499 return zwplug->wp_offset >= disk->zone_capacity;
500 return zwplug->wp_offset >= disk->last_zone_capacity;
501 }
502
disk_insert_zone_wplug(struct gendisk * disk,struct blk_zone_wplug * zwplug)503 static bool disk_insert_zone_wplug(struct gendisk *disk,
504 struct blk_zone_wplug *zwplug)
505 {
506 struct blk_zone_wplug *zwplg;
507 unsigned long flags;
508 u8 *zones_cond;
509 unsigned int idx =
510 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
511
512 /*
513 * Add the new zone write plug to the hash table, but carefully as we
514 * are racing with other submission context, so we may already have a
515 * zone write plug for the same zone.
516 */
517 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
518 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
519 if (zwplg->zone_no == zwplug->zone_no) {
520 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock,
521 flags);
522 return false;
523 }
524 }
525
526 /*
527 * Set the zone condition: if we do not yet have a zones_cond array
528 * attached to the disk, then this is a zone write plug insert from the
529 * first call to blk_revalidate_disk_zones(), in which case the zone is
530 * necessarilly in the active condition.
531 */
532 zones_cond = rcu_dereference_check(disk->zones_cond,
533 lockdep_is_held(&disk->zone_wplugs_hash_lock));
534 if (zones_cond)
535 zwplug->cond = zones_cond[zwplug->zone_no];
536 else
537 zwplug->cond = BLK_ZONE_COND_ACTIVE;
538
539 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
540 atomic_inc(&disk->nr_zone_wplugs);
541 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
542
543 return true;
544 }
545
disk_get_hashed_zone_wplug(struct gendisk * disk,sector_t sector)546 static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk,
547 sector_t sector)
548 {
549 unsigned int zno = disk_zone_no(disk, sector);
550 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
551 struct blk_zone_wplug *zwplug;
552
553 rcu_read_lock();
554
555 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
556 if (zwplug->zone_no == zno &&
557 refcount_inc_not_zero(&zwplug->ref)) {
558 rcu_read_unlock();
559 return zwplug;
560 }
561 }
562
563 rcu_read_unlock();
564
565 return NULL;
566 }
567
disk_get_zone_wplug(struct gendisk * disk,sector_t sector)568 static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
569 sector_t sector)
570 {
571 if (!atomic_read(&disk->nr_zone_wplugs))
572 return NULL;
573
574 return disk_get_hashed_zone_wplug(disk, sector);
575 }
576
disk_free_zone_wplug_rcu(struct rcu_head * rcu_head)577 static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
578 {
579 struct blk_zone_wplug *zwplug =
580 container_of(rcu_head, struct blk_zone_wplug, rcu_head);
581
582 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
583 }
584
disk_free_zone_wplug(struct blk_zone_wplug * zwplug)585 static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
586 {
587 struct gendisk *disk = zwplug->disk;
588 unsigned long flags;
589
590 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_DEAD));
591 WARN_ON_ONCE(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED);
592 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
593
594 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
595 blk_zone_set_cond(rcu_dereference_check(disk->zones_cond,
596 lockdep_is_held(&disk->zone_wplugs_hash_lock)),
597 zwplug->zone_no, zwplug->cond);
598 hlist_del_init_rcu(&zwplug->node);
599 atomic_dec(&disk->nr_zone_wplugs);
600 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
601
602 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
603 }
604
disk_put_zone_wplug(struct blk_zone_wplug * zwplug)605 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
606 {
607 if (refcount_dec_and_test(&zwplug->ref))
608 disk_free_zone_wplug(zwplug);
609 }
610
611 /*
612 * Flag the zone write plug as dead and drop the initial reference we got when
613 * the zone write plug was added to the hash table. The zone write plug will be
614 * unhashed when its last reference is dropped.
615 */
disk_mark_zone_wplug_dead(struct blk_zone_wplug * zwplug)616 static void disk_mark_zone_wplug_dead(struct blk_zone_wplug *zwplug)
617 {
618 lockdep_assert_held(&zwplug->lock);
619
620 if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD)) {
621 zwplug->flags |= BLK_ZONE_WPLUG_DEAD;
622 disk_put_zone_wplug(zwplug);
623 }
624 }
625
disk_check_zone_wplug_dead(struct blk_zone_wplug * zwplug)626 static inline bool disk_check_zone_wplug_dead(struct blk_zone_wplug *zwplug)
627 {
628 if (!(zwplug->flags & BLK_ZONE_WPLUG_DEAD))
629 return false;
630
631 /*
632 * If a new write is received right after a zone reset completes and
633 * while the disk_zone_wplugs_worker() thread has not yet released the
634 * reference on the zone write plug after processing the last write to
635 * the zone, then the new write BIO will see the zone write plug marked
636 * as dead. This case is however a false positive and a perfectly valid
637 * pattern. In such case, restore the zone write plug to a live one.
638 */
639 if (!zwplug->wp_offset && bio_list_empty(&zwplug->bio_list)) {
640 zwplug->flags &= ~BLK_ZONE_WPLUG_DEAD;
641 refcount_inc(&zwplug->ref);
642 return false;
643 }
644
645 return true;
646 }
647
648 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
649 struct blk_zone_wplug *zwplug);
650
blk_zone_wplug_bio_work(struct work_struct * work)651 static void blk_zone_wplug_bio_work(struct work_struct *work)
652 {
653 struct blk_zone_wplug *zwplug =
654 container_of(work, struct blk_zone_wplug, bio_work);
655
656 disk_zone_wplug_submit_bio(zwplug->disk, zwplug);
657
658 /* Drop the reference we took in disk_zone_wplug_schedule_work(). */
659 disk_put_zone_wplug(zwplug);
660 }
661
662 /*
663 * Get a zone write plug for the zone containing @sector.
664 * If the plug does not exist, it is allocated and inserted in the disk hash
665 * table.
666 */
disk_get_or_alloc_zone_wplug(struct gendisk * disk,sector_t sector,gfp_t gfp_mask)667 static struct blk_zone_wplug *disk_get_or_alloc_zone_wplug(struct gendisk *disk,
668 sector_t sector, gfp_t gfp_mask)
669 {
670 unsigned int zno = disk_zone_no(disk, sector);
671 struct blk_zone_wplug *zwplug;
672
673 again:
674 zwplug = disk_get_zone_wplug(disk, sector);
675 if (zwplug)
676 return zwplug;
677
678 /*
679 * Allocate and initialize a zone write plug with an extra reference
680 * so that it is not freed when the zone write plug becomes idle without
681 * the zone being full.
682 */
683 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
684 if (!zwplug)
685 return NULL;
686
687 INIT_HLIST_NODE(&zwplug->node);
688 refcount_set(&zwplug->ref, 2);
689 spin_lock_init(&zwplug->lock);
690 zwplug->flags = 0;
691 zwplug->zone_no = zno;
692 zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
693 bio_list_init(&zwplug->bio_list);
694 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
695 INIT_LIST_HEAD(&zwplug->entry);
696 zwplug->disk = disk;
697
698 /*
699 * Insert the new zone write plug in the hash table. This can fail only
700 * if another context already inserted a plug. Retry from the beginning
701 * in such case.
702 */
703 if (!disk_insert_zone_wplug(disk, zwplug)) {
704 mempool_free(zwplug, disk->zone_wplugs_pool);
705 goto again;
706 }
707
708 return zwplug;
709 }
710
blk_zone_wplug_bio_io_error(struct blk_zone_wplug * zwplug,struct bio * bio)711 static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
712 struct bio *bio)
713 {
714 struct request_queue *q = zwplug->disk->queue;
715
716 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
717 bio_io_error(bio);
718 disk_put_zone_wplug(zwplug);
719 /* Drop the reference taken by disk_zone_wplug_add_bio(). */
720 blk_queue_exit(q);
721 }
722
723 /*
724 * Abort (fail) all plugged BIOs of a zone write plug.
725 */
disk_zone_wplug_abort(struct blk_zone_wplug * zwplug)726 static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
727 {
728 struct gendisk *disk = zwplug->disk;
729 struct bio *bio;
730
731 lockdep_assert_held(&zwplug->lock);
732
733 if (bio_list_empty(&zwplug->bio_list))
734 return;
735
736 pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n",
737 zwplug->disk->disk_name, zwplug->zone_no);
738 while ((bio = bio_list_pop(&zwplug->bio_list)))
739 blk_zone_wplug_bio_io_error(zwplug, bio);
740
741 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
742
743 /*
744 * If we are using the per disk zone write plugs worker thread, remove
745 * the zone write plug from the work list and drop the reference we
746 * took when the zone write plug was added to that list.
747 */
748 if (blk_queue_zoned_qd1_writes(disk->queue)) {
749 spin_lock(&disk->zone_wplugs_list_lock);
750 if (!list_empty(&zwplug->entry)) {
751 list_del_init(&zwplug->entry);
752 disk_put_zone_wplug(zwplug);
753 }
754 spin_unlock(&disk->zone_wplugs_list_lock);
755 }
756 }
757
758 /*
759 * Update a zone write plug condition based on the write pointer offset.
760 */
disk_zone_wplug_update_cond(struct gendisk * disk,struct blk_zone_wplug * zwplug)761 static void disk_zone_wplug_update_cond(struct gendisk *disk,
762 struct blk_zone_wplug *zwplug)
763 {
764 lockdep_assert_held(&zwplug->lock);
765
766 if (disk_zone_wplug_is_full(disk, zwplug))
767 zwplug->cond = BLK_ZONE_COND_FULL;
768 else if (!zwplug->wp_offset)
769 zwplug->cond = BLK_ZONE_COND_EMPTY;
770 else
771 zwplug->cond = BLK_ZONE_COND_ACTIVE;
772 }
773
774 /*
775 * Set a zone write plug write pointer offset to the specified value.
776 * This aborts all plugged BIOs, which is fine as this function is called for
777 * a zone reset operation, a zone finish operation or if the zone needs a wp
778 * update from a report zone after a write error.
779 */
disk_zone_wplug_set_wp_offset(struct gendisk * disk,struct blk_zone_wplug * zwplug,unsigned int wp_offset)780 static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
781 struct blk_zone_wplug *zwplug,
782 unsigned int wp_offset)
783 {
784 lockdep_assert_held(&zwplug->lock);
785
786 /* Update the zone write pointer and abort all plugged BIOs. */
787 zwplug->flags &= ~BLK_ZONE_WPLUG_NEED_WP_UPDATE;
788 zwplug->wp_offset = wp_offset;
789 disk_zone_wplug_update_cond(disk, zwplug);
790
791 disk_zone_wplug_abort(zwplug);
792 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
793 disk_mark_zone_wplug_dead(zwplug);
794 }
795
blk_zone_wp_offset(struct blk_zone * zone)796 static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
797 {
798 switch (zone->cond) {
799 case BLK_ZONE_COND_IMP_OPEN:
800 case BLK_ZONE_COND_EXP_OPEN:
801 case BLK_ZONE_COND_CLOSED:
802 case BLK_ZONE_COND_ACTIVE:
803 return zone->wp - zone->start;
804 case BLK_ZONE_COND_EMPTY:
805 return 0;
806 case BLK_ZONE_COND_FULL:
807 case BLK_ZONE_COND_NOT_WP:
808 case BLK_ZONE_COND_OFFLINE:
809 case BLK_ZONE_COND_READONLY:
810 default:
811 /*
812 * Conventional, full, offline and read-only zones do not have
813 * a valid write pointer.
814 */
815 return UINT_MAX;
816 }
817 }
818
disk_zone_wplug_sync_wp_offset(struct gendisk * disk,struct blk_zone * zone)819 static unsigned int disk_zone_wplug_sync_wp_offset(struct gendisk *disk,
820 struct blk_zone *zone)
821 {
822 struct blk_zone_wplug *zwplug;
823 unsigned int wp_offset = blk_zone_wp_offset(zone);
824
825 zwplug = disk_get_zone_wplug(disk, zone->start);
826 if (zwplug) {
827 unsigned long flags;
828
829 spin_lock_irqsave(&zwplug->lock, flags);
830 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
831 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
832 spin_unlock_irqrestore(&zwplug->lock, flags);
833 disk_put_zone_wplug(zwplug);
834 }
835
836 return wp_offset;
837 }
838
839 /**
840 * disk_report_zone - Report one zone
841 * @disk: Target disk
842 * @zone: The zone to report
843 * @idx: The index of the zone in the overall zone report
844 * @args: report zones callback and data
845 *
846 * Description:
847 * Helper function for block device drivers to report one zone of a zone
848 * report initiated with blkdev_report_zones(). The zone being reported is
849 * specified by @zone and used to update, if necessary, the zone write plug
850 * information for the zone. If @args specifies a user callback function,
851 * this callback is executed.
852 */
disk_report_zone(struct gendisk * disk,struct blk_zone * zone,unsigned int idx,struct blk_report_zones_args * args)853 int disk_report_zone(struct gendisk *disk, struct blk_zone *zone,
854 unsigned int idx, struct blk_report_zones_args *args)
855 {
856 if (args && args->report_active) {
857 /*
858 * If we come here, then this is a report zones as a fallback
859 * for a cached report. So collapse the implicit open, explicit
860 * open and closed conditions into the active zone condition.
861 */
862 switch (zone->cond) {
863 case BLK_ZONE_COND_IMP_OPEN:
864 case BLK_ZONE_COND_EXP_OPEN:
865 case BLK_ZONE_COND_CLOSED:
866 zone->cond = BLK_ZONE_COND_ACTIVE;
867 break;
868 default:
869 break;
870 }
871 }
872
873 if (disk->zone_wplugs_hash)
874 disk_zone_wplug_sync_wp_offset(disk, zone);
875
876 if (args && args->cb)
877 return args->cb(zone, idx, args->data);
878
879 return 0;
880 }
881 EXPORT_SYMBOL_GPL(disk_report_zone);
882
blkdev_report_zone_cb(struct blk_zone * zone,unsigned int idx,void * data)883 static int blkdev_report_zone_cb(struct blk_zone *zone, unsigned int idx,
884 void *data)
885 {
886 memcpy(data, zone, sizeof(struct blk_zone));
887 return 0;
888 }
889
blkdev_report_zone_fallback(struct block_device * bdev,sector_t sector,struct blk_zone * zone)890 static int blkdev_report_zone_fallback(struct block_device *bdev,
891 sector_t sector, struct blk_zone *zone)
892 {
893 struct blk_report_zones_args args = {
894 .cb = blkdev_report_zone_cb,
895 .data = zone,
896 .report_active = true,
897 };
898 int error;
899
900 error = blkdev_do_report_zones(bdev, sector, 1, &args);
901 if (error < 0)
902 return error;
903 if (error == 0)
904 return -EIO;
905 return 0;
906 }
907
908 /*
909 * For devices that natively support zone append operations, we do not use zone
910 * write plugging for zone append writes, which makes the zone condition
911 * tracking invalid once zone append was used. In that case fall back to a
912 * regular report zones to get correct information.
913 */
blkdev_has_cached_report_zones(struct block_device * bdev)914 static inline bool blkdev_has_cached_report_zones(struct block_device *bdev)
915 {
916 return disk_need_zone_resources(bdev->bd_disk) &&
917 (bdev_emulates_zone_append(bdev) ||
918 !test_bit(GD_ZONE_APPEND_USED, &bdev->bd_disk->state));
919 }
920
921 /**
922 * blkdev_get_zone_info - Get a single zone information from cached data
923 * @bdev: Target block device
924 * @sector: Sector contained by the target zone
925 * @zone: zone structure to return the zone information
926 *
927 * Description:
928 * Get the zone information for the zone containing @sector using the zone
929 * write plug of the target zone, if one exist, or the disk zone condition
930 * array otherwise. The zone condition may be reported as being
931 * the BLK_ZONE_COND_ACTIVE condition for a zone that is in the implicit
932 * open, explicit open or closed condition.
933 *
934 * Returns 0 on success and a negative error code on failure.
935 */
blkdev_get_zone_info(struct block_device * bdev,sector_t sector,struct blk_zone * zone)936 int blkdev_get_zone_info(struct block_device *bdev, sector_t sector,
937 struct blk_zone *zone)
938 {
939 struct gendisk *disk = bdev->bd_disk;
940 sector_t zone_sectors = bdev_zone_sectors(bdev);
941 struct blk_zone_wplug *zwplug;
942 unsigned long flags;
943 u8 *zones_cond;
944
945 if (!bdev_is_zoned(bdev))
946 return -EOPNOTSUPP;
947
948 if (sector >= get_capacity(disk))
949 return -EINVAL;
950
951 memset(zone, 0, sizeof(*zone));
952 sector = bdev_zone_start(bdev, sector);
953
954 if (!blkdev_has_cached_report_zones(bdev))
955 return blkdev_report_zone_fallback(bdev, sector, zone);
956
957 rcu_read_lock();
958 zones_cond = rcu_dereference(disk->zones_cond);
959 if (!disk->zone_wplugs_hash || !zones_cond) {
960 rcu_read_unlock();
961 return blkdev_report_zone_fallback(bdev, sector, zone);
962 }
963 zone->cond = zones_cond[disk_zone_no(disk, sector)];
964 rcu_read_unlock();
965
966 zone->start = sector;
967 zone->len = zone_sectors;
968
969 /*
970 * If this is a conventional zone, we do not have a zone write plug and
971 * can report the zone immediately.
972 */
973 if (zone->cond == BLK_ZONE_COND_NOT_WP) {
974 zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
975 zone->capacity = zone_sectors;
976 zone->wp = ULLONG_MAX;
977 return 0;
978 }
979
980 /*
981 * This is a sequential write required zone. If the zone is read-only or
982 * offline, only set the zone write pointer to an invalid value and
983 * report the zone.
984 */
985 zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
986 if (disk_zone_is_last(disk, zone))
987 zone->capacity = disk->last_zone_capacity;
988 else
989 zone->capacity = disk->zone_capacity;
990
991 if (zone->cond == BLK_ZONE_COND_READONLY ||
992 zone->cond == BLK_ZONE_COND_OFFLINE) {
993 zone->wp = ULLONG_MAX;
994 return 0;
995 }
996
997 /*
998 * If the zone does not have a zone write plug, it is either full or
999 * empty, as we otherwise would have a zone write plug for it. In this
1000 * case, set the write pointer accordingly and report the zone.
1001 * Otherwise, if we have a zone write plug, use it.
1002 */
1003 zwplug = disk_get_zone_wplug(disk, sector);
1004 if (!zwplug) {
1005 if (zone->cond == BLK_ZONE_COND_FULL)
1006 zone->wp = ULLONG_MAX;
1007 else
1008 zone->wp = sector;
1009 return 0;
1010 }
1011
1012 spin_lock_irqsave(&zwplug->lock, flags);
1013 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE) {
1014 spin_unlock_irqrestore(&zwplug->lock, flags);
1015 disk_put_zone_wplug(zwplug);
1016 return blkdev_report_zone_fallback(bdev, sector, zone);
1017 }
1018 zone->cond = zwplug->cond;
1019 zone->wp = sector + zwplug->wp_offset;
1020 spin_unlock_irqrestore(&zwplug->lock, flags);
1021
1022 disk_put_zone_wplug(zwplug);
1023
1024 return 0;
1025 }
1026 EXPORT_SYMBOL_GPL(blkdev_get_zone_info);
1027
1028 /**
1029 * blkdev_report_zones_cached - Get cached zones information
1030 * @bdev: Target block device
1031 * @sector: Sector from which to report zones
1032 * @nr_zones: Maximum number of zones to report
1033 * @cb: Callback function called for each reported zone
1034 * @data: Private data for the callback function
1035 *
1036 * Description:
1037 * Similar to blkdev_report_zones() but instead of calling into the low level
1038 * device driver to get the zone report from the device, use
1039 * blkdev_get_zone_info() to generate the report from the disk zone write
1040 * plugs and zones condition array. Since calling this function without a
1041 * callback does not make sense, @cb must be specified.
1042 */
blkdev_report_zones_cached(struct block_device * bdev,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)1043 int blkdev_report_zones_cached(struct block_device *bdev, sector_t sector,
1044 unsigned int nr_zones, report_zones_cb cb, void *data)
1045 {
1046 struct gendisk *disk = bdev->bd_disk;
1047 sector_t capacity = get_capacity(disk);
1048 sector_t zone_sectors = bdev_zone_sectors(bdev);
1049 unsigned int idx = 0;
1050 struct blk_zone zone;
1051 int ret;
1052
1053 if (!cb || !bdev_is_zoned(bdev) ||
1054 WARN_ON_ONCE(!disk->fops->report_zones))
1055 return -EOPNOTSUPP;
1056
1057 if (!nr_zones || sector >= capacity)
1058 return 0;
1059
1060 if (!blkdev_has_cached_report_zones(bdev)) {
1061 struct blk_report_zones_args args = {
1062 .cb = cb,
1063 .data = data,
1064 .report_active = true,
1065 };
1066
1067 return blkdev_do_report_zones(bdev, sector, nr_zones, &args);
1068 }
1069
1070 for (sector = bdev_zone_start(bdev, sector);
1071 sector < capacity && idx < nr_zones;
1072 sector += zone_sectors, idx++) {
1073 ret = blkdev_get_zone_info(bdev, sector, &zone);
1074 if (ret)
1075 return ret;
1076
1077 ret = cb(&zone, idx, data);
1078 if (ret)
1079 return ret;
1080 }
1081
1082 return idx;
1083 }
1084 EXPORT_SYMBOL_GPL(blkdev_report_zones_cached);
1085
blk_zone_reset_bio_endio(struct bio * bio)1086 static void blk_zone_reset_bio_endio(struct bio *bio)
1087 {
1088 struct gendisk *disk = bio->bi_bdev->bd_disk;
1089 sector_t sector = bio->bi_iter.bi_sector;
1090 struct blk_zone_wplug *zwplug;
1091
1092 /*
1093 * If we have a zone write plug, set its write pointer offset to 0.
1094 * This will abort all BIOs plugged for the target zone. It is fine as
1095 * resetting zones while writes are still in-flight will result in the
1096 * writes failing anyway.
1097 */
1098 zwplug = disk_get_zone_wplug(disk, sector);
1099 if (zwplug) {
1100 unsigned long flags;
1101
1102 spin_lock_irqsave(&zwplug->lock, flags);
1103 disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1104 spin_unlock_irqrestore(&zwplug->lock, flags);
1105 disk_put_zone_wplug(zwplug);
1106 } else {
1107 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1108 }
1109 }
1110
blk_zone_reset_all_bio_endio(struct bio * bio)1111 static void blk_zone_reset_all_bio_endio(struct bio *bio)
1112 {
1113 struct gendisk *disk = bio->bi_bdev->bd_disk;
1114 sector_t capacity = get_capacity(disk);
1115 struct blk_zone_wplug *zwplug;
1116 unsigned long flags;
1117 sector_t sector;
1118 unsigned int i;
1119
1120 if (atomic_read(&disk->nr_zone_wplugs)) {
1121 /* Update the condition of all zone write plugs. */
1122 rcu_read_lock();
1123 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1124 hlist_for_each_entry_rcu(zwplug,
1125 &disk->zone_wplugs_hash[i],
1126 node) {
1127 spin_lock_irqsave(&zwplug->lock, flags);
1128 disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
1129 spin_unlock_irqrestore(&zwplug->lock, flags);
1130 }
1131 }
1132 rcu_read_unlock();
1133 }
1134
1135 /* Update the cached zone conditions. */
1136 for (sector = 0; sector < capacity;
1137 sector += bdev_zone_sectors(bio->bi_bdev))
1138 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_EMPTY);
1139 clear_bit(GD_ZONE_APPEND_USED, &disk->state);
1140 }
1141
blk_zone_finish_bio_endio(struct bio * bio)1142 static void blk_zone_finish_bio_endio(struct bio *bio)
1143 {
1144 struct block_device *bdev = bio->bi_bdev;
1145 struct gendisk *disk = bdev->bd_disk;
1146 sector_t sector = bio->bi_iter.bi_sector;
1147 struct blk_zone_wplug *zwplug;
1148
1149 /*
1150 * If we have a zone write plug, set its write pointer offset to the
1151 * zone size. This will abort all BIOs plugged for the target zone. It
1152 * is fine as resetting zones while writes are still in-flight will
1153 * result in the writes failing anyway.
1154 */
1155 zwplug = disk_get_zone_wplug(disk, sector);
1156 if (zwplug) {
1157 unsigned long flags;
1158
1159 spin_lock_irqsave(&zwplug->lock, flags);
1160 disk_zone_wplug_set_wp_offset(disk, zwplug,
1161 bdev_zone_sectors(bdev));
1162 spin_unlock_irqrestore(&zwplug->lock, flags);
1163 disk_put_zone_wplug(zwplug);
1164 } else {
1165 disk_zone_set_cond(disk, sector, BLK_ZONE_COND_FULL);
1166 }
1167 }
1168
blk_zone_mgmt_bio_endio(struct bio * bio)1169 void blk_zone_mgmt_bio_endio(struct bio *bio)
1170 {
1171 /* If the BIO failed, we have nothing to do. */
1172 if (bio->bi_status != BLK_STS_OK)
1173 return;
1174
1175 switch (bio_op(bio)) {
1176 case REQ_OP_ZONE_RESET:
1177 blk_zone_reset_bio_endio(bio);
1178 return;
1179 case REQ_OP_ZONE_RESET_ALL:
1180 blk_zone_reset_all_bio_endio(bio);
1181 return;
1182 case REQ_OP_ZONE_FINISH:
1183 blk_zone_finish_bio_endio(bio);
1184 return;
1185 default:
1186 return;
1187 }
1188 }
1189
disk_zone_wplug_schedule_work(struct gendisk * disk,struct blk_zone_wplug * zwplug)1190 static void disk_zone_wplug_schedule_work(struct gendisk *disk,
1191 struct blk_zone_wplug *zwplug)
1192 {
1193 lockdep_assert_held(&zwplug->lock);
1194
1195 /*
1196 * Schedule the submission of the next plugged BIO. Taking a reference
1197 * to the zone write plug is required as the bio_work belongs to the
1198 * plug, and thus we must ensure that the write plug does not go away
1199 * while the work is being scheduled but has not run yet.
1200 * blk_zone_wplug_bio_work() will release the reference we take here,
1201 * and we also drop this reference if the work is already scheduled.
1202 */
1203 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1204 WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue));
1205 refcount_inc(&zwplug->ref);
1206 if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work))
1207 disk_put_zone_wplug(zwplug);
1208 }
1209
disk_zone_wplug_add_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug,struct bio * bio,unsigned int nr_segs)1210 static inline void disk_zone_wplug_add_bio(struct gendisk *disk,
1211 struct blk_zone_wplug *zwplug,
1212 struct bio *bio, unsigned int nr_segs)
1213 {
1214 /*
1215 * Grab an extra reference on the BIO request queue usage counter.
1216 * This reference will be reused to submit a request for the BIO for
1217 * blk-mq devices and dropped when the BIO is failed and after
1218 * it is issued in the case of BIO-based devices.
1219 */
1220 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
1221
1222 /*
1223 * The BIO is being plugged and thus will have to wait for the on-going
1224 * write and for all other writes already plugged. So polling makes
1225 * no sense.
1226 */
1227 bio_clear_polled(bio);
1228
1229 /*
1230 * Reuse the poll cookie field to store the number of segments when
1231 * split to the hardware limits.
1232 */
1233 bio->__bi_nr_segments = nr_segs;
1234
1235 /*
1236 * We always receive BIOs after they are split and ready to be issued.
1237 * The block layer passes the parts of a split BIO in order, and the
1238 * user must also issue write sequentially. So simply add the new BIO
1239 * at the tail of the list to preserve the sequential write order.
1240 */
1241 bio_list_add(&zwplug->bio_list, bio);
1242 trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
1243 bio->bi_iter.bi_sector, bio_sectors(bio));
1244
1245 /*
1246 * If we are using the disk zone write plugs worker instead of the per
1247 * zone write plug BIO work, add the zone write plug to the work list
1248 * if it is not already there. Make sure to also get an extra reference
1249 * on the zone write plug so that it does not go away until it is
1250 * removed from the work list.
1251 */
1252 if (blk_queue_zoned_qd1_writes(disk->queue)) {
1253 spin_lock(&disk->zone_wplugs_list_lock);
1254 if (list_empty(&zwplug->entry)) {
1255 list_add_tail(&zwplug->entry, &disk->zone_wplugs_list);
1256 refcount_inc(&zwplug->ref);
1257 }
1258 spin_unlock(&disk->zone_wplugs_list_lock);
1259 }
1260 }
1261
1262 /*
1263 * Called from bio_attempt_back_merge() when a BIO was merged with a request.
1264 */
blk_zone_write_plug_bio_merged(struct bio * bio)1265 void blk_zone_write_plug_bio_merged(struct bio *bio)
1266 {
1267 struct gendisk *disk = bio->bi_bdev->bd_disk;
1268 struct blk_zone_wplug *zwplug;
1269 unsigned long flags;
1270
1271 /*
1272 * If the BIO was already plugged, then we were called through
1273 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
1274 * For this case, we already hold a reference on the zone write plug for
1275 * the BIO and blk_zone_write_plug_init_request() will handle the
1276 * zone write pointer offset update.
1277 */
1278 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1279 return;
1280
1281 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1282
1283 /*
1284 * Get a reference on the zone write plug of the target zone and advance
1285 * the zone write pointer offset. Given that this is a merge, we already
1286 * have at least one request and one BIO referencing the zone write
1287 * plug. So this should not fail.
1288 */
1289 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1290 if (WARN_ON_ONCE(!zwplug))
1291 return;
1292
1293 spin_lock_irqsave(&zwplug->lock, flags);
1294 zwplug->wp_offset += bio_sectors(bio);
1295 disk_zone_wplug_update_cond(disk, zwplug);
1296 spin_unlock_irqrestore(&zwplug->lock, flags);
1297 }
1298
1299 /*
1300 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
1301 * already went through zone write plugging (either a new BIO or one that was
1302 * unplugged).
1303 */
blk_zone_write_plug_init_request(struct request * req)1304 void blk_zone_write_plug_init_request(struct request *req)
1305 {
1306 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
1307 struct request_queue *q = req->q;
1308 struct gendisk *disk = q->disk;
1309 struct blk_zone_wplug *zwplug =
1310 disk_get_zone_wplug(disk, blk_rq_pos(req));
1311 unsigned long flags;
1312 struct bio *bio;
1313
1314 if (WARN_ON_ONCE(!zwplug))
1315 return;
1316
1317 /*
1318 * Indicate that completion of this request needs to be handled with
1319 * blk_zone_write_plug_finish_request(), which will drop the reference
1320 * on the zone write plug we took above on entry to this function.
1321 */
1322 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
1323
1324 if (blk_queue_nomerges(q))
1325 return;
1326
1327 /*
1328 * Walk through the list of plugged BIOs to check if they can be merged
1329 * into the back of the request.
1330 */
1331 spin_lock_irqsave(&zwplug->lock, flags);
1332 while (!disk_zone_wplug_is_full(disk, zwplug)) {
1333 bio = bio_list_peek(&zwplug->bio_list);
1334 if (!bio)
1335 break;
1336
1337 if (bio->bi_iter.bi_sector != req_back_sector ||
1338 !blk_rq_merge_ok(req, bio))
1339 break;
1340
1341 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
1342 !bio->__bi_nr_segments);
1343
1344 bio_list_pop(&zwplug->bio_list);
1345 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
1346 BIO_MERGE_OK) {
1347 bio_list_add_head(&zwplug->bio_list, bio);
1348 break;
1349 }
1350
1351 /* Drop the reference taken by disk_zone_wplug_add_bio(). */
1352 blk_queue_exit(q);
1353 zwplug->wp_offset += bio_sectors(bio);
1354 disk_zone_wplug_update_cond(disk, zwplug);
1355
1356 req_back_sector += bio_sectors(bio);
1357 }
1358 spin_unlock_irqrestore(&zwplug->lock, flags);
1359 }
1360
1361 /*
1362 * Check and prepare a BIO for submission by incrementing the write pointer
1363 * offset of its zone write plug and changing zone append operations into
1364 * regular write when zone append emulation is needed.
1365 */
blk_zone_wplug_prepare_bio(struct blk_zone_wplug * zwplug,struct bio * bio)1366 static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
1367 struct bio *bio)
1368 {
1369 struct gendisk *disk = bio->bi_bdev->bd_disk;
1370
1371 lockdep_assert_held(&zwplug->lock);
1372
1373 /*
1374 * If we lost track of the zone write pointer due to a write error,
1375 * the user must either execute a report zones, reset the zone or finish
1376 * the to recover a reliable write pointer position. Fail BIOs if the
1377 * user did not do that as we cannot handle emulated zone append
1378 * otherwise.
1379 */
1380 if (zwplug->flags & BLK_ZONE_WPLUG_NEED_WP_UPDATE)
1381 return false;
1382
1383 /*
1384 * Check that the user is not attempting to write to a full zone.
1385 * We know such BIO will fail, and that would potentially overflow our
1386 * write pointer offset beyond the end of the zone.
1387 */
1388 if (disk_zone_wplug_is_full(disk, zwplug))
1389 return false;
1390
1391 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1392 /*
1393 * Use a regular write starting at the current write pointer.
1394 * Similarly to native zone append operations, do not allow
1395 * merging.
1396 */
1397 bio->bi_opf &= ~REQ_OP_MASK;
1398 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
1399 bio->bi_iter.bi_sector += zwplug->wp_offset;
1400
1401 /*
1402 * Remember that this BIO is in fact a zone append operation
1403 * so that we can restore its operation code on completion.
1404 */
1405 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
1406 } else {
1407 /*
1408 * Check for non-sequential writes early as we know that BIOs
1409 * with a start sector not unaligned to the zone write pointer
1410 * will fail.
1411 */
1412 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
1413 return false;
1414 }
1415
1416 /* Advance the zone write pointer offset. */
1417 zwplug->wp_offset += bio_sectors(bio);
1418 disk_zone_wplug_update_cond(disk, zwplug);
1419
1420 return true;
1421 }
1422
blk_zone_wplug_handle_write(struct bio * bio,unsigned int nr_segs)1423 static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
1424 {
1425 struct gendisk *disk = bio->bi_bdev->bd_disk;
1426 sector_t sector = bio->bi_iter.bi_sector;
1427 struct blk_zone_wplug *zwplug;
1428 gfp_t gfp_mask = GFP_NOIO;
1429 unsigned long flags;
1430
1431 /*
1432 * BIOs must be fully contained within a zone so that we use the correct
1433 * zone write plug for the entire BIO. For blk-mq devices, the block
1434 * layer should already have done any splitting required to ensure this
1435 * and this BIO should thus not be straddling zone boundaries. For
1436 * BIO-based devices, it is the responsibility of the driver to split
1437 * the bio before submitting it.
1438 */
1439 if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1440 bio_io_error(bio);
1441 return true;
1442 }
1443
1444 /* Conventional zones do not need write plugging. */
1445 if (!bdev_zone_is_seq(bio->bi_bdev, sector)) {
1446 /* Zone append to conventional zones is not allowed. */
1447 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1448 bio_io_error(bio);
1449 return true;
1450 }
1451 return false;
1452 }
1453
1454 if (bio->bi_opf & REQ_NOWAIT)
1455 gfp_mask = GFP_NOWAIT;
1456
1457 zwplug = disk_get_or_alloc_zone_wplug(disk, sector, gfp_mask);
1458 if (!zwplug) {
1459 if (bio->bi_opf & REQ_NOWAIT)
1460 bio_wouldblock_error(bio);
1461 else
1462 bio_io_error(bio);
1463 return true;
1464 }
1465
1466 spin_lock_irqsave(&zwplug->lock, flags);
1467
1468 /*
1469 * Check if we got a zone write plug marked as dead. If yes, then the
1470 * user is likely issuing writes to a full zone, or without
1471 * synchronizing with zone reset or zone finish operations. In such
1472 * case, fail the BIO to signal this invalid usage.
1473 */
1474 if (disk_check_zone_wplug_dead(zwplug)) {
1475 spin_unlock_irqrestore(&zwplug->lock, flags);
1476 disk_put_zone_wplug(zwplug);
1477 bio_io_error(bio);
1478 return true;
1479 }
1480
1481 /* Indicate that this BIO is being handled using zone write plugging. */
1482 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1483
1484 /*
1485 * Add REQ_NOWAIT BIOs to the plug list to ensure that we will not see a
1486 * BLK_STS_AGAIN failure if we let the caller submit the BIO.
1487 */
1488 if (bio->bi_opf & REQ_NOWAIT) {
1489 bio->bi_opf &= ~REQ_NOWAIT;
1490 goto queue_bio;
1491 }
1492
1493 /*
1494 * For rotational devices, we will use the gendisk zone write plugs
1495 * work instead of the per zone write plug BIO work, so queue the BIO.
1496 */
1497 if (blk_queue_zoned_qd1_writes(disk->queue))
1498 goto queue_bio;
1499
1500 /* If the zone is already plugged, add the BIO to the BIO plug list. */
1501 if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
1502 goto queue_bio;
1503
1504 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1505 spin_unlock_irqrestore(&zwplug->lock, flags);
1506 bio_io_error(bio);
1507 return true;
1508 }
1509
1510 /* Otherwise, plug and let the caller submit the BIO. */
1511 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1512
1513 spin_unlock_irqrestore(&zwplug->lock, flags);
1514
1515 return false;
1516
1517 queue_bio:
1518 disk_zone_wplug_add_bio(disk, zwplug, bio, nr_segs);
1519
1520 if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
1521 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1522 if (blk_queue_zoned_qd1_writes(disk->queue))
1523 wake_up_process(disk->zone_wplugs_worker);
1524 else
1525 disk_zone_wplug_schedule_work(disk, zwplug);
1526 }
1527
1528 spin_unlock_irqrestore(&zwplug->lock, flags);
1529
1530 return true;
1531 }
1532
blk_zone_wplug_handle_native_zone_append(struct bio * bio)1533 static void blk_zone_wplug_handle_native_zone_append(struct bio *bio)
1534 {
1535 struct gendisk *disk = bio->bi_bdev->bd_disk;
1536 struct blk_zone_wplug *zwplug;
1537 unsigned long flags;
1538
1539 if (!test_bit(GD_ZONE_APPEND_USED, &disk->state))
1540 set_bit(GD_ZONE_APPEND_USED, &disk->state);
1541
1542 /*
1543 * We have native support for zone append operations, so we are not
1544 * going to handle @bio through plugging. However, we may already have a
1545 * zone write plug for the target zone if that zone was previously
1546 * partially written using regular writes. In such case, we risk leaving
1547 * the plug in the disk hash table if the zone is fully written using
1548 * zone append operations. Avoid this by removing the zone write plug.
1549 */
1550 zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1551 if (likely(!zwplug))
1552 return;
1553
1554 spin_lock_irqsave(&zwplug->lock, flags);
1555
1556 /*
1557 * We are about to remove the zone write plug. But if the user
1558 * (mistakenly) has issued regular writes together with native zone
1559 * append, we must aborts the writes as otherwise the plugged BIOs would
1560 * not be executed by the plug BIO work as disk_get_zone_wplug() will
1561 * return NULL after the plug is removed. Aborting the plugged write
1562 * BIOs is consistent with the fact that these writes will most likely
1563 * fail anyway as there is no ordering guarantees between zone append
1564 * operations and regular write operations.
1565 */
1566 if (!bio_list_empty(&zwplug->bio_list)) {
1567 pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n",
1568 disk->disk_name, zwplug->zone_no);
1569 disk_zone_wplug_abort(zwplug);
1570 }
1571 disk_mark_zone_wplug_dead(zwplug);
1572 spin_unlock_irqrestore(&zwplug->lock, flags);
1573
1574 disk_put_zone_wplug(zwplug);
1575 }
1576
blk_zone_wplug_handle_zone_mgmt(struct bio * bio)1577 static bool blk_zone_wplug_handle_zone_mgmt(struct bio *bio)
1578 {
1579 if (bio_op(bio) != REQ_OP_ZONE_RESET_ALL &&
1580 !bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
1581 /*
1582 * Zone reset and zone finish operations do not apply to
1583 * conventional zones.
1584 */
1585 bio_io_error(bio);
1586 return true;
1587 }
1588
1589 /*
1590 * No-wait zone management BIOs do not make much sense as the callers
1591 * issue these as blocking operations in most cases. To avoid issues
1592 * with the BIO execution potentially failing with BLK_STS_AGAIN, warn
1593 * about REQ_NOWAIT being set and ignore that flag.
1594 */
1595 if (WARN_ON_ONCE(bio->bi_opf & REQ_NOWAIT))
1596 bio->bi_opf &= ~REQ_NOWAIT;
1597
1598 return false;
1599 }
1600
1601 /**
1602 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1603 * @bio: The BIO being submitted
1604 * @nr_segs: The number of physical segments of @bio
1605 *
1606 * Handle write, write zeroes and zone append operations requiring emulation
1607 * using zone write plugging.
1608 *
1609 * Return true whenever @bio execution needs to be delayed through the zone
1610 * write plug. Otherwise, return false to let the submission path process
1611 * @bio normally.
1612 */
blk_zone_plug_bio(struct bio * bio,unsigned int nr_segs)1613 bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1614 {
1615 struct block_device *bdev = bio->bi_bdev;
1616
1617 if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
1618 return false;
1619
1620 /*
1621 * Regular writes and write zeroes need to be handled through the target
1622 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1623 * which may need to go through the flush machinery depending on the
1624 * target device capabilities. Plugging such writes is fine as the flush
1625 * machinery operates at the request level, below the plug, and
1626 * completion of the flush sequence will go through the regular BIO
1627 * completion, which will handle zone write plugging.
1628 * Zone append operations for devices that requested emulation must
1629 * also be plugged so that these BIOs can be changed into regular
1630 * write BIOs.
1631 * Zone reset, reset all and finish commands need special treatment
1632 * to correctly track the write pointer offset of zones. These commands
1633 * are not plugged as we do not need serialization with write
1634 * operations. It is the responsibility of the user to not issue reset
1635 * and finish commands when write operations are in flight.
1636 */
1637 switch (bio_op(bio)) {
1638 case REQ_OP_ZONE_APPEND:
1639 if (!bdev_emulates_zone_append(bdev)) {
1640 blk_zone_wplug_handle_native_zone_append(bio);
1641 return false;
1642 }
1643 fallthrough;
1644 case REQ_OP_WRITE:
1645 case REQ_OP_WRITE_ZEROES:
1646 return blk_zone_wplug_handle_write(bio, nr_segs);
1647 case REQ_OP_ZONE_RESET:
1648 case REQ_OP_ZONE_FINISH:
1649 case REQ_OP_ZONE_RESET_ALL:
1650 return blk_zone_wplug_handle_zone_mgmt(bio);
1651 default:
1652 return false;
1653 }
1654
1655 return false;
1656 }
1657 EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1658
disk_zone_wplug_unplug_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug)1659 static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1660 struct blk_zone_wplug *zwplug)
1661 {
1662 unsigned long flags;
1663
1664 spin_lock_irqsave(&zwplug->lock, flags);
1665
1666 /*
1667 * For rotational devices, signal the BIO completion to the zone write
1668 * plug work. Otherwise, schedule submission of the next plugged BIO
1669 * if we have one.
1670 */
1671 if (bio_list_empty(&zwplug->bio_list))
1672 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1673
1674 if (blk_queue_zoned_qd1_writes(disk->queue))
1675 complete(&disk->zone_wplugs_worker_bio_done);
1676 else if (!bio_list_empty(&zwplug->bio_list))
1677 disk_zone_wplug_schedule_work(disk, zwplug);
1678
1679 if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
1680 disk_mark_zone_wplug_dead(zwplug);
1681
1682 spin_unlock_irqrestore(&zwplug->lock, flags);
1683 }
1684
blk_zone_append_update_request_bio(struct request * rq,struct bio * bio)1685 void blk_zone_append_update_request_bio(struct request *rq, struct bio *bio)
1686 {
1687 /*
1688 * For zone append requests, the request sector indicates the location
1689 * at which the BIO data was written. Return this value to the BIO
1690 * issuer through the BIO iter sector.
1691 * For plugged zone writes, which include emulated zone append, we need
1692 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
1693 * lookup the zone write plug.
1694 */
1695 bio->bi_iter.bi_sector = rq->__sector;
1696 trace_blk_zone_append_update_request_bio(rq);
1697 }
1698
blk_zone_write_plug_bio_endio(struct bio * bio)1699 void blk_zone_write_plug_bio_endio(struct bio *bio)
1700 {
1701 struct gendisk *disk = bio->bi_bdev->bd_disk;
1702 struct blk_zone_wplug *zwplug =
1703 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
1704 unsigned long flags;
1705
1706 if (WARN_ON_ONCE(!zwplug))
1707 return;
1708
1709 /* Make sure we do not see this BIO again by clearing the plug flag. */
1710 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1711
1712 /*
1713 * If this is a regular write emulating a zone append operation,
1714 * restore the original operation code.
1715 */
1716 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1717 bio->bi_opf &= ~REQ_OP_MASK;
1718 bio->bi_opf |= REQ_OP_ZONE_APPEND;
1719 bio_clear_flag(bio, BIO_EMULATES_ZONE_APPEND);
1720 }
1721
1722 /*
1723 * If the BIO failed, abort all plugged BIOs and mark the plug as
1724 * needing a write pointer update.
1725 */
1726 if (bio->bi_status != BLK_STS_OK) {
1727 spin_lock_irqsave(&zwplug->lock, flags);
1728 disk_zone_wplug_abort(zwplug);
1729 zwplug->flags |= BLK_ZONE_WPLUG_NEED_WP_UPDATE;
1730 spin_unlock_irqrestore(&zwplug->lock, flags);
1731 }
1732
1733 /* Drop the reference we took when the BIO was issued. */
1734 disk_put_zone_wplug(zwplug);
1735
1736 /*
1737 * For BIO-based devices, blk_zone_write_plug_finish_request()
1738 * is not called. So we need to schedule execution of the next
1739 * plugged BIO here.
1740 */
1741 if (bdev_test_flag(bio->bi_bdev, BD_HAS_SUBMIT_BIO))
1742 disk_zone_wplug_unplug_bio(disk, zwplug);
1743
1744 /* Drop the reference we took when entering this function. */
1745 disk_put_zone_wplug(zwplug);
1746 }
1747
blk_zone_write_plug_finish_request(struct request * req)1748 void blk_zone_write_plug_finish_request(struct request *req)
1749 {
1750 struct gendisk *disk = req->q->disk;
1751 struct blk_zone_wplug *zwplug;
1752
1753 zwplug = disk_get_zone_wplug(disk, req->__sector);
1754 if (WARN_ON_ONCE(!zwplug))
1755 return;
1756
1757 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1758
1759 /*
1760 * Drop the reference we took when the request was initialized in
1761 * blk_zone_write_plug_init_request().
1762 */
1763 disk_put_zone_wplug(zwplug);
1764
1765 disk_zone_wplug_unplug_bio(disk, zwplug);
1766
1767 /* Drop the reference we took when entering this function. */
1768 disk_put_zone_wplug(zwplug);
1769 }
1770
disk_zone_wplug_submit_bio(struct gendisk * disk,struct blk_zone_wplug * zwplug)1771 static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
1772 struct blk_zone_wplug *zwplug)
1773 {
1774 struct block_device *bdev;
1775 unsigned long flags;
1776 struct bio *bio;
1777 bool prepared;
1778
1779 /*
1780 * Submit the next plugged BIO. If we do not have any, clear
1781 * the plugged flag.
1782 */
1783 again:
1784 spin_lock_irqsave(&zwplug->lock, flags);
1785 bio = bio_list_pop(&zwplug->bio_list);
1786 if (!bio) {
1787 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1788 spin_unlock_irqrestore(&zwplug->lock, flags);
1789 return false;
1790 }
1791
1792 trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
1793 bio->bi_iter.bi_sector, bio_sectors(bio));
1794
1795 prepared = blk_zone_wplug_prepare_bio(zwplug, bio);
1796 spin_unlock_irqrestore(&zwplug->lock, flags);
1797
1798 if (!prepared) {
1799 blk_zone_wplug_bio_io_error(zwplug, bio);
1800 goto again;
1801 }
1802
1803 /*
1804 * blk-mq devices will reuse the extra reference on the request queue
1805 * usage counter we took when the BIO was plugged, but the submission
1806 * path for BIO-based devices will not do that. So drop this extra
1807 * reference here.
1808 */
1809 if (blk_queue_zoned_qd1_writes(disk->queue))
1810 reinit_completion(&disk->zone_wplugs_worker_bio_done);
1811 bdev = bio->bi_bdev;
1812 if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
1813 bdev->bd_disk->fops->submit_bio(bio);
1814 blk_queue_exit(bdev->bd_disk->queue);
1815 } else {
1816 blk_mq_submit_bio(bio);
1817 }
1818
1819 return true;
1820 }
1821
disk_get_zone_wplugs_work(struct gendisk * disk)1822 static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk)
1823 {
1824 struct blk_zone_wplug *zwplug;
1825
1826 spin_lock_irq(&disk->zone_wplugs_list_lock);
1827 zwplug = list_first_entry_or_null(&disk->zone_wplugs_list,
1828 struct blk_zone_wplug, entry);
1829 if (zwplug)
1830 list_del_init(&zwplug->entry);
1831 spin_unlock_irq(&disk->zone_wplugs_list_lock);
1832
1833 return zwplug;
1834 }
1835
disk_zone_wplugs_worker(void * data)1836 static int disk_zone_wplugs_worker(void *data)
1837 {
1838 struct gendisk *disk = data;
1839 struct blk_zone_wplug *zwplug;
1840 unsigned int noio_flag;
1841
1842 noio_flag = memalloc_noio_save();
1843 set_user_nice(current, MIN_NICE);
1844 set_freezable();
1845
1846 for (;;) {
1847 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1848
1849 zwplug = disk_get_zone_wplugs_work(disk);
1850 if (zwplug) {
1851 /*
1852 * Process all BIOs of this zone write plug and then
1853 * drop the reference we took when adding the zone write
1854 * plug to the active list.
1855 */
1856 set_current_state(TASK_RUNNING);
1857 while (disk_zone_wplug_submit_bio(disk, zwplug))
1858 blk_wait_io(&disk->zone_wplugs_worker_bio_done);
1859 disk_put_zone_wplug(zwplug);
1860 continue;
1861 }
1862
1863 /*
1864 * Only sleep if nothing sets the state to running. Else check
1865 * for zone write plugs work again as a newly submitted BIO
1866 * might have added a zone write plug to the work list.
1867 */
1868 if (get_current_state() == TASK_RUNNING) {
1869 try_to_freeze();
1870 } else {
1871 if (kthread_should_stop()) {
1872 set_current_state(TASK_RUNNING);
1873 break;
1874 }
1875 schedule();
1876 }
1877 }
1878
1879 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
1880 memalloc_noio_restore(noio_flag);
1881
1882 return 0;
1883 }
1884
disk_init_zone_resources(struct gendisk * disk)1885 void disk_init_zone_resources(struct gendisk *disk)
1886 {
1887 spin_lock_init(&disk->zone_wplugs_hash_lock);
1888 spin_lock_init(&disk->zone_wplugs_list_lock);
1889 INIT_LIST_HEAD(&disk->zone_wplugs_list);
1890 init_completion(&disk->zone_wplugs_worker_bio_done);
1891 }
1892
1893 /*
1894 * For the size of a disk zone write plug hash table, use the size of the
1895 * zone write plug mempool, which is the maximum of the disk open zones and
1896 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1897 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1898 */
1899 #define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
1900 #define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
1901
disk_alloc_zone_resources(struct gendisk * disk,unsigned int pool_size)1902 static int disk_alloc_zone_resources(struct gendisk *disk,
1903 unsigned int pool_size)
1904 {
1905 unsigned int i;
1906 int ret = -ENOMEM;
1907
1908 atomic_set(&disk->nr_zone_wplugs, 0);
1909 disk->zone_wplugs_hash_bits =
1910 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1911
1912 disk->zone_wplugs_hash =
1913 kzalloc_objs(struct hlist_head,
1914 disk_zone_wplugs_hash_size(disk));
1915 if (!disk->zone_wplugs_hash)
1916 return -ENOMEM;
1917
1918 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1919 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1920
1921 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1922 sizeof(struct blk_zone_wplug));
1923 if (!disk->zone_wplugs_pool)
1924 goto free_hash;
1925
1926 disk->zone_wplugs_wq =
1927 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1928 pool_size, disk->disk_name);
1929 if (!disk->zone_wplugs_wq)
1930 goto destroy_pool;
1931
1932 disk->zone_wplugs_worker =
1933 kthread_create(disk_zone_wplugs_worker, disk,
1934 "%s_zwplugs_worker", disk->disk_name);
1935 if (IS_ERR(disk->zone_wplugs_worker)) {
1936 ret = PTR_ERR(disk->zone_wplugs_worker);
1937 disk->zone_wplugs_worker = NULL;
1938 goto destroy_wq;
1939 }
1940 wake_up_process(disk->zone_wplugs_worker);
1941
1942 return 0;
1943
1944 destroy_wq:
1945 destroy_workqueue(disk->zone_wplugs_wq);
1946 disk->zone_wplugs_wq = NULL;
1947 destroy_pool:
1948 mempool_destroy(disk->zone_wplugs_pool);
1949 disk->zone_wplugs_pool = NULL;
1950 free_hash:
1951 kfree(disk->zone_wplugs_hash);
1952 disk->zone_wplugs_hash = NULL;
1953 disk->zone_wplugs_hash_bits = 0;
1954 return ret;
1955 }
1956
disk_destroy_zone_wplugs_hash_table(struct gendisk * disk)1957 static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1958 {
1959 struct blk_zone_wplug *zwplug;
1960 unsigned int i;
1961
1962 if (!disk->zone_wplugs_hash)
1963 return;
1964
1965 /* Free all the zone write plugs we have. */
1966 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1967 while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1968 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1969 struct blk_zone_wplug, node);
1970 spin_lock_irq(&zwplug->lock);
1971 disk_mark_zone_wplug_dead(zwplug);
1972 spin_unlock_irq(&zwplug->lock);
1973 }
1974 }
1975
1976 WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs));
1977 kfree(disk->zone_wplugs_hash);
1978 disk->zone_wplugs_hash = NULL;
1979 disk->zone_wplugs_hash_bits = 0;
1980
1981 /*
1982 * Wait for the zone write plugs to be RCU-freed before destroying the
1983 * mempool.
1984 */
1985 rcu_barrier();
1986 mempool_destroy(disk->zone_wplugs_pool);
1987 disk->zone_wplugs_pool = NULL;
1988 }
1989
disk_set_zones_cond_array(struct gendisk * disk,u8 * zones_cond)1990 static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
1991 {
1992 unsigned long flags;
1993
1994 spin_lock_irqsave(&disk->zone_wplugs_hash_lock, flags);
1995 zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
1996 lockdep_is_held(&disk->zone_wplugs_hash_lock));
1997 spin_unlock_irqrestore(&disk->zone_wplugs_hash_lock, flags);
1998
1999 kfree_rcu_mightsleep(zones_cond);
2000 }
2001
disk_free_zone_resources(struct gendisk * disk)2002 void disk_free_zone_resources(struct gendisk *disk)
2003 {
2004 if (disk->zone_wplugs_worker)
2005 kthread_stop(disk->zone_wplugs_worker);
2006 WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
2007
2008 if (disk->zone_wplugs_wq) {
2009 destroy_workqueue(disk->zone_wplugs_wq);
2010 disk->zone_wplugs_wq = NULL;
2011 }
2012
2013 disk_destroy_zone_wplugs_hash_table(disk);
2014
2015 disk_set_zones_cond_array(disk, NULL);
2016 disk->zone_capacity = 0;
2017 disk->last_zone_capacity = 0;
2018 disk->nr_zones = 0;
2019 }
2020
2021 struct blk_revalidate_zone_args {
2022 struct gendisk *disk;
2023 u8 *zones_cond;
2024 unsigned int nr_zones;
2025 unsigned int nr_conv_zones;
2026 unsigned int zone_capacity;
2027 unsigned int last_zone_capacity;
2028 sector_t sector;
2029 };
2030
disk_revalidate_zone_resources(struct gendisk * disk,struct blk_revalidate_zone_args * args)2031 static int disk_revalidate_zone_resources(struct gendisk *disk,
2032 struct blk_revalidate_zone_args *args)
2033 {
2034 struct queue_limits *lim = &disk->queue->limits;
2035 unsigned int pool_size;
2036 int ret = 0;
2037
2038 args->disk = disk;
2039 args->nr_zones =
2040 DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
2041
2042 /* Cached zone conditions: 1 byte per zone */
2043 args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
2044 if (!args->zones_cond)
2045 return -ENOMEM;
2046
2047 if (!disk_need_zone_resources(disk))
2048 return 0;
2049
2050 /*
2051 * If the device has no limit on the maximum number of open and active
2052 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
2053 */
2054 pool_size = max(lim->max_open_zones, lim->max_active_zones);
2055 if (!pool_size)
2056 pool_size =
2057 min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
2058
2059 if (!disk->zone_wplugs_hash) {
2060 ret = disk_alloc_zone_resources(disk, pool_size);
2061 if (ret)
2062 kfree(args->zones_cond);
2063 }
2064
2065 return ret;
2066 }
2067
2068 /*
2069 * Update the disk zone resources information and device queue limits.
2070 * The disk queue is frozen when this is executed.
2071 */
disk_update_zone_resources(struct gendisk * disk,struct blk_revalidate_zone_args * args)2072 static int disk_update_zone_resources(struct gendisk *disk,
2073 struct blk_revalidate_zone_args *args)
2074 {
2075 struct request_queue *q = disk->queue;
2076 unsigned int nr_seq_zones;
2077 unsigned int pool_size, memflags;
2078 struct queue_limits lim;
2079 int ret = 0;
2080
2081 lim = queue_limits_start_update(q);
2082
2083 memflags = blk_mq_freeze_queue(q);
2084
2085 disk->nr_zones = args->nr_zones;
2086 if (args->nr_conv_zones >= disk->nr_zones) {
2087 queue_limits_cancel_update(q);
2088 pr_warn("%s: Invalid number of conventional zones %u / %u\n",
2089 disk->disk_name, args->nr_conv_zones, disk->nr_zones);
2090 ret = -ENODEV;
2091 goto unfreeze;
2092 }
2093
2094 disk->zone_capacity = args->zone_capacity;
2095 disk->last_zone_capacity = args->last_zone_capacity;
2096 disk_set_zones_cond_array(disk, args->zones_cond);
2097 args->zones_cond = NULL;
2098
2099 /*
2100 * Some devices can advertise zone resource limits that are larger than
2101 * the number of sequential zones of the zoned block device, e.g. a
2102 * small ZNS namespace. For such case, assume that the zoned device has
2103 * no zone resource limits.
2104 */
2105 nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
2106 if (lim.max_open_zones >= nr_seq_zones)
2107 lim.max_open_zones = 0;
2108 if (lim.max_active_zones >= nr_seq_zones)
2109 lim.max_active_zones = 0;
2110
2111 if (!disk->zone_wplugs_pool)
2112 goto commit;
2113
2114 /*
2115 * If the device has no limit on the maximum number of open and active
2116 * zones, set its max open zone limit to the mempool size to indicate
2117 * to the user that there is a potential performance impact due to
2118 * dynamic zone write plug allocation when simultaneously writing to
2119 * more zones than the size of the mempool.
2120 */
2121 pool_size = max(lim.max_open_zones, lim.max_active_zones);
2122 if (!pool_size)
2123 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
2124
2125 mempool_resize(disk->zone_wplugs_pool, pool_size);
2126
2127 if (!lim.max_open_zones && !lim.max_active_zones) {
2128 if (pool_size < nr_seq_zones)
2129 lim.max_open_zones = pool_size;
2130 else
2131 lim.max_open_zones = 0;
2132 }
2133
2134 commit:
2135 ret = queue_limits_commit_update(q, &lim);
2136
2137 unfreeze:
2138 if (ret)
2139 disk_free_zone_resources(disk);
2140
2141 blk_mq_unfreeze_queue(q, memflags);
2142
2143 return ret;
2144 }
2145
blk_revalidate_zone_cond(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2146 static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
2147 struct blk_revalidate_zone_args *args)
2148 {
2149 enum blk_zone_cond cond = zone->cond;
2150
2151 /* Check that the zone condition is consistent with the zone type. */
2152 switch (cond) {
2153 case BLK_ZONE_COND_NOT_WP:
2154 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
2155 goto invalid_condition;
2156 break;
2157 case BLK_ZONE_COND_IMP_OPEN:
2158 case BLK_ZONE_COND_EXP_OPEN:
2159 case BLK_ZONE_COND_CLOSED:
2160 case BLK_ZONE_COND_EMPTY:
2161 case BLK_ZONE_COND_FULL:
2162 case BLK_ZONE_COND_OFFLINE:
2163 case BLK_ZONE_COND_READONLY:
2164 if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
2165 goto invalid_condition;
2166 break;
2167 default:
2168 pr_warn("%s: Invalid zone condition 0x%X\n",
2169 args->disk->disk_name, cond);
2170 return -ENODEV;
2171 }
2172
2173 blk_zone_set_cond(args->zones_cond, idx, cond);
2174
2175 return 0;
2176
2177 invalid_condition:
2178 pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
2179 args->disk->disk_name, cond, zone->type);
2180
2181 return -ENODEV;
2182 }
2183
blk_revalidate_conv_zone(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2184 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
2185 struct blk_revalidate_zone_args *args)
2186 {
2187 struct gendisk *disk = args->disk;
2188
2189 if (zone->capacity != zone->len) {
2190 pr_warn("%s: Invalid conventional zone capacity\n",
2191 disk->disk_name);
2192 return -ENODEV;
2193 }
2194
2195 if (disk_zone_is_last(disk, zone))
2196 args->last_zone_capacity = zone->capacity;
2197
2198 args->nr_conv_zones++;
2199
2200 return 0;
2201 }
2202
blk_revalidate_seq_zone(struct blk_zone * zone,unsigned int idx,struct blk_revalidate_zone_args * args)2203 static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
2204 struct blk_revalidate_zone_args *args)
2205 {
2206 struct gendisk *disk = args->disk;
2207 struct blk_zone_wplug *zwplug;
2208 unsigned int wp_offset;
2209
2210 /*
2211 * Remember the capacity of the first sequential zone and check
2212 * if it is constant for all zones, ignoring the last zone as it can be
2213 * smaller.
2214 */
2215 if (!args->zone_capacity)
2216 args->zone_capacity = zone->capacity;
2217 if (disk_zone_is_last(disk, zone)) {
2218 args->last_zone_capacity = zone->capacity;
2219 } else if (zone->capacity != args->zone_capacity) {
2220 pr_warn("%s: Invalid variable zone capacity\n",
2221 disk->disk_name);
2222 return -ENODEV;
2223 }
2224
2225 /*
2226 * If the device needs zone append emulation, we need to track the
2227 * write pointer of all zones that are not empty nor full. So make sure
2228 * we have a zone write plug for such zone if the device has a zone
2229 * write plug hash table.
2230 */
2231 if (!disk->zone_wplugs_hash)
2232 return 0;
2233
2234 wp_offset = disk_zone_wplug_sync_wp_offset(disk, zone);
2235 if (!wp_offset || wp_offset >= zone->capacity)
2236 return 0;
2237
2238 zwplug = disk_get_or_alloc_zone_wplug(disk, zone->wp, GFP_NOIO);
2239 if (!zwplug)
2240 return -ENOMEM;
2241 disk_put_zone_wplug(zwplug);
2242
2243 return 0;
2244 }
2245
2246 /*
2247 * Helper function to check the validity of zones of a zoned block device.
2248 */
blk_revalidate_zone_cb(struct blk_zone * zone,unsigned int idx,void * data)2249 static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
2250 void *data)
2251 {
2252 struct blk_revalidate_zone_args *args = data;
2253 struct gendisk *disk = args->disk;
2254 sector_t zone_sectors = disk->queue->limits.chunk_sectors;
2255 int ret;
2256
2257 /* Check for bad zones and holes in the zone report */
2258 if (zone->start != args->sector) {
2259 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
2260 disk->disk_name, args->sector, zone->start);
2261 return -ENODEV;
2262 }
2263
2264 if (zone->start >= get_capacity(disk) || !zone->len) {
2265 pr_warn("%s: Invalid zone start %llu, length %llu\n",
2266 disk->disk_name, zone->start, zone->len);
2267 return -ENODEV;
2268 }
2269
2270 /*
2271 * All zones must have the same size, with the exception on an eventual
2272 * smaller last zone.
2273 */
2274 if (!disk_zone_is_last(disk, zone)) {
2275 if (zone->len != zone_sectors) {
2276 pr_warn("%s: Invalid zoned device with non constant zone size\n",
2277 disk->disk_name);
2278 return -ENODEV;
2279 }
2280 } else if (zone->len > zone_sectors) {
2281 pr_warn("%s: Invalid zoned device with larger last zone size\n",
2282 disk->disk_name);
2283 return -ENODEV;
2284 }
2285
2286 if (!zone->capacity || zone->capacity > zone->len) {
2287 pr_warn("%s: Invalid zone capacity\n",
2288 disk->disk_name);
2289 return -ENODEV;
2290 }
2291
2292 /* Check zone condition */
2293 ret = blk_revalidate_zone_cond(zone, idx, args);
2294 if (ret)
2295 return ret;
2296
2297 /* Check zone type */
2298 switch (zone->type) {
2299 case BLK_ZONE_TYPE_CONVENTIONAL:
2300 ret = blk_revalidate_conv_zone(zone, idx, args);
2301 break;
2302 case BLK_ZONE_TYPE_SEQWRITE_REQ:
2303 ret = blk_revalidate_seq_zone(zone, idx, args);
2304 break;
2305 case BLK_ZONE_TYPE_SEQWRITE_PREF:
2306 default:
2307 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
2308 disk->disk_name, (int)zone->type, zone->start);
2309 ret = -ENODEV;
2310 }
2311
2312 if (!ret)
2313 args->sector += zone->len;
2314
2315 return ret;
2316 }
2317
2318 /**
2319 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
2320 * @disk: Target disk
2321 *
2322 * Helper function for low-level device drivers to check, (re) allocate and
2323 * initialize resources used for managing zoned disks. This function should
2324 * normally be called by blk-mq based drivers when a zoned gendisk is probed
2325 * and when the zone configuration of the gendisk changes (e.g. after a format).
2326 * Before calling this function, the device driver must already have set the
2327 * device zone size (chunk_sector limit) and the max zone append limit.
2328 * BIO based drivers can also use this function as long as the device queue
2329 * can be safely frozen.
2330 */
blk_revalidate_disk_zones(struct gendisk * disk)2331 int blk_revalidate_disk_zones(struct gendisk *disk)
2332 {
2333 struct request_queue *q = disk->queue;
2334 sector_t zone_sectors = q->limits.chunk_sectors;
2335 sector_t capacity = get_capacity(disk);
2336 struct blk_revalidate_zone_args args = { };
2337 unsigned int memflags, noio_flag;
2338 struct blk_report_zones_args rep_args = {
2339 .cb = blk_revalidate_zone_cb,
2340 .data = &args,
2341 };
2342 int ret = -ENOMEM;
2343
2344 if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
2345 return -EIO;
2346
2347 if (!capacity)
2348 return -ENODEV;
2349
2350 /*
2351 * Checks that the device driver indicated a valid zone size and that
2352 * the max zone append limit is set.
2353 */
2354 if (!zone_sectors || !is_power_of_2(zone_sectors)) {
2355 pr_warn("%s: Invalid non power of two zone size (%llu)\n",
2356 disk->disk_name, zone_sectors);
2357 return -ENODEV;
2358 }
2359
2360 /*
2361 * Ensure that all memory allocations in this context are done as if
2362 * GFP_NOIO was specified.
2363 */
2364 noio_flag = memalloc_noio_save();
2365 ret = disk_revalidate_zone_resources(disk, &args);
2366 if (ret) {
2367 memalloc_noio_restore(noio_flag);
2368 return ret;
2369 }
2370
2371 ret = disk->fops->report_zones(disk, 0, UINT_MAX, &rep_args);
2372 if (!ret) {
2373 pr_warn("%s: No zones reported\n", disk->disk_name);
2374 ret = -ENODEV;
2375 }
2376 memalloc_noio_restore(noio_flag);
2377
2378 if (ret <= 0)
2379 goto free_resources;
2380
2381 /*
2382 * If zones where reported, make sure that the entire disk capacity
2383 * has been checked.
2384 */
2385 if (args.sector != capacity) {
2386 pr_warn("%s: Missing zones from sector %llu\n",
2387 disk->disk_name, args.sector);
2388 ret = -ENODEV;
2389 goto free_resources;
2390 }
2391
2392 ret = disk_update_zone_resources(disk, &args);
2393 if (ret)
2394 goto free_resources;
2395
2396 return 0;
2397
2398 free_resources:
2399 pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
2400
2401 kfree(args.zones_cond);
2402 memflags = blk_mq_freeze_queue(q);
2403 disk_free_zone_resources(disk);
2404 blk_mq_unfreeze_queue(q, memflags);
2405
2406 return ret;
2407 }
2408 EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
2409
2410 /**
2411 * blk_zone_issue_zeroout - zero-fill a block range in a zone
2412 * @bdev: blockdev to write
2413 * @sector: start sector
2414 * @nr_sects: number of sectors to write
2415 * @gfp_mask: memory allocation flags (for bio_alloc)
2416 *
2417 * Description:
2418 * Zero-fill a block range in a zone (@sector must be equal to the zone write
2419 * pointer), handling potential errors due to the (initially unknown) lack of
2420 * hardware offload (See blkdev_issue_zeroout()).
2421 */
blk_zone_issue_zeroout(struct block_device * bdev,sector_t sector,sector_t nr_sects,gfp_t gfp_mask)2422 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
2423 sector_t nr_sects, gfp_t gfp_mask)
2424 {
2425 struct gendisk *disk = bdev->bd_disk;
2426 int ret;
2427
2428 if (WARN_ON_ONCE(!bdev_is_zoned(bdev)))
2429 return -EIO;
2430
2431 ret = blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask,
2432 BLKDEV_ZERO_NOFALLBACK);
2433 if (ret != -EOPNOTSUPP)
2434 return ret;
2435
2436 /*
2437 * The failed call to blkdev_issue_zeroout() advanced the zone write
2438 * pointer. Undo this using a report zone to update the zone write
2439 * pointer to the correct current value.
2440 */
2441 ret = disk->fops->report_zones(disk, sector, 1, NULL);
2442 if (ret != 1)
2443 return ret < 0 ? ret : -EIO;
2444
2445 /*
2446 * Retry without BLKDEV_ZERO_NOFALLBACK to force the fallback to a
2447 * regular write with zero-pages.
2448 */
2449 return blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask, 0);
2450 }
2451 EXPORT_SYMBOL_GPL(blk_zone_issue_zeroout);
2452
2453 #ifdef CONFIG_BLK_DEBUG_FS
queue_zone_wplug_show(struct blk_zone_wplug * zwplug,struct seq_file * m)2454 static void queue_zone_wplug_show(struct blk_zone_wplug *zwplug,
2455 struct seq_file *m)
2456 {
2457 unsigned int zwp_wp_offset, zwp_flags;
2458 unsigned int zwp_zone_no, zwp_ref;
2459 unsigned int zwp_bio_list_size;
2460 enum blk_zone_cond zwp_cond;
2461 unsigned long flags;
2462
2463 spin_lock_irqsave(&zwplug->lock, flags);
2464 zwp_zone_no = zwplug->zone_no;
2465 zwp_flags = zwplug->flags;
2466 zwp_ref = refcount_read(&zwplug->ref);
2467 zwp_cond = zwplug->cond;
2468 zwp_wp_offset = zwplug->wp_offset;
2469 zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
2470 spin_unlock_irqrestore(&zwplug->lock, flags);
2471
2472 seq_printf(m,
2473 "Zone no: %u, flags: 0x%x, ref: %u, cond: %s, wp ofst: %u, pending BIO: %u\n",
2474 zwp_zone_no, zwp_flags, zwp_ref, blk_zone_cond_str(zwp_cond),
2475 zwp_wp_offset, zwp_bio_list_size);
2476 }
2477
queue_zone_wplugs_show(void * data,struct seq_file * m)2478 int queue_zone_wplugs_show(void *data, struct seq_file *m)
2479 {
2480 struct request_queue *q = data;
2481 struct gendisk *disk = q->disk;
2482 struct blk_zone_wplug *zwplug;
2483 unsigned int i;
2484
2485 if (!disk->zone_wplugs_hash)
2486 return 0;
2487
2488 rcu_read_lock();
2489 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
2490 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[i],
2491 node)
2492 queue_zone_wplug_show(zwplug, m);
2493 rcu_read_unlock();
2494
2495 return 0;
2496 }
2497
2498 #endif
2499