xref: /linux/drivers/md/dm-zone.c (revision 16e5ac127d8d18adf85fe5ba847d77b58d1ed418)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2021 Western Digital Corporation or its affiliates.
4  */
5 
6 #include <linux/blkdev.h>
7 #include <linux/mm.h>
8 #include <linux/sched/mm.h>
9 #include <linux/slab.h>
10 #include <linux/bitmap.h>
11 
12 #include "dm-core.h"
13 
14 #define DM_MSG_PREFIX "zone"
15 
16 #define DM_ZONE_INVALID_WP_OFST		UINT_MAX
17 
18 /*
19  * For internal zone reports bypassing the top BIO submission path.
20  */
21 static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
22 				  sector_t sector, unsigned int nr_zones,
23 				  report_zones_cb cb, void *data)
24 {
25 	struct gendisk *disk = md->disk;
26 	int ret;
27 	struct dm_report_zones_args args = {
28 		.next_sector = sector,
29 		.orig_data = data,
30 		.orig_cb = cb,
31 	};
32 
33 	do {
34 		struct dm_target *tgt;
35 
36 		tgt = dm_table_find_target(t, args.next_sector);
37 		if (WARN_ON_ONCE(!tgt->type->report_zones))
38 			return -EIO;
39 
40 		args.tgt = tgt;
41 		ret = tgt->type->report_zones(tgt, &args,
42 					      nr_zones - args.zone_idx);
43 		if (ret < 0)
44 			return ret;
45 	} while (args.zone_idx < nr_zones &&
46 		 args.next_sector < get_capacity(disk));
47 
48 	return args.zone_idx;
49 }
50 
51 /*
52  * User facing dm device block device report zone operation. This calls the
53  * report_zones operation for each target of a device table. This operation is
54  * generally implemented by targets using dm_report_zones().
55  */
56 int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
57 			unsigned int nr_zones, report_zones_cb cb, void *data)
58 {
59 	struct mapped_device *md = disk->private_data;
60 	struct dm_table *map;
61 	int srcu_idx, ret;
62 
63 	if (dm_suspended_md(md))
64 		return -EAGAIN;
65 
66 	map = dm_get_live_table(md, &srcu_idx);
67 	if (!map)
68 		return -EIO;
69 
70 	ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
71 
72 	dm_put_live_table(md, srcu_idx);
73 
74 	return ret;
75 }
76 
77 static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
78 			      void *data)
79 {
80 	struct dm_report_zones_args *args = data;
81 	sector_t sector_diff = args->tgt->begin - args->start;
82 
83 	/*
84 	 * Ignore zones beyond the target range.
85 	 */
86 	if (zone->start >= args->start + args->tgt->len)
87 		return 0;
88 
89 	/*
90 	 * Remap the start sector and write pointer position of the zone
91 	 * to match its position in the target range.
92 	 */
93 	zone->start += sector_diff;
94 	if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
95 		if (zone->cond == BLK_ZONE_COND_FULL)
96 			zone->wp = zone->start + zone->len;
97 		else if (zone->cond == BLK_ZONE_COND_EMPTY)
98 			zone->wp = zone->start;
99 		else
100 			zone->wp += sector_diff;
101 	}
102 
103 	args->next_sector = zone->start + zone->len;
104 	return args->orig_cb(zone, args->zone_idx++, args->orig_data);
105 }
106 
107 /*
108  * Helper for drivers of zoned targets to implement struct target_type
109  * report_zones operation.
110  */
111 int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
112 		    struct dm_report_zones_args *args, unsigned int nr_zones)
113 {
114 	/*
115 	 * Set the target mapping start sector first so that
116 	 * dm_report_zones_cb() can correctly remap zone information.
117 	 */
118 	args->start = start;
119 
120 	return blkdev_report_zones(bdev, sector, nr_zones,
121 				   dm_report_zones_cb, args);
122 }
123 EXPORT_SYMBOL_GPL(dm_report_zones);
124 
125 bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
126 {
127 	struct request_queue *q = md->queue;
128 
129 	if (!blk_queue_is_zoned(q))
130 		return false;
131 
132 	switch (bio_op(bio)) {
133 	case REQ_OP_WRITE_ZEROES:
134 	case REQ_OP_WRITE:
135 		return !op_is_flush(bio->bi_opf) && bio_sectors(bio);
136 	default:
137 		return false;
138 	}
139 }
140 
141 void dm_cleanup_zoned_dev(struct mapped_device *md)
142 {
143 	if (md->disk) {
144 		bitmap_free(md->disk->conv_zones_bitmap);
145 		md->disk->conv_zones_bitmap = NULL;
146 		bitmap_free(md->disk->seq_zones_wlock);
147 		md->disk->seq_zones_wlock = NULL;
148 	}
149 
150 	kvfree(md->zwp_offset);
151 	md->zwp_offset = NULL;
152 	md->nr_zones = 0;
153 }
154 
155 static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
156 {
157 	switch (zone->cond) {
158 	case BLK_ZONE_COND_IMP_OPEN:
159 	case BLK_ZONE_COND_EXP_OPEN:
160 	case BLK_ZONE_COND_CLOSED:
161 		return zone->wp - zone->start;
162 	case BLK_ZONE_COND_FULL:
163 		return zone->len;
164 	case BLK_ZONE_COND_EMPTY:
165 	case BLK_ZONE_COND_NOT_WP:
166 	case BLK_ZONE_COND_OFFLINE:
167 	case BLK_ZONE_COND_READONLY:
168 	default:
169 		/*
170 		 * Conventional, offline and read-only zones do not have a valid
171 		 * write pointer. Use 0 as for an empty zone.
172 		 */
173 		return 0;
174 	}
175 }
176 
177 static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
178 				 void *data)
179 {
180 	struct mapped_device *md = data;
181 	struct gendisk *disk = md->disk;
182 
183 	switch (zone->type) {
184 	case BLK_ZONE_TYPE_CONVENTIONAL:
185 		if (!disk->conv_zones_bitmap) {
186 			disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones,
187 								GFP_NOIO);
188 			if (!disk->conv_zones_bitmap)
189 				return -ENOMEM;
190 		}
191 		set_bit(idx, disk->conv_zones_bitmap);
192 		break;
193 	case BLK_ZONE_TYPE_SEQWRITE_REQ:
194 	case BLK_ZONE_TYPE_SEQWRITE_PREF:
195 		if (!disk->seq_zones_wlock) {
196 			disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones,
197 							      GFP_NOIO);
198 			if (!disk->seq_zones_wlock)
199 				return -ENOMEM;
200 		}
201 		if (!md->zwp_offset) {
202 			md->zwp_offset =
203 				kvcalloc(disk->nr_zones, sizeof(unsigned int),
204 					 GFP_KERNEL);
205 			if (!md->zwp_offset)
206 				return -ENOMEM;
207 		}
208 		md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
209 
210 		break;
211 	default:
212 		DMERR("Invalid zone type 0x%x at sectors %llu",
213 		      (int)zone->type, zone->start);
214 		return -ENODEV;
215 	}
216 
217 	return 0;
218 }
219 
220 /*
221  * Revalidate the zones of a mapped device to initialize resource necessary
222  * for zone append emulation. Note that we cannot simply use the block layer
223  * blk_revalidate_disk_zones() function here as the mapped device is suspended
224  * (this is called from __bind() context).
225  */
226 static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
227 {
228 	struct gendisk *disk = md->disk;
229 	unsigned int noio_flag;
230 	int ret;
231 
232 	/*
233 	 * Check if something changed. If yes, cleanup the current resources
234 	 * and reallocate everything.
235 	 */
236 	if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
237 		dm_cleanup_zoned_dev(md);
238 	if (md->nr_zones)
239 		return 0;
240 
241 	/*
242 	 * Scan all zones to initialize everything. Ensure that all vmalloc
243 	 * operations in this context are done as if GFP_NOIO was specified.
244 	 */
245 	noio_flag = memalloc_noio_save();
246 	ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
247 				     dm_zone_revalidate_cb, md);
248 	memalloc_noio_restore(noio_flag);
249 	if (ret < 0)
250 		goto err;
251 	if (ret != disk->nr_zones) {
252 		ret = -EIO;
253 		goto err;
254 	}
255 
256 	md->nr_zones = disk->nr_zones;
257 
258 	return 0;
259 
260 err:
261 	DMERR("Revalidate zones failed %d", ret);
262 	dm_cleanup_zoned_dev(md);
263 	return ret;
264 }
265 
266 static int device_not_zone_append_capable(struct dm_target *ti,
267 					  struct dm_dev *dev, sector_t start,
268 					  sector_t len, void *data)
269 {
270 	return !bdev_is_zoned(dev->bdev);
271 }
272 
273 static bool dm_table_supports_zone_append(struct dm_table *t)
274 {
275 	for (unsigned int i = 0; i < t->num_targets; i++) {
276 		struct dm_target *ti = dm_table_get_target(t, i);
277 
278 		if (ti->emulate_zone_append)
279 			return false;
280 
281 		if (!ti->type->iterate_devices ||
282 		    ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL))
283 			return false;
284 	}
285 
286 	return true;
287 }
288 
289 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
290 {
291 	struct mapped_device *md = t->md;
292 
293 	/*
294 	 * For a zoned target, the number of zones should be updated for the
295 	 * correct value to be exposed in sysfs queue/nr_zones.
296 	 */
297 	WARN_ON_ONCE(queue_is_mq(q));
298 	md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
299 
300 	/* Check if zone append is natively supported */
301 	if (dm_table_supports_zone_append(t)) {
302 		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
303 		dm_cleanup_zoned_dev(md);
304 		return 0;
305 	}
306 
307 	/*
308 	 * Mark the mapped device as needing zone append emulation and
309 	 * initialize the emulation resources once the capacity is set.
310 	 */
311 	set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
312 	if (!get_capacity(md->disk))
313 		return 0;
314 
315 	return dm_revalidate_zones(md, t);
316 }
317 
318 static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
319 				       void *data)
320 {
321 	unsigned int *wp_offset = data;
322 
323 	*wp_offset = dm_get_zone_wp_offset(zone);
324 
325 	return 0;
326 }
327 
328 static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
329 				    unsigned int *wp_ofst)
330 {
331 	sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
332 	unsigned int noio_flag;
333 	struct dm_table *t;
334 	int srcu_idx, ret;
335 
336 	t = dm_get_live_table(md, &srcu_idx);
337 	if (!t)
338 		return -EIO;
339 
340 	/*
341 	 * Ensure that all memory allocations in this context are done as if
342 	 * GFP_NOIO was specified.
343 	 */
344 	noio_flag = memalloc_noio_save();
345 	ret = dm_blk_do_report_zones(md, t, sector, 1,
346 				     dm_update_zone_wp_offset_cb, wp_ofst);
347 	memalloc_noio_restore(noio_flag);
348 
349 	dm_put_live_table(md, srcu_idx);
350 
351 	if (ret != 1)
352 		return -EIO;
353 
354 	return 0;
355 }
356 
357 struct orig_bio_details {
358 	enum req_op op;
359 	unsigned int nr_sectors;
360 };
361 
362 /*
363  * First phase of BIO mapping for targets with zone append emulation:
364  * check all BIO that change a zone writer pointer and change zone
365  * append operations into regular write operations.
366  */
367 static bool dm_zone_map_bio_begin(struct mapped_device *md,
368 				  unsigned int zno, struct bio *clone)
369 {
370 	sector_t zsectors = bdev_zone_sectors(md->disk->part0);
371 	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
372 
373 	/*
374 	 * If the target zone is in an error state, recover by inspecting the
375 	 * zone to get its current write pointer position. Note that since the
376 	 * target zone is already locked, a BIO issuing context should never
377 	 * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
378 	 */
379 	if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
380 		if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
381 			return false;
382 		WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
383 	}
384 
385 	switch (bio_op(clone)) {
386 	case REQ_OP_ZONE_RESET:
387 	case REQ_OP_ZONE_FINISH:
388 		return true;
389 	case REQ_OP_WRITE_ZEROES:
390 	case REQ_OP_WRITE:
391 		/* Writes must be aligned to the zone write pointer */
392 		if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
393 			return false;
394 		break;
395 	case REQ_OP_ZONE_APPEND:
396 		/*
397 		 * Change zone append operations into a non-mergeable regular
398 		 * writes directed at the current write pointer position of the
399 		 * target zone.
400 		 */
401 		clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
402 			(clone->bi_opf & (~REQ_OP_MASK));
403 		clone->bi_iter.bi_sector += zwp_offset;
404 		break;
405 	default:
406 		DMWARN_LIMIT("Invalid BIO operation");
407 		return false;
408 	}
409 
410 	/* Cannot write to a full zone */
411 	if (zwp_offset >= zsectors)
412 		return false;
413 
414 	return true;
415 }
416 
417 /*
418  * Second phase of BIO mapping for targets with zone append emulation:
419  * update the zone write pointer offset array to account for the additional
420  * data written to a zone. Note that at this point, the remapped clone BIO
421  * may already have completed, so we do not touch it.
422  */
423 static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno,
424 					struct orig_bio_details *orig_bio_details,
425 					unsigned int nr_sectors)
426 {
427 	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
428 
429 	/* The clone BIO may already have been completed and failed */
430 	if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
431 		return BLK_STS_IOERR;
432 
433 	/* Update the zone wp offset */
434 	switch (orig_bio_details->op) {
435 	case REQ_OP_ZONE_RESET:
436 		WRITE_ONCE(md->zwp_offset[zno], 0);
437 		return BLK_STS_OK;
438 	case REQ_OP_ZONE_FINISH:
439 		WRITE_ONCE(md->zwp_offset[zno],
440 			   bdev_zone_sectors(md->disk->part0));
441 		return BLK_STS_OK;
442 	case REQ_OP_WRITE_ZEROES:
443 	case REQ_OP_WRITE:
444 		WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
445 		return BLK_STS_OK;
446 	case REQ_OP_ZONE_APPEND:
447 		/*
448 		 * Check that the target did not truncate the write operation
449 		 * emulating a zone append.
450 		 */
451 		if (nr_sectors != orig_bio_details->nr_sectors) {
452 			DMWARN_LIMIT("Truncated write for zone append");
453 			return BLK_STS_IOERR;
454 		}
455 		WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
456 		return BLK_STS_OK;
457 	default:
458 		DMWARN_LIMIT("Invalid BIO operation");
459 		return BLK_STS_IOERR;
460 	}
461 }
462 
463 static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
464 				struct bio *clone)
465 {
466 	if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
467 		return;
468 
469 	wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
470 	bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
471 }
472 
473 static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
474 				  struct bio *clone)
475 {
476 	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
477 		return;
478 
479 	WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
480 	clear_bit_unlock(zno, disk->seq_zones_wlock);
481 	smp_mb__after_atomic();
482 	wake_up_bit(disk->seq_zones_wlock, zno);
483 
484 	bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
485 }
486 
487 static bool dm_need_zone_wp_tracking(struct bio *bio)
488 {
489 	/*
490 	 * Special processing is not needed for operations that do not need the
491 	 * zone write lock, that is, all operations that target conventional
492 	 * zones and all operations that do not modify directly a sequential
493 	 * zone write pointer.
494 	 */
495 	if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
496 		return false;
497 	switch (bio_op(bio)) {
498 	case REQ_OP_WRITE_ZEROES:
499 	case REQ_OP_WRITE:
500 	case REQ_OP_ZONE_RESET:
501 	case REQ_OP_ZONE_FINISH:
502 	case REQ_OP_ZONE_APPEND:
503 		return bio_zone_is_seq(bio);
504 	default:
505 		return false;
506 	}
507 }
508 
509 /*
510  * Special IO mapping for targets needing zone append emulation.
511  */
512 int dm_zone_map_bio(struct dm_target_io *tio)
513 {
514 	struct dm_io *io = tio->io;
515 	struct dm_target *ti = tio->ti;
516 	struct mapped_device *md = io->md;
517 	struct bio *clone = &tio->clone;
518 	struct orig_bio_details orig_bio_details;
519 	unsigned int zno;
520 	blk_status_t sts;
521 	int r;
522 
523 	/*
524 	 * IOs that do not change a zone write pointer do not need
525 	 * any additional special processing.
526 	 */
527 	if (!dm_need_zone_wp_tracking(clone))
528 		return ti->type->map(ti, clone);
529 
530 	/* Lock the target zone */
531 	zno = bio_zone_no(clone);
532 	dm_zone_lock(md->disk, zno, clone);
533 
534 	orig_bio_details.nr_sectors = bio_sectors(clone);
535 	orig_bio_details.op = bio_op(clone);
536 
537 	/*
538 	 * Check that the bio and the target zone write pointer offset are
539 	 * both valid, and if the bio is a zone append, remap it to a write.
540 	 */
541 	if (!dm_zone_map_bio_begin(md, zno, clone)) {
542 		dm_zone_unlock(md->disk, zno, clone);
543 		return DM_MAPIO_KILL;
544 	}
545 
546 	/* Let the target do its work */
547 	r = ti->type->map(ti, clone);
548 	switch (r) {
549 	case DM_MAPIO_SUBMITTED:
550 		/*
551 		 * The target submitted the clone BIO. The target zone will
552 		 * be unlocked on completion of the clone.
553 		 */
554 		sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
555 					  *tio->len_ptr);
556 		break;
557 	case DM_MAPIO_REMAPPED:
558 		/*
559 		 * The target only remapped the clone BIO. In case of error,
560 		 * unlock the target zone here as the clone will not be
561 		 * submitted.
562 		 */
563 		sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
564 					  *tio->len_ptr);
565 		if (sts != BLK_STS_OK)
566 			dm_zone_unlock(md->disk, zno, clone);
567 		break;
568 	case DM_MAPIO_REQUEUE:
569 	case DM_MAPIO_KILL:
570 	default:
571 		dm_zone_unlock(md->disk, zno, clone);
572 		sts = BLK_STS_IOERR;
573 		break;
574 	}
575 
576 	if (sts != BLK_STS_OK)
577 		return DM_MAPIO_KILL;
578 
579 	return r;
580 }
581 
582 /*
583  * IO completion callback called from clone_endio().
584  */
585 void dm_zone_endio(struct dm_io *io, struct bio *clone)
586 {
587 	struct mapped_device *md = io->md;
588 	struct gendisk *disk = md->disk;
589 	struct bio *orig_bio = io->orig_bio;
590 	unsigned int zwp_offset;
591 	unsigned int zno;
592 
593 	/*
594 	 * For targets that do not emulate zone append, we only need to
595 	 * handle native zone-append bios.
596 	 */
597 	if (!dm_emulate_zone_append(md)) {
598 		/*
599 		 * Get the offset within the zone of the written sector
600 		 * and add that to the original bio sector position.
601 		 */
602 		if (clone->bi_status == BLK_STS_OK &&
603 		    bio_op(clone) == REQ_OP_ZONE_APPEND) {
604 			sector_t mask =
605 				(sector_t)bdev_zone_sectors(disk->part0) - 1;
606 
607 			orig_bio->bi_iter.bi_sector +=
608 				clone->bi_iter.bi_sector & mask;
609 		}
610 
611 		return;
612 	}
613 
614 	/*
615 	 * For targets that do emulate zone append, if the clone BIO does not
616 	 * own the target zone write lock, we have nothing to do.
617 	 */
618 	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
619 		return;
620 
621 	zno = bio_zone_no(orig_bio);
622 
623 	if (clone->bi_status != BLK_STS_OK) {
624 		/*
625 		 * BIOs that modify a zone write pointer may leave the zone
626 		 * in an unknown state in case of failure (e.g. the write
627 		 * pointer was only partially advanced). In this case, set
628 		 * the target zone write pointer as invalid unless it is
629 		 * already being updated.
630 		 */
631 		WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
632 	} else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
633 		/*
634 		 * Get the written sector for zone append operation that were
635 		 * emulated using regular write operations.
636 		 */
637 		zwp_offset = READ_ONCE(md->zwp_offset[zno]);
638 		if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
639 			WRITE_ONCE(md->zwp_offset[zno],
640 				   DM_ZONE_INVALID_WP_OFST);
641 		else
642 			orig_bio->bi_iter.bi_sector +=
643 				zwp_offset - bio_sectors(orig_bio);
644 	}
645 
646 	dm_zone_unlock(disk, zno, clone);
647 }
648