xref: /linux/drivers/md/dm-zoned-target.c (revision 2b64b2ed277ff23e785fbdb65098ee7e1252d64f)
1 /*
2  * Copyright (C) 2017 Western Digital Corporation or its affiliates.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-zoned.h"
8 
9 #include <linux/module.h>
10 
11 #define	DM_MSG_PREFIX		"zoned"
12 
13 #define DMZ_MIN_BIOS		8192
14 
15 /*
16  * Zone BIO context.
17  */
18 struct dmz_bioctx {
19 	struct dmz_target	*target;
20 	struct dm_zone		*zone;
21 	struct bio		*bio;
22 	refcount_t		ref;
23 };
24 
25 /*
26  * Chunk work descriptor.
27  */
28 struct dm_chunk_work {
29 	struct work_struct	work;
30 	refcount_t		refcount;
31 	struct dmz_target	*target;
32 	unsigned int		chunk;
33 	struct bio_list		bio_list;
34 };
35 
36 /*
37  * Target descriptor.
38  */
39 struct dmz_target {
40 	struct dm_dev		*ddev;
41 
42 	unsigned long		flags;
43 
44 	/* Zoned block device information */
45 	struct dmz_dev		*dev;
46 
47 	/* For metadata handling */
48 	struct dmz_metadata     *metadata;
49 
50 	/* For reclaim */
51 	struct dmz_reclaim	*reclaim;
52 
53 	/* For chunk work */
54 	struct radix_tree_root	chunk_rxtree;
55 	struct workqueue_struct *chunk_wq;
56 	struct mutex		chunk_lock;
57 
58 	/* For cloned BIOs to zones */
59 	struct bio_set		bio_set;
60 
61 	/* For flush */
62 	spinlock_t		flush_lock;
63 	struct bio_list		flush_list;
64 	struct delayed_work	flush_work;
65 	struct workqueue_struct *flush_wq;
66 };
67 
68 /*
69  * Flush intervals (seconds).
70  */
71 #define DMZ_FLUSH_PERIOD	(10 * HZ)
72 
73 /*
74  * Target BIO completion.
75  */
76 static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
77 {
78 	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
79 
80 	if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
81 		bio->bi_status = status;
82 
83 	if (refcount_dec_and_test(&bioctx->ref)) {
84 		struct dm_zone *zone = bioctx->zone;
85 
86 		if (zone) {
87 			if (bio->bi_status != BLK_STS_OK &&
88 			    bio_op(bio) == REQ_OP_WRITE &&
89 			    dmz_is_seq(zone))
90 				set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
91 			dmz_deactivate_zone(zone);
92 		}
93 		bio_endio(bio);
94 	}
95 }
96 
97 /*
98  * Completion callback for an internally cloned target BIO. This terminates the
99  * target BIO when there are no more references to its context.
100  */
101 static void dmz_clone_endio(struct bio *clone)
102 {
103 	struct dmz_bioctx *bioctx = clone->bi_private;
104 	blk_status_t status = clone->bi_status;
105 
106 	bio_put(clone);
107 	dmz_bio_endio(bioctx->bio, status);
108 }
109 
110 /*
111  * Issue a clone of a target BIO. The clone may only partially process the
112  * original target BIO.
113  */
114 static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
115 			  struct bio *bio, sector_t chunk_block,
116 			  unsigned int nr_blocks)
117 {
118 	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
119 	struct bio *clone;
120 
121 	clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
122 	if (!clone)
123 		return -ENOMEM;
124 
125 	bio_set_dev(clone, dmz->dev->bdev);
126 	clone->bi_iter.bi_sector =
127 		dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
128 	clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
129 	clone->bi_end_io = dmz_clone_endio;
130 	clone->bi_private = bioctx;
131 
132 	bio_advance(bio, clone->bi_iter.bi_size);
133 
134 	refcount_inc(&bioctx->ref);
135 	generic_make_request(clone);
136 
137 	if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
138 		zone->wp_block += nr_blocks;
139 
140 	return 0;
141 }
142 
143 /*
144  * Zero out pages of discarded blocks accessed by a read BIO.
145  */
146 static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
147 				 sector_t chunk_block, unsigned int nr_blocks)
148 {
149 	unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;
150 
151 	/* Clear nr_blocks */
152 	swap(bio->bi_iter.bi_size, size);
153 	zero_fill_bio(bio);
154 	swap(bio->bi_iter.bi_size, size);
155 
156 	bio_advance(bio, size);
157 }
158 
159 /*
160  * Process a read BIO.
161  */
162 static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
163 			   struct bio *bio)
164 {
165 	sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
166 	unsigned int nr_blocks = dmz_bio_blocks(bio);
167 	sector_t end_block = chunk_block + nr_blocks;
168 	struct dm_zone *rzone, *bzone;
169 	int ret;
170 
171 	/* Read into unmapped chunks need only zeroing the BIO buffer */
172 	if (!zone) {
173 		zero_fill_bio(bio);
174 		return 0;
175 	}
176 
177 	dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks",
178 		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
179 		      (dmz_is_rnd(zone) ? "RND" : "SEQ"),
180 		      dmz_id(dmz->metadata, zone),
181 		      (unsigned long long)chunk_block, nr_blocks);
182 
183 	/* Check block validity to determine the read location */
184 	bzone = zone->bzone;
185 	while (chunk_block < end_block) {
186 		nr_blocks = 0;
187 		if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
188 			/* Test block validity in the data zone */
189 			ret = dmz_block_valid(dmz->metadata, zone, chunk_block);
190 			if (ret < 0)
191 				return ret;
192 			if (ret > 0) {
193 				/* Read data zone blocks */
194 				nr_blocks = ret;
195 				rzone = zone;
196 			}
197 		}
198 
199 		/*
200 		 * No valid blocks found in the data zone.
201 		 * Check the buffer zone, if there is one.
202 		 */
203 		if (!nr_blocks && bzone) {
204 			ret = dmz_block_valid(dmz->metadata, bzone, chunk_block);
205 			if (ret < 0)
206 				return ret;
207 			if (ret > 0) {
208 				/* Read buffer zone blocks */
209 				nr_blocks = ret;
210 				rzone = bzone;
211 			}
212 		}
213 
214 		if (nr_blocks) {
215 			/* Valid blocks found: read them */
216 			nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
217 			ret = dmz_submit_bio(dmz, rzone, bio, chunk_block, nr_blocks);
218 			if (ret)
219 				return ret;
220 			chunk_block += nr_blocks;
221 		} else {
222 			/* No valid block: zeroout the current BIO block */
223 			dmz_handle_read_zero(dmz, bio, chunk_block, 1);
224 			chunk_block++;
225 		}
226 	}
227 
228 	return 0;
229 }
230 
231 /*
232  * Write blocks directly in a data zone, at the write pointer.
233  * If a buffer zone is assigned, invalidate the blocks written
234  * in place.
235  */
236 static int dmz_handle_direct_write(struct dmz_target *dmz,
237 				   struct dm_zone *zone, struct bio *bio,
238 				   sector_t chunk_block,
239 				   unsigned int nr_blocks)
240 {
241 	struct dmz_metadata *zmd = dmz->metadata;
242 	struct dm_zone *bzone = zone->bzone;
243 	int ret;
244 
245 	if (dmz_is_readonly(zone))
246 		return -EROFS;
247 
248 	/* Submit write */
249 	ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks);
250 	if (ret)
251 		return ret;
252 
253 	/*
254 	 * Validate the blocks in the data zone and invalidate
255 	 * in the buffer zone, if there is one.
256 	 */
257 	ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
258 	if (ret == 0 && bzone)
259 		ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);
260 
261 	return ret;
262 }
263 
264 /*
265  * Write blocks in the buffer zone of @zone.
266  * If no buffer zone is assigned yet, get one.
267  * Called with @zone write locked.
268  */
269 static int dmz_handle_buffered_write(struct dmz_target *dmz,
270 				     struct dm_zone *zone, struct bio *bio,
271 				     sector_t chunk_block,
272 				     unsigned int nr_blocks)
273 {
274 	struct dmz_metadata *zmd = dmz->metadata;
275 	struct dm_zone *bzone;
276 	int ret;
277 
278 	/* Get the buffer zone. One will be allocated if needed */
279 	bzone = dmz_get_chunk_buffer(zmd, zone);
280 	if (!bzone)
281 		return -ENOSPC;
282 
283 	if (dmz_is_readonly(bzone))
284 		return -EROFS;
285 
286 	/* Submit write */
287 	ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks);
288 	if (ret)
289 		return ret;
290 
291 	/*
292 	 * Validate the blocks in the buffer zone
293 	 * and invalidate in the data zone.
294 	 */
295 	ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
296 	if (ret == 0 && chunk_block < zone->wp_block)
297 		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
298 
299 	return ret;
300 }
301 
302 /*
303  * Process a write BIO.
304  */
305 static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
306 			    struct bio *bio)
307 {
308 	sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
309 	unsigned int nr_blocks = dmz_bio_blocks(bio);
310 
311 	if (!zone)
312 		return -ENOSPC;
313 
314 	dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
315 		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
316 		      (dmz_is_rnd(zone) ? "RND" : "SEQ"),
317 		      dmz_id(dmz->metadata, zone),
318 		      (unsigned long long)chunk_block, nr_blocks);
319 
320 	if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
321 		/*
322 		 * zone is a random zone or it is a sequential zone
323 		 * and the BIO is aligned to the zone write pointer:
324 		 * direct write the zone.
325 		 */
326 		return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks);
327 	}
328 
329 	/*
330 	 * This is an unaligned write in a sequential zone:
331 	 * use buffered write.
332 	 */
333 	return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
334 }
335 
336 /*
337  * Process a discard BIO.
338  */
339 static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
340 			      struct bio *bio)
341 {
342 	struct dmz_metadata *zmd = dmz->metadata;
343 	sector_t block = dmz_bio_block(bio);
344 	unsigned int nr_blocks = dmz_bio_blocks(bio);
345 	sector_t chunk_block = dmz_chunk_block(dmz->dev, block);
346 	int ret = 0;
347 
348 	/* For unmapped chunks, there is nothing to do */
349 	if (!zone)
350 		return 0;
351 
352 	if (dmz_is_readonly(zone))
353 		return -EROFS;
354 
355 	dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
356 		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
357 		      dmz_id(zmd, zone),
358 		      (unsigned long long)chunk_block, nr_blocks);
359 
360 	/*
361 	 * Invalidate blocks in the data zone and its
362 	 * buffer zone if one is mapped.
363 	 */
364 	if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
365 		ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
366 	if (ret == 0 && zone->bzone)
367 		ret = dmz_invalidate_blocks(zmd, zone->bzone,
368 					    chunk_block, nr_blocks);
369 	return ret;
370 }
371 
372 /*
373  * Process a BIO.
374  */
375 static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
376 			   struct bio *bio)
377 {
378 	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
379 	struct dmz_metadata *zmd = dmz->metadata;
380 	struct dm_zone *zone;
381 	int ret;
382 
383 	/*
384 	 * Write may trigger a zone allocation. So make sure the
385 	 * allocation can succeed.
386 	 */
387 	if (bio_op(bio) == REQ_OP_WRITE)
388 		dmz_schedule_reclaim(dmz->reclaim);
389 
390 	dmz_lock_metadata(zmd);
391 
392 	/*
393 	 * Get the data zone mapping the chunk. There may be no
394 	 * mapping for read and discard. If a mapping is obtained,
395 	 + the zone returned will be set to active state.
396 	 */
397 	zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio),
398 				     bio_op(bio));
399 	if (IS_ERR(zone)) {
400 		ret = PTR_ERR(zone);
401 		goto out;
402 	}
403 
404 	/* Process the BIO */
405 	if (zone) {
406 		dmz_activate_zone(zone);
407 		bioctx->zone = zone;
408 	}
409 
410 	switch (bio_op(bio)) {
411 	case REQ_OP_READ:
412 		ret = dmz_handle_read(dmz, zone, bio);
413 		break;
414 	case REQ_OP_WRITE:
415 		ret = dmz_handle_write(dmz, zone, bio);
416 		break;
417 	case REQ_OP_DISCARD:
418 	case REQ_OP_WRITE_ZEROES:
419 		ret = dmz_handle_discard(dmz, zone, bio);
420 		break;
421 	default:
422 		dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x",
423 			    bio_op(bio));
424 		ret = -EIO;
425 	}
426 
427 	/*
428 	 * Release the chunk mapping. This will check that the mapping
429 	 * is still valid, that is, that the zone used still has valid blocks.
430 	 */
431 	if (zone)
432 		dmz_put_chunk_mapping(zmd, zone);
433 out:
434 	dmz_bio_endio(bio, errno_to_blk_status(ret));
435 
436 	dmz_unlock_metadata(zmd);
437 }
438 
439 /*
440  * Increment a chunk reference counter.
441  */
442 static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
443 {
444 	refcount_inc(&cw->refcount);
445 }
446 
447 /*
448  * Decrement a chunk work reference count and
449  * free it if it becomes 0.
450  */
451 static void dmz_put_chunk_work(struct dm_chunk_work *cw)
452 {
453 	if (refcount_dec_and_test(&cw->refcount)) {
454 		WARN_ON(!bio_list_empty(&cw->bio_list));
455 		radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
456 		kfree(cw);
457 	}
458 }
459 
460 /*
461  * Chunk BIO work function.
462  */
463 static void dmz_chunk_work(struct work_struct *work)
464 {
465 	struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
466 	struct dmz_target *dmz = cw->target;
467 	struct bio *bio;
468 
469 	mutex_lock(&dmz->chunk_lock);
470 
471 	/* Process the chunk BIOs */
472 	while ((bio = bio_list_pop(&cw->bio_list))) {
473 		mutex_unlock(&dmz->chunk_lock);
474 		dmz_handle_bio(dmz, cw, bio);
475 		mutex_lock(&dmz->chunk_lock);
476 		dmz_put_chunk_work(cw);
477 	}
478 
479 	/* Queueing the work incremented the work refcount */
480 	dmz_put_chunk_work(cw);
481 
482 	mutex_unlock(&dmz->chunk_lock);
483 }
484 
485 /*
486  * Flush work.
487  */
488 static void dmz_flush_work(struct work_struct *work)
489 {
490 	struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
491 	struct bio *bio;
492 	int ret;
493 
494 	/* Flush dirty metadata blocks */
495 	ret = dmz_flush_metadata(dmz->metadata);
496 
497 	/* Process queued flush requests */
498 	while (1) {
499 		spin_lock(&dmz->flush_lock);
500 		bio = bio_list_pop(&dmz->flush_list);
501 		spin_unlock(&dmz->flush_lock);
502 
503 		if (!bio)
504 			break;
505 
506 		dmz_bio_endio(bio, errno_to_blk_status(ret));
507 	}
508 
509 	queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
510 }
511 
512 /*
513  * Get a chunk work and start it to process a new BIO.
514  * If the BIO chunk has no work yet, create one.
515  */
516 static void dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
517 {
518 	unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
519 	struct dm_chunk_work *cw;
520 
521 	mutex_lock(&dmz->chunk_lock);
522 
523 	/* Get the BIO chunk work. If one is not active yet, create one */
524 	cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
525 	if (!cw) {
526 		int ret;
527 
528 		/* Create a new chunk work */
529 		cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
530 		if (!cw)
531 			goto out;
532 
533 		INIT_WORK(&cw->work, dmz_chunk_work);
534 		refcount_set(&cw->refcount, 0);
535 		cw->target = dmz;
536 		cw->chunk = chunk;
537 		bio_list_init(&cw->bio_list);
538 
539 		ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
540 		if (unlikely(ret)) {
541 			kfree(cw);
542 			cw = NULL;
543 			goto out;
544 		}
545 	}
546 
547 	bio_list_add(&cw->bio_list, bio);
548 	dmz_get_chunk_work(cw);
549 
550 	if (queue_work(dmz->chunk_wq, &cw->work))
551 		dmz_get_chunk_work(cw);
552 out:
553 	mutex_unlock(&dmz->chunk_lock);
554 }
555 
556 /*
557  * Process a new BIO.
558  */
559 static int dmz_map(struct dm_target *ti, struct bio *bio)
560 {
561 	struct dmz_target *dmz = ti->private;
562 	struct dmz_dev *dev = dmz->dev;
563 	struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
564 	sector_t sector = bio->bi_iter.bi_sector;
565 	unsigned int nr_sectors = bio_sectors(bio);
566 	sector_t chunk_sector;
567 
568 	dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
569 		      bio_op(bio), (unsigned long long)sector, nr_sectors,
570 		      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
571 		      (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)),
572 		      (unsigned int)dmz_bio_blocks(bio));
573 
574 	bio_set_dev(bio, dev->bdev);
575 
576 	if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
577 		return DM_MAPIO_REMAPPED;
578 
579 	/* The BIO should be block aligned */
580 	if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
581 		return DM_MAPIO_KILL;
582 
583 	/* Initialize the BIO context */
584 	bioctx->target = dmz;
585 	bioctx->zone = NULL;
586 	bioctx->bio = bio;
587 	refcount_set(&bioctx->ref, 1);
588 
589 	/* Set the BIO pending in the flush list */
590 	if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
591 		spin_lock(&dmz->flush_lock);
592 		bio_list_add(&dmz->flush_list, bio);
593 		spin_unlock(&dmz->flush_lock);
594 		mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
595 		return DM_MAPIO_SUBMITTED;
596 	}
597 
598 	/* Split zone BIOs to fit entirely into a zone */
599 	chunk_sector = sector & (dev->zone_nr_sectors - 1);
600 	if (chunk_sector + nr_sectors > dev->zone_nr_sectors)
601 		dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
602 
603 	/* Now ready to handle this BIO */
604 	dmz_reclaim_bio_acc(dmz->reclaim);
605 	dmz_queue_chunk_work(dmz, bio);
606 
607 	return DM_MAPIO_SUBMITTED;
608 }
609 
610 /*
611  * Get zoned device information.
612  */
613 static int dmz_get_zoned_device(struct dm_target *ti, char *path)
614 {
615 	struct dmz_target *dmz = ti->private;
616 	struct request_queue *q;
617 	struct dmz_dev *dev;
618 	sector_t aligned_capacity;
619 	int ret;
620 
621 	/* Get the target device */
622 	ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev);
623 	if (ret) {
624 		ti->error = "Get target device failed";
625 		dmz->ddev = NULL;
626 		return ret;
627 	}
628 
629 	dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);
630 	if (!dev) {
631 		ret = -ENOMEM;
632 		goto err;
633 	}
634 
635 	dev->bdev = dmz->ddev->bdev;
636 	(void)bdevname(dev->bdev, dev->name);
637 
638 	if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {
639 		ti->error = "Not a zoned block device";
640 		ret = -EINVAL;
641 		goto err;
642 	}
643 
644 	q = bdev_get_queue(dev->bdev);
645 	dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
646 	aligned_capacity = dev->capacity & ~(blk_queue_zone_sectors(q) - 1);
647 	if (ti->begin ||
648 	    ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
649 		ti->error = "Partial mapping not supported";
650 		ret = -EINVAL;
651 		goto err;
652 	}
653 
654 	dev->zone_nr_sectors = blk_queue_zone_sectors(q);
655 	dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
656 
657 	dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
658 	dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
659 
660 	dev->nr_zones = blkdev_nr_zones(dev->bdev);
661 
662 	dmz->dev = dev;
663 
664 	return 0;
665 err:
666 	dm_put_device(ti, dmz->ddev);
667 	kfree(dev);
668 
669 	return ret;
670 }
671 
672 /*
673  * Cleanup zoned device information.
674  */
675 static void dmz_put_zoned_device(struct dm_target *ti)
676 {
677 	struct dmz_target *dmz = ti->private;
678 
679 	dm_put_device(ti, dmz->ddev);
680 	kfree(dmz->dev);
681 	dmz->dev = NULL;
682 }
683 
684 /*
685  * Setup target.
686  */
687 static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
688 {
689 	struct dmz_target *dmz;
690 	struct dmz_dev *dev;
691 	int ret;
692 
693 	/* Check arguments */
694 	if (argc != 1) {
695 		ti->error = "Invalid argument count";
696 		return -EINVAL;
697 	}
698 
699 	/* Allocate and initialize the target descriptor */
700 	dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
701 	if (!dmz) {
702 		ti->error = "Unable to allocate the zoned target descriptor";
703 		return -ENOMEM;
704 	}
705 	ti->private = dmz;
706 
707 	/* Get the target zoned block device */
708 	ret = dmz_get_zoned_device(ti, argv[0]);
709 	if (ret) {
710 		dmz->ddev = NULL;
711 		goto err;
712 	}
713 
714 	/* Initialize metadata */
715 	dev = dmz->dev;
716 	ret = dmz_ctr_metadata(dev, &dmz->metadata);
717 	if (ret) {
718 		ti->error = "Metadata initialization failed";
719 		goto err_dev;
720 	}
721 
722 	/* Set target (no write same support) */
723 	ti->max_io_len = dev->zone_nr_sectors << 9;
724 	ti->num_flush_bios = 1;
725 	ti->num_discard_bios = 1;
726 	ti->num_write_zeroes_bios = 1;
727 	ti->per_io_data_size = sizeof(struct dmz_bioctx);
728 	ti->flush_supported = true;
729 	ti->discards_supported = true;
730 
731 	/* The exposed capacity is the number of chunks that can be mapped */
732 	ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
733 
734 	/* Zone BIO */
735 	ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
736 	if (ret) {
737 		ti->error = "Create BIO set failed";
738 		goto err_meta;
739 	}
740 
741 	/* Chunk BIO work */
742 	mutex_init(&dmz->chunk_lock);
743 	INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
744 	dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
745 					0, dev->name);
746 	if (!dmz->chunk_wq) {
747 		ti->error = "Create chunk workqueue failed";
748 		ret = -ENOMEM;
749 		goto err_bio;
750 	}
751 
752 	/* Flush work */
753 	spin_lock_init(&dmz->flush_lock);
754 	bio_list_init(&dmz->flush_list);
755 	INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
756 	dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
757 						dev->name);
758 	if (!dmz->flush_wq) {
759 		ti->error = "Create flush workqueue failed";
760 		ret = -ENOMEM;
761 		goto err_cwq;
762 	}
763 	mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
764 
765 	/* Initialize reclaim */
766 	ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
767 	if (ret) {
768 		ti->error = "Zone reclaim initialization failed";
769 		goto err_fwq;
770 	}
771 
772 	dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)",
773 		     (unsigned long long)ti->len,
774 		     (unsigned long long)dmz_sect2blk(ti->len));
775 
776 	return 0;
777 err_fwq:
778 	destroy_workqueue(dmz->flush_wq);
779 err_cwq:
780 	destroy_workqueue(dmz->chunk_wq);
781 err_bio:
782 	mutex_destroy(&dmz->chunk_lock);
783 	bioset_exit(&dmz->bio_set);
784 err_meta:
785 	dmz_dtr_metadata(dmz->metadata);
786 err_dev:
787 	dmz_put_zoned_device(ti);
788 err:
789 	kfree(dmz);
790 
791 	return ret;
792 }
793 
794 /*
795  * Cleanup target.
796  */
797 static void dmz_dtr(struct dm_target *ti)
798 {
799 	struct dmz_target *dmz = ti->private;
800 
801 	flush_workqueue(dmz->chunk_wq);
802 	destroy_workqueue(dmz->chunk_wq);
803 
804 	dmz_dtr_reclaim(dmz->reclaim);
805 
806 	cancel_delayed_work_sync(&dmz->flush_work);
807 	destroy_workqueue(dmz->flush_wq);
808 
809 	(void) dmz_flush_metadata(dmz->metadata);
810 
811 	dmz_dtr_metadata(dmz->metadata);
812 
813 	bioset_exit(&dmz->bio_set);
814 
815 	dmz_put_zoned_device(ti);
816 
817 	mutex_destroy(&dmz->chunk_lock);
818 
819 	kfree(dmz);
820 }
821 
822 /*
823  * Setup target request queue limits.
824  */
825 static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
826 {
827 	struct dmz_target *dmz = ti->private;
828 	unsigned int chunk_sectors = dmz->dev->zone_nr_sectors;
829 
830 	limits->logical_block_size = DMZ_BLOCK_SIZE;
831 	limits->physical_block_size = DMZ_BLOCK_SIZE;
832 
833 	blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
834 	blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
835 
836 	limits->discard_alignment = DMZ_BLOCK_SIZE;
837 	limits->discard_granularity = DMZ_BLOCK_SIZE;
838 	limits->max_discard_sectors = chunk_sectors;
839 	limits->max_hw_discard_sectors = chunk_sectors;
840 	limits->max_write_zeroes_sectors = chunk_sectors;
841 
842 	/* FS hint to try to align to the device zone size */
843 	limits->chunk_sectors = chunk_sectors;
844 	limits->max_sectors = chunk_sectors;
845 
846 	/* We are exposing a drive-managed zoned block device */
847 	limits->zoned = BLK_ZONED_NONE;
848 }
849 
850 /*
851  * Pass on ioctl to the backend device.
852  */
853 static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
854 {
855 	struct dmz_target *dmz = ti->private;
856 
857 	*bdev = dmz->dev->bdev;
858 
859 	return 0;
860 }
861 
862 /*
863  * Stop works on suspend.
864  */
865 static void dmz_suspend(struct dm_target *ti)
866 {
867 	struct dmz_target *dmz = ti->private;
868 
869 	flush_workqueue(dmz->chunk_wq);
870 	dmz_suspend_reclaim(dmz->reclaim);
871 	cancel_delayed_work_sync(&dmz->flush_work);
872 }
873 
874 /*
875  * Restart works on resume or if suspend failed.
876  */
877 static void dmz_resume(struct dm_target *ti)
878 {
879 	struct dmz_target *dmz = ti->private;
880 
881 	queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
882 	dmz_resume_reclaim(dmz->reclaim);
883 }
884 
885 static int dmz_iterate_devices(struct dm_target *ti,
886 			       iterate_devices_callout_fn fn, void *data)
887 {
888 	struct dmz_target *dmz = ti->private;
889 	struct dmz_dev *dev = dmz->dev;
890 	sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1);
891 
892 	return fn(ti, dmz->ddev, 0, capacity, data);
893 }
894 
895 static struct target_type dmz_type = {
896 	.name		 = "zoned",
897 	.version	 = {1, 0, 0},
898 	.features	 = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
899 	.module		 = THIS_MODULE,
900 	.ctr		 = dmz_ctr,
901 	.dtr		 = dmz_dtr,
902 	.map		 = dmz_map,
903 	.io_hints	 = dmz_io_hints,
904 	.prepare_ioctl	 = dmz_prepare_ioctl,
905 	.postsuspend	 = dmz_suspend,
906 	.resume		 = dmz_resume,
907 	.iterate_devices = dmz_iterate_devices,
908 };
909 
910 static int __init dmz_init(void)
911 {
912 	return dm_register_target(&dmz_type);
913 }
914 
915 static void __exit dmz_exit(void)
916 {
917 	dm_unregister_target(&dmz_type);
918 }
919 
920 module_init(dmz_init);
921 module_exit(dmz_exit);
922 
923 MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
924 MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
925 MODULE_LICENSE("GPL");
926