xref: /linux/drivers/block/zloop.c (revision f3e3dbcea15e20f7413afd8c791a496f0b80e80b)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2025, Christoph Hellwig.
4  * Copyright (c) 2025, Western Digital Corporation or its affiliates.
5  *
6  * Zoned Loop Device driver - exports a zoned block device using one file per
7  * zone as backing storage.
8  */
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 
11 #include <linux/module.h>
12 #include <linux/blk-mq.h>
13 #include <linux/blkzoned.h>
14 #include <linux/pagemap.h>
15 #include <linux/miscdevice.h>
16 #include <linux/falloc.h>
17 #include <linux/mutex.h>
18 #include <linux/parser.h>
19 #include <linux/seq_file.h>
20 #include <linux/xattr.h>
21 
22 /*
23  * Options for adding (and removing) a device.
24  */
25 enum {
26 	ZLOOP_OPT_ERR			= 0,
27 	ZLOOP_OPT_ID			= (1 << 0),
28 	ZLOOP_OPT_CAPACITY		= (1 << 1),
29 	ZLOOP_OPT_ZONE_SIZE		= (1 << 2),
30 	ZLOOP_OPT_ZONE_CAPACITY		= (1 << 3),
31 	ZLOOP_OPT_NR_CONV_ZONES		= (1 << 4),
32 	ZLOOP_OPT_BASE_DIR		= (1 << 5),
33 	ZLOOP_OPT_NR_QUEUES		= (1 << 6),
34 	ZLOOP_OPT_QUEUE_DEPTH		= (1 << 7),
35 	ZLOOP_OPT_BUFFERED_IO		= (1 << 8),
36 	ZLOOP_OPT_ZONE_APPEND		= (1 << 9),
37 	ZLOOP_OPT_ORDERED_ZONE_APPEND	= (1 << 10),
38 	ZLOOP_OPT_DISCARD_WRITE_CACHE	= (1 << 11),
39 	ZLOOP_OPT_MAX_OPEN_ZONES	= (1 << 12),
40 };
41 
42 static const match_table_t zloop_opt_tokens = {
43 	{ ZLOOP_OPT_ID,			"id=%d"	},
44 	{ ZLOOP_OPT_CAPACITY,		"capacity_mb=%u"	},
45 	{ ZLOOP_OPT_ZONE_SIZE,		"zone_size_mb=%u"	},
46 	{ ZLOOP_OPT_ZONE_CAPACITY,	"zone_capacity_mb=%u"	},
47 	{ ZLOOP_OPT_NR_CONV_ZONES,	"conv_zones=%u"		},
48 	{ ZLOOP_OPT_BASE_DIR,		"base_dir=%s"		},
49 	{ ZLOOP_OPT_NR_QUEUES,		"nr_queues=%u"		},
50 	{ ZLOOP_OPT_QUEUE_DEPTH,	"queue_depth=%u"	},
51 	{ ZLOOP_OPT_BUFFERED_IO,	"buffered_io"		},
52 	{ ZLOOP_OPT_ZONE_APPEND,	"zone_append=%u"	},
53 	{ ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append"	},
54 	{ ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" },
55 	{ ZLOOP_OPT_MAX_OPEN_ZONES,	"max_open_zones=%u"	},
56 	{ ZLOOP_OPT_ERR,		NULL			}
57 };
58 
59 /* Default values for the "add" operation. */
60 #define ZLOOP_DEF_ID			-1
61 #define ZLOOP_DEF_ZONE_SIZE		((256ULL * SZ_1M) >> SECTOR_SHIFT)
62 #define ZLOOP_DEF_NR_ZONES		64
63 #define ZLOOP_DEF_NR_CONV_ZONES		8
64 #define ZLOOP_DEF_MAX_OPEN_ZONES	0
65 #define ZLOOP_DEF_BASE_DIR		"/var/local/zloop"
66 #define ZLOOP_DEF_NR_QUEUES		1
67 #define ZLOOP_DEF_QUEUE_DEPTH		128
68 #define ZLOOP_DEF_BUFFERED_IO		false
69 #define ZLOOP_DEF_ZONE_APPEND		true
70 #define ZLOOP_DEF_ORDERED_ZONE_APPEND	false
71 
72 /* Arbitrary limit on the zone size (16GB). */
73 #define ZLOOP_MAX_ZONE_SIZE_MB		16384
74 
75 struct zloop_options {
76 	unsigned int		mask;
77 	int			id;
78 	sector_t		capacity;
79 	sector_t		zone_size;
80 	sector_t		zone_capacity;
81 	unsigned int		nr_conv_zones;
82 	unsigned int		max_open_zones;
83 	char			*base_dir;
84 	unsigned int		nr_queues;
85 	unsigned int		queue_depth;
86 	bool			buffered_io;
87 	bool			zone_append;
88 	bool			ordered_zone_append;
89 	bool			discard_write_cache;
90 };
91 
92 /*
93  * Device states.
94  */
95 enum {
96 	Zlo_creating = 0,
97 	Zlo_live,
98 	Zlo_deleting,
99 };
100 
101 enum zloop_zone_flags {
102 	ZLOOP_ZONE_CONV = 0,
103 	ZLOOP_ZONE_SEQ_ERROR,
104 };
105 
106 /*
107  * Zone descriptor.
108  * Locking order: z.lock -> z.wp_lock -> zlo.open_zones_lock
109  */
110 struct zloop_zone {
111 	struct list_head	open_zone_entry;
112 	struct file		*file;
113 
114 	unsigned long		flags;
115 	struct mutex		lock;
116 	spinlock_t		wp_lock;
117 	enum blk_zone_cond	cond;
118 	sector_t		start;
119 	sector_t		wp;
120 
121 	gfp_t			old_gfp_mask;
122 };
123 
124 struct zloop_device {
125 	unsigned int		id;
126 	unsigned int		state;
127 
128 	struct blk_mq_tag_set	tag_set;
129 	struct gendisk		*disk;
130 
131 	struct workqueue_struct *workqueue;
132 	bool			buffered_io;
133 	bool			zone_append;
134 	bool			ordered_zone_append;
135 	bool			discard_write_cache;
136 
137 	const char		*base_dir;
138 	struct file		*data_dir;
139 
140 	unsigned int		zone_shift;
141 	sector_t		zone_size;
142 	sector_t		zone_capacity;
143 	unsigned int		nr_zones;
144 	unsigned int		nr_conv_zones;
145 	unsigned int		max_open_zones;
146 	unsigned int		block_size;
147 
148 	spinlock_t		open_zones_lock;
149 	struct list_head	open_zones_lru_list;
150 	unsigned int		nr_open_zones;
151 
152 	struct zloop_zone	zones[] __counted_by(nr_zones);
153 };
154 
155 struct zloop_cmd {
156 	struct work_struct	work;
157 	atomic_t		ref;
158 	sector_t		sector;
159 	sector_t		nr_sectors;
160 	long			ret;
161 	struct kiocb		iocb;
162 	struct bio_vec		*bvec;
163 };
164 
165 static DEFINE_IDR(zloop_index_idr);
166 static DEFINE_MUTEX(zloop_ctl_mutex);
167 
rq_zone_no(struct request * rq)168 static unsigned int rq_zone_no(struct request *rq)
169 {
170 	struct zloop_device *zlo = rq->q->queuedata;
171 
172 	return blk_rq_pos(rq) >> zlo->zone_shift;
173 }
174 
175 /*
176  * Open an already open zone. This is mostly a no-op, except for the imp open ->
177  * exp open condition change that may happen. We also move a zone at the tail of
178  * the list of open zones so that if we need to
179  * implicitly close one open zone, we can do so in LRU order.
180  */
zloop_lru_rotate_open_zone(struct zloop_device * zlo,struct zloop_zone * zone)181 static inline void zloop_lru_rotate_open_zone(struct zloop_device *zlo,
182 					      struct zloop_zone *zone)
183 {
184 	if (zlo->max_open_zones) {
185 		spin_lock(&zlo->open_zones_lock);
186 		list_move_tail(&zone->open_zone_entry,
187 			       &zlo->open_zones_lru_list);
188 		spin_unlock(&zlo->open_zones_lock);
189 	}
190 }
191 
zloop_lru_remove_open_zone(struct zloop_device * zlo,struct zloop_zone * zone)192 static inline void zloop_lru_remove_open_zone(struct zloop_device *zlo,
193 					      struct zloop_zone *zone)
194 {
195 	if (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
196 	    zone->cond == BLK_ZONE_COND_EXP_OPEN) {
197 		spin_lock(&zlo->open_zones_lock);
198 		list_del_init(&zone->open_zone_entry);
199 		zlo->nr_open_zones--;
200 		spin_unlock(&zlo->open_zones_lock);
201 	}
202 }
203 
zloop_can_open_zone(struct zloop_device * zlo)204 static inline bool zloop_can_open_zone(struct zloop_device *zlo)
205 {
206 	return !zlo->max_open_zones || zlo->nr_open_zones < zlo->max_open_zones;
207 }
208 
209 /*
210  * If we have reached the maximum open zones limit, attempt to close an
211  * implicitly open zone (if we have any) so that we can implicitly open another
212  * zone without exceeding the maximum number of open zones.
213  */
zloop_close_imp_open_zone(struct zloop_device * zlo)214 static bool zloop_close_imp_open_zone(struct zloop_device *zlo)
215 {
216 	struct zloop_zone *zone;
217 
218 	lockdep_assert_held(&zlo->open_zones_lock);
219 
220 	if (zloop_can_open_zone(zlo))
221 		return true;
222 
223 	list_for_each_entry(zone, &zlo->open_zones_lru_list, open_zone_entry) {
224 		if (zone->cond == BLK_ZONE_COND_IMP_OPEN) {
225 			zone->cond = BLK_ZONE_COND_CLOSED;
226 			list_del_init(&zone->open_zone_entry);
227 			zlo->nr_open_zones--;
228 			return true;
229 		}
230 	}
231 
232 	return false;
233 }
234 
zloop_open_closed_or_empty_zone(struct zloop_device * zlo,struct zloop_zone * zone,bool explicit)235 static bool zloop_open_closed_or_empty_zone(struct zloop_device *zlo,
236 					    struct zloop_zone *zone,
237 					    bool explicit)
238 {
239 	spin_lock(&zlo->open_zones_lock);
240 
241 	if (explicit) {
242 		/*
243 		 * Explicit open: we cannot allow this if we have reached the
244 		 * maximum open zones limit.
245 		 */
246 		if (!zloop_can_open_zone(zlo))
247 			goto fail;
248 		zone->cond = BLK_ZONE_COND_EXP_OPEN;
249 	} else {
250 		/*
251 		 * Implicit open case: if we have reached the maximum open zones
252 		 * limit, try to close an implicitly open zone first.
253 		 */
254 		if (!zloop_close_imp_open_zone(zlo))
255 			goto fail;
256 		zone->cond = BLK_ZONE_COND_IMP_OPEN;
257 	}
258 
259 	zlo->nr_open_zones++;
260 	list_add_tail(&zone->open_zone_entry,
261 		      &zlo->open_zones_lru_list);
262 
263 	spin_unlock(&zlo->open_zones_lock);
264 
265 	return true;
266 
267 fail:
268 	spin_unlock(&zlo->open_zones_lock);
269 
270 	return false;
271 }
272 
zloop_do_open_zone(struct zloop_device * zlo,struct zloop_zone * zone,bool explicit)273 static bool zloop_do_open_zone(struct zloop_device *zlo,
274 			       struct zloop_zone *zone, bool explicit)
275 {
276 	switch (zone->cond) {
277 	case BLK_ZONE_COND_IMP_OPEN:
278 	case BLK_ZONE_COND_EXP_OPEN:
279 		if (explicit)
280 			zone->cond = BLK_ZONE_COND_EXP_OPEN;
281 		zloop_lru_rotate_open_zone(zlo, zone);
282 		return true;
283 	case BLK_ZONE_COND_EMPTY:
284 	case BLK_ZONE_COND_CLOSED:
285 		return zloop_open_closed_or_empty_zone(zlo, zone, explicit);
286 	default:
287 		return false;
288 	}
289 }
290 
zloop_mark_full(struct zloop_device * zlo,struct zloop_zone * zone)291 static void zloop_mark_full(struct zloop_device *zlo, struct zloop_zone *zone)
292 {
293 	lockdep_assert_held(&zone->wp_lock);
294 
295 	zloop_lru_remove_open_zone(zlo, zone);
296 	zone->cond = BLK_ZONE_COND_FULL;
297 	zone->wp = ULLONG_MAX;
298 }
299 
zloop_mark_empty(struct zloop_device * zlo,struct zloop_zone * zone)300 static void zloop_mark_empty(struct zloop_device *zlo, struct zloop_zone *zone)
301 {
302 	lockdep_assert_held(&zone->wp_lock);
303 
304 	zloop_lru_remove_open_zone(zlo, zone);
305 	zone->cond = BLK_ZONE_COND_EMPTY;
306 	zone->wp = zone->start;
307 }
308 
zloop_update_seq_zone(struct zloop_device * zlo,unsigned int zone_no)309 static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
310 {
311 	struct zloop_zone *zone = &zlo->zones[zone_no];
312 	struct kstat stat;
313 	sector_t file_sectors;
314 	int ret;
315 
316 	lockdep_assert_held(&zone->lock);
317 
318 	ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
319 	if (ret < 0) {
320 		pr_err("Failed to get zone %u file stat (err=%d)\n",
321 		       zone_no, ret);
322 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
323 		return ret;
324 	}
325 
326 	file_sectors = stat.size >> SECTOR_SHIFT;
327 	if (file_sectors > zlo->zone_capacity) {
328 		pr_err("Zone %u file too large (%llu sectors > %llu)\n",
329 		       zone_no, file_sectors, zlo->zone_capacity);
330 		return -EINVAL;
331 	}
332 
333 	if (!IS_ALIGNED(stat.size, zlo->block_size)) {
334 		pr_err("Zone %u file size (%llu) not aligned to block size %u\n",
335 		       zone_no, stat.size, zlo->block_size);
336 		return -EINVAL;
337 	}
338 
339 	spin_lock(&zone->wp_lock);
340 	if (!file_sectors) {
341 		zloop_mark_empty(zlo, zone);
342 	} else if (file_sectors == zlo->zone_capacity) {
343 		zloop_mark_full(zlo, zone);
344 	} else {
345 		if (zone->cond != BLK_ZONE_COND_IMP_OPEN &&
346 		    zone->cond != BLK_ZONE_COND_EXP_OPEN)
347 			zone->cond = BLK_ZONE_COND_CLOSED;
348 		zone->wp = zone->start + file_sectors;
349 	}
350 	spin_unlock(&zone->wp_lock);
351 
352 	return 0;
353 }
354 
zloop_open_zone(struct zloop_device * zlo,unsigned int zone_no)355 static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
356 {
357 	struct zloop_zone *zone = &zlo->zones[zone_no];
358 	int ret = 0;
359 
360 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
361 		return -EIO;
362 
363 	mutex_lock(&zone->lock);
364 
365 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
366 		ret = zloop_update_seq_zone(zlo, zone_no);
367 		if (ret)
368 			goto unlock;
369 	}
370 
371 	if (!zloop_do_open_zone(zlo, zone, true))
372 		ret = -EIO;
373 
374 unlock:
375 	mutex_unlock(&zone->lock);
376 
377 	return ret;
378 }
379 
zloop_close_zone(struct zloop_device * zlo,unsigned int zone_no)380 static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
381 {
382 	struct zloop_zone *zone = &zlo->zones[zone_no];
383 	int ret = 0;
384 
385 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
386 		return -EIO;
387 
388 	mutex_lock(&zone->lock);
389 
390 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
391 		ret = zloop_update_seq_zone(zlo, zone_no);
392 		if (ret)
393 			goto unlock;
394 	}
395 
396 	switch (zone->cond) {
397 	case BLK_ZONE_COND_CLOSED:
398 		break;
399 	case BLK_ZONE_COND_IMP_OPEN:
400 	case BLK_ZONE_COND_EXP_OPEN:
401 		spin_lock(&zone->wp_lock);
402 		zloop_lru_remove_open_zone(zlo, zone);
403 		if (zone->wp == zone->start)
404 			zone->cond = BLK_ZONE_COND_EMPTY;
405 		else
406 			zone->cond = BLK_ZONE_COND_CLOSED;
407 		spin_unlock(&zone->wp_lock);
408 		break;
409 	case BLK_ZONE_COND_EMPTY:
410 	case BLK_ZONE_COND_FULL:
411 	default:
412 		ret = -EIO;
413 		break;
414 	}
415 
416 unlock:
417 	mutex_unlock(&zone->lock);
418 
419 	return ret;
420 }
421 
zloop_reset_zone(struct zloop_device * zlo,unsigned int zone_no)422 static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
423 {
424 	struct zloop_zone *zone = &zlo->zones[zone_no];
425 	int ret = 0;
426 
427 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
428 		return -EIO;
429 
430 	mutex_lock(&zone->lock);
431 
432 	if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
433 	    zone->cond == BLK_ZONE_COND_EMPTY)
434 		goto unlock;
435 
436 	if (vfs_truncate(&zone->file->f_path, 0)) {
437 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
438 		ret = -EIO;
439 		goto unlock;
440 	}
441 
442 	spin_lock(&zone->wp_lock);
443 	zloop_mark_empty(zlo, zone);
444 	clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
445 	spin_unlock(&zone->wp_lock);
446 
447 unlock:
448 	mutex_unlock(&zone->lock);
449 
450 	return ret;
451 }
452 
zloop_reset_all_zones(struct zloop_device * zlo)453 static int zloop_reset_all_zones(struct zloop_device *zlo)
454 {
455 	unsigned int i;
456 	int ret;
457 
458 	for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
459 		ret = zloop_reset_zone(zlo, i);
460 		if (ret)
461 			return ret;
462 	}
463 
464 	return 0;
465 }
466 
zloop_finish_zone(struct zloop_device * zlo,unsigned int zone_no)467 static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
468 {
469 	struct zloop_zone *zone = &zlo->zones[zone_no];
470 	int ret = 0;
471 
472 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
473 		return -EIO;
474 
475 	mutex_lock(&zone->lock);
476 
477 	if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
478 	    zone->cond == BLK_ZONE_COND_FULL)
479 		goto unlock;
480 
481 	if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
482 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
483 		ret = -EIO;
484 		goto unlock;
485 	}
486 
487 	spin_lock(&zone->wp_lock);
488 	zloop_mark_full(zlo, zone);
489 	clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
490 	spin_unlock(&zone->wp_lock);
491 
492  unlock:
493 	mutex_unlock(&zone->lock);
494 
495 	return ret;
496 }
497 
zloop_put_cmd(struct zloop_cmd * cmd)498 static void zloop_put_cmd(struct zloop_cmd *cmd)
499 {
500 	struct request *rq = blk_mq_rq_from_pdu(cmd);
501 
502 	if (!atomic_dec_and_test(&cmd->ref))
503 		return;
504 	kfree(cmd->bvec);
505 	cmd->bvec = NULL;
506 	if (likely(!blk_should_fake_timeout(rq->q)))
507 		blk_mq_complete_request(rq);
508 }
509 
zloop_rw_complete(struct kiocb * iocb,long ret)510 static void zloop_rw_complete(struct kiocb *iocb, long ret)
511 {
512 	struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
513 
514 	cmd->ret = ret;
515 	zloop_put_cmd(cmd);
516 }
517 
zloop_do_rw(struct zloop_cmd * cmd)518 static int zloop_do_rw(struct zloop_cmd *cmd)
519 {
520 	struct request *rq = blk_mq_rq_from_pdu(cmd);
521 	int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE;
522 	unsigned int nr_bvec = blk_rq_nr_bvec(rq);
523 	struct zloop_device *zlo = rq->q->queuedata;
524 	struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)];
525 	struct req_iterator rq_iter;
526 	struct iov_iter iter;
527 
528 	if (rq->bio != rq->biotail) {
529 		struct bio_vec tmp, *bvec;
530 
531 		cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO);
532 		if (!cmd->bvec)
533 			return -EIO;
534 
535 		/*
536 		 * The bios of the request may be started from the middle of
537 		 * the 'bvec' because of bio splitting, so we can't directly
538 		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
539 		 * API will take care of all details for us.
540 		 */
541 		bvec = cmd->bvec;
542 		rq_for_each_bvec(tmp, rq, rq_iter) {
543 			*bvec = tmp;
544 			bvec++;
545 		}
546 		iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
547 	} else {
548 		/*
549 		 * Same here, this bio may be started from the middle of the
550 		 * 'bvec' because of bio splitting, so offset from the bvec
551 		 * must be passed to iov iterator
552 		 */
553 		iov_iter_bvec(&iter, rw,
554 			__bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
555 					nr_bvec, blk_rq_bytes(rq));
556 		iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
557 	}
558 
559 	cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT;
560 	cmd->iocb.ki_filp = zone->file;
561 	cmd->iocb.ki_complete = zloop_rw_complete;
562 	if (!zlo->buffered_io)
563 		cmd->iocb.ki_flags = IOCB_DIRECT;
564 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
565 
566 	if (rw == ITER_SOURCE)
567 		return zone->file->f_op->write_iter(&cmd->iocb, &iter);
568 	return zone->file->f_op->read_iter(&cmd->iocb, &iter);
569 }
570 
zloop_seq_write_prep(struct zloop_cmd * cmd)571 static int zloop_seq_write_prep(struct zloop_cmd *cmd)
572 {
573 	struct request *rq = blk_mq_rq_from_pdu(cmd);
574 	struct zloop_device *zlo = rq->q->queuedata;
575 	unsigned int zone_no = rq_zone_no(rq);
576 	sector_t nr_sectors = blk_rq_sectors(rq);
577 	bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
578 	struct zloop_zone *zone = &zlo->zones[zone_no];
579 	sector_t zone_end = zone->start + zlo->zone_capacity;
580 	int ret = 0;
581 
582 	spin_lock(&zone->wp_lock);
583 
584 	/*
585 	 * Zone append operations always go at the current write pointer, but
586 	 * regular write operations must already be aligned to the write pointer
587 	 * when submitted.
588 	 */
589 	if (is_append) {
590 		/*
591 		 * If ordered zone append is in use, we already checked and set
592 		 * the target sector in zloop_queue_rq().
593 		 */
594 		if (!zlo->ordered_zone_append) {
595 			if (zone->cond == BLK_ZONE_COND_FULL ||
596 			    zone->wp + nr_sectors > zone_end) {
597 				ret = -EIO;
598 				goto out_unlock;
599 			}
600 			cmd->sector = zone->wp;
601 		}
602 	} else {
603 		if (cmd->sector != zone->wp) {
604 			pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
605 			       zone_no, cmd->sector, zone->wp);
606 			ret = -EIO;
607 			goto out_unlock;
608 		}
609 	}
610 
611 	/* Implicitly open the target zone. */
612 	if (!zloop_do_open_zone(zlo, zone, false)) {
613 		ret = -EIO;
614 		goto out_unlock;
615 	}
616 
617 	/*
618 	 * Advance the write pointer, unless ordered zone append is in use. If
619 	 * the write fails, the write pointer position will be corrected when
620 	 * the next I/O starts execution.
621 	 */
622 	if (!is_append || !zlo->ordered_zone_append) {
623 		zone->wp += nr_sectors;
624 		if (zone->wp == zone_end)
625 			zloop_mark_full(zlo, zone);
626 	}
627 out_unlock:
628 	spin_unlock(&zone->wp_lock);
629 	return ret;
630 }
631 
zloop_rw(struct zloop_cmd * cmd)632 static void zloop_rw(struct zloop_cmd *cmd)
633 {
634 	struct request *rq = blk_mq_rq_from_pdu(cmd);
635 	struct zloop_device *zlo = rq->q->queuedata;
636 	unsigned int zone_no = rq_zone_no(rq);
637 	sector_t nr_sectors = blk_rq_sectors(rq);
638 	bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
639 	bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
640 	struct zloop_zone *zone;
641 	int ret = -EIO;
642 
643 	atomic_set(&cmd->ref, 2);
644 	cmd->sector = blk_rq_pos(rq);
645 	cmd->nr_sectors = nr_sectors;
646 	cmd->ret = 0;
647 
648 	if (WARN_ON_ONCE(is_append && !zlo->zone_append))
649 		goto out;
650 
651 	/* We should never get an I/O beyond the device capacity. */
652 	if (WARN_ON_ONCE(zone_no >= zlo->nr_zones))
653 		goto out;
654 
655 	zone = &zlo->zones[zone_no];
656 
657 	/*
658 	 * The block layer should never send requests that are not fully
659 	 * contained within the zone.
660 	 */
661 	if (WARN_ON_ONCE(cmd->sector + nr_sectors >
662 			 zone->start + zlo->zone_size))
663 		goto out;
664 
665 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
666 		mutex_lock(&zone->lock);
667 		ret = zloop_update_seq_zone(zlo, zone_no);
668 		mutex_unlock(&zone->lock);
669 		if (ret)
670 			goto out;
671 	}
672 
673 	if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
674 		mutex_lock(&zone->lock);
675 		ret = zloop_seq_write_prep(cmd);
676 		if (!ret)
677 			ret = zloop_do_rw(cmd);
678 		mutex_unlock(&zone->lock);
679 	} else {
680 		ret = zloop_do_rw(cmd);
681 	}
682 out:
683 	if (ret != -EIOCBQUEUED)
684 		zloop_rw_complete(&cmd->iocb, ret);
685 	zloop_put_cmd(cmd);
686 }
687 
zloop_zone_is_active(struct zloop_zone * zone)688 static inline bool zloop_zone_is_active(struct zloop_zone *zone)
689 {
690 	switch (zone->cond) {
691 	case BLK_ZONE_COND_EXP_OPEN:
692 	case BLK_ZONE_COND_IMP_OPEN:
693 	case BLK_ZONE_COND_CLOSED:
694 		return true;
695 	default:
696 		return false;
697 	}
698 }
699 
zloop_record_safe_wps(struct zloop_device * zlo)700 static int zloop_record_safe_wps(struct zloop_device *zlo)
701 {
702 	unsigned int i;
703 	int ret;
704 
705 	for (i = 0; i < zlo->nr_zones; i++) {
706 		struct zloop_zone *zone = &zlo->zones[i];
707 		struct file *file = zone->file;
708 
709 		if (!zloop_zone_is_active(zone))
710 			continue;
711 		ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file),
712 				"user.zloop.wp", &zone->wp, sizeof(zone->wp), 0);
713 		if (ret) {
714 			pr_err("%pg: failed to record write pointer (%d)\n",
715 				zlo->disk->part0, ret);
716 			return ret;
717 		}
718 	}
719 
720 	return 0;
721 }
722 
723 /*
724  * Sync the entire FS containing the zone files instead of walking all files.
725  */
zloop_flush(struct zloop_device * zlo)726 static int zloop_flush(struct zloop_device *zlo)
727 {
728 	struct super_block *sb = file_inode(zlo->data_dir)->i_sb;
729 	int ret;
730 
731 	if (zlo->discard_write_cache) {
732 		ret = zloop_record_safe_wps(zlo);
733 		if (ret)
734 			return ret;
735 	}
736 
737 	down_read(&sb->s_umount);
738 	ret = sync_filesystem(sb);
739 	up_read(&sb->s_umount);
740 
741 	return ret;
742 }
743 
zloop_handle_cmd(struct zloop_cmd * cmd)744 static void zloop_handle_cmd(struct zloop_cmd *cmd)
745 {
746 	struct request *rq = blk_mq_rq_from_pdu(cmd);
747 	struct zloop_device *zlo = rq->q->queuedata;
748 
749 	/* We can block in this context, so ignore REQ_NOWAIT. */
750 	if (rq->cmd_flags & REQ_NOWAIT)
751 		rq->cmd_flags &= ~REQ_NOWAIT;
752 
753 	switch (req_op(rq)) {
754 	case REQ_OP_READ:
755 	case REQ_OP_WRITE:
756 	case REQ_OP_ZONE_APPEND:
757 		/*
758 		 * zloop_rw() always executes asynchronously or completes
759 		 * directly.
760 		 */
761 		zloop_rw(cmd);
762 		return;
763 	case REQ_OP_FLUSH:
764 		cmd->ret = zloop_flush(zlo);
765 		break;
766 	case REQ_OP_ZONE_RESET:
767 		cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
768 		break;
769 	case REQ_OP_ZONE_RESET_ALL:
770 		cmd->ret = zloop_reset_all_zones(zlo);
771 		break;
772 	case REQ_OP_ZONE_FINISH:
773 		cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
774 		break;
775 	case REQ_OP_ZONE_OPEN:
776 		cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
777 		break;
778 	case REQ_OP_ZONE_CLOSE:
779 		cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
780 		break;
781 	default:
782 		WARN_ON_ONCE(1);
783 		pr_err("Unsupported operation %d\n", req_op(rq));
784 		cmd->ret = -EOPNOTSUPP;
785 		break;
786 	}
787 
788 	blk_mq_complete_request(rq);
789 }
790 
zloop_cmd_workfn(struct work_struct * work)791 static void zloop_cmd_workfn(struct work_struct *work)
792 {
793 	struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
794 	int orig_flags = current->flags;
795 
796 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
797 	zloop_handle_cmd(cmd);
798 	current->flags = orig_flags;
799 }
800 
zloop_complete_rq(struct request * rq)801 static void zloop_complete_rq(struct request *rq)
802 {
803 	struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
804 	struct zloop_device *zlo = rq->q->queuedata;
805 	unsigned int zone_no = cmd->sector >> zlo->zone_shift;
806 	struct zloop_zone *zone = &zlo->zones[zone_no];
807 	blk_status_t sts = BLK_STS_OK;
808 
809 	switch (req_op(rq)) {
810 	case REQ_OP_READ:
811 		if (cmd->ret < 0)
812 			pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
813 			       zone_no, cmd->sector, cmd->nr_sectors);
814 
815 		if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
816 			/* short read */
817 			struct bio *bio;
818 
819 			__rq_for_each_bio(bio, rq)
820 				zero_fill_bio(bio);
821 		}
822 		break;
823 	case REQ_OP_WRITE:
824 	case REQ_OP_ZONE_APPEND:
825 		if (cmd->ret < 0)
826 			pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
827 			       zone_no,
828 			       req_op(rq) == REQ_OP_WRITE ? "" : "append ",
829 			       cmd->sector, cmd->nr_sectors);
830 
831 		if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
832 			pr_err("Zone %u: partial write %ld/%u B\n",
833 			       zone_no, cmd->ret, blk_rq_bytes(rq));
834 			cmd->ret = -EIO;
835 		}
836 
837 		if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
838 			/*
839 			 * A write to a sequential zone file failed: mark the
840 			 * zone as having an error. This will be corrected and
841 			 * cleared when the next IO is submitted.
842 			 */
843 			set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
844 			break;
845 		}
846 		if (req_op(rq) == REQ_OP_ZONE_APPEND)
847 			rq->__sector = cmd->sector;
848 
849 		break;
850 	default:
851 		break;
852 	}
853 
854 	if (cmd->ret < 0)
855 		sts = errno_to_blk_status(cmd->ret);
856 	blk_mq_end_request(rq, sts);
857 }
858 
zloop_set_zone_append_sector(struct request * rq)859 static bool zloop_set_zone_append_sector(struct request *rq)
860 {
861 	struct zloop_device *zlo = rq->q->queuedata;
862 	unsigned int zone_no = rq_zone_no(rq);
863 	struct zloop_zone *zone = &zlo->zones[zone_no];
864 	sector_t zone_end = zone->start + zlo->zone_capacity;
865 	sector_t nr_sectors = blk_rq_sectors(rq);
866 
867 	spin_lock(&zone->wp_lock);
868 
869 	if (zone->cond == BLK_ZONE_COND_FULL ||
870 	    zone->wp + nr_sectors > zone_end) {
871 		spin_unlock(&zone->wp_lock);
872 		return false;
873 	}
874 
875 	rq->__sector = zone->wp;
876 	zone->wp += blk_rq_sectors(rq);
877 	if (zone->wp >= zone_end)
878 		zloop_mark_full(zlo, zone);
879 
880 	spin_unlock(&zone->wp_lock);
881 
882 	return true;
883 }
884 
zloop_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)885 static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
886 				   const struct blk_mq_queue_data *bd)
887 {
888 	struct request *rq = bd->rq;
889 	struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
890 	struct zloop_device *zlo = rq->q->queuedata;
891 
892 	if (data_race(READ_ONCE(zlo->state)) == Zlo_deleting) {
893 		rq->rq_flags |= RQF_QUIET;
894 		return BLK_STS_IOERR;
895 	}
896 
897 	/*
898 	 * If we need to strongly order zone append operations, set the request
899 	 * sector to the zone write pointer location now instead of when the
900 	 * command work runs.
901 	 */
902 	if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) {
903 		if (!zloop_set_zone_append_sector(rq))
904 			return BLK_STS_IOERR;
905 	}
906 
907 	blk_mq_start_request(rq);
908 
909 	INIT_WORK(&cmd->work, zloop_cmd_workfn);
910 	queue_work(zlo->workqueue, &cmd->work);
911 
912 	return BLK_STS_OK;
913 }
914 
915 static const struct blk_mq_ops zloop_mq_ops = {
916 	.queue_rq       = zloop_queue_rq,
917 	.complete	= zloop_complete_rq,
918 };
919 
zloop_open(struct gendisk * disk,blk_mode_t mode)920 static int zloop_open(struct gendisk *disk, blk_mode_t mode)
921 {
922 	struct zloop_device *zlo = disk->private_data;
923 	int ret;
924 
925 	ret = mutex_lock_killable(&zloop_ctl_mutex);
926 	if (ret)
927 		return ret;
928 
929 	if (zlo->state != Zlo_live)
930 		ret = -ENXIO;
931 	mutex_unlock(&zloop_ctl_mutex);
932 	return ret;
933 }
934 
zloop_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)935 static int zloop_report_zones(struct gendisk *disk, sector_t sector,
936 		unsigned int nr_zones, struct blk_report_zones_args *args)
937 {
938 	struct zloop_device *zlo = disk->private_data;
939 	struct blk_zone blkz = {};
940 	unsigned int first, i;
941 	int ret;
942 
943 	first = disk_zone_no(disk, sector);
944 	if (first >= zlo->nr_zones)
945 		return 0;
946 	nr_zones = min(nr_zones, zlo->nr_zones - first);
947 
948 	for (i = 0; i < nr_zones; i++) {
949 		unsigned int zone_no = first + i;
950 		struct zloop_zone *zone = &zlo->zones[zone_no];
951 
952 		mutex_lock(&zone->lock);
953 
954 		if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
955 			ret = zloop_update_seq_zone(zlo, zone_no);
956 			if (ret) {
957 				mutex_unlock(&zone->lock);
958 				return ret;
959 			}
960 		}
961 
962 		blkz.start = zone->start;
963 		blkz.len = zlo->zone_size;
964 		spin_lock(&zone->wp_lock);
965 		blkz.wp = zone->wp;
966 		spin_unlock(&zone->wp_lock);
967 		blkz.cond = zone->cond;
968 		if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
969 			blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
970 			blkz.capacity = zlo->zone_size;
971 		} else {
972 			blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
973 			blkz.capacity = zlo->zone_capacity;
974 		}
975 
976 		mutex_unlock(&zone->lock);
977 
978 		ret = disk_report_zone(disk, &blkz, i, args);
979 		if (ret)
980 			return ret;
981 	}
982 
983 	return nr_zones;
984 }
985 
zloop_free_disk(struct gendisk * disk)986 static void zloop_free_disk(struct gendisk *disk)
987 {
988 	struct zloop_device *zlo = disk->private_data;
989 	unsigned int i;
990 
991 	blk_mq_free_tag_set(&zlo->tag_set);
992 
993 	for (i = 0; i < zlo->nr_zones; i++) {
994 		struct zloop_zone *zone = &zlo->zones[i];
995 
996 		mapping_set_gfp_mask(zone->file->f_mapping,
997 				zone->old_gfp_mask);
998 		fput(zone->file);
999 	}
1000 
1001 	fput(zlo->data_dir);
1002 	destroy_workqueue(zlo->workqueue);
1003 	kfree(zlo->base_dir);
1004 	kvfree(zlo);
1005 }
1006 
1007 static const struct block_device_operations zloop_fops = {
1008 	.owner			= THIS_MODULE,
1009 	.open			= zloop_open,
1010 	.report_zones		= zloop_report_zones,
1011 	.free_disk		= zloop_free_disk,
1012 };
1013 
1014 __printf(3, 4)
zloop_filp_open_fmt(int oflags,umode_t mode,const char * fmt,...)1015 static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
1016 		const char *fmt, ...)
1017 {
1018 	struct file *file;
1019 	va_list ap;
1020 	char *p;
1021 
1022 	va_start(ap, fmt);
1023 	p = kvasprintf(GFP_KERNEL, fmt, ap);
1024 	va_end(ap);
1025 
1026 	if (!p)
1027 		return ERR_PTR(-ENOMEM);
1028 	file = filp_open(p, oflags, mode);
1029 	kfree(p);
1030 	return file;
1031 }
1032 
zloop_get_block_size(struct zloop_device * zlo,struct zloop_zone * zone)1033 static int zloop_get_block_size(struct zloop_device *zlo,
1034 				struct zloop_zone *zone)
1035 {
1036 	struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
1037 	struct kstat st;
1038 
1039 	/*
1040 	 * If the FS block size is lower than or equal to 4K, use that as the
1041 	 * device block size. Otherwise, fallback to the FS direct IO alignment
1042 	 * constraint if that is provided, and to the FS underlying device
1043 	 * physical block size if the direct IO alignment is unknown.
1044 	 */
1045 	if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
1046 		zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
1047 	else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
1048 		 (st.result_mask & STATX_DIOALIGN))
1049 		zlo->block_size = st.dio_offset_align;
1050 	else if (sb_bdev)
1051 		zlo->block_size = bdev_physical_block_size(sb_bdev);
1052 	else
1053 		zlo->block_size = SECTOR_SIZE;
1054 
1055 	if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
1056 		pr_err("Zone capacity is not aligned to block size %u\n",
1057 		       zlo->block_size);
1058 		return -EINVAL;
1059 	}
1060 
1061 	return 0;
1062 }
1063 
zloop_init_zone(struct zloop_device * zlo,struct zloop_options * opts,unsigned int zone_no,bool restore)1064 static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
1065 			   unsigned int zone_no, bool restore)
1066 {
1067 	struct zloop_zone *zone = &zlo->zones[zone_no];
1068 	int oflags = O_RDWR;
1069 	struct kstat stat;
1070 	sector_t file_sectors;
1071 	int ret;
1072 
1073 	mutex_init(&zone->lock);
1074 	INIT_LIST_HEAD(&zone->open_zone_entry);
1075 	spin_lock_init(&zone->wp_lock);
1076 	zone->start = (sector_t)zone_no << zlo->zone_shift;
1077 
1078 	if (!restore)
1079 		oflags |= O_CREAT;
1080 
1081 	if (!opts->buffered_io)
1082 		oflags |= O_DIRECT;
1083 
1084 	if (zone_no < zlo->nr_conv_zones) {
1085 		/* Conventional zone file. */
1086 		set_bit(ZLOOP_ZONE_CONV, &zone->flags);
1087 		zone->cond = BLK_ZONE_COND_NOT_WP;
1088 		zone->wp = U64_MAX;
1089 
1090 		zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
1091 					zlo->base_dir, zlo->id, zone_no);
1092 		if (IS_ERR(zone->file)) {
1093 			pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
1094 			       zone_no, zlo->base_dir, zlo->id, zone_no,
1095 			       PTR_ERR(zone->file));
1096 			return PTR_ERR(zone->file);
1097 		}
1098 
1099 		if (!zlo->block_size) {
1100 			ret = zloop_get_block_size(zlo, zone);
1101 			if (ret)
1102 				return ret;
1103 		}
1104 
1105 		ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
1106 		if (ret < 0) {
1107 			pr_err("Failed to get zone %u file stat\n", zone_no);
1108 			return ret;
1109 		}
1110 		file_sectors = stat.size >> SECTOR_SHIFT;
1111 
1112 		if (restore && file_sectors != zlo->zone_size) {
1113 			pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
1114 			       zone_no, file_sectors, zlo->zone_capacity);
1115 			return ret;
1116 		}
1117 
1118 		ret = vfs_truncate(&zone->file->f_path,
1119 				   zlo->zone_size << SECTOR_SHIFT);
1120 		if (ret < 0) {
1121 			pr_err("Failed to truncate zone %u file (err=%d)\n",
1122 			       zone_no, ret);
1123 			return ret;
1124 		}
1125 
1126 		return 0;
1127 	}
1128 
1129 	/* Sequential zone file. */
1130 	zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
1131 					 zlo->base_dir, zlo->id, zone_no);
1132 	if (IS_ERR(zone->file)) {
1133 		pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
1134 		       zone_no, zlo->base_dir, zlo->id, zone_no,
1135 		       PTR_ERR(zone->file));
1136 		return PTR_ERR(zone->file);
1137 	}
1138 
1139 	if (!zlo->block_size) {
1140 		ret = zloop_get_block_size(zlo, zone);
1141 		if (ret)
1142 			return ret;
1143 	}
1144 
1145 	zloop_get_block_size(zlo, zone);
1146 
1147 	mutex_lock(&zone->lock);
1148 	ret = zloop_update_seq_zone(zlo, zone_no);
1149 	mutex_unlock(&zone->lock);
1150 
1151 	return ret;
1152 }
1153 
zloop_dev_exists(struct zloop_device * zlo)1154 static bool zloop_dev_exists(struct zloop_device *zlo)
1155 {
1156 	struct file *cnv, *seq;
1157 	bool exists;
1158 
1159 	cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
1160 				  zlo->base_dir, zlo->id, 0);
1161 	seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
1162 				  zlo->base_dir, zlo->id, 0);
1163 	exists = !IS_ERR(cnv) || !IS_ERR(seq);
1164 
1165 	if (!IS_ERR(cnv))
1166 		fput(cnv);
1167 	if (!IS_ERR(seq))
1168 		fput(seq);
1169 
1170 	return exists;
1171 }
1172 
zloop_ctl_add(struct zloop_options * opts)1173 static int zloop_ctl_add(struct zloop_options *opts)
1174 {
1175 	struct queue_limits lim = {
1176 		.max_hw_sectors		= SZ_1M >> SECTOR_SHIFT,
1177 		.chunk_sectors		= opts->zone_size,
1178 		.features		= BLK_FEAT_ZONED | BLK_FEAT_WRITE_CACHE,
1179 
1180 	};
1181 	unsigned int nr_zones, i, j;
1182 	struct zloop_device *zlo;
1183 	int ret = -EINVAL;
1184 	bool restore;
1185 
1186 	__module_get(THIS_MODULE);
1187 
1188 	nr_zones = opts->capacity >> ilog2(opts->zone_size);
1189 	if (opts->nr_conv_zones >= nr_zones) {
1190 		pr_err("Invalid number of conventional zones %u\n",
1191 		       opts->nr_conv_zones);
1192 		goto out;
1193 	}
1194 
1195 	if (opts->max_open_zones > nr_zones - opts->nr_conv_zones) {
1196 		pr_err("Invalid maximum number of open zones %u\n",
1197 		       opts->max_open_zones);
1198 		goto out;
1199 	}
1200 
1201 	zlo = kvzalloc_flex(*zlo, zones, nr_zones);
1202 	if (!zlo) {
1203 		ret = -ENOMEM;
1204 		goto out;
1205 	}
1206 	WRITE_ONCE(zlo->state, Zlo_creating);
1207 	spin_lock_init(&zlo->open_zones_lock);
1208 	INIT_LIST_HEAD(&zlo->open_zones_lru_list);
1209 
1210 	ret = mutex_lock_killable(&zloop_ctl_mutex);
1211 	if (ret)
1212 		goto out_free_dev;
1213 
1214 	/* Allocate id, if @opts->id >= 0, we're requesting that specific id */
1215 	if (opts->id >= 0) {
1216 		ret = idr_alloc(&zloop_index_idr, zlo,
1217 				  opts->id, opts->id + 1, GFP_KERNEL);
1218 		if (ret == -ENOSPC)
1219 			ret = -EEXIST;
1220 	} else {
1221 		ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
1222 	}
1223 	mutex_unlock(&zloop_ctl_mutex);
1224 	if (ret < 0)
1225 		goto out_free_dev;
1226 
1227 	zlo->id = ret;
1228 	zlo->zone_shift = ilog2(opts->zone_size);
1229 	zlo->zone_size = opts->zone_size;
1230 	if (opts->zone_capacity)
1231 		zlo->zone_capacity = opts->zone_capacity;
1232 	else
1233 		zlo->zone_capacity = zlo->zone_size;
1234 	zlo->nr_zones = nr_zones;
1235 	zlo->nr_conv_zones = opts->nr_conv_zones;
1236 	zlo->max_open_zones = opts->max_open_zones;
1237 	zlo->buffered_io = opts->buffered_io;
1238 	zlo->zone_append = opts->zone_append;
1239 	if (zlo->zone_append)
1240 		zlo->ordered_zone_append = opts->ordered_zone_append;
1241 	zlo->discard_write_cache = opts->discard_write_cache;
1242 
1243 	zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
1244 				opts->nr_queues * opts->queue_depth, zlo->id);
1245 	if (!zlo->workqueue) {
1246 		ret = -ENOMEM;
1247 		goto out_free_idr;
1248 	}
1249 
1250 	if (opts->base_dir)
1251 		zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
1252 	else
1253 		zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
1254 	if (!zlo->base_dir) {
1255 		ret = -ENOMEM;
1256 		goto out_destroy_workqueue;
1257 	}
1258 
1259 	zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
1260 					    zlo->base_dir, zlo->id);
1261 	if (IS_ERR(zlo->data_dir)) {
1262 		ret = PTR_ERR(zlo->data_dir);
1263 		pr_warn("Failed to open directory %s/%u (err=%d)\n",
1264 			zlo->base_dir, zlo->id, ret);
1265 		goto out_free_base_dir;
1266 	}
1267 
1268 	/*
1269 	 * If we already have zone files, we are restoring a device created by a
1270 	 * previous add operation. In this case, zloop_init_zone() will check
1271 	 * that the zone files are consistent with the zone configuration given.
1272 	 */
1273 	restore = zloop_dev_exists(zlo);
1274 	for (i = 0; i < nr_zones; i++) {
1275 		ret = zloop_init_zone(zlo, opts, i, restore);
1276 		if (ret)
1277 			goto out_close_files;
1278 	}
1279 
1280 	lim.physical_block_size = zlo->block_size;
1281 	lim.logical_block_size = zlo->block_size;
1282 	if (zlo->zone_append)
1283 		lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
1284 	lim.max_open_zones = zlo->max_open_zones;
1285 
1286 	zlo->tag_set.ops = &zloop_mq_ops;
1287 	zlo->tag_set.nr_hw_queues = opts->nr_queues;
1288 	zlo->tag_set.queue_depth = opts->queue_depth;
1289 	zlo->tag_set.numa_node = NUMA_NO_NODE;
1290 	zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
1291 	zlo->tag_set.driver_data = zlo;
1292 
1293 	ret = blk_mq_alloc_tag_set(&zlo->tag_set);
1294 	if (ret) {
1295 		pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
1296 		goto out_close_files;
1297 	}
1298 
1299 	zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
1300 	if (IS_ERR(zlo->disk)) {
1301 		pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
1302 		ret = PTR_ERR(zlo->disk);
1303 		goto out_cleanup_tags;
1304 	}
1305 	zlo->disk->flags = GENHD_FL_NO_PART;
1306 	zlo->disk->fops = &zloop_fops;
1307 	zlo->disk->private_data = zlo;
1308 	sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
1309 	set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
1310 
1311 	ret = blk_revalidate_disk_zones(zlo->disk);
1312 	if (ret)
1313 		goto out_cleanup_disk;
1314 
1315 	ret = add_disk(zlo->disk);
1316 	if (ret) {
1317 		pr_err("add_disk failed (err=%d)\n", ret);
1318 		goto out_cleanup_disk;
1319 	}
1320 
1321 	mutex_lock(&zloop_ctl_mutex);
1322 	WRITE_ONCE(zlo->state, Zlo_live);
1323 	mutex_unlock(&zloop_ctl_mutex);
1324 
1325 	pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n",
1326 		zlo->id, zlo->nr_zones,
1327 		((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
1328 		zlo->block_size);
1329 	pr_info("zloop%d: using %s%s zone append\n",
1330 		zlo->id,
1331 		zlo->ordered_zone_append ? "ordered " : "",
1332 		zlo->zone_append ? "native" : "emulated");
1333 
1334 	return 0;
1335 
1336 out_cleanup_disk:
1337 	put_disk(zlo->disk);
1338 out_cleanup_tags:
1339 	blk_mq_free_tag_set(&zlo->tag_set);
1340 out_close_files:
1341 	for (j = 0; j < i; j++) {
1342 		struct zloop_zone *zone = &zlo->zones[j];
1343 
1344 		if (!IS_ERR_OR_NULL(zone->file))
1345 			fput(zone->file);
1346 	}
1347 	fput(zlo->data_dir);
1348 out_free_base_dir:
1349 	kfree(zlo->base_dir);
1350 out_destroy_workqueue:
1351 	destroy_workqueue(zlo->workqueue);
1352 out_free_idr:
1353 	mutex_lock(&zloop_ctl_mutex);
1354 	idr_remove(&zloop_index_idr, zlo->id);
1355 	mutex_unlock(&zloop_ctl_mutex);
1356 out_free_dev:
1357 	kvfree(zlo);
1358 out:
1359 	module_put(THIS_MODULE);
1360 	if (ret == -ENOENT)
1361 		ret = -EINVAL;
1362 	return ret;
1363 }
1364 
zloop_forget_cache(struct zloop_device * zlo)1365 static void zloop_forget_cache(struct zloop_device *zlo)
1366 {
1367 	unsigned int i;
1368 	int ret;
1369 
1370 	pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0);
1371 
1372 	for (i = 0; i < zlo->nr_zones; i++) {
1373 		struct zloop_zone *zone = &zlo->zones[i];
1374 		struct file *file = zone->file;
1375 		sector_t old_wp;
1376 
1377 		if (!zloop_zone_is_active(zone))
1378 			continue;
1379 
1380 		ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file),
1381 				"user.zloop.wp", &old_wp, sizeof(old_wp));
1382 		if (ret == -ENODATA) {
1383 			old_wp = 0;
1384 		} else if (ret != sizeof(old_wp)) {
1385 			pr_err("%pg: failed to retrieve write pointer (%d)\n",
1386 				zlo->disk->part0, ret);
1387 			continue;
1388 		}
1389 
1390 		if (old_wp > zone->wp)
1391 			continue;
1392 		/*
1393 		 * This should not happen, if we recored a full zone, it can't
1394 		 * be active.
1395 		 */
1396 		if (WARN_ON_ONCE(old_wp == ULLONG_MAX))
1397 			continue;
1398 
1399 		vfs_truncate(&file->f_path,
1400 			(old_wp - zone->start) << SECTOR_SHIFT);
1401 	}
1402 }
1403 
zloop_ctl_remove(struct zloop_options * opts)1404 static int zloop_ctl_remove(struct zloop_options *opts)
1405 {
1406 	struct zloop_device *zlo;
1407 	int ret;
1408 
1409 	if (!(opts->mask & ZLOOP_OPT_ID)) {
1410 		pr_err("No ID specified for remove\n");
1411 		return -EINVAL;
1412 	}
1413 
1414 	if (opts->mask & ~ZLOOP_OPT_ID) {
1415 		pr_err("Invalid option specified for remove\n");
1416 		return -EINVAL;
1417 	}
1418 
1419 	ret = mutex_lock_killable(&zloop_ctl_mutex);
1420 	if (ret)
1421 		return ret;
1422 
1423 	zlo = idr_find(&zloop_index_idr, opts->id);
1424 	if (!zlo || zlo->state == Zlo_creating) {
1425 		ret = -ENODEV;
1426 	} else if (zlo->state == Zlo_deleting) {
1427 		ret = -EINVAL;
1428 	} else {
1429 		idr_remove(&zloop_index_idr, zlo->id);
1430 		WRITE_ONCE(zlo->state, Zlo_deleting);
1431 	}
1432 
1433 	mutex_unlock(&zloop_ctl_mutex);
1434 	if (ret)
1435 		return ret;
1436 
1437 	del_gendisk(zlo->disk);
1438 
1439 	if (zlo->discard_write_cache)
1440 		zloop_forget_cache(zlo);
1441 
1442 	put_disk(zlo->disk);
1443 
1444 	pr_info("Removed device %d\n", opts->id);
1445 
1446 	module_put(THIS_MODULE);
1447 
1448 	return 0;
1449 }
1450 
zloop_parse_options(struct zloop_options * opts,const char * buf)1451 static int zloop_parse_options(struct zloop_options *opts, const char *buf)
1452 {
1453 	substring_t args[MAX_OPT_ARGS];
1454 	char *options, *o, *p;
1455 	unsigned int token;
1456 	int ret = 0;
1457 
1458 	/* Set defaults. */
1459 	opts->mask = 0;
1460 	opts->id = ZLOOP_DEF_ID;
1461 	opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
1462 	opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
1463 	opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
1464 	opts->max_open_zones = ZLOOP_DEF_MAX_OPEN_ZONES;
1465 	opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
1466 	opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
1467 	opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
1468 	opts->zone_append = ZLOOP_DEF_ZONE_APPEND;
1469 	opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND;
1470 
1471 	if (!buf)
1472 		return 0;
1473 
1474 	/* Skip leading spaces before the options. */
1475 	while (isspace(*buf))
1476 		buf++;
1477 
1478 	options = o = kstrdup(buf, GFP_KERNEL);
1479 	if (!options)
1480 		return -ENOMEM;
1481 
1482 	/* Parse the options, doing only some light invalid value checks. */
1483 	while ((p = strsep(&o, ",\n")) != NULL) {
1484 		if (!*p)
1485 			continue;
1486 
1487 		token = match_token(p, zloop_opt_tokens, args);
1488 		opts->mask |= token;
1489 		switch (token) {
1490 		case ZLOOP_OPT_ID:
1491 			if (match_int(args, &opts->id)) {
1492 				ret = -EINVAL;
1493 				goto out;
1494 			}
1495 			break;
1496 		case ZLOOP_OPT_CAPACITY:
1497 			if (match_uint(args, &token)) {
1498 				ret = -EINVAL;
1499 				goto out;
1500 			}
1501 			if (!token) {
1502 				pr_err("Invalid capacity\n");
1503 				ret = -EINVAL;
1504 				goto out;
1505 			}
1506 			opts->capacity =
1507 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1508 			break;
1509 		case ZLOOP_OPT_ZONE_SIZE:
1510 			if (match_uint(args, &token)) {
1511 				ret = -EINVAL;
1512 				goto out;
1513 			}
1514 			if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
1515 			    !is_power_of_2(token)) {
1516 				pr_err("Invalid zone size %u\n", token);
1517 				ret = -EINVAL;
1518 				goto out;
1519 			}
1520 			opts->zone_size =
1521 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1522 			break;
1523 		case ZLOOP_OPT_ZONE_CAPACITY:
1524 			if (match_uint(args, &token)) {
1525 				ret = -EINVAL;
1526 				goto out;
1527 			}
1528 			if (!token) {
1529 				pr_err("Invalid zone capacity\n");
1530 				ret = -EINVAL;
1531 				goto out;
1532 			}
1533 			opts->zone_capacity =
1534 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1535 			break;
1536 		case ZLOOP_OPT_NR_CONV_ZONES:
1537 			if (match_uint(args, &token)) {
1538 				ret = -EINVAL;
1539 				goto out;
1540 			}
1541 			opts->nr_conv_zones = token;
1542 			break;
1543 		case ZLOOP_OPT_MAX_OPEN_ZONES:
1544 			if (match_uint(args, &token)) {
1545 				ret = -EINVAL;
1546 				goto out;
1547 			}
1548 			opts->max_open_zones = token;
1549 			break;
1550 		case ZLOOP_OPT_BASE_DIR:
1551 			p = match_strdup(args);
1552 			if (!p) {
1553 				ret = -ENOMEM;
1554 				goto out;
1555 			}
1556 			kfree(opts->base_dir);
1557 			opts->base_dir = p;
1558 			break;
1559 		case ZLOOP_OPT_NR_QUEUES:
1560 			if (match_uint(args, &token)) {
1561 				ret = -EINVAL;
1562 				goto out;
1563 			}
1564 			if (!token) {
1565 				pr_err("Invalid number of queues\n");
1566 				ret = -EINVAL;
1567 				goto out;
1568 			}
1569 			opts->nr_queues = min(token, num_online_cpus());
1570 			break;
1571 		case ZLOOP_OPT_QUEUE_DEPTH:
1572 			if (match_uint(args, &token)) {
1573 				ret = -EINVAL;
1574 				goto out;
1575 			}
1576 			if (!token) {
1577 				pr_err("Invalid queue depth\n");
1578 				ret = -EINVAL;
1579 				goto out;
1580 			}
1581 			opts->queue_depth = token;
1582 			break;
1583 		case ZLOOP_OPT_BUFFERED_IO:
1584 			opts->buffered_io = true;
1585 			break;
1586 		case ZLOOP_OPT_ZONE_APPEND:
1587 			if (match_uint(args, &token)) {
1588 				ret = -EINVAL;
1589 				goto out;
1590 			}
1591 			if (token != 0 && token != 1) {
1592 				pr_err("Invalid zone_append value\n");
1593 				ret = -EINVAL;
1594 				goto out;
1595 			}
1596 			opts->zone_append = token;
1597 			break;
1598 		case ZLOOP_OPT_ORDERED_ZONE_APPEND:
1599 			opts->ordered_zone_append = true;
1600 			break;
1601 		case ZLOOP_OPT_DISCARD_WRITE_CACHE:
1602 			opts->discard_write_cache = true;
1603 			break;
1604 		case ZLOOP_OPT_ERR:
1605 		default:
1606 			pr_warn("unknown parameter or missing value '%s'\n", p);
1607 			ret = -EINVAL;
1608 			goto out;
1609 		}
1610 	}
1611 
1612 	ret = -EINVAL;
1613 	if (opts->capacity <= opts->zone_size) {
1614 		pr_err("Invalid capacity\n");
1615 		goto out;
1616 	}
1617 
1618 	if (opts->zone_capacity > opts->zone_size) {
1619 		pr_err("Invalid zone capacity\n");
1620 		goto out;
1621 	}
1622 
1623 	ret = 0;
1624 out:
1625 	kfree(options);
1626 	return ret;
1627 }
1628 
1629 enum {
1630 	ZLOOP_CTL_ADD,
1631 	ZLOOP_CTL_REMOVE,
1632 };
1633 
1634 static struct zloop_ctl_op {
1635 	int		code;
1636 	const char	*name;
1637 } zloop_ctl_ops[] = {
1638 	{ ZLOOP_CTL_ADD,	"add" },
1639 	{ ZLOOP_CTL_REMOVE,	"remove" },
1640 	{ -1,	NULL },
1641 };
1642 
zloop_ctl_write(struct file * file,const char __user * ubuf,size_t count,loff_t * pos)1643 static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
1644 			       size_t count, loff_t *pos)
1645 {
1646 	struct zloop_options opts = { };
1647 	struct zloop_ctl_op *op;
1648 	const char *buf, *opts_buf;
1649 	int i, ret;
1650 
1651 	if (count > PAGE_SIZE)
1652 		return -ENOMEM;
1653 
1654 	buf = memdup_user_nul(ubuf, count);
1655 	if (IS_ERR(buf))
1656 		return PTR_ERR(buf);
1657 
1658 	for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
1659 		op = &zloop_ctl_ops[i];
1660 		if (!op->name) {
1661 			pr_err("Invalid operation\n");
1662 			ret = -EINVAL;
1663 			goto out;
1664 		}
1665 		if (!strncmp(buf, op->name, strlen(op->name)))
1666 			break;
1667 	}
1668 
1669 	if (count <= strlen(op->name))
1670 		opts_buf = NULL;
1671 	else
1672 		opts_buf = buf + strlen(op->name);
1673 
1674 	ret = zloop_parse_options(&opts, opts_buf);
1675 	if (ret) {
1676 		pr_err("Failed to parse options\n");
1677 		goto out;
1678 	}
1679 
1680 	switch (op->code) {
1681 	case ZLOOP_CTL_ADD:
1682 		ret = zloop_ctl_add(&opts);
1683 		break;
1684 	case ZLOOP_CTL_REMOVE:
1685 		ret = zloop_ctl_remove(&opts);
1686 		break;
1687 	default:
1688 		pr_err("Invalid operation\n");
1689 		ret = -EINVAL;
1690 		goto out;
1691 	}
1692 
1693 out:
1694 	kfree(opts.base_dir);
1695 	kfree(buf);
1696 	return ret ? ret : count;
1697 }
1698 
zloop_ctl_show(struct seq_file * seq_file,void * private)1699 static int zloop_ctl_show(struct seq_file *seq_file, void *private)
1700 {
1701 	const struct match_token *tok;
1702 	int i;
1703 
1704 	/* Add operation */
1705 	seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
1706 	for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
1707 		tok = &zloop_opt_tokens[i];
1708 		if (!tok->pattern)
1709 			break;
1710 		if (i)
1711 			seq_putc(seq_file, ',');
1712 		seq_puts(seq_file, tok->pattern);
1713 	}
1714 	seq_putc(seq_file, '\n');
1715 
1716 	/* Remove operation */
1717 	seq_puts(seq_file, zloop_ctl_ops[1].name);
1718 	seq_puts(seq_file, " id=%d\n");
1719 
1720 	return 0;
1721 }
1722 
zloop_ctl_open(struct inode * inode,struct file * file)1723 static int zloop_ctl_open(struct inode *inode, struct file *file)
1724 {
1725 	file->private_data = NULL;
1726 	return single_open(file, zloop_ctl_show, NULL);
1727 }
1728 
zloop_ctl_release(struct inode * inode,struct file * file)1729 static int zloop_ctl_release(struct inode *inode, struct file *file)
1730 {
1731 	return single_release(inode, file);
1732 }
1733 
1734 static const struct file_operations zloop_ctl_fops = {
1735 	.owner		= THIS_MODULE,
1736 	.open		= zloop_ctl_open,
1737 	.release	= zloop_ctl_release,
1738 	.write		= zloop_ctl_write,
1739 	.read		= seq_read,
1740 };
1741 
1742 static struct miscdevice zloop_misc = {
1743 	.minor		= MISC_DYNAMIC_MINOR,
1744 	.name		= "zloop-control",
1745 	.fops		= &zloop_ctl_fops,
1746 };
1747 
zloop_init(void)1748 static int __init zloop_init(void)
1749 {
1750 	int ret;
1751 
1752 	ret = misc_register(&zloop_misc);
1753 	if (ret) {
1754 		pr_err("Failed to register misc device: %d\n", ret);
1755 		return ret;
1756 	}
1757 	pr_info("Module loaded\n");
1758 
1759 	return 0;
1760 }
1761 
zloop_exit(void)1762 static void __exit zloop_exit(void)
1763 {
1764 	misc_deregister(&zloop_misc);
1765 	idr_destroy(&zloop_index_idr);
1766 }
1767 
1768 module_init(zloop_init);
1769 module_exit(zloop_exit);
1770 
1771 MODULE_DESCRIPTION("Zoned loopback device");
1772 MODULE_LICENSE("GPL");
1773