xref: /linux/drivers/block/zloop.c (revision 8457669db968c98edb781892d73fa559e1efcbd4)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (c) 2025, Christoph Hellwig.
4  * Copyright (c) 2025, Western Digital Corporation or its affiliates.
5  *
6  * Zoned Loop Device driver - exports a zoned block device using one file per
7  * zone as backing storage.
8  */
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 
11 #include <linux/module.h>
12 #include <linux/blk-mq.h>
13 #include <linux/blkzoned.h>
14 #include <linux/pagemap.h>
15 #include <linux/miscdevice.h>
16 #include <linux/falloc.h>
17 #include <linux/mutex.h>
18 #include <linux/parser.h>
19 #include <linux/seq_file.h>
20 
21 /*
22  * Options for adding (and removing) a device.
23  */
24 enum {
25 	ZLOOP_OPT_ERR			= 0,
26 	ZLOOP_OPT_ID			= (1 << 0),
27 	ZLOOP_OPT_CAPACITY		= (1 << 1),
28 	ZLOOP_OPT_ZONE_SIZE		= (1 << 2),
29 	ZLOOP_OPT_ZONE_CAPACITY		= (1 << 3),
30 	ZLOOP_OPT_NR_CONV_ZONES		= (1 << 4),
31 	ZLOOP_OPT_BASE_DIR		= (1 << 5),
32 	ZLOOP_OPT_NR_QUEUES		= (1 << 6),
33 	ZLOOP_OPT_QUEUE_DEPTH		= (1 << 7),
34 	ZLOOP_OPT_BUFFERED_IO		= (1 << 8),
35 	ZLOOP_OPT_ZONE_APPEND		= (1 << 9),
36 	ZLOOP_OPT_ORDERED_ZONE_APPEND	= (1 << 10),
37 };
38 
39 static const match_table_t zloop_opt_tokens = {
40 	{ ZLOOP_OPT_ID,			"id=%d"	},
41 	{ ZLOOP_OPT_CAPACITY,		"capacity_mb=%u"	},
42 	{ ZLOOP_OPT_ZONE_SIZE,		"zone_size_mb=%u"	},
43 	{ ZLOOP_OPT_ZONE_CAPACITY,	"zone_capacity_mb=%u"	},
44 	{ ZLOOP_OPT_NR_CONV_ZONES,	"conv_zones=%u"		},
45 	{ ZLOOP_OPT_BASE_DIR,		"base_dir=%s"		},
46 	{ ZLOOP_OPT_NR_QUEUES,		"nr_queues=%u"		},
47 	{ ZLOOP_OPT_QUEUE_DEPTH,	"queue_depth=%u"	},
48 	{ ZLOOP_OPT_BUFFERED_IO,	"buffered_io"		},
49 	{ ZLOOP_OPT_ZONE_APPEND,	"zone_append=%u"	},
50 	{ ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append"	},
51 	{ ZLOOP_OPT_ERR,		NULL			}
52 };
53 
54 /* Default values for the "add" operation. */
55 #define ZLOOP_DEF_ID			-1
56 #define ZLOOP_DEF_ZONE_SIZE		((256ULL * SZ_1M) >> SECTOR_SHIFT)
57 #define ZLOOP_DEF_NR_ZONES		64
58 #define ZLOOP_DEF_NR_CONV_ZONES		8
59 #define ZLOOP_DEF_BASE_DIR		"/var/local/zloop"
60 #define ZLOOP_DEF_NR_QUEUES		1
61 #define ZLOOP_DEF_QUEUE_DEPTH		128
62 #define ZLOOP_DEF_BUFFERED_IO		false
63 #define ZLOOP_DEF_ZONE_APPEND		true
64 #define ZLOOP_DEF_ORDERED_ZONE_APPEND	false
65 
66 /* Arbitrary limit on the zone size (16GB). */
67 #define ZLOOP_MAX_ZONE_SIZE_MB		16384
68 
69 struct zloop_options {
70 	unsigned int		mask;
71 	int			id;
72 	sector_t		capacity;
73 	sector_t		zone_size;
74 	sector_t		zone_capacity;
75 	unsigned int		nr_conv_zones;
76 	char			*base_dir;
77 	unsigned int		nr_queues;
78 	unsigned int		queue_depth;
79 	bool			buffered_io;
80 	bool			zone_append;
81 	bool			ordered_zone_append;
82 };
83 
84 /*
85  * Device states.
86  */
87 enum {
88 	Zlo_creating = 0,
89 	Zlo_live,
90 	Zlo_deleting,
91 };
92 
93 enum zloop_zone_flags {
94 	ZLOOP_ZONE_CONV = 0,
95 	ZLOOP_ZONE_SEQ_ERROR,
96 };
97 
98 struct zloop_zone {
99 	struct file		*file;
100 
101 	unsigned long		flags;
102 	struct mutex		lock;
103 	spinlock_t		wp_lock;
104 	enum blk_zone_cond	cond;
105 	sector_t		start;
106 	sector_t		wp;
107 
108 	gfp_t			old_gfp_mask;
109 };
110 
111 struct zloop_device {
112 	unsigned int		id;
113 	unsigned int		state;
114 
115 	struct blk_mq_tag_set	tag_set;
116 	struct gendisk		*disk;
117 
118 	struct workqueue_struct *workqueue;
119 	bool			buffered_io;
120 	bool			zone_append;
121 	bool			ordered_zone_append;
122 
123 	const char		*base_dir;
124 	struct file		*data_dir;
125 
126 	unsigned int		zone_shift;
127 	sector_t		zone_size;
128 	sector_t		zone_capacity;
129 	unsigned int		nr_zones;
130 	unsigned int		nr_conv_zones;
131 	unsigned int		block_size;
132 
133 	struct zloop_zone	zones[] __counted_by(nr_zones);
134 };
135 
136 struct zloop_cmd {
137 	struct work_struct	work;
138 	atomic_t		ref;
139 	sector_t		sector;
140 	sector_t		nr_sectors;
141 	long			ret;
142 	struct kiocb		iocb;
143 	struct bio_vec		*bvec;
144 };
145 
146 static DEFINE_IDR(zloop_index_idr);
147 static DEFINE_MUTEX(zloop_ctl_mutex);
148 
rq_zone_no(struct request * rq)149 static unsigned int rq_zone_no(struct request *rq)
150 {
151 	struct zloop_device *zlo = rq->q->queuedata;
152 
153 	return blk_rq_pos(rq) >> zlo->zone_shift;
154 }
155 
zloop_update_seq_zone(struct zloop_device * zlo,unsigned int zone_no)156 static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
157 {
158 	struct zloop_zone *zone = &zlo->zones[zone_no];
159 	struct kstat stat;
160 	sector_t file_sectors;
161 	unsigned long flags;
162 	int ret;
163 
164 	lockdep_assert_held(&zone->lock);
165 
166 	ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
167 	if (ret < 0) {
168 		pr_err("Failed to get zone %u file stat (err=%d)\n",
169 		       zone_no, ret);
170 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
171 		return ret;
172 	}
173 
174 	file_sectors = stat.size >> SECTOR_SHIFT;
175 	if (file_sectors > zlo->zone_capacity) {
176 		pr_err("Zone %u file too large (%llu sectors > %llu)\n",
177 		       zone_no, file_sectors, zlo->zone_capacity);
178 		return -EINVAL;
179 	}
180 
181 	if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
182 		pr_err("Zone %u file size not aligned to block size %u\n",
183 		       zone_no, zlo->block_size);
184 		return -EINVAL;
185 	}
186 
187 	spin_lock_irqsave(&zone->wp_lock, flags);
188 	if (!file_sectors) {
189 		zone->cond = BLK_ZONE_COND_EMPTY;
190 		zone->wp = zone->start;
191 	} else if (file_sectors == zlo->zone_capacity) {
192 		zone->cond = BLK_ZONE_COND_FULL;
193 		zone->wp = ULLONG_MAX;
194 	} else {
195 		zone->cond = BLK_ZONE_COND_CLOSED;
196 		zone->wp = zone->start + file_sectors;
197 	}
198 	spin_unlock_irqrestore(&zone->wp_lock, flags);
199 
200 	return 0;
201 }
202 
zloop_open_zone(struct zloop_device * zlo,unsigned int zone_no)203 static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
204 {
205 	struct zloop_zone *zone = &zlo->zones[zone_no];
206 	int ret = 0;
207 
208 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
209 		return -EIO;
210 
211 	mutex_lock(&zone->lock);
212 
213 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
214 		ret = zloop_update_seq_zone(zlo, zone_no);
215 		if (ret)
216 			goto unlock;
217 	}
218 
219 	switch (zone->cond) {
220 	case BLK_ZONE_COND_EXP_OPEN:
221 		break;
222 	case BLK_ZONE_COND_EMPTY:
223 	case BLK_ZONE_COND_CLOSED:
224 	case BLK_ZONE_COND_IMP_OPEN:
225 		zone->cond = BLK_ZONE_COND_EXP_OPEN;
226 		break;
227 	case BLK_ZONE_COND_FULL:
228 	default:
229 		ret = -EIO;
230 		break;
231 	}
232 
233 unlock:
234 	mutex_unlock(&zone->lock);
235 
236 	return ret;
237 }
238 
zloop_close_zone(struct zloop_device * zlo,unsigned int zone_no)239 static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
240 {
241 	struct zloop_zone *zone = &zlo->zones[zone_no];
242 	unsigned long flags;
243 	int ret = 0;
244 
245 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
246 		return -EIO;
247 
248 	mutex_lock(&zone->lock);
249 
250 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
251 		ret = zloop_update_seq_zone(zlo, zone_no);
252 		if (ret)
253 			goto unlock;
254 	}
255 
256 	switch (zone->cond) {
257 	case BLK_ZONE_COND_CLOSED:
258 		break;
259 	case BLK_ZONE_COND_IMP_OPEN:
260 	case BLK_ZONE_COND_EXP_OPEN:
261 		spin_lock_irqsave(&zone->wp_lock, flags);
262 		if (zone->wp == zone->start)
263 			zone->cond = BLK_ZONE_COND_EMPTY;
264 		else
265 			zone->cond = BLK_ZONE_COND_CLOSED;
266 		spin_unlock_irqrestore(&zone->wp_lock, flags);
267 		break;
268 	case BLK_ZONE_COND_EMPTY:
269 	case BLK_ZONE_COND_FULL:
270 	default:
271 		ret = -EIO;
272 		break;
273 	}
274 
275 unlock:
276 	mutex_unlock(&zone->lock);
277 
278 	return ret;
279 }
280 
zloop_reset_zone(struct zloop_device * zlo,unsigned int zone_no)281 static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
282 {
283 	struct zloop_zone *zone = &zlo->zones[zone_no];
284 	unsigned long flags;
285 	int ret = 0;
286 
287 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
288 		return -EIO;
289 
290 	mutex_lock(&zone->lock);
291 
292 	if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
293 	    zone->cond == BLK_ZONE_COND_EMPTY)
294 		goto unlock;
295 
296 	if (vfs_truncate(&zone->file->f_path, 0)) {
297 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
298 		ret = -EIO;
299 		goto unlock;
300 	}
301 
302 	spin_lock_irqsave(&zone->wp_lock, flags);
303 	zone->cond = BLK_ZONE_COND_EMPTY;
304 	zone->wp = zone->start;
305 	clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
306 	spin_unlock_irqrestore(&zone->wp_lock, flags);
307 
308 unlock:
309 	mutex_unlock(&zone->lock);
310 
311 	return ret;
312 }
313 
zloop_reset_all_zones(struct zloop_device * zlo)314 static int zloop_reset_all_zones(struct zloop_device *zlo)
315 {
316 	unsigned int i;
317 	int ret;
318 
319 	for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
320 		ret = zloop_reset_zone(zlo, i);
321 		if (ret)
322 			return ret;
323 	}
324 
325 	return 0;
326 }
327 
zloop_finish_zone(struct zloop_device * zlo,unsigned int zone_no)328 static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
329 {
330 	struct zloop_zone *zone = &zlo->zones[zone_no];
331 	unsigned long flags;
332 	int ret = 0;
333 
334 	if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
335 		return -EIO;
336 
337 	mutex_lock(&zone->lock);
338 
339 	if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
340 	    zone->cond == BLK_ZONE_COND_FULL)
341 		goto unlock;
342 
343 	if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
344 		set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
345 		ret = -EIO;
346 		goto unlock;
347 	}
348 
349 	spin_lock_irqsave(&zone->wp_lock, flags);
350 	zone->cond = BLK_ZONE_COND_FULL;
351 	zone->wp = ULLONG_MAX;
352 	clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
353 	spin_unlock_irqrestore(&zone->wp_lock, flags);
354 
355  unlock:
356 	mutex_unlock(&zone->lock);
357 
358 	return ret;
359 }
360 
zloop_put_cmd(struct zloop_cmd * cmd)361 static void zloop_put_cmd(struct zloop_cmd *cmd)
362 {
363 	struct request *rq = blk_mq_rq_from_pdu(cmd);
364 
365 	if (!atomic_dec_and_test(&cmd->ref))
366 		return;
367 	kfree(cmd->bvec);
368 	cmd->bvec = NULL;
369 	if (likely(!blk_should_fake_timeout(rq->q)))
370 		blk_mq_complete_request(rq);
371 }
372 
zloop_rw_complete(struct kiocb * iocb,long ret)373 static void zloop_rw_complete(struct kiocb *iocb, long ret)
374 {
375 	struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
376 
377 	cmd->ret = ret;
378 	zloop_put_cmd(cmd);
379 }
380 
zloop_rw(struct zloop_cmd * cmd)381 static void zloop_rw(struct zloop_cmd *cmd)
382 {
383 	struct request *rq = blk_mq_rq_from_pdu(cmd);
384 	struct zloop_device *zlo = rq->q->queuedata;
385 	unsigned int zone_no = rq_zone_no(rq);
386 	sector_t sector = blk_rq_pos(rq);
387 	sector_t nr_sectors = blk_rq_sectors(rq);
388 	bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
389 	bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
390 	int rw = is_write ? ITER_SOURCE : ITER_DEST;
391 	struct req_iterator rq_iter;
392 	struct zloop_zone *zone;
393 	struct iov_iter iter;
394 	struct bio_vec tmp;
395 	unsigned long flags;
396 	sector_t zone_end;
397 	unsigned int nr_bvec;
398 	int ret;
399 
400 	atomic_set(&cmd->ref, 2);
401 	cmd->sector = sector;
402 	cmd->nr_sectors = nr_sectors;
403 	cmd->ret = 0;
404 
405 	if (WARN_ON_ONCE(is_append && !zlo->zone_append)) {
406 		ret = -EIO;
407 		goto out;
408 	}
409 
410 	/* We should never get an I/O beyond the device capacity. */
411 	if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) {
412 		ret = -EIO;
413 		goto out;
414 	}
415 	zone = &zlo->zones[zone_no];
416 	zone_end = zone->start + zlo->zone_capacity;
417 
418 	/*
419 	 * The block layer should never send requests that are not fully
420 	 * contained within the zone.
421 	 */
422 	if (WARN_ON_ONCE(sector + nr_sectors > zone->start + zlo->zone_size)) {
423 		ret = -EIO;
424 		goto out;
425 	}
426 
427 	if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
428 		mutex_lock(&zone->lock);
429 		ret = zloop_update_seq_zone(zlo, zone_no);
430 		mutex_unlock(&zone->lock);
431 		if (ret)
432 			goto out;
433 	}
434 
435 	if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
436 		mutex_lock(&zone->lock);
437 
438 		spin_lock_irqsave(&zone->wp_lock, flags);
439 
440 		/*
441 		 * Zone append operations always go at the current write
442 		 * pointer, but regular write operations must already be
443 		 * aligned to the write pointer when submitted.
444 		 */
445 		if (is_append) {
446 			/*
447 			 * If ordered zone append is in use, we already checked
448 			 * and set the target sector in zloop_queue_rq().
449 			 */
450 			if (!zlo->ordered_zone_append) {
451 				if (zone->cond == BLK_ZONE_COND_FULL ||
452 				    zone->wp + nr_sectors > zone_end) {
453 					spin_unlock_irqrestore(&zone->wp_lock,
454 							       flags);
455 					ret = -EIO;
456 					goto unlock;
457 				}
458 				sector = zone->wp;
459 			}
460 			cmd->sector = sector;
461 		} else if (sector != zone->wp) {
462 			spin_unlock_irqrestore(&zone->wp_lock, flags);
463 			pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
464 			       zone_no, sector, zone->wp);
465 			ret = -EIO;
466 			goto unlock;
467 		}
468 
469 		/* Implicitly open the target zone. */
470 		if (zone->cond == BLK_ZONE_COND_CLOSED ||
471 		    zone->cond == BLK_ZONE_COND_EMPTY)
472 			zone->cond = BLK_ZONE_COND_IMP_OPEN;
473 
474 		/*
475 		 * Advance the write pointer, unless ordered zone append is in
476 		 * use. If the write fails, the write pointer position will be
477 		 * corrected when the next I/O starts execution.
478 		 */
479 		if (!is_append || !zlo->ordered_zone_append) {
480 			zone->wp += nr_sectors;
481 			if (zone->wp == zone_end) {
482 				zone->cond = BLK_ZONE_COND_FULL;
483 				zone->wp = ULLONG_MAX;
484 			}
485 		}
486 
487 		spin_unlock_irqrestore(&zone->wp_lock, flags);
488 	}
489 
490 	nr_bvec = blk_rq_nr_bvec(rq);
491 
492 	if (rq->bio != rq->biotail) {
493 		struct bio_vec *bvec;
494 
495 		cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO);
496 		if (!cmd->bvec) {
497 			ret = -EIO;
498 			goto unlock;
499 		}
500 
501 		/*
502 		 * The bios of the request may be started from the middle of
503 		 * the 'bvec' because of bio splitting, so we can't directly
504 		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
505 		 * API will take care of all details for us.
506 		 */
507 		bvec = cmd->bvec;
508 		rq_for_each_bvec(tmp, rq, rq_iter) {
509 			*bvec = tmp;
510 			bvec++;
511 		}
512 		iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
513 	} else {
514 		/*
515 		 * Same here, this bio may be started from the middle of the
516 		 * 'bvec' because of bio splitting, so offset from the bvec
517 		 * must be passed to iov iterator
518 		 */
519 		iov_iter_bvec(&iter, rw,
520 			__bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
521 					nr_bvec, blk_rq_bytes(rq));
522 		iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
523 	}
524 
525 	cmd->iocb.ki_pos = (sector - zone->start) << SECTOR_SHIFT;
526 	cmd->iocb.ki_filp = zone->file;
527 	cmd->iocb.ki_complete = zloop_rw_complete;
528 	if (!zlo->buffered_io)
529 		cmd->iocb.ki_flags = IOCB_DIRECT;
530 	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
531 
532 	if (rw == ITER_SOURCE)
533 		ret = zone->file->f_op->write_iter(&cmd->iocb, &iter);
534 	else
535 		ret = zone->file->f_op->read_iter(&cmd->iocb, &iter);
536 unlock:
537 	if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write)
538 		mutex_unlock(&zone->lock);
539 out:
540 	if (ret != -EIOCBQUEUED)
541 		zloop_rw_complete(&cmd->iocb, ret);
542 	zloop_put_cmd(cmd);
543 }
544 
545 /*
546  * Sync the entire FS containing the zone files instead of walking all files.
547  */
zloop_flush(struct zloop_device * zlo)548 static int zloop_flush(struct zloop_device *zlo)
549 {
550 	struct super_block *sb = file_inode(zlo->data_dir)->i_sb;
551 	int ret;
552 
553 	down_read(&sb->s_umount);
554 	ret = sync_filesystem(sb);
555 	up_read(&sb->s_umount);
556 
557 	return ret;
558 }
559 
zloop_handle_cmd(struct zloop_cmd * cmd)560 static void zloop_handle_cmd(struct zloop_cmd *cmd)
561 {
562 	struct request *rq = blk_mq_rq_from_pdu(cmd);
563 	struct zloop_device *zlo = rq->q->queuedata;
564 
565 	/* We can block in this context, so ignore REQ_NOWAIT. */
566 	if (rq->cmd_flags & REQ_NOWAIT)
567 		rq->cmd_flags &= ~REQ_NOWAIT;
568 
569 	switch (req_op(rq)) {
570 	case REQ_OP_READ:
571 	case REQ_OP_WRITE:
572 	case REQ_OP_ZONE_APPEND:
573 		/*
574 		 * zloop_rw() always executes asynchronously or completes
575 		 * directly.
576 		 */
577 		zloop_rw(cmd);
578 		return;
579 	case REQ_OP_FLUSH:
580 		cmd->ret = zloop_flush(zlo);
581 		break;
582 	case REQ_OP_ZONE_RESET:
583 		cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
584 		break;
585 	case REQ_OP_ZONE_RESET_ALL:
586 		cmd->ret = zloop_reset_all_zones(zlo);
587 		break;
588 	case REQ_OP_ZONE_FINISH:
589 		cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
590 		break;
591 	case REQ_OP_ZONE_OPEN:
592 		cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
593 		break;
594 	case REQ_OP_ZONE_CLOSE:
595 		cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
596 		break;
597 	default:
598 		WARN_ON_ONCE(1);
599 		pr_err("Unsupported operation %d\n", req_op(rq));
600 		cmd->ret = -EOPNOTSUPP;
601 		break;
602 	}
603 
604 	blk_mq_complete_request(rq);
605 }
606 
zloop_cmd_workfn(struct work_struct * work)607 static void zloop_cmd_workfn(struct work_struct *work)
608 {
609 	struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
610 	int orig_flags = current->flags;
611 
612 	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
613 	zloop_handle_cmd(cmd);
614 	current->flags = orig_flags;
615 }
616 
zloop_complete_rq(struct request * rq)617 static void zloop_complete_rq(struct request *rq)
618 {
619 	struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
620 	struct zloop_device *zlo = rq->q->queuedata;
621 	unsigned int zone_no = cmd->sector >> zlo->zone_shift;
622 	struct zloop_zone *zone = &zlo->zones[zone_no];
623 	blk_status_t sts = BLK_STS_OK;
624 
625 	switch (req_op(rq)) {
626 	case REQ_OP_READ:
627 		if (cmd->ret < 0)
628 			pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
629 			       zone_no, cmd->sector, cmd->nr_sectors);
630 
631 		if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
632 			/* short read */
633 			struct bio *bio;
634 
635 			__rq_for_each_bio(bio, rq)
636 				zero_fill_bio(bio);
637 		}
638 		break;
639 	case REQ_OP_WRITE:
640 	case REQ_OP_ZONE_APPEND:
641 		if (cmd->ret < 0)
642 			pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
643 			       zone_no,
644 			       req_op(rq) == REQ_OP_WRITE ? "" : "append ",
645 			       cmd->sector, cmd->nr_sectors);
646 
647 		if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
648 			pr_err("Zone %u: partial write %ld/%u B\n",
649 			       zone_no, cmd->ret, blk_rq_bytes(rq));
650 			cmd->ret = -EIO;
651 		}
652 
653 		if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
654 			/*
655 			 * A write to a sequential zone file failed: mark the
656 			 * zone as having an error. This will be corrected and
657 			 * cleared when the next IO is submitted.
658 			 */
659 			set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
660 			break;
661 		}
662 		if (req_op(rq) == REQ_OP_ZONE_APPEND)
663 			rq->__sector = cmd->sector;
664 
665 		break;
666 	default:
667 		break;
668 	}
669 
670 	if (cmd->ret < 0)
671 		sts = errno_to_blk_status(cmd->ret);
672 	blk_mq_end_request(rq, sts);
673 }
674 
zloop_set_zone_append_sector(struct request * rq)675 static bool zloop_set_zone_append_sector(struct request *rq)
676 {
677 	struct zloop_device *zlo = rq->q->queuedata;
678 	unsigned int zone_no = rq_zone_no(rq);
679 	struct zloop_zone *zone = &zlo->zones[zone_no];
680 	sector_t zone_end = zone->start + zlo->zone_capacity;
681 	sector_t nr_sectors = blk_rq_sectors(rq);
682 	unsigned long flags;
683 
684 	spin_lock_irqsave(&zone->wp_lock, flags);
685 
686 	if (zone->cond == BLK_ZONE_COND_FULL ||
687 	    zone->wp + nr_sectors > zone_end) {
688 		spin_unlock_irqrestore(&zone->wp_lock, flags);
689 		return false;
690 	}
691 
692 	rq->__sector = zone->wp;
693 	zone->wp += blk_rq_sectors(rq);
694 	if (zone->wp >= zone_end) {
695 		zone->cond = BLK_ZONE_COND_FULL;
696 		zone->wp = ULLONG_MAX;
697 	}
698 
699 	spin_unlock_irqrestore(&zone->wp_lock, flags);
700 
701 	return true;
702 }
703 
zloop_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)704 static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
705 				   const struct blk_mq_queue_data *bd)
706 {
707 	struct request *rq = bd->rq;
708 	struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
709 	struct zloop_device *zlo = rq->q->queuedata;
710 
711 	if (data_race(READ_ONCE(zlo->state)) == Zlo_deleting)
712 		return BLK_STS_IOERR;
713 
714 	/*
715 	 * If we need to strongly order zone append operations, set the request
716 	 * sector to the zone write pointer location now instead of when the
717 	 * command work runs.
718 	 */
719 	if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) {
720 		if (!zloop_set_zone_append_sector(rq))
721 			return BLK_STS_IOERR;
722 	}
723 
724 	blk_mq_start_request(rq);
725 
726 	INIT_WORK(&cmd->work, zloop_cmd_workfn);
727 	queue_work(zlo->workqueue, &cmd->work);
728 
729 	return BLK_STS_OK;
730 }
731 
732 static const struct blk_mq_ops zloop_mq_ops = {
733 	.queue_rq       = zloop_queue_rq,
734 	.complete	= zloop_complete_rq,
735 };
736 
zloop_open(struct gendisk * disk,blk_mode_t mode)737 static int zloop_open(struct gendisk *disk, blk_mode_t mode)
738 {
739 	struct zloop_device *zlo = disk->private_data;
740 	int ret;
741 
742 	ret = mutex_lock_killable(&zloop_ctl_mutex);
743 	if (ret)
744 		return ret;
745 
746 	if (zlo->state != Zlo_live)
747 		ret = -ENXIO;
748 	mutex_unlock(&zloop_ctl_mutex);
749 	return ret;
750 }
751 
zloop_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)752 static int zloop_report_zones(struct gendisk *disk, sector_t sector,
753 		unsigned int nr_zones, struct blk_report_zones_args *args)
754 {
755 	struct zloop_device *zlo = disk->private_data;
756 	struct blk_zone blkz = {};
757 	unsigned int first, i;
758 	unsigned long flags;
759 	int ret;
760 
761 	first = disk_zone_no(disk, sector);
762 	if (first >= zlo->nr_zones)
763 		return 0;
764 	nr_zones = min(nr_zones, zlo->nr_zones - first);
765 
766 	for (i = 0; i < nr_zones; i++) {
767 		unsigned int zone_no = first + i;
768 		struct zloop_zone *zone = &zlo->zones[zone_no];
769 
770 		mutex_lock(&zone->lock);
771 
772 		if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
773 			ret = zloop_update_seq_zone(zlo, zone_no);
774 			if (ret) {
775 				mutex_unlock(&zone->lock);
776 				return ret;
777 			}
778 		}
779 
780 		blkz.start = zone->start;
781 		blkz.len = zlo->zone_size;
782 		spin_lock_irqsave(&zone->wp_lock, flags);
783 		blkz.wp = zone->wp;
784 		spin_unlock_irqrestore(&zone->wp_lock, flags);
785 		blkz.cond = zone->cond;
786 		if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
787 			blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
788 			blkz.capacity = zlo->zone_size;
789 		} else {
790 			blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
791 			blkz.capacity = zlo->zone_capacity;
792 		}
793 
794 		mutex_unlock(&zone->lock);
795 
796 		ret = disk_report_zone(disk, &blkz, i, args);
797 		if (ret)
798 			return ret;
799 	}
800 
801 	return nr_zones;
802 }
803 
zloop_free_disk(struct gendisk * disk)804 static void zloop_free_disk(struct gendisk *disk)
805 {
806 	struct zloop_device *zlo = disk->private_data;
807 	unsigned int i;
808 
809 	blk_mq_free_tag_set(&zlo->tag_set);
810 
811 	for (i = 0; i < zlo->nr_zones; i++) {
812 		struct zloop_zone *zone = &zlo->zones[i];
813 
814 		mapping_set_gfp_mask(zone->file->f_mapping,
815 				zone->old_gfp_mask);
816 		fput(zone->file);
817 	}
818 
819 	fput(zlo->data_dir);
820 	destroy_workqueue(zlo->workqueue);
821 	kfree(zlo->base_dir);
822 	kvfree(zlo);
823 }
824 
825 static const struct block_device_operations zloop_fops = {
826 	.owner			= THIS_MODULE,
827 	.open			= zloop_open,
828 	.report_zones		= zloop_report_zones,
829 	.free_disk		= zloop_free_disk,
830 };
831 
832 __printf(3, 4)
zloop_filp_open_fmt(int oflags,umode_t mode,const char * fmt,...)833 static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
834 		const char *fmt, ...)
835 {
836 	struct file *file;
837 	va_list ap;
838 	char *p;
839 
840 	va_start(ap, fmt);
841 	p = kvasprintf(GFP_KERNEL, fmt, ap);
842 	va_end(ap);
843 
844 	if (!p)
845 		return ERR_PTR(-ENOMEM);
846 	file = filp_open(p, oflags, mode);
847 	kfree(p);
848 	return file;
849 }
850 
zloop_get_block_size(struct zloop_device * zlo,struct zloop_zone * zone)851 static int zloop_get_block_size(struct zloop_device *zlo,
852 				struct zloop_zone *zone)
853 {
854 	struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
855 	struct kstat st;
856 
857 	/*
858 	 * If the FS block size is lower than or equal to 4K, use that as the
859 	 * device block size. Otherwise, fallback to the FS direct IO alignment
860 	 * constraint if that is provided, and to the FS underlying device
861 	 * physical block size if the direct IO alignment is unknown.
862 	 */
863 	if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
864 		zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
865 	else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
866 		 (st.result_mask & STATX_DIOALIGN))
867 		zlo->block_size = st.dio_offset_align;
868 	else if (sb_bdev)
869 		zlo->block_size = bdev_physical_block_size(sb_bdev);
870 	else
871 		zlo->block_size = SECTOR_SIZE;
872 
873 	if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
874 		pr_err("Zone capacity is not aligned to block size %u\n",
875 		       zlo->block_size);
876 		return -EINVAL;
877 	}
878 
879 	return 0;
880 }
881 
zloop_init_zone(struct zloop_device * zlo,struct zloop_options * opts,unsigned int zone_no,bool restore)882 static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
883 			   unsigned int zone_no, bool restore)
884 {
885 	struct zloop_zone *zone = &zlo->zones[zone_no];
886 	int oflags = O_RDWR;
887 	struct kstat stat;
888 	sector_t file_sectors;
889 	int ret;
890 
891 	mutex_init(&zone->lock);
892 	spin_lock_init(&zone->wp_lock);
893 	zone->start = (sector_t)zone_no << zlo->zone_shift;
894 
895 	if (!restore)
896 		oflags |= O_CREAT;
897 
898 	if (!opts->buffered_io)
899 		oflags |= O_DIRECT;
900 
901 	if (zone_no < zlo->nr_conv_zones) {
902 		/* Conventional zone file. */
903 		set_bit(ZLOOP_ZONE_CONV, &zone->flags);
904 		zone->cond = BLK_ZONE_COND_NOT_WP;
905 		zone->wp = U64_MAX;
906 
907 		zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
908 					zlo->base_dir, zlo->id, zone_no);
909 		if (IS_ERR(zone->file)) {
910 			pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
911 			       zone_no, zlo->base_dir, zlo->id, zone_no,
912 			       PTR_ERR(zone->file));
913 			return PTR_ERR(zone->file);
914 		}
915 
916 		if (!zlo->block_size) {
917 			ret = zloop_get_block_size(zlo, zone);
918 			if (ret)
919 				return ret;
920 		}
921 
922 		ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
923 		if (ret < 0) {
924 			pr_err("Failed to get zone %u file stat\n", zone_no);
925 			return ret;
926 		}
927 		file_sectors = stat.size >> SECTOR_SHIFT;
928 
929 		if (restore && file_sectors != zlo->zone_size) {
930 			pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
931 			       zone_no, file_sectors, zlo->zone_capacity);
932 			return ret;
933 		}
934 
935 		ret = vfs_truncate(&zone->file->f_path,
936 				   zlo->zone_size << SECTOR_SHIFT);
937 		if (ret < 0) {
938 			pr_err("Failed to truncate zone %u file (err=%d)\n",
939 			       zone_no, ret);
940 			return ret;
941 		}
942 
943 		return 0;
944 	}
945 
946 	/* Sequential zone file. */
947 	zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
948 					 zlo->base_dir, zlo->id, zone_no);
949 	if (IS_ERR(zone->file)) {
950 		pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
951 		       zone_no, zlo->base_dir, zlo->id, zone_no,
952 		       PTR_ERR(zone->file));
953 		return PTR_ERR(zone->file);
954 	}
955 
956 	if (!zlo->block_size) {
957 		ret = zloop_get_block_size(zlo, zone);
958 		if (ret)
959 			return ret;
960 	}
961 
962 	zloop_get_block_size(zlo, zone);
963 
964 	mutex_lock(&zone->lock);
965 	ret = zloop_update_seq_zone(zlo, zone_no);
966 	mutex_unlock(&zone->lock);
967 
968 	return ret;
969 }
970 
zloop_dev_exists(struct zloop_device * zlo)971 static bool zloop_dev_exists(struct zloop_device *zlo)
972 {
973 	struct file *cnv, *seq;
974 	bool exists;
975 
976 	cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
977 				  zlo->base_dir, zlo->id, 0);
978 	seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
979 				  zlo->base_dir, zlo->id, 0);
980 	exists = !IS_ERR(cnv) || !IS_ERR(seq);
981 
982 	if (!IS_ERR(cnv))
983 		fput(cnv);
984 	if (!IS_ERR(seq))
985 		fput(seq);
986 
987 	return exists;
988 }
989 
zloop_ctl_add(struct zloop_options * opts)990 static int zloop_ctl_add(struct zloop_options *opts)
991 {
992 	struct queue_limits lim = {
993 		.max_hw_sectors		= SZ_1M >> SECTOR_SHIFT,
994 		.chunk_sectors		= opts->zone_size,
995 		.features		= BLK_FEAT_ZONED | BLK_FEAT_WRITE_CACHE,
996 
997 	};
998 	unsigned int nr_zones, i, j;
999 	struct zloop_device *zlo;
1000 	int ret = -EINVAL;
1001 	bool restore;
1002 
1003 	__module_get(THIS_MODULE);
1004 
1005 	nr_zones = opts->capacity >> ilog2(opts->zone_size);
1006 	if (opts->nr_conv_zones >= nr_zones) {
1007 		pr_err("Invalid number of conventional zones %u\n",
1008 		       opts->nr_conv_zones);
1009 		goto out;
1010 	}
1011 
1012 	zlo = kvzalloc_flex(*zlo, zones, nr_zones);
1013 	if (!zlo) {
1014 		ret = -ENOMEM;
1015 		goto out;
1016 	}
1017 	WRITE_ONCE(zlo->state, Zlo_creating);
1018 
1019 	ret = mutex_lock_killable(&zloop_ctl_mutex);
1020 	if (ret)
1021 		goto out_free_dev;
1022 
1023 	/* Allocate id, if @opts->id >= 0, we're requesting that specific id */
1024 	if (opts->id >= 0) {
1025 		ret = idr_alloc(&zloop_index_idr, zlo,
1026 				  opts->id, opts->id + 1, GFP_KERNEL);
1027 		if (ret == -ENOSPC)
1028 			ret = -EEXIST;
1029 	} else {
1030 		ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
1031 	}
1032 	mutex_unlock(&zloop_ctl_mutex);
1033 	if (ret < 0)
1034 		goto out_free_dev;
1035 
1036 	zlo->id = ret;
1037 	zlo->zone_shift = ilog2(opts->zone_size);
1038 	zlo->zone_size = opts->zone_size;
1039 	if (opts->zone_capacity)
1040 		zlo->zone_capacity = opts->zone_capacity;
1041 	else
1042 		zlo->zone_capacity = zlo->zone_size;
1043 	zlo->nr_zones = nr_zones;
1044 	zlo->nr_conv_zones = opts->nr_conv_zones;
1045 	zlo->buffered_io = opts->buffered_io;
1046 	zlo->zone_append = opts->zone_append;
1047 	if (zlo->zone_append)
1048 		zlo->ordered_zone_append = opts->ordered_zone_append;
1049 
1050 	zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
1051 				opts->nr_queues * opts->queue_depth, zlo->id);
1052 	if (!zlo->workqueue) {
1053 		ret = -ENOMEM;
1054 		goto out_free_idr;
1055 	}
1056 
1057 	if (opts->base_dir)
1058 		zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
1059 	else
1060 		zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
1061 	if (!zlo->base_dir) {
1062 		ret = -ENOMEM;
1063 		goto out_destroy_workqueue;
1064 	}
1065 
1066 	zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
1067 					    zlo->base_dir, zlo->id);
1068 	if (IS_ERR(zlo->data_dir)) {
1069 		ret = PTR_ERR(zlo->data_dir);
1070 		pr_warn("Failed to open directory %s/%u (err=%d)\n",
1071 			zlo->base_dir, zlo->id, ret);
1072 		goto out_free_base_dir;
1073 	}
1074 
1075 	/*
1076 	 * If we already have zone files, we are restoring a device created by a
1077 	 * previous add operation. In this case, zloop_init_zone() will check
1078 	 * that the zone files are consistent with the zone configuration given.
1079 	 */
1080 	restore = zloop_dev_exists(zlo);
1081 	for (i = 0; i < nr_zones; i++) {
1082 		ret = zloop_init_zone(zlo, opts, i, restore);
1083 		if (ret)
1084 			goto out_close_files;
1085 	}
1086 
1087 	lim.physical_block_size = zlo->block_size;
1088 	lim.logical_block_size = zlo->block_size;
1089 	if (zlo->zone_append)
1090 		lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
1091 
1092 	zlo->tag_set.ops = &zloop_mq_ops;
1093 	zlo->tag_set.nr_hw_queues = opts->nr_queues;
1094 	zlo->tag_set.queue_depth = opts->queue_depth;
1095 	zlo->tag_set.numa_node = NUMA_NO_NODE;
1096 	zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
1097 	zlo->tag_set.driver_data = zlo;
1098 
1099 	ret = blk_mq_alloc_tag_set(&zlo->tag_set);
1100 	if (ret) {
1101 		pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
1102 		goto out_close_files;
1103 	}
1104 
1105 	zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
1106 	if (IS_ERR(zlo->disk)) {
1107 		pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
1108 		ret = PTR_ERR(zlo->disk);
1109 		goto out_cleanup_tags;
1110 	}
1111 	zlo->disk->flags = GENHD_FL_NO_PART;
1112 	zlo->disk->fops = &zloop_fops;
1113 	zlo->disk->private_data = zlo;
1114 	sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
1115 	set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
1116 
1117 	ret = blk_revalidate_disk_zones(zlo->disk);
1118 	if (ret)
1119 		goto out_cleanup_disk;
1120 
1121 	ret = add_disk(zlo->disk);
1122 	if (ret) {
1123 		pr_err("add_disk failed (err=%d)\n", ret);
1124 		goto out_cleanup_disk;
1125 	}
1126 
1127 	mutex_lock(&zloop_ctl_mutex);
1128 	WRITE_ONCE(zlo->state, Zlo_live);
1129 	mutex_unlock(&zloop_ctl_mutex);
1130 
1131 	pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n",
1132 		zlo->id, zlo->nr_zones,
1133 		((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
1134 		zlo->block_size);
1135 	pr_info("zloop%d: using %s%s zone append\n",
1136 		zlo->id,
1137 		zlo->ordered_zone_append ? "ordered " : "",
1138 		zlo->zone_append ? "native" : "emulated");
1139 
1140 	return 0;
1141 
1142 out_cleanup_disk:
1143 	put_disk(zlo->disk);
1144 out_cleanup_tags:
1145 	blk_mq_free_tag_set(&zlo->tag_set);
1146 out_close_files:
1147 	for (j = 0; j < i; j++) {
1148 		struct zloop_zone *zone = &zlo->zones[j];
1149 
1150 		if (!IS_ERR_OR_NULL(zone->file))
1151 			fput(zone->file);
1152 	}
1153 	fput(zlo->data_dir);
1154 out_free_base_dir:
1155 	kfree(zlo->base_dir);
1156 out_destroy_workqueue:
1157 	destroy_workqueue(zlo->workqueue);
1158 out_free_idr:
1159 	mutex_lock(&zloop_ctl_mutex);
1160 	idr_remove(&zloop_index_idr, zlo->id);
1161 	mutex_unlock(&zloop_ctl_mutex);
1162 out_free_dev:
1163 	kvfree(zlo);
1164 out:
1165 	module_put(THIS_MODULE);
1166 	if (ret == -ENOENT)
1167 		ret = -EINVAL;
1168 	return ret;
1169 }
1170 
zloop_ctl_remove(struct zloop_options * opts)1171 static int zloop_ctl_remove(struct zloop_options *opts)
1172 {
1173 	struct zloop_device *zlo;
1174 	int ret;
1175 
1176 	if (!(opts->mask & ZLOOP_OPT_ID)) {
1177 		pr_err("No ID specified for remove\n");
1178 		return -EINVAL;
1179 	}
1180 
1181 	if (opts->mask & ~ZLOOP_OPT_ID) {
1182 		pr_err("Invalid option specified for remove\n");
1183 		return -EINVAL;
1184 	}
1185 
1186 	ret = mutex_lock_killable(&zloop_ctl_mutex);
1187 	if (ret)
1188 		return ret;
1189 
1190 	zlo = idr_find(&zloop_index_idr, opts->id);
1191 	if (!zlo || zlo->state == Zlo_creating) {
1192 		ret = -ENODEV;
1193 	} else if (zlo->state == Zlo_deleting) {
1194 		ret = -EINVAL;
1195 	} else {
1196 		idr_remove(&zloop_index_idr, zlo->id);
1197 		WRITE_ONCE(zlo->state, Zlo_deleting);
1198 	}
1199 
1200 	mutex_unlock(&zloop_ctl_mutex);
1201 	if (ret)
1202 		return ret;
1203 
1204 	del_gendisk(zlo->disk);
1205 	put_disk(zlo->disk);
1206 
1207 	pr_info("Removed device %d\n", opts->id);
1208 
1209 	module_put(THIS_MODULE);
1210 
1211 	return 0;
1212 }
1213 
zloop_parse_options(struct zloop_options * opts,const char * buf)1214 static int zloop_parse_options(struct zloop_options *opts, const char *buf)
1215 {
1216 	substring_t args[MAX_OPT_ARGS];
1217 	char *options, *o, *p;
1218 	unsigned int token;
1219 	int ret = 0;
1220 
1221 	/* Set defaults. */
1222 	opts->mask = 0;
1223 	opts->id = ZLOOP_DEF_ID;
1224 	opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
1225 	opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
1226 	opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
1227 	opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
1228 	opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
1229 	opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
1230 	opts->zone_append = ZLOOP_DEF_ZONE_APPEND;
1231 	opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND;
1232 
1233 	if (!buf)
1234 		return 0;
1235 
1236 	/* Skip leading spaces before the options. */
1237 	while (isspace(*buf))
1238 		buf++;
1239 
1240 	options = o = kstrdup(buf, GFP_KERNEL);
1241 	if (!options)
1242 		return -ENOMEM;
1243 
1244 	/* Parse the options, doing only some light invalid value checks. */
1245 	while ((p = strsep(&o, ",\n")) != NULL) {
1246 		if (!*p)
1247 			continue;
1248 
1249 		token = match_token(p, zloop_opt_tokens, args);
1250 		opts->mask |= token;
1251 		switch (token) {
1252 		case ZLOOP_OPT_ID:
1253 			if (match_int(args, &opts->id)) {
1254 				ret = -EINVAL;
1255 				goto out;
1256 			}
1257 			break;
1258 		case ZLOOP_OPT_CAPACITY:
1259 			if (match_uint(args, &token)) {
1260 				ret = -EINVAL;
1261 				goto out;
1262 			}
1263 			if (!token) {
1264 				pr_err("Invalid capacity\n");
1265 				ret = -EINVAL;
1266 				goto out;
1267 			}
1268 			opts->capacity =
1269 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1270 			break;
1271 		case ZLOOP_OPT_ZONE_SIZE:
1272 			if (match_uint(args, &token)) {
1273 				ret = -EINVAL;
1274 				goto out;
1275 			}
1276 			if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
1277 			    !is_power_of_2(token)) {
1278 				pr_err("Invalid zone size %u\n", token);
1279 				ret = -EINVAL;
1280 				goto out;
1281 			}
1282 			opts->zone_size =
1283 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1284 			break;
1285 		case ZLOOP_OPT_ZONE_CAPACITY:
1286 			if (match_uint(args, &token)) {
1287 				ret = -EINVAL;
1288 				goto out;
1289 			}
1290 			if (!token) {
1291 				pr_err("Invalid zone capacity\n");
1292 				ret = -EINVAL;
1293 				goto out;
1294 			}
1295 			opts->zone_capacity =
1296 				((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1297 			break;
1298 		case ZLOOP_OPT_NR_CONV_ZONES:
1299 			if (match_uint(args, &token)) {
1300 				ret = -EINVAL;
1301 				goto out;
1302 			}
1303 			opts->nr_conv_zones = token;
1304 			break;
1305 		case ZLOOP_OPT_BASE_DIR:
1306 			p = match_strdup(args);
1307 			if (!p) {
1308 				ret = -ENOMEM;
1309 				goto out;
1310 			}
1311 			kfree(opts->base_dir);
1312 			opts->base_dir = p;
1313 			break;
1314 		case ZLOOP_OPT_NR_QUEUES:
1315 			if (match_uint(args, &token)) {
1316 				ret = -EINVAL;
1317 				goto out;
1318 			}
1319 			if (!token) {
1320 				pr_err("Invalid number of queues\n");
1321 				ret = -EINVAL;
1322 				goto out;
1323 			}
1324 			opts->nr_queues = min(token, num_online_cpus());
1325 			break;
1326 		case ZLOOP_OPT_QUEUE_DEPTH:
1327 			if (match_uint(args, &token)) {
1328 				ret = -EINVAL;
1329 				goto out;
1330 			}
1331 			if (!token) {
1332 				pr_err("Invalid queue depth\n");
1333 				ret = -EINVAL;
1334 				goto out;
1335 			}
1336 			opts->queue_depth = token;
1337 			break;
1338 		case ZLOOP_OPT_BUFFERED_IO:
1339 			opts->buffered_io = true;
1340 			break;
1341 		case ZLOOP_OPT_ZONE_APPEND:
1342 			if (match_uint(args, &token)) {
1343 				ret = -EINVAL;
1344 				goto out;
1345 			}
1346 			if (token != 0 && token != 1) {
1347 				pr_err("Invalid zone_append value\n");
1348 				ret = -EINVAL;
1349 				goto out;
1350 			}
1351 			opts->zone_append = token;
1352 			break;
1353 		case ZLOOP_OPT_ORDERED_ZONE_APPEND:
1354 			opts->ordered_zone_append = true;
1355 			break;
1356 		case ZLOOP_OPT_ERR:
1357 		default:
1358 			pr_warn("unknown parameter or missing value '%s'\n", p);
1359 			ret = -EINVAL;
1360 			goto out;
1361 		}
1362 	}
1363 
1364 	ret = -EINVAL;
1365 	if (opts->capacity <= opts->zone_size) {
1366 		pr_err("Invalid capacity\n");
1367 		goto out;
1368 	}
1369 
1370 	if (opts->zone_capacity > opts->zone_size) {
1371 		pr_err("Invalid zone capacity\n");
1372 		goto out;
1373 	}
1374 
1375 	ret = 0;
1376 out:
1377 	kfree(options);
1378 	return ret;
1379 }
1380 
1381 enum {
1382 	ZLOOP_CTL_ADD,
1383 	ZLOOP_CTL_REMOVE,
1384 };
1385 
1386 static struct zloop_ctl_op {
1387 	int		code;
1388 	const char	*name;
1389 } zloop_ctl_ops[] = {
1390 	{ ZLOOP_CTL_ADD,	"add" },
1391 	{ ZLOOP_CTL_REMOVE,	"remove" },
1392 	{ -1,	NULL },
1393 };
1394 
zloop_ctl_write(struct file * file,const char __user * ubuf,size_t count,loff_t * pos)1395 static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
1396 			       size_t count, loff_t *pos)
1397 {
1398 	struct zloop_options opts = { };
1399 	struct zloop_ctl_op *op;
1400 	const char *buf, *opts_buf;
1401 	int i, ret;
1402 
1403 	if (count > PAGE_SIZE)
1404 		return -ENOMEM;
1405 
1406 	buf = memdup_user_nul(ubuf, count);
1407 	if (IS_ERR(buf))
1408 		return PTR_ERR(buf);
1409 
1410 	for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
1411 		op = &zloop_ctl_ops[i];
1412 		if (!op->name) {
1413 			pr_err("Invalid operation\n");
1414 			ret = -EINVAL;
1415 			goto out;
1416 		}
1417 		if (!strncmp(buf, op->name, strlen(op->name)))
1418 			break;
1419 	}
1420 
1421 	if (count <= strlen(op->name))
1422 		opts_buf = NULL;
1423 	else
1424 		opts_buf = buf + strlen(op->name);
1425 
1426 	ret = zloop_parse_options(&opts, opts_buf);
1427 	if (ret) {
1428 		pr_err("Failed to parse options\n");
1429 		goto out;
1430 	}
1431 
1432 	switch (op->code) {
1433 	case ZLOOP_CTL_ADD:
1434 		ret = zloop_ctl_add(&opts);
1435 		break;
1436 	case ZLOOP_CTL_REMOVE:
1437 		ret = zloop_ctl_remove(&opts);
1438 		break;
1439 	default:
1440 		pr_err("Invalid operation\n");
1441 		ret = -EINVAL;
1442 		goto out;
1443 	}
1444 
1445 out:
1446 	kfree(opts.base_dir);
1447 	kfree(buf);
1448 	return ret ? ret : count;
1449 }
1450 
zloop_ctl_show(struct seq_file * seq_file,void * private)1451 static int zloop_ctl_show(struct seq_file *seq_file, void *private)
1452 {
1453 	const struct match_token *tok;
1454 	int i;
1455 
1456 	/* Add operation */
1457 	seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
1458 	for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
1459 		tok = &zloop_opt_tokens[i];
1460 		if (!tok->pattern)
1461 			break;
1462 		if (i)
1463 			seq_putc(seq_file, ',');
1464 		seq_puts(seq_file, tok->pattern);
1465 	}
1466 	seq_putc(seq_file, '\n');
1467 
1468 	/* Remove operation */
1469 	seq_puts(seq_file, zloop_ctl_ops[1].name);
1470 	seq_puts(seq_file, " id=%d\n");
1471 
1472 	return 0;
1473 }
1474 
zloop_ctl_open(struct inode * inode,struct file * file)1475 static int zloop_ctl_open(struct inode *inode, struct file *file)
1476 {
1477 	file->private_data = NULL;
1478 	return single_open(file, zloop_ctl_show, NULL);
1479 }
1480 
zloop_ctl_release(struct inode * inode,struct file * file)1481 static int zloop_ctl_release(struct inode *inode, struct file *file)
1482 {
1483 	return single_release(inode, file);
1484 }
1485 
1486 static const struct file_operations zloop_ctl_fops = {
1487 	.owner		= THIS_MODULE,
1488 	.open		= zloop_ctl_open,
1489 	.release	= zloop_ctl_release,
1490 	.write		= zloop_ctl_write,
1491 	.read		= seq_read,
1492 };
1493 
1494 static struct miscdevice zloop_misc = {
1495 	.minor		= MISC_DYNAMIC_MINOR,
1496 	.name		= "zloop-control",
1497 	.fops		= &zloop_ctl_fops,
1498 };
1499 
zloop_init(void)1500 static int __init zloop_init(void)
1501 {
1502 	int ret;
1503 
1504 	ret = misc_register(&zloop_misc);
1505 	if (ret) {
1506 		pr_err("Failed to register misc device: %d\n", ret);
1507 		return ret;
1508 	}
1509 	pr_info("Module loaded\n");
1510 
1511 	return 0;
1512 }
1513 
zloop_exit(void)1514 static void __exit zloop_exit(void)
1515 {
1516 	misc_deregister(&zloop_misc);
1517 	idr_destroy(&zloop_index_idr);
1518 }
1519 
1520 module_init(zloop_init);
1521 module_exit(zloop_exit);
1522 
1523 MODULE_DESCRIPTION("Zoned loopback device");
1524 MODULE_LICENSE("GPL");
1525