1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Copyright (c) 2025, Christoph Hellwig.
4 * Copyright (c) 2025, Western Digital Corporation or its affiliates.
5 *
6 * Zoned Loop Device driver - exports a zoned block device using one file per
7 * zone as backing storage.
8 */
9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10
11 #include <linux/module.h>
12 #include <linux/blk-mq.h>
13 #include <linux/blkzoned.h>
14 #include <linux/pagemap.h>
15 #include <linux/miscdevice.h>
16 #include <linux/falloc.h>
17 #include <linux/mutex.h>
18 #include <linux/parser.h>
19 #include <linux/seq_file.h>
20 #include <linux/xattr.h>
21
22 /*
23 * Options for adding (and removing) a device.
24 */
25 enum {
26 ZLOOP_OPT_ERR = 0,
27 ZLOOP_OPT_ID = (1 << 0),
28 ZLOOP_OPT_CAPACITY = (1 << 1),
29 ZLOOP_OPT_ZONE_SIZE = (1 << 2),
30 ZLOOP_OPT_ZONE_CAPACITY = (1 << 3),
31 ZLOOP_OPT_NR_CONV_ZONES = (1 << 4),
32 ZLOOP_OPT_BASE_DIR = (1 << 5),
33 ZLOOP_OPT_NR_QUEUES = (1 << 6),
34 ZLOOP_OPT_QUEUE_DEPTH = (1 << 7),
35 ZLOOP_OPT_BUFFERED_IO = (1 << 8),
36 ZLOOP_OPT_ZONE_APPEND = (1 << 9),
37 ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10),
38 ZLOOP_OPT_DISCARD_WRITE_CACHE = (1 << 11),
39 ZLOOP_OPT_MAX_OPEN_ZONES = (1 << 12),
40 };
41
42 static const match_table_t zloop_opt_tokens = {
43 { ZLOOP_OPT_ID, "id=%d" },
44 { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" },
45 { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" },
46 { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" },
47 { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" },
48 { ZLOOP_OPT_BASE_DIR, "base_dir=%s" },
49 { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" },
50 { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" },
51 { ZLOOP_OPT_BUFFERED_IO, "buffered_io" },
52 { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" },
53 { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" },
54 { ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" },
55 { ZLOOP_OPT_MAX_OPEN_ZONES, "max_open_zones=%u" },
56 { ZLOOP_OPT_ERR, NULL }
57 };
58
59 /* Default values for the "add" operation. */
60 #define ZLOOP_DEF_ID -1
61 #define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT)
62 #define ZLOOP_DEF_NR_ZONES 64
63 #define ZLOOP_DEF_NR_CONV_ZONES 8
64 #define ZLOOP_DEF_MAX_OPEN_ZONES 0
65 #define ZLOOP_DEF_BASE_DIR "/var/local/zloop"
66 #define ZLOOP_DEF_NR_QUEUES 1
67 #define ZLOOP_DEF_QUEUE_DEPTH 128
68 #define ZLOOP_DEF_BUFFERED_IO false
69 #define ZLOOP_DEF_ZONE_APPEND true
70 #define ZLOOP_DEF_ORDERED_ZONE_APPEND false
71
72 /* Arbitrary limit on the zone size (16GB). */
73 #define ZLOOP_MAX_ZONE_SIZE_MB 16384
74
75 struct zloop_options {
76 unsigned int mask;
77 int id;
78 sector_t capacity;
79 sector_t zone_size;
80 sector_t zone_capacity;
81 unsigned int nr_conv_zones;
82 unsigned int max_open_zones;
83 char *base_dir;
84 unsigned int nr_queues;
85 unsigned int queue_depth;
86 bool buffered_io;
87 bool zone_append;
88 bool ordered_zone_append;
89 bool discard_write_cache;
90 };
91
92 /*
93 * Device states.
94 */
95 enum {
96 Zlo_creating = 0,
97 Zlo_live,
98 Zlo_deleting,
99 };
100
101 enum zloop_zone_flags {
102 ZLOOP_ZONE_CONV = 0,
103 ZLOOP_ZONE_SEQ_ERROR,
104 };
105
106 /*
107 * Zone descriptor.
108 * Locking order: z.lock -> z.wp_lock -> zlo.open_zones_lock
109 */
110 struct zloop_zone {
111 struct list_head open_zone_entry;
112 struct file *file;
113
114 unsigned long flags;
115 struct mutex lock;
116 spinlock_t wp_lock;
117 enum blk_zone_cond cond;
118 sector_t start;
119 sector_t wp;
120
121 gfp_t old_gfp_mask;
122 };
123
124 struct zloop_device {
125 unsigned int id;
126 unsigned int state;
127
128 struct blk_mq_tag_set tag_set;
129 struct gendisk *disk;
130
131 struct workqueue_struct *workqueue;
132 bool buffered_io;
133 bool zone_append;
134 bool ordered_zone_append;
135 bool discard_write_cache;
136
137 const char *base_dir;
138 struct file *data_dir;
139
140 unsigned int zone_shift;
141 sector_t zone_size;
142 sector_t zone_capacity;
143 unsigned int nr_zones;
144 unsigned int nr_conv_zones;
145 unsigned int max_open_zones;
146 unsigned int block_size;
147
148 spinlock_t open_zones_lock;
149 struct list_head open_zones_lru_list;
150 unsigned int nr_open_zones;
151
152 struct zloop_zone zones[] __counted_by(nr_zones);
153 };
154
155 struct zloop_cmd {
156 struct work_struct work;
157 atomic_t ref;
158 sector_t sector;
159 sector_t nr_sectors;
160 long ret;
161 struct kiocb iocb;
162 struct bio_vec *bvec;
163 };
164
165 static DEFINE_IDR(zloop_index_idr);
166 static DEFINE_MUTEX(zloop_ctl_mutex);
167
rq_zone_no(struct request * rq)168 static unsigned int rq_zone_no(struct request *rq)
169 {
170 struct zloop_device *zlo = rq->q->queuedata;
171
172 return blk_rq_pos(rq) >> zlo->zone_shift;
173 }
174
175 /*
176 * Open an already open zone. This is mostly a no-op, except for the imp open ->
177 * exp open condition change that may happen. We also move a zone at the tail of
178 * the list of open zones so that if we need to
179 * implicitly close one open zone, we can do so in LRU order.
180 */
zloop_lru_rotate_open_zone(struct zloop_device * zlo,struct zloop_zone * zone)181 static inline void zloop_lru_rotate_open_zone(struct zloop_device *zlo,
182 struct zloop_zone *zone)
183 {
184 if (zlo->max_open_zones) {
185 spin_lock(&zlo->open_zones_lock);
186 list_move_tail(&zone->open_zone_entry,
187 &zlo->open_zones_lru_list);
188 spin_unlock(&zlo->open_zones_lock);
189 }
190 }
191
zloop_lru_remove_open_zone(struct zloop_device * zlo,struct zloop_zone * zone)192 static inline void zloop_lru_remove_open_zone(struct zloop_device *zlo,
193 struct zloop_zone *zone)
194 {
195 if (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
196 zone->cond == BLK_ZONE_COND_EXP_OPEN) {
197 spin_lock(&zlo->open_zones_lock);
198 list_del_init(&zone->open_zone_entry);
199 zlo->nr_open_zones--;
200 spin_unlock(&zlo->open_zones_lock);
201 }
202 }
203
zloop_can_open_zone(struct zloop_device * zlo)204 static inline bool zloop_can_open_zone(struct zloop_device *zlo)
205 {
206 return !zlo->max_open_zones || zlo->nr_open_zones < zlo->max_open_zones;
207 }
208
209 /*
210 * If we have reached the maximum open zones limit, attempt to close an
211 * implicitly open zone (if we have any) so that we can implicitly open another
212 * zone without exceeding the maximum number of open zones.
213 */
zloop_close_imp_open_zone(struct zloop_device * zlo)214 static bool zloop_close_imp_open_zone(struct zloop_device *zlo)
215 {
216 struct zloop_zone *zone;
217
218 lockdep_assert_held(&zlo->open_zones_lock);
219
220 if (zloop_can_open_zone(zlo))
221 return true;
222
223 list_for_each_entry(zone, &zlo->open_zones_lru_list, open_zone_entry) {
224 if (zone->cond == BLK_ZONE_COND_IMP_OPEN) {
225 zone->cond = BLK_ZONE_COND_CLOSED;
226 list_del_init(&zone->open_zone_entry);
227 zlo->nr_open_zones--;
228 return true;
229 }
230 }
231
232 return false;
233 }
234
zloop_open_closed_or_empty_zone(struct zloop_device * zlo,struct zloop_zone * zone,bool explicit)235 static bool zloop_open_closed_or_empty_zone(struct zloop_device *zlo,
236 struct zloop_zone *zone,
237 bool explicit)
238 {
239 spin_lock(&zlo->open_zones_lock);
240
241 if (explicit) {
242 /*
243 * Explicit open: we cannot allow this if we have reached the
244 * maximum open zones limit.
245 */
246 if (!zloop_can_open_zone(zlo))
247 goto fail;
248 zone->cond = BLK_ZONE_COND_EXP_OPEN;
249 } else {
250 /*
251 * Implicit open case: if we have reached the maximum open zones
252 * limit, try to close an implicitly open zone first.
253 */
254 if (!zloop_close_imp_open_zone(zlo))
255 goto fail;
256 zone->cond = BLK_ZONE_COND_IMP_OPEN;
257 }
258
259 zlo->nr_open_zones++;
260 list_add_tail(&zone->open_zone_entry,
261 &zlo->open_zones_lru_list);
262
263 spin_unlock(&zlo->open_zones_lock);
264
265 return true;
266
267 fail:
268 spin_unlock(&zlo->open_zones_lock);
269
270 return false;
271 }
272
zloop_do_open_zone(struct zloop_device * zlo,struct zloop_zone * zone,bool explicit)273 static bool zloop_do_open_zone(struct zloop_device *zlo,
274 struct zloop_zone *zone, bool explicit)
275 {
276 switch (zone->cond) {
277 case BLK_ZONE_COND_IMP_OPEN:
278 case BLK_ZONE_COND_EXP_OPEN:
279 if (explicit)
280 zone->cond = BLK_ZONE_COND_EXP_OPEN;
281 zloop_lru_rotate_open_zone(zlo, zone);
282 return true;
283 case BLK_ZONE_COND_EMPTY:
284 case BLK_ZONE_COND_CLOSED:
285 return zloop_open_closed_or_empty_zone(zlo, zone, explicit);
286 default:
287 return false;
288 }
289 }
290
zloop_mark_full(struct zloop_device * zlo,struct zloop_zone * zone)291 static void zloop_mark_full(struct zloop_device *zlo, struct zloop_zone *zone)
292 {
293 lockdep_assert_held(&zone->wp_lock);
294
295 zloop_lru_remove_open_zone(zlo, zone);
296 zone->cond = BLK_ZONE_COND_FULL;
297 zone->wp = ULLONG_MAX;
298 }
299
zloop_mark_empty(struct zloop_device * zlo,struct zloop_zone * zone)300 static void zloop_mark_empty(struct zloop_device *zlo, struct zloop_zone *zone)
301 {
302 lockdep_assert_held(&zone->wp_lock);
303
304 zloop_lru_remove_open_zone(zlo, zone);
305 zone->cond = BLK_ZONE_COND_EMPTY;
306 zone->wp = zone->start;
307 }
308
zloop_update_seq_zone(struct zloop_device * zlo,unsigned int zone_no)309 static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no)
310 {
311 struct zloop_zone *zone = &zlo->zones[zone_no];
312 struct kstat stat;
313 sector_t file_sectors;
314 int ret;
315
316 lockdep_assert_held(&zone->lock);
317
318 ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
319 if (ret < 0) {
320 pr_err("Failed to get zone %u file stat (err=%d)\n",
321 zone_no, ret);
322 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
323 return ret;
324 }
325
326 file_sectors = stat.size >> SECTOR_SHIFT;
327 if (file_sectors > zlo->zone_capacity) {
328 pr_err("Zone %u file too large (%llu sectors > %llu)\n",
329 zone_no, file_sectors, zlo->zone_capacity);
330 return -EINVAL;
331 }
332
333 if (!IS_ALIGNED(stat.size, zlo->block_size)) {
334 pr_err("Zone %u file size (%llu) not aligned to block size %u\n",
335 zone_no, stat.size, zlo->block_size);
336 return -EINVAL;
337 }
338
339 spin_lock(&zone->wp_lock);
340 if (!file_sectors) {
341 zloop_mark_empty(zlo, zone);
342 } else if (file_sectors == zlo->zone_capacity) {
343 zloop_mark_full(zlo, zone);
344 } else {
345 if (zone->cond != BLK_ZONE_COND_IMP_OPEN &&
346 zone->cond != BLK_ZONE_COND_EXP_OPEN)
347 zone->cond = BLK_ZONE_COND_CLOSED;
348 zone->wp = zone->start + file_sectors;
349 }
350 spin_unlock(&zone->wp_lock);
351
352 return 0;
353 }
354
zloop_open_zone(struct zloop_device * zlo,unsigned int zone_no)355 static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no)
356 {
357 struct zloop_zone *zone = &zlo->zones[zone_no];
358 int ret = 0;
359
360 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
361 return -EIO;
362
363 mutex_lock(&zone->lock);
364
365 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
366 ret = zloop_update_seq_zone(zlo, zone_no);
367 if (ret)
368 goto unlock;
369 }
370
371 if (!zloop_do_open_zone(zlo, zone, true))
372 ret = -EIO;
373
374 unlock:
375 mutex_unlock(&zone->lock);
376
377 return ret;
378 }
379
zloop_close_zone(struct zloop_device * zlo,unsigned int zone_no)380 static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no)
381 {
382 struct zloop_zone *zone = &zlo->zones[zone_no];
383 int ret = 0;
384
385 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
386 return -EIO;
387
388 mutex_lock(&zone->lock);
389
390 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
391 ret = zloop_update_seq_zone(zlo, zone_no);
392 if (ret)
393 goto unlock;
394 }
395
396 switch (zone->cond) {
397 case BLK_ZONE_COND_CLOSED:
398 break;
399 case BLK_ZONE_COND_IMP_OPEN:
400 case BLK_ZONE_COND_EXP_OPEN:
401 spin_lock(&zone->wp_lock);
402 zloop_lru_remove_open_zone(zlo, zone);
403 if (zone->wp == zone->start)
404 zone->cond = BLK_ZONE_COND_EMPTY;
405 else
406 zone->cond = BLK_ZONE_COND_CLOSED;
407 spin_unlock(&zone->wp_lock);
408 break;
409 case BLK_ZONE_COND_EMPTY:
410 case BLK_ZONE_COND_FULL:
411 default:
412 ret = -EIO;
413 break;
414 }
415
416 unlock:
417 mutex_unlock(&zone->lock);
418
419 return ret;
420 }
421
zloop_reset_zone(struct zloop_device * zlo,unsigned int zone_no)422 static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no)
423 {
424 struct zloop_zone *zone = &zlo->zones[zone_no];
425 int ret = 0;
426
427 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
428 return -EIO;
429
430 mutex_lock(&zone->lock);
431
432 if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
433 zone->cond == BLK_ZONE_COND_EMPTY)
434 goto unlock;
435
436 if (vfs_truncate(&zone->file->f_path, 0)) {
437 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
438 ret = -EIO;
439 goto unlock;
440 }
441
442 spin_lock(&zone->wp_lock);
443 zloop_mark_empty(zlo, zone);
444 clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
445 spin_unlock(&zone->wp_lock);
446
447 unlock:
448 mutex_unlock(&zone->lock);
449
450 return ret;
451 }
452
zloop_reset_all_zones(struct zloop_device * zlo)453 static int zloop_reset_all_zones(struct zloop_device *zlo)
454 {
455 unsigned int i;
456 int ret;
457
458 for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) {
459 ret = zloop_reset_zone(zlo, i);
460 if (ret)
461 return ret;
462 }
463
464 return 0;
465 }
466
zloop_finish_zone(struct zloop_device * zlo,unsigned int zone_no)467 static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no)
468 {
469 struct zloop_zone *zone = &zlo->zones[zone_no];
470 int ret = 0;
471
472 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags))
473 return -EIO;
474
475 mutex_lock(&zone->lock);
476
477 if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) &&
478 zone->cond == BLK_ZONE_COND_FULL)
479 goto unlock;
480
481 if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) {
482 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
483 ret = -EIO;
484 goto unlock;
485 }
486
487 spin_lock(&zone->wp_lock);
488 zloop_mark_full(zlo, zone);
489 clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
490 spin_unlock(&zone->wp_lock);
491
492 unlock:
493 mutex_unlock(&zone->lock);
494
495 return ret;
496 }
497
zloop_put_cmd(struct zloop_cmd * cmd)498 static void zloop_put_cmd(struct zloop_cmd *cmd)
499 {
500 struct request *rq = blk_mq_rq_from_pdu(cmd);
501
502 if (!atomic_dec_and_test(&cmd->ref))
503 return;
504 kfree(cmd->bvec);
505 cmd->bvec = NULL;
506 if (likely(!blk_should_fake_timeout(rq->q)))
507 blk_mq_complete_request(rq);
508 }
509
zloop_rw_complete(struct kiocb * iocb,long ret)510 static void zloop_rw_complete(struct kiocb *iocb, long ret)
511 {
512 struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb);
513
514 cmd->ret = ret;
515 zloop_put_cmd(cmd);
516 }
517
zloop_do_rw(struct zloop_cmd * cmd)518 static int zloop_do_rw(struct zloop_cmd *cmd)
519 {
520 struct request *rq = blk_mq_rq_from_pdu(cmd);
521 int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE;
522 unsigned int nr_bvec = blk_rq_nr_bvec(rq);
523 struct zloop_device *zlo = rq->q->queuedata;
524 struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)];
525 struct req_iterator rq_iter;
526 struct iov_iter iter;
527
528 if (rq->bio != rq->biotail) {
529 struct bio_vec tmp, *bvec;
530
531 cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO);
532 if (!cmd->bvec)
533 return -EIO;
534
535 /*
536 * The bios of the request may be started from the middle of
537 * the 'bvec' because of bio splitting, so we can't directly
538 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
539 * API will take care of all details for us.
540 */
541 bvec = cmd->bvec;
542 rq_for_each_bvec(tmp, rq, rq_iter) {
543 *bvec = tmp;
544 bvec++;
545 }
546 iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
547 } else {
548 /*
549 * Same here, this bio may be started from the middle of the
550 * 'bvec' because of bio splitting, so offset from the bvec
551 * must be passed to iov iterator
552 */
553 iov_iter_bvec(&iter, rw,
554 __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
555 nr_bvec, blk_rq_bytes(rq));
556 iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
557 }
558
559 cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT;
560 cmd->iocb.ki_filp = zone->file;
561 cmd->iocb.ki_complete = zloop_rw_complete;
562 if (!zlo->buffered_io)
563 cmd->iocb.ki_flags = IOCB_DIRECT;
564 cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
565
566 if (rw == ITER_SOURCE)
567 return zone->file->f_op->write_iter(&cmd->iocb, &iter);
568 return zone->file->f_op->read_iter(&cmd->iocb, &iter);
569 }
570
zloop_seq_write_prep(struct zloop_cmd * cmd)571 static int zloop_seq_write_prep(struct zloop_cmd *cmd)
572 {
573 struct request *rq = blk_mq_rq_from_pdu(cmd);
574 struct zloop_device *zlo = rq->q->queuedata;
575 unsigned int zone_no = rq_zone_no(rq);
576 sector_t nr_sectors = blk_rq_sectors(rq);
577 bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
578 struct zloop_zone *zone = &zlo->zones[zone_no];
579 sector_t zone_end = zone->start + zlo->zone_capacity;
580 int ret = 0;
581
582 spin_lock(&zone->wp_lock);
583
584 /*
585 * Zone append operations always go at the current write pointer, but
586 * regular write operations must already be aligned to the write pointer
587 * when submitted.
588 */
589 if (is_append) {
590 /*
591 * If ordered zone append is in use, we already checked and set
592 * the target sector in zloop_queue_rq().
593 */
594 if (!zlo->ordered_zone_append) {
595 if (zone->cond == BLK_ZONE_COND_FULL ||
596 zone->wp + nr_sectors > zone_end) {
597 ret = -EIO;
598 goto out_unlock;
599 }
600 cmd->sector = zone->wp;
601 }
602 } else {
603 if (cmd->sector != zone->wp) {
604 pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n",
605 zone_no, cmd->sector, zone->wp);
606 ret = -EIO;
607 goto out_unlock;
608 }
609 }
610
611 /* Implicitly open the target zone. */
612 if (!zloop_do_open_zone(zlo, zone, false)) {
613 ret = -EIO;
614 goto out_unlock;
615 }
616
617 /*
618 * Advance the write pointer, unless ordered zone append is in use. If
619 * the write fails, the write pointer position will be corrected when
620 * the next I/O starts execution.
621 */
622 if (!is_append || !zlo->ordered_zone_append) {
623 zone->wp += nr_sectors;
624 if (zone->wp == zone_end)
625 zloop_mark_full(zlo, zone);
626 }
627 out_unlock:
628 spin_unlock(&zone->wp_lock);
629 return ret;
630 }
631
zloop_rw(struct zloop_cmd * cmd)632 static void zloop_rw(struct zloop_cmd *cmd)
633 {
634 struct request *rq = blk_mq_rq_from_pdu(cmd);
635 struct zloop_device *zlo = rq->q->queuedata;
636 unsigned int zone_no = rq_zone_no(rq);
637 sector_t nr_sectors = blk_rq_sectors(rq);
638 bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND;
639 bool is_write = req_op(rq) == REQ_OP_WRITE || is_append;
640 struct zloop_zone *zone;
641 int ret = -EIO;
642
643 atomic_set(&cmd->ref, 2);
644 cmd->sector = blk_rq_pos(rq);
645 cmd->nr_sectors = nr_sectors;
646 cmd->ret = 0;
647
648 if (WARN_ON_ONCE(is_append && !zlo->zone_append))
649 goto out;
650
651 /* We should never get an I/O beyond the device capacity. */
652 if (WARN_ON_ONCE(zone_no >= zlo->nr_zones))
653 goto out;
654
655 zone = &zlo->zones[zone_no];
656
657 /*
658 * The block layer should never send requests that are not fully
659 * contained within the zone.
660 */
661 if (WARN_ON_ONCE(cmd->sector + nr_sectors >
662 zone->start + zlo->zone_size))
663 goto out;
664
665 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
666 mutex_lock(&zone->lock);
667 ret = zloop_update_seq_zone(zlo, zone_no);
668 mutex_unlock(&zone->lock);
669 if (ret)
670 goto out;
671 }
672
673 if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) {
674 mutex_lock(&zone->lock);
675 ret = zloop_seq_write_prep(cmd);
676 if (!ret)
677 ret = zloop_do_rw(cmd);
678 mutex_unlock(&zone->lock);
679 } else {
680 ret = zloop_do_rw(cmd);
681 }
682 out:
683 if (ret != -EIOCBQUEUED)
684 zloop_rw_complete(&cmd->iocb, ret);
685 zloop_put_cmd(cmd);
686 }
687
zloop_zone_is_active(struct zloop_zone * zone)688 static inline bool zloop_zone_is_active(struct zloop_zone *zone)
689 {
690 switch (zone->cond) {
691 case BLK_ZONE_COND_EXP_OPEN:
692 case BLK_ZONE_COND_IMP_OPEN:
693 case BLK_ZONE_COND_CLOSED:
694 return true;
695 default:
696 return false;
697 }
698 }
699
zloop_record_safe_wps(struct zloop_device * zlo)700 static int zloop_record_safe_wps(struct zloop_device *zlo)
701 {
702 unsigned int i;
703 int ret;
704
705 for (i = 0; i < zlo->nr_zones; i++) {
706 struct zloop_zone *zone = &zlo->zones[i];
707 struct file *file = zone->file;
708
709 if (!zloop_zone_is_active(zone))
710 continue;
711 ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file),
712 "user.zloop.wp", &zone->wp, sizeof(zone->wp), 0);
713 if (ret) {
714 pr_err("%pg: failed to record write pointer (%d)\n",
715 zlo->disk->part0, ret);
716 return ret;
717 }
718 }
719
720 return 0;
721 }
722
723 /*
724 * Sync the entire FS containing the zone files instead of walking all files.
725 */
zloop_flush(struct zloop_device * zlo)726 static int zloop_flush(struct zloop_device *zlo)
727 {
728 struct super_block *sb = file_inode(zlo->data_dir)->i_sb;
729 int ret;
730
731 if (zlo->discard_write_cache) {
732 ret = zloop_record_safe_wps(zlo);
733 if (ret)
734 return ret;
735 }
736
737 down_read(&sb->s_umount);
738 ret = sync_filesystem(sb);
739 up_read(&sb->s_umount);
740
741 return ret;
742 }
743
zloop_handle_cmd(struct zloop_cmd * cmd)744 static void zloop_handle_cmd(struct zloop_cmd *cmd)
745 {
746 struct request *rq = blk_mq_rq_from_pdu(cmd);
747 struct zloop_device *zlo = rq->q->queuedata;
748
749 /* We can block in this context, so ignore REQ_NOWAIT. */
750 if (rq->cmd_flags & REQ_NOWAIT)
751 rq->cmd_flags &= ~REQ_NOWAIT;
752
753 switch (req_op(rq)) {
754 case REQ_OP_READ:
755 case REQ_OP_WRITE:
756 case REQ_OP_ZONE_APPEND:
757 /*
758 * zloop_rw() always executes asynchronously or completes
759 * directly.
760 */
761 zloop_rw(cmd);
762 return;
763 case REQ_OP_FLUSH:
764 cmd->ret = zloop_flush(zlo);
765 break;
766 case REQ_OP_ZONE_RESET:
767 cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq));
768 break;
769 case REQ_OP_ZONE_RESET_ALL:
770 cmd->ret = zloop_reset_all_zones(zlo);
771 break;
772 case REQ_OP_ZONE_FINISH:
773 cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq));
774 break;
775 case REQ_OP_ZONE_OPEN:
776 cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq));
777 break;
778 case REQ_OP_ZONE_CLOSE:
779 cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq));
780 break;
781 default:
782 WARN_ON_ONCE(1);
783 pr_err("Unsupported operation %d\n", req_op(rq));
784 cmd->ret = -EOPNOTSUPP;
785 break;
786 }
787
788 blk_mq_complete_request(rq);
789 }
790
zloop_cmd_workfn(struct work_struct * work)791 static void zloop_cmd_workfn(struct work_struct *work)
792 {
793 struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work);
794 int orig_flags = current->flags;
795
796 current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
797 zloop_handle_cmd(cmd);
798 current->flags = orig_flags;
799 }
800
zloop_complete_rq(struct request * rq)801 static void zloop_complete_rq(struct request *rq)
802 {
803 struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
804 struct zloop_device *zlo = rq->q->queuedata;
805 unsigned int zone_no = cmd->sector >> zlo->zone_shift;
806 struct zloop_zone *zone = &zlo->zones[zone_no];
807 blk_status_t sts = BLK_STS_OK;
808
809 switch (req_op(rq)) {
810 case REQ_OP_READ:
811 if (cmd->ret < 0)
812 pr_err("Zone %u: failed read sector %llu, %llu sectors\n",
813 zone_no, cmd->sector, cmd->nr_sectors);
814
815 if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
816 /* short read */
817 struct bio *bio;
818
819 __rq_for_each_bio(bio, rq)
820 zero_fill_bio(bio);
821 }
822 break;
823 case REQ_OP_WRITE:
824 case REQ_OP_ZONE_APPEND:
825 if (cmd->ret < 0)
826 pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n",
827 zone_no,
828 req_op(rq) == REQ_OP_WRITE ? "" : "append ",
829 cmd->sector, cmd->nr_sectors);
830
831 if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) {
832 pr_err("Zone %u: partial write %ld/%u B\n",
833 zone_no, cmd->ret, blk_rq_bytes(rq));
834 cmd->ret = -EIO;
835 }
836
837 if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
838 /*
839 * A write to a sequential zone file failed: mark the
840 * zone as having an error. This will be corrected and
841 * cleared when the next IO is submitted.
842 */
843 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags);
844 break;
845 }
846 if (req_op(rq) == REQ_OP_ZONE_APPEND)
847 rq->__sector = cmd->sector;
848
849 break;
850 default:
851 break;
852 }
853
854 if (cmd->ret < 0)
855 sts = errno_to_blk_status(cmd->ret);
856 blk_mq_end_request(rq, sts);
857 }
858
zloop_set_zone_append_sector(struct request * rq)859 static bool zloop_set_zone_append_sector(struct request *rq)
860 {
861 struct zloop_device *zlo = rq->q->queuedata;
862 unsigned int zone_no = rq_zone_no(rq);
863 struct zloop_zone *zone = &zlo->zones[zone_no];
864 sector_t zone_end = zone->start + zlo->zone_capacity;
865 sector_t nr_sectors = blk_rq_sectors(rq);
866
867 spin_lock(&zone->wp_lock);
868
869 if (zone->cond == BLK_ZONE_COND_FULL ||
870 zone->wp + nr_sectors > zone_end) {
871 spin_unlock(&zone->wp_lock);
872 return false;
873 }
874
875 rq->__sector = zone->wp;
876 zone->wp += blk_rq_sectors(rq);
877 if (zone->wp >= zone_end)
878 zloop_mark_full(zlo, zone);
879
880 spin_unlock(&zone->wp_lock);
881
882 return true;
883 }
884
zloop_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)885 static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx,
886 const struct blk_mq_queue_data *bd)
887 {
888 struct request *rq = bd->rq;
889 struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq);
890 struct zloop_device *zlo = rq->q->queuedata;
891
892 if (data_race(READ_ONCE(zlo->state)) == Zlo_deleting) {
893 rq->rq_flags |= RQF_QUIET;
894 return BLK_STS_IOERR;
895 }
896
897 /*
898 * If we need to strongly order zone append operations, set the request
899 * sector to the zone write pointer location now instead of when the
900 * command work runs.
901 */
902 if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) {
903 if (!zloop_set_zone_append_sector(rq))
904 return BLK_STS_IOERR;
905 }
906
907 blk_mq_start_request(rq);
908
909 INIT_WORK(&cmd->work, zloop_cmd_workfn);
910 queue_work(zlo->workqueue, &cmd->work);
911
912 return BLK_STS_OK;
913 }
914
915 static const struct blk_mq_ops zloop_mq_ops = {
916 .queue_rq = zloop_queue_rq,
917 .complete = zloop_complete_rq,
918 };
919
zloop_open(struct gendisk * disk,blk_mode_t mode)920 static int zloop_open(struct gendisk *disk, blk_mode_t mode)
921 {
922 struct zloop_device *zlo = disk->private_data;
923 int ret;
924
925 ret = mutex_lock_killable(&zloop_ctl_mutex);
926 if (ret)
927 return ret;
928
929 if (zlo->state != Zlo_live)
930 ret = -ENXIO;
931 mutex_unlock(&zloop_ctl_mutex);
932 return ret;
933 }
934
zloop_report_zones(struct gendisk * disk,sector_t sector,unsigned int nr_zones,struct blk_report_zones_args * args)935 static int zloop_report_zones(struct gendisk *disk, sector_t sector,
936 unsigned int nr_zones, struct blk_report_zones_args *args)
937 {
938 struct zloop_device *zlo = disk->private_data;
939 struct blk_zone blkz = {};
940 unsigned int first, i;
941 int ret;
942
943 first = disk_zone_no(disk, sector);
944 if (first >= zlo->nr_zones)
945 return 0;
946 nr_zones = min(nr_zones, zlo->nr_zones - first);
947
948 for (i = 0; i < nr_zones; i++) {
949 unsigned int zone_no = first + i;
950 struct zloop_zone *zone = &zlo->zones[zone_no];
951
952 mutex_lock(&zone->lock);
953
954 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) {
955 ret = zloop_update_seq_zone(zlo, zone_no);
956 if (ret) {
957 mutex_unlock(&zone->lock);
958 return ret;
959 }
960 }
961
962 blkz.start = zone->start;
963 blkz.len = zlo->zone_size;
964 spin_lock(&zone->wp_lock);
965 blkz.wp = zone->wp;
966 spin_unlock(&zone->wp_lock);
967 blkz.cond = zone->cond;
968 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) {
969 blkz.type = BLK_ZONE_TYPE_CONVENTIONAL;
970 blkz.capacity = zlo->zone_size;
971 } else {
972 blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ;
973 blkz.capacity = zlo->zone_capacity;
974 }
975
976 mutex_unlock(&zone->lock);
977
978 ret = disk_report_zone(disk, &blkz, i, args);
979 if (ret)
980 return ret;
981 }
982
983 return nr_zones;
984 }
985
zloop_free_disk(struct gendisk * disk)986 static void zloop_free_disk(struct gendisk *disk)
987 {
988 struct zloop_device *zlo = disk->private_data;
989 unsigned int i;
990
991 blk_mq_free_tag_set(&zlo->tag_set);
992
993 for (i = 0; i < zlo->nr_zones; i++) {
994 struct zloop_zone *zone = &zlo->zones[i];
995
996 mapping_set_gfp_mask(zone->file->f_mapping,
997 zone->old_gfp_mask);
998 fput(zone->file);
999 }
1000
1001 fput(zlo->data_dir);
1002 destroy_workqueue(zlo->workqueue);
1003 kfree(zlo->base_dir);
1004 kvfree(zlo);
1005 }
1006
1007 static const struct block_device_operations zloop_fops = {
1008 .owner = THIS_MODULE,
1009 .open = zloop_open,
1010 .report_zones = zloop_report_zones,
1011 .free_disk = zloop_free_disk,
1012 };
1013
1014 __printf(3, 4)
zloop_filp_open_fmt(int oflags,umode_t mode,const char * fmt,...)1015 static struct file *zloop_filp_open_fmt(int oflags, umode_t mode,
1016 const char *fmt, ...)
1017 {
1018 struct file *file;
1019 va_list ap;
1020 char *p;
1021
1022 va_start(ap, fmt);
1023 p = kvasprintf(GFP_KERNEL, fmt, ap);
1024 va_end(ap);
1025
1026 if (!p)
1027 return ERR_PTR(-ENOMEM);
1028 file = filp_open(p, oflags, mode);
1029 kfree(p);
1030 return file;
1031 }
1032
zloop_get_block_size(struct zloop_device * zlo,struct zloop_zone * zone)1033 static int zloop_get_block_size(struct zloop_device *zlo,
1034 struct zloop_zone *zone)
1035 {
1036 struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev;
1037 struct kstat st;
1038
1039 /*
1040 * If the FS block size is lower than or equal to 4K, use that as the
1041 * device block size. Otherwise, fallback to the FS direct IO alignment
1042 * constraint if that is provided, and to the FS underlying device
1043 * physical block size if the direct IO alignment is unknown.
1044 */
1045 if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K)
1046 zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize;
1047 else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) &&
1048 (st.result_mask & STATX_DIOALIGN))
1049 zlo->block_size = st.dio_offset_align;
1050 else if (sb_bdev)
1051 zlo->block_size = bdev_physical_block_size(sb_bdev);
1052 else
1053 zlo->block_size = SECTOR_SIZE;
1054
1055 if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) {
1056 pr_err("Zone capacity is not aligned to block size %u\n",
1057 zlo->block_size);
1058 return -EINVAL;
1059 }
1060
1061 return 0;
1062 }
1063
zloop_init_zone(struct zloop_device * zlo,struct zloop_options * opts,unsigned int zone_no,bool restore)1064 static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts,
1065 unsigned int zone_no, bool restore)
1066 {
1067 struct zloop_zone *zone = &zlo->zones[zone_no];
1068 int oflags = O_RDWR;
1069 struct kstat stat;
1070 sector_t file_sectors;
1071 int ret;
1072
1073 mutex_init(&zone->lock);
1074 INIT_LIST_HEAD(&zone->open_zone_entry);
1075 spin_lock_init(&zone->wp_lock);
1076 zone->start = (sector_t)zone_no << zlo->zone_shift;
1077
1078 if (!restore)
1079 oflags |= O_CREAT;
1080
1081 if (!opts->buffered_io)
1082 oflags |= O_DIRECT;
1083
1084 if (zone_no < zlo->nr_conv_zones) {
1085 /* Conventional zone file. */
1086 set_bit(ZLOOP_ZONE_CONV, &zone->flags);
1087 zone->cond = BLK_ZONE_COND_NOT_WP;
1088 zone->wp = U64_MAX;
1089
1090 zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u",
1091 zlo->base_dir, zlo->id, zone_no);
1092 if (IS_ERR(zone->file)) {
1093 pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)",
1094 zone_no, zlo->base_dir, zlo->id, zone_no,
1095 PTR_ERR(zone->file));
1096 return PTR_ERR(zone->file);
1097 }
1098
1099 if (!zlo->block_size) {
1100 ret = zloop_get_block_size(zlo, zone);
1101 if (ret)
1102 return ret;
1103 }
1104
1105 ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0);
1106 if (ret < 0) {
1107 pr_err("Failed to get zone %u file stat\n", zone_no);
1108 return ret;
1109 }
1110 file_sectors = stat.size >> SECTOR_SHIFT;
1111
1112 if (restore && file_sectors != zlo->zone_size) {
1113 pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n",
1114 zone_no, file_sectors, zlo->zone_capacity);
1115 return ret;
1116 }
1117
1118 ret = vfs_truncate(&zone->file->f_path,
1119 zlo->zone_size << SECTOR_SHIFT);
1120 if (ret < 0) {
1121 pr_err("Failed to truncate zone %u file (err=%d)\n",
1122 zone_no, ret);
1123 return ret;
1124 }
1125
1126 return 0;
1127 }
1128
1129 /* Sequential zone file. */
1130 zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u",
1131 zlo->base_dir, zlo->id, zone_no);
1132 if (IS_ERR(zone->file)) {
1133 pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)",
1134 zone_no, zlo->base_dir, zlo->id, zone_no,
1135 PTR_ERR(zone->file));
1136 return PTR_ERR(zone->file);
1137 }
1138
1139 if (!zlo->block_size) {
1140 ret = zloop_get_block_size(zlo, zone);
1141 if (ret)
1142 return ret;
1143 }
1144
1145 zloop_get_block_size(zlo, zone);
1146
1147 mutex_lock(&zone->lock);
1148 ret = zloop_update_seq_zone(zlo, zone_no);
1149 mutex_unlock(&zone->lock);
1150
1151 return ret;
1152 }
1153
zloop_dev_exists(struct zloop_device * zlo)1154 static bool zloop_dev_exists(struct zloop_device *zlo)
1155 {
1156 struct file *cnv, *seq;
1157 bool exists;
1158
1159 cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u",
1160 zlo->base_dir, zlo->id, 0);
1161 seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u",
1162 zlo->base_dir, zlo->id, 0);
1163 exists = !IS_ERR(cnv) || !IS_ERR(seq);
1164
1165 if (!IS_ERR(cnv))
1166 fput(cnv);
1167 if (!IS_ERR(seq))
1168 fput(seq);
1169
1170 return exists;
1171 }
1172
zloop_ctl_add(struct zloop_options * opts)1173 static int zloop_ctl_add(struct zloop_options *opts)
1174 {
1175 struct queue_limits lim = {
1176 .max_hw_sectors = SZ_1M >> SECTOR_SHIFT,
1177 .chunk_sectors = opts->zone_size,
1178 .features = BLK_FEAT_ZONED | BLK_FEAT_WRITE_CACHE,
1179
1180 };
1181 unsigned int nr_zones, i, j;
1182 struct zloop_device *zlo;
1183 int ret = -EINVAL;
1184 bool restore;
1185
1186 __module_get(THIS_MODULE);
1187
1188 nr_zones = opts->capacity >> ilog2(opts->zone_size);
1189 if (opts->nr_conv_zones >= nr_zones) {
1190 pr_err("Invalid number of conventional zones %u\n",
1191 opts->nr_conv_zones);
1192 goto out;
1193 }
1194
1195 if (opts->max_open_zones > nr_zones - opts->nr_conv_zones) {
1196 pr_err("Invalid maximum number of open zones %u\n",
1197 opts->max_open_zones);
1198 goto out;
1199 }
1200
1201 zlo = kvzalloc_flex(*zlo, zones, nr_zones);
1202 if (!zlo) {
1203 ret = -ENOMEM;
1204 goto out;
1205 }
1206 WRITE_ONCE(zlo->state, Zlo_creating);
1207 spin_lock_init(&zlo->open_zones_lock);
1208 INIT_LIST_HEAD(&zlo->open_zones_lru_list);
1209
1210 ret = mutex_lock_killable(&zloop_ctl_mutex);
1211 if (ret)
1212 goto out_free_dev;
1213
1214 /* Allocate id, if @opts->id >= 0, we're requesting that specific id */
1215 if (opts->id >= 0) {
1216 ret = idr_alloc(&zloop_index_idr, zlo,
1217 opts->id, opts->id + 1, GFP_KERNEL);
1218 if (ret == -ENOSPC)
1219 ret = -EEXIST;
1220 } else {
1221 ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL);
1222 }
1223 mutex_unlock(&zloop_ctl_mutex);
1224 if (ret < 0)
1225 goto out_free_dev;
1226
1227 zlo->id = ret;
1228 zlo->zone_shift = ilog2(opts->zone_size);
1229 zlo->zone_size = opts->zone_size;
1230 if (opts->zone_capacity)
1231 zlo->zone_capacity = opts->zone_capacity;
1232 else
1233 zlo->zone_capacity = zlo->zone_size;
1234 zlo->nr_zones = nr_zones;
1235 zlo->nr_conv_zones = opts->nr_conv_zones;
1236 zlo->max_open_zones = opts->max_open_zones;
1237 zlo->buffered_io = opts->buffered_io;
1238 zlo->zone_append = opts->zone_append;
1239 if (zlo->zone_append)
1240 zlo->ordered_zone_append = opts->ordered_zone_append;
1241 zlo->discard_write_cache = opts->discard_write_cache;
1242
1243 zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE,
1244 opts->nr_queues * opts->queue_depth, zlo->id);
1245 if (!zlo->workqueue) {
1246 ret = -ENOMEM;
1247 goto out_free_idr;
1248 }
1249
1250 if (opts->base_dir)
1251 zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL);
1252 else
1253 zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL);
1254 if (!zlo->base_dir) {
1255 ret = -ENOMEM;
1256 goto out_destroy_workqueue;
1257 }
1258
1259 zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u",
1260 zlo->base_dir, zlo->id);
1261 if (IS_ERR(zlo->data_dir)) {
1262 ret = PTR_ERR(zlo->data_dir);
1263 pr_warn("Failed to open directory %s/%u (err=%d)\n",
1264 zlo->base_dir, zlo->id, ret);
1265 goto out_free_base_dir;
1266 }
1267
1268 /*
1269 * If we already have zone files, we are restoring a device created by a
1270 * previous add operation. In this case, zloop_init_zone() will check
1271 * that the zone files are consistent with the zone configuration given.
1272 */
1273 restore = zloop_dev_exists(zlo);
1274 for (i = 0; i < nr_zones; i++) {
1275 ret = zloop_init_zone(zlo, opts, i, restore);
1276 if (ret)
1277 goto out_close_files;
1278 }
1279
1280 lim.physical_block_size = zlo->block_size;
1281 lim.logical_block_size = zlo->block_size;
1282 if (zlo->zone_append)
1283 lim.max_hw_zone_append_sectors = lim.max_hw_sectors;
1284 lim.max_open_zones = zlo->max_open_zones;
1285
1286 zlo->tag_set.ops = &zloop_mq_ops;
1287 zlo->tag_set.nr_hw_queues = opts->nr_queues;
1288 zlo->tag_set.queue_depth = opts->queue_depth;
1289 zlo->tag_set.numa_node = NUMA_NO_NODE;
1290 zlo->tag_set.cmd_size = sizeof(struct zloop_cmd);
1291 zlo->tag_set.driver_data = zlo;
1292
1293 ret = blk_mq_alloc_tag_set(&zlo->tag_set);
1294 if (ret) {
1295 pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret);
1296 goto out_close_files;
1297 }
1298
1299 zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo);
1300 if (IS_ERR(zlo->disk)) {
1301 pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret);
1302 ret = PTR_ERR(zlo->disk);
1303 goto out_cleanup_tags;
1304 }
1305 zlo->disk->flags = GENHD_FL_NO_PART;
1306 zlo->disk->fops = &zloop_fops;
1307 zlo->disk->private_data = zlo;
1308 sprintf(zlo->disk->disk_name, "zloop%d", zlo->id);
1309 set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones);
1310
1311 ret = blk_revalidate_disk_zones(zlo->disk);
1312 if (ret)
1313 goto out_cleanup_disk;
1314
1315 ret = add_disk(zlo->disk);
1316 if (ret) {
1317 pr_err("add_disk failed (err=%d)\n", ret);
1318 goto out_cleanup_disk;
1319 }
1320
1321 mutex_lock(&zloop_ctl_mutex);
1322 WRITE_ONCE(zlo->state, Zlo_live);
1323 mutex_unlock(&zloop_ctl_mutex);
1324
1325 pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n",
1326 zlo->id, zlo->nr_zones,
1327 ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20,
1328 zlo->block_size);
1329 pr_info("zloop%d: using %s%s zone append\n",
1330 zlo->id,
1331 zlo->ordered_zone_append ? "ordered " : "",
1332 zlo->zone_append ? "native" : "emulated");
1333
1334 return 0;
1335
1336 out_cleanup_disk:
1337 put_disk(zlo->disk);
1338 out_cleanup_tags:
1339 blk_mq_free_tag_set(&zlo->tag_set);
1340 out_close_files:
1341 for (j = 0; j < i; j++) {
1342 struct zloop_zone *zone = &zlo->zones[j];
1343
1344 if (!IS_ERR_OR_NULL(zone->file))
1345 fput(zone->file);
1346 }
1347 fput(zlo->data_dir);
1348 out_free_base_dir:
1349 kfree(zlo->base_dir);
1350 out_destroy_workqueue:
1351 destroy_workqueue(zlo->workqueue);
1352 out_free_idr:
1353 mutex_lock(&zloop_ctl_mutex);
1354 idr_remove(&zloop_index_idr, zlo->id);
1355 mutex_unlock(&zloop_ctl_mutex);
1356 out_free_dev:
1357 kvfree(zlo);
1358 out:
1359 module_put(THIS_MODULE);
1360 if (ret == -ENOENT)
1361 ret = -EINVAL;
1362 return ret;
1363 }
1364
zloop_forget_cache(struct zloop_device * zlo)1365 static void zloop_forget_cache(struct zloop_device *zlo)
1366 {
1367 unsigned int i;
1368 int ret;
1369
1370 pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0);
1371
1372 for (i = 0; i < zlo->nr_zones; i++) {
1373 struct zloop_zone *zone = &zlo->zones[i];
1374 struct file *file = zone->file;
1375 sector_t old_wp;
1376
1377 if (!zloop_zone_is_active(zone))
1378 continue;
1379
1380 ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file),
1381 "user.zloop.wp", &old_wp, sizeof(old_wp));
1382 if (ret == -ENODATA) {
1383 old_wp = 0;
1384 } else if (ret != sizeof(old_wp)) {
1385 pr_err("%pg: failed to retrieve write pointer (%d)\n",
1386 zlo->disk->part0, ret);
1387 continue;
1388 }
1389
1390 if (old_wp > zone->wp)
1391 continue;
1392 /*
1393 * This should not happen, if we recored a full zone, it can't
1394 * be active.
1395 */
1396 if (WARN_ON_ONCE(old_wp == ULLONG_MAX))
1397 continue;
1398
1399 vfs_truncate(&file->f_path,
1400 (old_wp - zone->start) << SECTOR_SHIFT);
1401 }
1402 }
1403
zloop_ctl_remove(struct zloop_options * opts)1404 static int zloop_ctl_remove(struct zloop_options *opts)
1405 {
1406 struct zloop_device *zlo;
1407 int ret;
1408
1409 if (!(opts->mask & ZLOOP_OPT_ID)) {
1410 pr_err("No ID specified for remove\n");
1411 return -EINVAL;
1412 }
1413
1414 if (opts->mask & ~ZLOOP_OPT_ID) {
1415 pr_err("Invalid option specified for remove\n");
1416 return -EINVAL;
1417 }
1418
1419 ret = mutex_lock_killable(&zloop_ctl_mutex);
1420 if (ret)
1421 return ret;
1422
1423 zlo = idr_find(&zloop_index_idr, opts->id);
1424 if (!zlo || zlo->state == Zlo_creating) {
1425 ret = -ENODEV;
1426 } else if (zlo->state == Zlo_deleting) {
1427 ret = -EINVAL;
1428 } else {
1429 idr_remove(&zloop_index_idr, zlo->id);
1430 WRITE_ONCE(zlo->state, Zlo_deleting);
1431 }
1432
1433 mutex_unlock(&zloop_ctl_mutex);
1434 if (ret)
1435 return ret;
1436
1437 del_gendisk(zlo->disk);
1438
1439 if (zlo->discard_write_cache)
1440 zloop_forget_cache(zlo);
1441
1442 put_disk(zlo->disk);
1443
1444 pr_info("Removed device %d\n", opts->id);
1445
1446 module_put(THIS_MODULE);
1447
1448 return 0;
1449 }
1450
zloop_parse_options(struct zloop_options * opts,const char * buf)1451 static int zloop_parse_options(struct zloop_options *opts, const char *buf)
1452 {
1453 substring_t args[MAX_OPT_ARGS];
1454 char *options, *o, *p;
1455 unsigned int token;
1456 int ret = 0;
1457
1458 /* Set defaults. */
1459 opts->mask = 0;
1460 opts->id = ZLOOP_DEF_ID;
1461 opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES;
1462 opts->zone_size = ZLOOP_DEF_ZONE_SIZE;
1463 opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES;
1464 opts->max_open_zones = ZLOOP_DEF_MAX_OPEN_ZONES;
1465 opts->nr_queues = ZLOOP_DEF_NR_QUEUES;
1466 opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH;
1467 opts->buffered_io = ZLOOP_DEF_BUFFERED_IO;
1468 opts->zone_append = ZLOOP_DEF_ZONE_APPEND;
1469 opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND;
1470
1471 if (!buf)
1472 return 0;
1473
1474 /* Skip leading spaces before the options. */
1475 while (isspace(*buf))
1476 buf++;
1477
1478 options = o = kstrdup(buf, GFP_KERNEL);
1479 if (!options)
1480 return -ENOMEM;
1481
1482 /* Parse the options, doing only some light invalid value checks. */
1483 while ((p = strsep(&o, ",\n")) != NULL) {
1484 if (!*p)
1485 continue;
1486
1487 token = match_token(p, zloop_opt_tokens, args);
1488 opts->mask |= token;
1489 switch (token) {
1490 case ZLOOP_OPT_ID:
1491 if (match_int(args, &opts->id)) {
1492 ret = -EINVAL;
1493 goto out;
1494 }
1495 break;
1496 case ZLOOP_OPT_CAPACITY:
1497 if (match_uint(args, &token)) {
1498 ret = -EINVAL;
1499 goto out;
1500 }
1501 if (!token) {
1502 pr_err("Invalid capacity\n");
1503 ret = -EINVAL;
1504 goto out;
1505 }
1506 opts->capacity =
1507 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1508 break;
1509 case ZLOOP_OPT_ZONE_SIZE:
1510 if (match_uint(args, &token)) {
1511 ret = -EINVAL;
1512 goto out;
1513 }
1514 if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB ||
1515 !is_power_of_2(token)) {
1516 pr_err("Invalid zone size %u\n", token);
1517 ret = -EINVAL;
1518 goto out;
1519 }
1520 opts->zone_size =
1521 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1522 break;
1523 case ZLOOP_OPT_ZONE_CAPACITY:
1524 if (match_uint(args, &token)) {
1525 ret = -EINVAL;
1526 goto out;
1527 }
1528 if (!token) {
1529 pr_err("Invalid zone capacity\n");
1530 ret = -EINVAL;
1531 goto out;
1532 }
1533 opts->zone_capacity =
1534 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT;
1535 break;
1536 case ZLOOP_OPT_NR_CONV_ZONES:
1537 if (match_uint(args, &token)) {
1538 ret = -EINVAL;
1539 goto out;
1540 }
1541 opts->nr_conv_zones = token;
1542 break;
1543 case ZLOOP_OPT_MAX_OPEN_ZONES:
1544 if (match_uint(args, &token)) {
1545 ret = -EINVAL;
1546 goto out;
1547 }
1548 opts->max_open_zones = token;
1549 break;
1550 case ZLOOP_OPT_BASE_DIR:
1551 p = match_strdup(args);
1552 if (!p) {
1553 ret = -ENOMEM;
1554 goto out;
1555 }
1556 kfree(opts->base_dir);
1557 opts->base_dir = p;
1558 break;
1559 case ZLOOP_OPT_NR_QUEUES:
1560 if (match_uint(args, &token)) {
1561 ret = -EINVAL;
1562 goto out;
1563 }
1564 if (!token) {
1565 pr_err("Invalid number of queues\n");
1566 ret = -EINVAL;
1567 goto out;
1568 }
1569 opts->nr_queues = min(token, num_online_cpus());
1570 break;
1571 case ZLOOP_OPT_QUEUE_DEPTH:
1572 if (match_uint(args, &token)) {
1573 ret = -EINVAL;
1574 goto out;
1575 }
1576 if (!token) {
1577 pr_err("Invalid queue depth\n");
1578 ret = -EINVAL;
1579 goto out;
1580 }
1581 opts->queue_depth = token;
1582 break;
1583 case ZLOOP_OPT_BUFFERED_IO:
1584 opts->buffered_io = true;
1585 break;
1586 case ZLOOP_OPT_ZONE_APPEND:
1587 if (match_uint(args, &token)) {
1588 ret = -EINVAL;
1589 goto out;
1590 }
1591 if (token != 0 && token != 1) {
1592 pr_err("Invalid zone_append value\n");
1593 ret = -EINVAL;
1594 goto out;
1595 }
1596 opts->zone_append = token;
1597 break;
1598 case ZLOOP_OPT_ORDERED_ZONE_APPEND:
1599 opts->ordered_zone_append = true;
1600 break;
1601 case ZLOOP_OPT_DISCARD_WRITE_CACHE:
1602 opts->discard_write_cache = true;
1603 break;
1604 case ZLOOP_OPT_ERR:
1605 default:
1606 pr_warn("unknown parameter or missing value '%s'\n", p);
1607 ret = -EINVAL;
1608 goto out;
1609 }
1610 }
1611
1612 ret = -EINVAL;
1613 if (opts->capacity <= opts->zone_size) {
1614 pr_err("Invalid capacity\n");
1615 goto out;
1616 }
1617
1618 if (opts->zone_capacity > opts->zone_size) {
1619 pr_err("Invalid zone capacity\n");
1620 goto out;
1621 }
1622
1623 ret = 0;
1624 out:
1625 kfree(options);
1626 return ret;
1627 }
1628
1629 enum {
1630 ZLOOP_CTL_ADD,
1631 ZLOOP_CTL_REMOVE,
1632 };
1633
1634 static struct zloop_ctl_op {
1635 int code;
1636 const char *name;
1637 } zloop_ctl_ops[] = {
1638 { ZLOOP_CTL_ADD, "add" },
1639 { ZLOOP_CTL_REMOVE, "remove" },
1640 { -1, NULL },
1641 };
1642
zloop_ctl_write(struct file * file,const char __user * ubuf,size_t count,loff_t * pos)1643 static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf,
1644 size_t count, loff_t *pos)
1645 {
1646 struct zloop_options opts = { };
1647 struct zloop_ctl_op *op;
1648 const char *buf, *opts_buf;
1649 int i, ret;
1650
1651 if (count > PAGE_SIZE)
1652 return -ENOMEM;
1653
1654 buf = memdup_user_nul(ubuf, count);
1655 if (IS_ERR(buf))
1656 return PTR_ERR(buf);
1657
1658 for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) {
1659 op = &zloop_ctl_ops[i];
1660 if (!op->name) {
1661 pr_err("Invalid operation\n");
1662 ret = -EINVAL;
1663 goto out;
1664 }
1665 if (!strncmp(buf, op->name, strlen(op->name)))
1666 break;
1667 }
1668
1669 if (count <= strlen(op->name))
1670 opts_buf = NULL;
1671 else
1672 opts_buf = buf + strlen(op->name);
1673
1674 ret = zloop_parse_options(&opts, opts_buf);
1675 if (ret) {
1676 pr_err("Failed to parse options\n");
1677 goto out;
1678 }
1679
1680 switch (op->code) {
1681 case ZLOOP_CTL_ADD:
1682 ret = zloop_ctl_add(&opts);
1683 break;
1684 case ZLOOP_CTL_REMOVE:
1685 ret = zloop_ctl_remove(&opts);
1686 break;
1687 default:
1688 pr_err("Invalid operation\n");
1689 ret = -EINVAL;
1690 goto out;
1691 }
1692
1693 out:
1694 kfree(opts.base_dir);
1695 kfree(buf);
1696 return ret ? ret : count;
1697 }
1698
zloop_ctl_show(struct seq_file * seq_file,void * private)1699 static int zloop_ctl_show(struct seq_file *seq_file, void *private)
1700 {
1701 const struct match_token *tok;
1702 int i;
1703
1704 /* Add operation */
1705 seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name);
1706 for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) {
1707 tok = &zloop_opt_tokens[i];
1708 if (!tok->pattern)
1709 break;
1710 if (i)
1711 seq_putc(seq_file, ',');
1712 seq_puts(seq_file, tok->pattern);
1713 }
1714 seq_putc(seq_file, '\n');
1715
1716 /* Remove operation */
1717 seq_puts(seq_file, zloop_ctl_ops[1].name);
1718 seq_puts(seq_file, " id=%d\n");
1719
1720 return 0;
1721 }
1722
zloop_ctl_open(struct inode * inode,struct file * file)1723 static int zloop_ctl_open(struct inode *inode, struct file *file)
1724 {
1725 file->private_data = NULL;
1726 return single_open(file, zloop_ctl_show, NULL);
1727 }
1728
zloop_ctl_release(struct inode * inode,struct file * file)1729 static int zloop_ctl_release(struct inode *inode, struct file *file)
1730 {
1731 return single_release(inode, file);
1732 }
1733
1734 static const struct file_operations zloop_ctl_fops = {
1735 .owner = THIS_MODULE,
1736 .open = zloop_ctl_open,
1737 .release = zloop_ctl_release,
1738 .write = zloop_ctl_write,
1739 .read = seq_read,
1740 };
1741
1742 static struct miscdevice zloop_misc = {
1743 .minor = MISC_DYNAMIC_MINOR,
1744 .name = "zloop-control",
1745 .fops = &zloop_ctl_fops,
1746 };
1747
zloop_init(void)1748 static int __init zloop_init(void)
1749 {
1750 int ret;
1751
1752 ret = misc_register(&zloop_misc);
1753 if (ret) {
1754 pr_err("Failed to register misc device: %d\n", ret);
1755 return ret;
1756 }
1757 pr_info("Module loaded\n");
1758
1759 return 0;
1760 }
1761
zloop_exit(void)1762 static void __exit zloop_exit(void)
1763 {
1764 misc_deregister(&zloop_misc);
1765 idr_destroy(&zloop_index_idr);
1766 }
1767
1768 module_init(zloop_init);
1769 module_exit(zloop_exit);
1770
1771 MODULE_DESCRIPTION("Zoned loopback device");
1772 MODULE_LICENSE("GPL");
1773