1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2025, Christoph Hellwig. 4 * Copyright (c) 2025, Western Digital Corporation or its affiliates. 5 * 6 * Zoned Loop Device driver - exports a zoned block device using one file per 7 * zone as backing storage. 8 */ 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/module.h> 12 #include <linux/blk-mq.h> 13 #include <linux/blkzoned.h> 14 #include <linux/pagemap.h> 15 #include <linux/miscdevice.h> 16 #include <linux/falloc.h> 17 #include <linux/mutex.h> 18 #include <linux/parser.h> 19 #include <linux/seq_file.h> 20 #include <linux/xattr.h> 21 22 /* 23 * Options for adding (and removing) a device. 24 */ 25 enum { 26 ZLOOP_OPT_ERR = 0, 27 ZLOOP_OPT_ID = (1 << 0), 28 ZLOOP_OPT_CAPACITY = (1 << 1), 29 ZLOOP_OPT_ZONE_SIZE = (1 << 2), 30 ZLOOP_OPT_ZONE_CAPACITY = (1 << 3), 31 ZLOOP_OPT_NR_CONV_ZONES = (1 << 4), 32 ZLOOP_OPT_BASE_DIR = (1 << 5), 33 ZLOOP_OPT_NR_QUEUES = (1 << 6), 34 ZLOOP_OPT_QUEUE_DEPTH = (1 << 7), 35 ZLOOP_OPT_BUFFERED_IO = (1 << 8), 36 ZLOOP_OPT_ZONE_APPEND = (1 << 9), 37 ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), 38 ZLOOP_OPT_DISCARD_WRITE_CACHE = (1 << 11), 39 ZLOOP_OPT_MAX_OPEN_ZONES = (1 << 12), 40 }; 41 42 static const match_table_t zloop_opt_tokens = { 43 { ZLOOP_OPT_ID, "id=%d" }, 44 { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" }, 45 { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" }, 46 { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" }, 47 { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" }, 48 { ZLOOP_OPT_BASE_DIR, "base_dir=%s" }, 49 { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" }, 50 { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" }, 51 { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, 52 { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, 53 { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, 54 { ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" }, 55 { ZLOOP_OPT_MAX_OPEN_ZONES, "max_open_zones=%u" }, 56 { ZLOOP_OPT_ERR, NULL } 57 }; 58 59 /* Default values for the "add" operation. */ 60 #define ZLOOP_DEF_ID -1 61 #define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT) 62 #define ZLOOP_DEF_NR_ZONES 64 63 #define ZLOOP_DEF_NR_CONV_ZONES 8 64 #define ZLOOP_DEF_MAX_OPEN_ZONES 0 65 #define ZLOOP_DEF_BASE_DIR "/var/local/zloop" 66 #define ZLOOP_DEF_NR_QUEUES 1 67 #define ZLOOP_DEF_QUEUE_DEPTH 128 68 #define ZLOOP_DEF_BUFFERED_IO false 69 #define ZLOOP_DEF_ZONE_APPEND true 70 #define ZLOOP_DEF_ORDERED_ZONE_APPEND false 71 72 /* Arbitrary limit on the zone size (16GB). */ 73 #define ZLOOP_MAX_ZONE_SIZE_MB 16384 74 75 struct zloop_options { 76 unsigned int mask; 77 int id; 78 sector_t capacity; 79 sector_t zone_size; 80 sector_t zone_capacity; 81 unsigned int nr_conv_zones; 82 unsigned int max_open_zones; 83 char *base_dir; 84 unsigned int nr_queues; 85 unsigned int queue_depth; 86 bool buffered_io; 87 bool zone_append; 88 bool ordered_zone_append; 89 bool discard_write_cache; 90 }; 91 92 /* 93 * Device states. 94 */ 95 enum { 96 Zlo_creating = 0, 97 Zlo_live, 98 Zlo_deleting, 99 }; 100 101 enum zloop_zone_flags { 102 ZLOOP_ZONE_CONV = 0, 103 ZLOOP_ZONE_SEQ_ERROR, 104 }; 105 106 /* 107 * Zone descriptor. 108 * Locking order: z.lock -> z.wp_lock -> zlo.open_zones_lock 109 */ 110 struct zloop_zone { 111 struct list_head open_zone_entry; 112 struct file *file; 113 114 unsigned long flags; 115 struct mutex lock; 116 spinlock_t wp_lock; 117 enum blk_zone_cond cond; 118 sector_t start; 119 sector_t wp; 120 121 gfp_t old_gfp_mask; 122 }; 123 124 struct zloop_device { 125 unsigned int id; 126 unsigned int state; 127 128 struct blk_mq_tag_set tag_set; 129 struct gendisk *disk; 130 131 struct workqueue_struct *workqueue; 132 bool buffered_io; 133 bool zone_append; 134 bool ordered_zone_append; 135 bool discard_write_cache; 136 137 const char *base_dir; 138 struct file *data_dir; 139 140 unsigned int zone_shift; 141 sector_t zone_size; 142 sector_t zone_capacity; 143 unsigned int nr_zones; 144 unsigned int nr_conv_zones; 145 unsigned int max_open_zones; 146 unsigned int block_size; 147 148 spinlock_t open_zones_lock; 149 struct list_head open_zones_lru_list; 150 unsigned int nr_open_zones; 151 152 struct zloop_zone zones[] __counted_by(nr_zones); 153 }; 154 155 struct zloop_cmd { 156 struct work_struct work; 157 atomic_t ref; 158 sector_t sector; 159 sector_t nr_sectors; 160 long ret; 161 struct kiocb iocb; 162 struct bio_vec *bvec; 163 }; 164 165 static DEFINE_IDR(zloop_index_idr); 166 static DEFINE_MUTEX(zloop_ctl_mutex); 167 168 static unsigned int rq_zone_no(struct request *rq) 169 { 170 struct zloop_device *zlo = rq->q->queuedata; 171 172 return blk_rq_pos(rq) >> zlo->zone_shift; 173 } 174 175 /* 176 * Open an already open zone. This is mostly a no-op, except for the imp open -> 177 * exp open condition change that may happen. We also move a zone at the tail of 178 * the list of open zones so that if we need to 179 * implicitly close one open zone, we can do so in LRU order. 180 */ 181 static inline void zloop_lru_rotate_open_zone(struct zloop_device *zlo, 182 struct zloop_zone *zone) 183 { 184 if (zlo->max_open_zones) { 185 spin_lock(&zlo->open_zones_lock); 186 list_move_tail(&zone->open_zone_entry, 187 &zlo->open_zones_lru_list); 188 spin_unlock(&zlo->open_zones_lock); 189 } 190 } 191 192 static inline void zloop_lru_remove_open_zone(struct zloop_device *zlo, 193 struct zloop_zone *zone) 194 { 195 if (zone->cond == BLK_ZONE_COND_IMP_OPEN || 196 zone->cond == BLK_ZONE_COND_EXP_OPEN) { 197 spin_lock(&zlo->open_zones_lock); 198 list_del_init(&zone->open_zone_entry); 199 zlo->nr_open_zones--; 200 spin_unlock(&zlo->open_zones_lock); 201 } 202 } 203 204 static inline bool zloop_can_open_zone(struct zloop_device *zlo) 205 { 206 return !zlo->max_open_zones || zlo->nr_open_zones < zlo->max_open_zones; 207 } 208 209 /* 210 * If we have reached the maximum open zones limit, attempt to close an 211 * implicitly open zone (if we have any) so that we can implicitly open another 212 * zone without exceeding the maximum number of open zones. 213 */ 214 static bool zloop_close_imp_open_zone(struct zloop_device *zlo) 215 { 216 struct zloop_zone *zone; 217 218 lockdep_assert_held(&zlo->open_zones_lock); 219 220 if (zloop_can_open_zone(zlo)) 221 return true; 222 223 list_for_each_entry(zone, &zlo->open_zones_lru_list, open_zone_entry) { 224 if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { 225 zone->cond = BLK_ZONE_COND_CLOSED; 226 list_del_init(&zone->open_zone_entry); 227 zlo->nr_open_zones--; 228 return true; 229 } 230 } 231 232 return false; 233 } 234 235 static bool zloop_open_closed_or_empty_zone(struct zloop_device *zlo, 236 struct zloop_zone *zone, 237 bool explicit) 238 { 239 spin_lock(&zlo->open_zones_lock); 240 241 if (explicit) { 242 /* 243 * Explicit open: we cannot allow this if we have reached the 244 * maximum open zones limit. 245 */ 246 if (!zloop_can_open_zone(zlo)) 247 goto fail; 248 zone->cond = BLK_ZONE_COND_EXP_OPEN; 249 } else { 250 /* 251 * Implicit open case: if we have reached the maximum open zones 252 * limit, try to close an implicitly open zone first. 253 */ 254 if (!zloop_close_imp_open_zone(zlo)) 255 goto fail; 256 zone->cond = BLK_ZONE_COND_IMP_OPEN; 257 } 258 259 zlo->nr_open_zones++; 260 list_add_tail(&zone->open_zone_entry, 261 &zlo->open_zones_lru_list); 262 263 spin_unlock(&zlo->open_zones_lock); 264 265 return true; 266 267 fail: 268 spin_unlock(&zlo->open_zones_lock); 269 270 return false; 271 } 272 273 static bool zloop_do_open_zone(struct zloop_device *zlo, 274 struct zloop_zone *zone, bool explicit) 275 { 276 switch (zone->cond) { 277 case BLK_ZONE_COND_IMP_OPEN: 278 case BLK_ZONE_COND_EXP_OPEN: 279 if (explicit) 280 zone->cond = BLK_ZONE_COND_EXP_OPEN; 281 zloop_lru_rotate_open_zone(zlo, zone); 282 return true; 283 case BLK_ZONE_COND_EMPTY: 284 case BLK_ZONE_COND_CLOSED: 285 return zloop_open_closed_or_empty_zone(zlo, zone, explicit); 286 default: 287 return false; 288 } 289 } 290 291 static void zloop_mark_full(struct zloop_device *zlo, struct zloop_zone *zone) 292 { 293 lockdep_assert_held(&zone->wp_lock); 294 295 zloop_lru_remove_open_zone(zlo, zone); 296 zone->cond = BLK_ZONE_COND_FULL; 297 zone->wp = ULLONG_MAX; 298 } 299 300 static void zloop_mark_empty(struct zloop_device *zlo, struct zloop_zone *zone) 301 { 302 lockdep_assert_held(&zone->wp_lock); 303 304 zloop_lru_remove_open_zone(zlo, zone); 305 zone->cond = BLK_ZONE_COND_EMPTY; 306 zone->wp = zone->start; 307 } 308 309 static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) 310 { 311 struct zloop_zone *zone = &zlo->zones[zone_no]; 312 struct kstat stat; 313 sector_t file_sectors; 314 int ret; 315 316 lockdep_assert_held(&zone->lock); 317 318 ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); 319 if (ret < 0) { 320 pr_err("Failed to get zone %u file stat (err=%d)\n", 321 zone_no, ret); 322 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 323 return ret; 324 } 325 326 file_sectors = stat.size >> SECTOR_SHIFT; 327 if (file_sectors > zlo->zone_capacity) { 328 pr_err("Zone %u file too large (%llu sectors > %llu)\n", 329 zone_no, file_sectors, zlo->zone_capacity); 330 return -EINVAL; 331 } 332 333 if (!IS_ALIGNED(stat.size, zlo->block_size)) { 334 pr_err("Zone %u file size (%llu) not aligned to block size %u\n", 335 zone_no, stat.size, zlo->block_size); 336 return -EINVAL; 337 } 338 339 spin_lock(&zone->wp_lock); 340 if (!file_sectors) { 341 zloop_mark_empty(zlo, zone); 342 } else if (file_sectors == zlo->zone_capacity) { 343 zloop_mark_full(zlo, zone); 344 } else { 345 if (zone->cond != BLK_ZONE_COND_IMP_OPEN && 346 zone->cond != BLK_ZONE_COND_EXP_OPEN) 347 zone->cond = BLK_ZONE_COND_CLOSED; 348 zone->wp = zone->start + file_sectors; 349 } 350 spin_unlock(&zone->wp_lock); 351 352 return 0; 353 } 354 355 static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no) 356 { 357 struct zloop_zone *zone = &zlo->zones[zone_no]; 358 int ret = 0; 359 360 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) 361 return -EIO; 362 363 mutex_lock(&zone->lock); 364 365 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { 366 ret = zloop_update_seq_zone(zlo, zone_no); 367 if (ret) 368 goto unlock; 369 } 370 371 if (!zloop_do_open_zone(zlo, zone, true)) 372 ret = -EIO; 373 374 unlock: 375 mutex_unlock(&zone->lock); 376 377 return ret; 378 } 379 380 static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) 381 { 382 struct zloop_zone *zone = &zlo->zones[zone_no]; 383 int ret = 0; 384 385 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) 386 return -EIO; 387 388 mutex_lock(&zone->lock); 389 390 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { 391 ret = zloop_update_seq_zone(zlo, zone_no); 392 if (ret) 393 goto unlock; 394 } 395 396 switch (zone->cond) { 397 case BLK_ZONE_COND_CLOSED: 398 break; 399 case BLK_ZONE_COND_IMP_OPEN: 400 case BLK_ZONE_COND_EXP_OPEN: 401 spin_lock(&zone->wp_lock); 402 zloop_lru_remove_open_zone(zlo, zone); 403 if (zone->wp == zone->start) 404 zone->cond = BLK_ZONE_COND_EMPTY; 405 else 406 zone->cond = BLK_ZONE_COND_CLOSED; 407 spin_unlock(&zone->wp_lock); 408 break; 409 case BLK_ZONE_COND_EMPTY: 410 case BLK_ZONE_COND_FULL: 411 default: 412 ret = -EIO; 413 break; 414 } 415 416 unlock: 417 mutex_unlock(&zone->lock); 418 419 return ret; 420 } 421 422 static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) 423 { 424 struct zloop_zone *zone = &zlo->zones[zone_no]; 425 int ret = 0; 426 427 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) 428 return -EIO; 429 430 mutex_lock(&zone->lock); 431 432 if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && 433 zone->cond == BLK_ZONE_COND_EMPTY) 434 goto unlock; 435 436 if (vfs_truncate(&zone->file->f_path, 0)) { 437 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 438 ret = -EIO; 439 goto unlock; 440 } 441 442 spin_lock(&zone->wp_lock); 443 zloop_mark_empty(zlo, zone); 444 clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 445 spin_unlock(&zone->wp_lock); 446 447 unlock: 448 mutex_unlock(&zone->lock); 449 450 return ret; 451 } 452 453 static int zloop_reset_all_zones(struct zloop_device *zlo) 454 { 455 unsigned int i; 456 int ret; 457 458 for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) { 459 ret = zloop_reset_zone(zlo, i); 460 if (ret) 461 return ret; 462 } 463 464 return 0; 465 } 466 467 static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) 468 { 469 struct zloop_zone *zone = &zlo->zones[zone_no]; 470 int ret = 0; 471 472 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) 473 return -EIO; 474 475 mutex_lock(&zone->lock); 476 477 if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && 478 zone->cond == BLK_ZONE_COND_FULL) 479 goto unlock; 480 481 if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) { 482 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 483 ret = -EIO; 484 goto unlock; 485 } 486 487 spin_lock(&zone->wp_lock); 488 zloop_mark_full(zlo, zone); 489 clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 490 spin_unlock(&zone->wp_lock); 491 492 unlock: 493 mutex_unlock(&zone->lock); 494 495 return ret; 496 } 497 498 static void zloop_put_cmd(struct zloop_cmd *cmd) 499 { 500 struct request *rq = blk_mq_rq_from_pdu(cmd); 501 502 if (!atomic_dec_and_test(&cmd->ref)) 503 return; 504 kfree(cmd->bvec); 505 cmd->bvec = NULL; 506 if (likely(!blk_should_fake_timeout(rq->q))) 507 blk_mq_complete_request(rq); 508 } 509 510 static void zloop_rw_complete(struct kiocb *iocb, long ret) 511 { 512 struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb); 513 514 cmd->ret = ret; 515 zloop_put_cmd(cmd); 516 } 517 518 static int zloop_do_rw(struct zloop_cmd *cmd) 519 { 520 struct request *rq = blk_mq_rq_from_pdu(cmd); 521 int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE; 522 unsigned int nr_bvec = blk_rq_nr_bvec(rq); 523 struct zloop_device *zlo = rq->q->queuedata; 524 struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)]; 525 struct req_iterator rq_iter; 526 struct iov_iter iter; 527 528 if (rq->bio != rq->biotail) { 529 struct bio_vec tmp, *bvec; 530 531 cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO); 532 if (!cmd->bvec) 533 return -EIO; 534 535 /* 536 * The bios of the request may be started from the middle of 537 * the 'bvec' because of bio splitting, so we can't directly 538 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec 539 * API will take care of all details for us. 540 */ 541 bvec = cmd->bvec; 542 rq_for_each_bvec(tmp, rq, rq_iter) { 543 *bvec = tmp; 544 bvec++; 545 } 546 iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq)); 547 } else { 548 /* 549 * Same here, this bio may be started from the middle of the 550 * 'bvec' because of bio splitting, so offset from the bvec 551 * must be passed to iov iterator 552 */ 553 iov_iter_bvec(&iter, rw, 554 __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter), 555 nr_bvec, blk_rq_bytes(rq)); 556 iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; 557 } 558 559 cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT; 560 cmd->iocb.ki_filp = zone->file; 561 cmd->iocb.ki_complete = zloop_rw_complete; 562 if (!zlo->buffered_io) 563 cmd->iocb.ki_flags = IOCB_DIRECT; 564 cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); 565 566 if (rw == ITER_SOURCE) 567 return zone->file->f_op->write_iter(&cmd->iocb, &iter); 568 return zone->file->f_op->read_iter(&cmd->iocb, &iter); 569 } 570 571 static int zloop_seq_write_prep(struct zloop_cmd *cmd) 572 { 573 struct request *rq = blk_mq_rq_from_pdu(cmd); 574 struct zloop_device *zlo = rq->q->queuedata; 575 unsigned int zone_no = rq_zone_no(rq); 576 sector_t nr_sectors = blk_rq_sectors(rq); 577 bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; 578 struct zloop_zone *zone = &zlo->zones[zone_no]; 579 sector_t zone_end = zone->start + zlo->zone_capacity; 580 int ret = 0; 581 582 spin_lock(&zone->wp_lock); 583 584 /* 585 * Zone append operations always go at the current write pointer, but 586 * regular write operations must already be aligned to the write pointer 587 * when submitted. 588 */ 589 if (is_append) { 590 /* 591 * If ordered zone append is in use, we already checked and set 592 * the target sector in zloop_queue_rq(). 593 */ 594 if (!zlo->ordered_zone_append) { 595 if (zone->cond == BLK_ZONE_COND_FULL || 596 zone->wp + nr_sectors > zone_end) { 597 ret = -EIO; 598 goto out_unlock; 599 } 600 cmd->sector = zone->wp; 601 } 602 } else { 603 if (cmd->sector != zone->wp) { 604 pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", 605 zone_no, cmd->sector, zone->wp); 606 ret = -EIO; 607 goto out_unlock; 608 } 609 } 610 611 /* Implicitly open the target zone. */ 612 if (!zloop_do_open_zone(zlo, zone, false)) { 613 ret = -EIO; 614 goto out_unlock; 615 } 616 617 /* 618 * Advance the write pointer, unless ordered zone append is in use. If 619 * the write fails, the write pointer position will be corrected when 620 * the next I/O starts execution. 621 */ 622 if (!is_append || !zlo->ordered_zone_append) { 623 zone->wp += nr_sectors; 624 if (zone->wp == zone_end) 625 zloop_mark_full(zlo, zone); 626 } 627 out_unlock: 628 spin_unlock(&zone->wp_lock); 629 return ret; 630 } 631 632 static void zloop_rw(struct zloop_cmd *cmd) 633 { 634 struct request *rq = blk_mq_rq_from_pdu(cmd); 635 struct zloop_device *zlo = rq->q->queuedata; 636 unsigned int zone_no = rq_zone_no(rq); 637 sector_t nr_sectors = blk_rq_sectors(rq); 638 bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; 639 bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; 640 struct zloop_zone *zone; 641 int ret = -EIO; 642 643 atomic_set(&cmd->ref, 2); 644 cmd->sector = blk_rq_pos(rq); 645 cmd->nr_sectors = nr_sectors; 646 cmd->ret = 0; 647 648 if (WARN_ON_ONCE(is_append && !zlo->zone_append)) 649 goto out; 650 651 /* We should never get an I/O beyond the device capacity. */ 652 if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) 653 goto out; 654 655 zone = &zlo->zones[zone_no]; 656 657 /* 658 * The block layer should never send requests that are not fully 659 * contained within the zone. 660 */ 661 if (WARN_ON_ONCE(cmd->sector + nr_sectors > 662 zone->start + zlo->zone_size)) 663 goto out; 664 665 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { 666 mutex_lock(&zone->lock); 667 ret = zloop_update_seq_zone(zlo, zone_no); 668 mutex_unlock(&zone->lock); 669 if (ret) 670 goto out; 671 } 672 673 if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { 674 mutex_lock(&zone->lock); 675 ret = zloop_seq_write_prep(cmd); 676 if (!ret) 677 ret = zloop_do_rw(cmd); 678 mutex_unlock(&zone->lock); 679 } else { 680 ret = zloop_do_rw(cmd); 681 } 682 out: 683 if (ret != -EIOCBQUEUED) 684 zloop_rw_complete(&cmd->iocb, ret); 685 zloop_put_cmd(cmd); 686 } 687 688 static inline bool zloop_zone_is_active(struct zloop_zone *zone) 689 { 690 switch (zone->cond) { 691 case BLK_ZONE_COND_EXP_OPEN: 692 case BLK_ZONE_COND_IMP_OPEN: 693 case BLK_ZONE_COND_CLOSED: 694 return true; 695 default: 696 return false; 697 } 698 } 699 700 static int zloop_record_safe_wps(struct zloop_device *zlo) 701 { 702 unsigned int i; 703 int ret; 704 705 for (i = 0; i < zlo->nr_zones; i++) { 706 struct zloop_zone *zone = &zlo->zones[i]; 707 struct file *file = zone->file; 708 709 if (!zloop_zone_is_active(zone)) 710 continue; 711 ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file), 712 "user.zloop.wp", &zone->wp, sizeof(zone->wp), 0); 713 if (ret) { 714 pr_err("%pg: failed to record write pointer (%d)\n", 715 zlo->disk->part0, ret); 716 return ret; 717 } 718 } 719 720 return 0; 721 } 722 723 /* 724 * Sync the entire FS containing the zone files instead of walking all files. 725 */ 726 static int zloop_flush(struct zloop_device *zlo) 727 { 728 struct super_block *sb = file_inode(zlo->data_dir)->i_sb; 729 int ret; 730 731 if (zlo->discard_write_cache) { 732 ret = zloop_record_safe_wps(zlo); 733 if (ret) 734 return ret; 735 } 736 737 down_read(&sb->s_umount); 738 ret = sync_filesystem(sb); 739 up_read(&sb->s_umount); 740 741 return ret; 742 } 743 744 static void zloop_handle_cmd(struct zloop_cmd *cmd) 745 { 746 struct request *rq = blk_mq_rq_from_pdu(cmd); 747 struct zloop_device *zlo = rq->q->queuedata; 748 749 /* We can block in this context, so ignore REQ_NOWAIT. */ 750 if (rq->cmd_flags & REQ_NOWAIT) 751 rq->cmd_flags &= ~REQ_NOWAIT; 752 753 switch (req_op(rq)) { 754 case REQ_OP_READ: 755 case REQ_OP_WRITE: 756 case REQ_OP_ZONE_APPEND: 757 /* 758 * zloop_rw() always executes asynchronously or completes 759 * directly. 760 */ 761 zloop_rw(cmd); 762 return; 763 case REQ_OP_FLUSH: 764 cmd->ret = zloop_flush(zlo); 765 break; 766 case REQ_OP_ZONE_RESET: 767 cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq)); 768 break; 769 case REQ_OP_ZONE_RESET_ALL: 770 cmd->ret = zloop_reset_all_zones(zlo); 771 break; 772 case REQ_OP_ZONE_FINISH: 773 cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq)); 774 break; 775 case REQ_OP_ZONE_OPEN: 776 cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq)); 777 break; 778 case REQ_OP_ZONE_CLOSE: 779 cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq)); 780 break; 781 default: 782 WARN_ON_ONCE(1); 783 pr_err("Unsupported operation %d\n", req_op(rq)); 784 cmd->ret = -EOPNOTSUPP; 785 break; 786 } 787 788 blk_mq_complete_request(rq); 789 } 790 791 static void zloop_cmd_workfn(struct work_struct *work) 792 { 793 struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work); 794 int orig_flags = current->flags; 795 796 current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; 797 zloop_handle_cmd(cmd); 798 current->flags = orig_flags; 799 } 800 801 static void zloop_complete_rq(struct request *rq) 802 { 803 struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); 804 struct zloop_device *zlo = rq->q->queuedata; 805 unsigned int zone_no = cmd->sector >> zlo->zone_shift; 806 struct zloop_zone *zone = &zlo->zones[zone_no]; 807 blk_status_t sts = BLK_STS_OK; 808 809 switch (req_op(rq)) { 810 case REQ_OP_READ: 811 if (cmd->ret < 0) 812 pr_err("Zone %u: failed read sector %llu, %llu sectors\n", 813 zone_no, cmd->sector, cmd->nr_sectors); 814 815 if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { 816 /* short read */ 817 struct bio *bio; 818 819 __rq_for_each_bio(bio, rq) 820 zero_fill_bio(bio); 821 } 822 break; 823 case REQ_OP_WRITE: 824 case REQ_OP_ZONE_APPEND: 825 if (cmd->ret < 0) 826 pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n", 827 zone_no, 828 req_op(rq) == REQ_OP_WRITE ? "" : "append ", 829 cmd->sector, cmd->nr_sectors); 830 831 if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { 832 pr_err("Zone %u: partial write %ld/%u B\n", 833 zone_no, cmd->ret, blk_rq_bytes(rq)); 834 cmd->ret = -EIO; 835 } 836 837 if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { 838 /* 839 * A write to a sequential zone file failed: mark the 840 * zone as having an error. This will be corrected and 841 * cleared when the next IO is submitted. 842 */ 843 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 844 break; 845 } 846 if (req_op(rq) == REQ_OP_ZONE_APPEND) 847 rq->__sector = cmd->sector; 848 849 break; 850 default: 851 break; 852 } 853 854 if (cmd->ret < 0) 855 sts = errno_to_blk_status(cmd->ret); 856 blk_mq_end_request(rq, sts); 857 } 858 859 static bool zloop_set_zone_append_sector(struct request *rq) 860 { 861 struct zloop_device *zlo = rq->q->queuedata; 862 unsigned int zone_no = rq_zone_no(rq); 863 struct zloop_zone *zone = &zlo->zones[zone_no]; 864 sector_t zone_end = zone->start + zlo->zone_capacity; 865 sector_t nr_sectors = blk_rq_sectors(rq); 866 867 spin_lock(&zone->wp_lock); 868 869 if (zone->cond == BLK_ZONE_COND_FULL || 870 zone->wp + nr_sectors > zone_end) { 871 spin_unlock(&zone->wp_lock); 872 return false; 873 } 874 875 rq->__sector = zone->wp; 876 zone->wp += blk_rq_sectors(rq); 877 if (zone->wp >= zone_end) 878 zloop_mark_full(zlo, zone); 879 880 spin_unlock(&zone->wp_lock); 881 882 return true; 883 } 884 885 static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, 886 const struct blk_mq_queue_data *bd) 887 { 888 struct request *rq = bd->rq; 889 struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); 890 struct zloop_device *zlo = rq->q->queuedata; 891 892 if (data_race(READ_ONCE(zlo->state)) == Zlo_deleting) { 893 rq->rq_flags |= RQF_QUIET; 894 return BLK_STS_IOERR; 895 } 896 897 /* 898 * If we need to strongly order zone append operations, set the request 899 * sector to the zone write pointer location now instead of when the 900 * command work runs. 901 */ 902 if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) { 903 if (!zloop_set_zone_append_sector(rq)) 904 return BLK_STS_IOERR; 905 } 906 907 blk_mq_start_request(rq); 908 909 INIT_WORK(&cmd->work, zloop_cmd_workfn); 910 queue_work(zlo->workqueue, &cmd->work); 911 912 return BLK_STS_OK; 913 } 914 915 static const struct blk_mq_ops zloop_mq_ops = { 916 .queue_rq = zloop_queue_rq, 917 .complete = zloop_complete_rq, 918 }; 919 920 static int zloop_open(struct gendisk *disk, blk_mode_t mode) 921 { 922 struct zloop_device *zlo = disk->private_data; 923 int ret; 924 925 ret = mutex_lock_killable(&zloop_ctl_mutex); 926 if (ret) 927 return ret; 928 929 if (zlo->state != Zlo_live) 930 ret = -ENXIO; 931 mutex_unlock(&zloop_ctl_mutex); 932 return ret; 933 } 934 935 static int zloop_report_zones(struct gendisk *disk, sector_t sector, 936 unsigned int nr_zones, struct blk_report_zones_args *args) 937 { 938 struct zloop_device *zlo = disk->private_data; 939 struct blk_zone blkz = {}; 940 unsigned int first, i; 941 int ret; 942 943 first = disk_zone_no(disk, sector); 944 if (first >= zlo->nr_zones) 945 return 0; 946 nr_zones = min(nr_zones, zlo->nr_zones - first); 947 948 for (i = 0; i < nr_zones; i++) { 949 unsigned int zone_no = first + i; 950 struct zloop_zone *zone = &zlo->zones[zone_no]; 951 952 mutex_lock(&zone->lock); 953 954 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { 955 ret = zloop_update_seq_zone(zlo, zone_no); 956 if (ret) { 957 mutex_unlock(&zone->lock); 958 return ret; 959 } 960 } 961 962 blkz.start = zone->start; 963 blkz.len = zlo->zone_size; 964 spin_lock(&zone->wp_lock); 965 blkz.wp = zone->wp; 966 spin_unlock(&zone->wp_lock); 967 blkz.cond = zone->cond; 968 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { 969 blkz.type = BLK_ZONE_TYPE_CONVENTIONAL; 970 blkz.capacity = zlo->zone_size; 971 } else { 972 blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ; 973 blkz.capacity = zlo->zone_capacity; 974 } 975 976 mutex_unlock(&zone->lock); 977 978 ret = disk_report_zone(disk, &blkz, i, args); 979 if (ret) 980 return ret; 981 } 982 983 return nr_zones; 984 } 985 986 static void zloop_free_disk(struct gendisk *disk) 987 { 988 struct zloop_device *zlo = disk->private_data; 989 unsigned int i; 990 991 blk_mq_free_tag_set(&zlo->tag_set); 992 993 for (i = 0; i < zlo->nr_zones; i++) { 994 struct zloop_zone *zone = &zlo->zones[i]; 995 996 mapping_set_gfp_mask(zone->file->f_mapping, 997 zone->old_gfp_mask); 998 fput(zone->file); 999 } 1000 1001 fput(zlo->data_dir); 1002 destroy_workqueue(zlo->workqueue); 1003 kfree(zlo->base_dir); 1004 kvfree(zlo); 1005 } 1006 1007 static const struct block_device_operations zloop_fops = { 1008 .owner = THIS_MODULE, 1009 .open = zloop_open, 1010 .report_zones = zloop_report_zones, 1011 .free_disk = zloop_free_disk, 1012 }; 1013 1014 __printf(3, 4) 1015 static struct file *zloop_filp_open_fmt(int oflags, umode_t mode, 1016 const char *fmt, ...) 1017 { 1018 struct file *file; 1019 va_list ap; 1020 char *p; 1021 1022 va_start(ap, fmt); 1023 p = kvasprintf(GFP_KERNEL, fmt, ap); 1024 va_end(ap); 1025 1026 if (!p) 1027 return ERR_PTR(-ENOMEM); 1028 file = filp_open(p, oflags, mode); 1029 kfree(p); 1030 return file; 1031 } 1032 1033 static int zloop_get_block_size(struct zloop_device *zlo, 1034 struct zloop_zone *zone) 1035 { 1036 struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev; 1037 struct kstat st; 1038 1039 /* 1040 * If the FS block size is lower than or equal to 4K, use that as the 1041 * device block size. Otherwise, fallback to the FS direct IO alignment 1042 * constraint if that is provided, and to the FS underlying device 1043 * physical block size if the direct IO alignment is unknown. 1044 */ 1045 if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K) 1046 zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize; 1047 else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) && 1048 (st.result_mask & STATX_DIOALIGN)) 1049 zlo->block_size = st.dio_offset_align; 1050 else if (sb_bdev) 1051 zlo->block_size = bdev_physical_block_size(sb_bdev); 1052 else 1053 zlo->block_size = SECTOR_SIZE; 1054 1055 if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { 1056 pr_err("Zone capacity is not aligned to block size %u\n", 1057 zlo->block_size); 1058 return -EINVAL; 1059 } 1060 1061 return 0; 1062 } 1063 1064 static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, 1065 unsigned int zone_no, bool restore) 1066 { 1067 struct zloop_zone *zone = &zlo->zones[zone_no]; 1068 int oflags = O_RDWR; 1069 struct kstat stat; 1070 sector_t file_sectors; 1071 int ret; 1072 1073 mutex_init(&zone->lock); 1074 INIT_LIST_HEAD(&zone->open_zone_entry); 1075 spin_lock_init(&zone->wp_lock); 1076 zone->start = (sector_t)zone_no << zlo->zone_shift; 1077 1078 if (!restore) 1079 oflags |= O_CREAT; 1080 1081 if (!opts->buffered_io) 1082 oflags |= O_DIRECT; 1083 1084 if (zone_no < zlo->nr_conv_zones) { 1085 /* Conventional zone file. */ 1086 set_bit(ZLOOP_ZONE_CONV, &zone->flags); 1087 zone->cond = BLK_ZONE_COND_NOT_WP; 1088 zone->wp = U64_MAX; 1089 1090 zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u", 1091 zlo->base_dir, zlo->id, zone_no); 1092 if (IS_ERR(zone->file)) { 1093 pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)", 1094 zone_no, zlo->base_dir, zlo->id, zone_no, 1095 PTR_ERR(zone->file)); 1096 return PTR_ERR(zone->file); 1097 } 1098 1099 if (!zlo->block_size) { 1100 ret = zloop_get_block_size(zlo, zone); 1101 if (ret) 1102 return ret; 1103 } 1104 1105 ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); 1106 if (ret < 0) { 1107 pr_err("Failed to get zone %u file stat\n", zone_no); 1108 return ret; 1109 } 1110 file_sectors = stat.size >> SECTOR_SHIFT; 1111 1112 if (restore && file_sectors != zlo->zone_size) { 1113 pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n", 1114 zone_no, file_sectors, zlo->zone_capacity); 1115 return ret; 1116 } 1117 1118 ret = vfs_truncate(&zone->file->f_path, 1119 zlo->zone_size << SECTOR_SHIFT); 1120 if (ret < 0) { 1121 pr_err("Failed to truncate zone %u file (err=%d)\n", 1122 zone_no, ret); 1123 return ret; 1124 } 1125 1126 return 0; 1127 } 1128 1129 /* Sequential zone file. */ 1130 zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u", 1131 zlo->base_dir, zlo->id, zone_no); 1132 if (IS_ERR(zone->file)) { 1133 pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)", 1134 zone_no, zlo->base_dir, zlo->id, zone_no, 1135 PTR_ERR(zone->file)); 1136 return PTR_ERR(zone->file); 1137 } 1138 1139 if (!zlo->block_size) { 1140 ret = zloop_get_block_size(zlo, zone); 1141 if (ret) 1142 return ret; 1143 } 1144 1145 zloop_get_block_size(zlo, zone); 1146 1147 mutex_lock(&zone->lock); 1148 ret = zloop_update_seq_zone(zlo, zone_no); 1149 mutex_unlock(&zone->lock); 1150 1151 return ret; 1152 } 1153 1154 static bool zloop_dev_exists(struct zloop_device *zlo) 1155 { 1156 struct file *cnv, *seq; 1157 bool exists; 1158 1159 cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u", 1160 zlo->base_dir, zlo->id, 0); 1161 seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u", 1162 zlo->base_dir, zlo->id, 0); 1163 exists = !IS_ERR(cnv) || !IS_ERR(seq); 1164 1165 if (!IS_ERR(cnv)) 1166 fput(cnv); 1167 if (!IS_ERR(seq)) 1168 fput(seq); 1169 1170 return exists; 1171 } 1172 1173 static int zloop_ctl_add(struct zloop_options *opts) 1174 { 1175 struct queue_limits lim = { 1176 .max_hw_sectors = SZ_1M >> SECTOR_SHIFT, 1177 .chunk_sectors = opts->zone_size, 1178 .features = BLK_FEAT_ZONED | BLK_FEAT_WRITE_CACHE, 1179 1180 }; 1181 unsigned int nr_zones, i, j; 1182 struct zloop_device *zlo; 1183 int ret = -EINVAL; 1184 bool restore; 1185 1186 __module_get(THIS_MODULE); 1187 1188 nr_zones = opts->capacity >> ilog2(opts->zone_size); 1189 if (opts->nr_conv_zones >= nr_zones) { 1190 pr_err("Invalid number of conventional zones %u\n", 1191 opts->nr_conv_zones); 1192 goto out; 1193 } 1194 1195 if (opts->max_open_zones > nr_zones - opts->nr_conv_zones) { 1196 pr_err("Invalid maximum number of open zones %u\n", 1197 opts->max_open_zones); 1198 goto out; 1199 } 1200 1201 zlo = kvzalloc_flex(*zlo, zones, nr_zones); 1202 if (!zlo) { 1203 ret = -ENOMEM; 1204 goto out; 1205 } 1206 WRITE_ONCE(zlo->state, Zlo_creating); 1207 spin_lock_init(&zlo->open_zones_lock); 1208 INIT_LIST_HEAD(&zlo->open_zones_lru_list); 1209 1210 ret = mutex_lock_killable(&zloop_ctl_mutex); 1211 if (ret) 1212 goto out_free_dev; 1213 1214 /* Allocate id, if @opts->id >= 0, we're requesting that specific id */ 1215 if (opts->id >= 0) { 1216 ret = idr_alloc(&zloop_index_idr, zlo, 1217 opts->id, opts->id + 1, GFP_KERNEL); 1218 if (ret == -ENOSPC) 1219 ret = -EEXIST; 1220 } else { 1221 ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL); 1222 } 1223 mutex_unlock(&zloop_ctl_mutex); 1224 if (ret < 0) 1225 goto out_free_dev; 1226 1227 zlo->id = ret; 1228 zlo->zone_shift = ilog2(opts->zone_size); 1229 zlo->zone_size = opts->zone_size; 1230 if (opts->zone_capacity) 1231 zlo->zone_capacity = opts->zone_capacity; 1232 else 1233 zlo->zone_capacity = zlo->zone_size; 1234 zlo->nr_zones = nr_zones; 1235 zlo->nr_conv_zones = opts->nr_conv_zones; 1236 zlo->max_open_zones = opts->max_open_zones; 1237 zlo->buffered_io = opts->buffered_io; 1238 zlo->zone_append = opts->zone_append; 1239 if (zlo->zone_append) 1240 zlo->ordered_zone_append = opts->ordered_zone_append; 1241 zlo->discard_write_cache = opts->discard_write_cache; 1242 1243 zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, 1244 opts->nr_queues * opts->queue_depth, zlo->id); 1245 if (!zlo->workqueue) { 1246 ret = -ENOMEM; 1247 goto out_free_idr; 1248 } 1249 1250 if (opts->base_dir) 1251 zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL); 1252 else 1253 zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL); 1254 if (!zlo->base_dir) { 1255 ret = -ENOMEM; 1256 goto out_destroy_workqueue; 1257 } 1258 1259 zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u", 1260 zlo->base_dir, zlo->id); 1261 if (IS_ERR(zlo->data_dir)) { 1262 ret = PTR_ERR(zlo->data_dir); 1263 pr_warn("Failed to open directory %s/%u (err=%d)\n", 1264 zlo->base_dir, zlo->id, ret); 1265 goto out_free_base_dir; 1266 } 1267 1268 /* 1269 * If we already have zone files, we are restoring a device created by a 1270 * previous add operation. In this case, zloop_init_zone() will check 1271 * that the zone files are consistent with the zone configuration given. 1272 */ 1273 restore = zloop_dev_exists(zlo); 1274 for (i = 0; i < nr_zones; i++) { 1275 ret = zloop_init_zone(zlo, opts, i, restore); 1276 if (ret) 1277 goto out_close_files; 1278 } 1279 1280 lim.physical_block_size = zlo->block_size; 1281 lim.logical_block_size = zlo->block_size; 1282 if (zlo->zone_append) 1283 lim.max_hw_zone_append_sectors = lim.max_hw_sectors; 1284 lim.max_open_zones = zlo->max_open_zones; 1285 1286 zlo->tag_set.ops = &zloop_mq_ops; 1287 zlo->tag_set.nr_hw_queues = opts->nr_queues; 1288 zlo->tag_set.queue_depth = opts->queue_depth; 1289 zlo->tag_set.numa_node = NUMA_NO_NODE; 1290 zlo->tag_set.cmd_size = sizeof(struct zloop_cmd); 1291 zlo->tag_set.driver_data = zlo; 1292 1293 ret = blk_mq_alloc_tag_set(&zlo->tag_set); 1294 if (ret) { 1295 pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret); 1296 goto out_close_files; 1297 } 1298 1299 zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo); 1300 if (IS_ERR(zlo->disk)) { 1301 pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret); 1302 ret = PTR_ERR(zlo->disk); 1303 goto out_cleanup_tags; 1304 } 1305 zlo->disk->flags = GENHD_FL_NO_PART; 1306 zlo->disk->fops = &zloop_fops; 1307 zlo->disk->private_data = zlo; 1308 sprintf(zlo->disk->disk_name, "zloop%d", zlo->id); 1309 set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones); 1310 1311 ret = blk_revalidate_disk_zones(zlo->disk); 1312 if (ret) 1313 goto out_cleanup_disk; 1314 1315 ret = add_disk(zlo->disk); 1316 if (ret) { 1317 pr_err("add_disk failed (err=%d)\n", ret); 1318 goto out_cleanup_disk; 1319 } 1320 1321 mutex_lock(&zloop_ctl_mutex); 1322 WRITE_ONCE(zlo->state, Zlo_live); 1323 mutex_unlock(&zloop_ctl_mutex); 1324 1325 pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n", 1326 zlo->id, zlo->nr_zones, 1327 ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20, 1328 zlo->block_size); 1329 pr_info("zloop%d: using %s%s zone append\n", 1330 zlo->id, 1331 zlo->ordered_zone_append ? "ordered " : "", 1332 zlo->zone_append ? "native" : "emulated"); 1333 1334 return 0; 1335 1336 out_cleanup_disk: 1337 put_disk(zlo->disk); 1338 out_cleanup_tags: 1339 blk_mq_free_tag_set(&zlo->tag_set); 1340 out_close_files: 1341 for (j = 0; j < i; j++) { 1342 struct zloop_zone *zone = &zlo->zones[j]; 1343 1344 if (!IS_ERR_OR_NULL(zone->file)) 1345 fput(zone->file); 1346 } 1347 fput(zlo->data_dir); 1348 out_free_base_dir: 1349 kfree(zlo->base_dir); 1350 out_destroy_workqueue: 1351 destroy_workqueue(zlo->workqueue); 1352 out_free_idr: 1353 mutex_lock(&zloop_ctl_mutex); 1354 idr_remove(&zloop_index_idr, zlo->id); 1355 mutex_unlock(&zloop_ctl_mutex); 1356 out_free_dev: 1357 kvfree(zlo); 1358 out: 1359 module_put(THIS_MODULE); 1360 if (ret == -ENOENT) 1361 ret = -EINVAL; 1362 return ret; 1363 } 1364 1365 static void zloop_forget_cache(struct zloop_device *zlo) 1366 { 1367 unsigned int i; 1368 int ret; 1369 1370 pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0); 1371 1372 for (i = 0; i < zlo->nr_zones; i++) { 1373 struct zloop_zone *zone = &zlo->zones[i]; 1374 struct file *file = zone->file; 1375 sector_t old_wp; 1376 1377 if (!zloop_zone_is_active(zone)) 1378 continue; 1379 1380 ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file), 1381 "user.zloop.wp", &old_wp, sizeof(old_wp)); 1382 if (ret == -ENODATA) { 1383 old_wp = 0; 1384 } else if (ret != sizeof(old_wp)) { 1385 pr_err("%pg: failed to retrieve write pointer (%d)\n", 1386 zlo->disk->part0, ret); 1387 continue; 1388 } 1389 1390 if (old_wp > zone->wp) 1391 continue; 1392 /* 1393 * This should not happen, if we recored a full zone, it can't 1394 * be active. 1395 */ 1396 if (WARN_ON_ONCE(old_wp == ULLONG_MAX)) 1397 continue; 1398 1399 vfs_truncate(&file->f_path, 1400 (old_wp - zone->start) << SECTOR_SHIFT); 1401 } 1402 } 1403 1404 static int zloop_ctl_remove(struct zloop_options *opts) 1405 { 1406 struct zloop_device *zlo; 1407 int ret; 1408 1409 if (!(opts->mask & ZLOOP_OPT_ID)) { 1410 pr_err("No ID specified for remove\n"); 1411 return -EINVAL; 1412 } 1413 1414 if (opts->mask & ~ZLOOP_OPT_ID) { 1415 pr_err("Invalid option specified for remove\n"); 1416 return -EINVAL; 1417 } 1418 1419 ret = mutex_lock_killable(&zloop_ctl_mutex); 1420 if (ret) 1421 return ret; 1422 1423 zlo = idr_find(&zloop_index_idr, opts->id); 1424 if (!zlo || zlo->state == Zlo_creating) { 1425 ret = -ENODEV; 1426 } else if (zlo->state == Zlo_deleting) { 1427 ret = -EINVAL; 1428 } else { 1429 idr_remove(&zloop_index_idr, zlo->id); 1430 WRITE_ONCE(zlo->state, Zlo_deleting); 1431 } 1432 1433 mutex_unlock(&zloop_ctl_mutex); 1434 if (ret) 1435 return ret; 1436 1437 del_gendisk(zlo->disk); 1438 1439 if (zlo->discard_write_cache) 1440 zloop_forget_cache(zlo); 1441 1442 put_disk(zlo->disk); 1443 1444 pr_info("Removed device %d\n", opts->id); 1445 1446 module_put(THIS_MODULE); 1447 1448 return 0; 1449 } 1450 1451 static int zloop_parse_options(struct zloop_options *opts, const char *buf) 1452 { 1453 substring_t args[MAX_OPT_ARGS]; 1454 char *options, *o, *p; 1455 unsigned int token; 1456 int ret = 0; 1457 1458 /* Set defaults. */ 1459 opts->mask = 0; 1460 opts->id = ZLOOP_DEF_ID; 1461 opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES; 1462 opts->zone_size = ZLOOP_DEF_ZONE_SIZE; 1463 opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES; 1464 opts->max_open_zones = ZLOOP_DEF_MAX_OPEN_ZONES; 1465 opts->nr_queues = ZLOOP_DEF_NR_QUEUES; 1466 opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; 1467 opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; 1468 opts->zone_append = ZLOOP_DEF_ZONE_APPEND; 1469 opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND; 1470 1471 if (!buf) 1472 return 0; 1473 1474 /* Skip leading spaces before the options. */ 1475 while (isspace(*buf)) 1476 buf++; 1477 1478 options = o = kstrdup(buf, GFP_KERNEL); 1479 if (!options) 1480 return -ENOMEM; 1481 1482 /* Parse the options, doing only some light invalid value checks. */ 1483 while ((p = strsep(&o, ",\n")) != NULL) { 1484 if (!*p) 1485 continue; 1486 1487 token = match_token(p, zloop_opt_tokens, args); 1488 opts->mask |= token; 1489 switch (token) { 1490 case ZLOOP_OPT_ID: 1491 if (match_int(args, &opts->id)) { 1492 ret = -EINVAL; 1493 goto out; 1494 } 1495 break; 1496 case ZLOOP_OPT_CAPACITY: 1497 if (match_uint(args, &token)) { 1498 ret = -EINVAL; 1499 goto out; 1500 } 1501 if (!token) { 1502 pr_err("Invalid capacity\n"); 1503 ret = -EINVAL; 1504 goto out; 1505 } 1506 opts->capacity = 1507 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; 1508 break; 1509 case ZLOOP_OPT_ZONE_SIZE: 1510 if (match_uint(args, &token)) { 1511 ret = -EINVAL; 1512 goto out; 1513 } 1514 if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB || 1515 !is_power_of_2(token)) { 1516 pr_err("Invalid zone size %u\n", token); 1517 ret = -EINVAL; 1518 goto out; 1519 } 1520 opts->zone_size = 1521 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; 1522 break; 1523 case ZLOOP_OPT_ZONE_CAPACITY: 1524 if (match_uint(args, &token)) { 1525 ret = -EINVAL; 1526 goto out; 1527 } 1528 if (!token) { 1529 pr_err("Invalid zone capacity\n"); 1530 ret = -EINVAL; 1531 goto out; 1532 } 1533 opts->zone_capacity = 1534 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; 1535 break; 1536 case ZLOOP_OPT_NR_CONV_ZONES: 1537 if (match_uint(args, &token)) { 1538 ret = -EINVAL; 1539 goto out; 1540 } 1541 opts->nr_conv_zones = token; 1542 break; 1543 case ZLOOP_OPT_MAX_OPEN_ZONES: 1544 if (match_uint(args, &token)) { 1545 ret = -EINVAL; 1546 goto out; 1547 } 1548 opts->max_open_zones = token; 1549 break; 1550 case ZLOOP_OPT_BASE_DIR: 1551 p = match_strdup(args); 1552 if (!p) { 1553 ret = -ENOMEM; 1554 goto out; 1555 } 1556 kfree(opts->base_dir); 1557 opts->base_dir = p; 1558 break; 1559 case ZLOOP_OPT_NR_QUEUES: 1560 if (match_uint(args, &token)) { 1561 ret = -EINVAL; 1562 goto out; 1563 } 1564 if (!token) { 1565 pr_err("Invalid number of queues\n"); 1566 ret = -EINVAL; 1567 goto out; 1568 } 1569 opts->nr_queues = min(token, num_online_cpus()); 1570 break; 1571 case ZLOOP_OPT_QUEUE_DEPTH: 1572 if (match_uint(args, &token)) { 1573 ret = -EINVAL; 1574 goto out; 1575 } 1576 if (!token) { 1577 pr_err("Invalid queue depth\n"); 1578 ret = -EINVAL; 1579 goto out; 1580 } 1581 opts->queue_depth = token; 1582 break; 1583 case ZLOOP_OPT_BUFFERED_IO: 1584 opts->buffered_io = true; 1585 break; 1586 case ZLOOP_OPT_ZONE_APPEND: 1587 if (match_uint(args, &token)) { 1588 ret = -EINVAL; 1589 goto out; 1590 } 1591 if (token != 0 && token != 1) { 1592 pr_err("Invalid zone_append value\n"); 1593 ret = -EINVAL; 1594 goto out; 1595 } 1596 opts->zone_append = token; 1597 break; 1598 case ZLOOP_OPT_ORDERED_ZONE_APPEND: 1599 opts->ordered_zone_append = true; 1600 break; 1601 case ZLOOP_OPT_DISCARD_WRITE_CACHE: 1602 opts->discard_write_cache = true; 1603 break; 1604 case ZLOOP_OPT_ERR: 1605 default: 1606 pr_warn("unknown parameter or missing value '%s'\n", p); 1607 ret = -EINVAL; 1608 goto out; 1609 } 1610 } 1611 1612 ret = -EINVAL; 1613 if (opts->capacity <= opts->zone_size) { 1614 pr_err("Invalid capacity\n"); 1615 goto out; 1616 } 1617 1618 if (opts->zone_capacity > opts->zone_size) { 1619 pr_err("Invalid zone capacity\n"); 1620 goto out; 1621 } 1622 1623 ret = 0; 1624 out: 1625 kfree(options); 1626 return ret; 1627 } 1628 1629 enum { 1630 ZLOOP_CTL_ADD, 1631 ZLOOP_CTL_REMOVE, 1632 }; 1633 1634 static struct zloop_ctl_op { 1635 int code; 1636 const char *name; 1637 } zloop_ctl_ops[] = { 1638 { ZLOOP_CTL_ADD, "add" }, 1639 { ZLOOP_CTL_REMOVE, "remove" }, 1640 { -1, NULL }, 1641 }; 1642 1643 static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf, 1644 size_t count, loff_t *pos) 1645 { 1646 struct zloop_options opts = { }; 1647 struct zloop_ctl_op *op; 1648 const char *buf, *opts_buf; 1649 int i, ret; 1650 1651 if (count > PAGE_SIZE) 1652 return -ENOMEM; 1653 1654 buf = memdup_user_nul(ubuf, count); 1655 if (IS_ERR(buf)) 1656 return PTR_ERR(buf); 1657 1658 for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) { 1659 op = &zloop_ctl_ops[i]; 1660 if (!op->name) { 1661 pr_err("Invalid operation\n"); 1662 ret = -EINVAL; 1663 goto out; 1664 } 1665 if (!strncmp(buf, op->name, strlen(op->name))) 1666 break; 1667 } 1668 1669 if (count <= strlen(op->name)) 1670 opts_buf = NULL; 1671 else 1672 opts_buf = buf + strlen(op->name); 1673 1674 ret = zloop_parse_options(&opts, opts_buf); 1675 if (ret) { 1676 pr_err("Failed to parse options\n"); 1677 goto out; 1678 } 1679 1680 switch (op->code) { 1681 case ZLOOP_CTL_ADD: 1682 ret = zloop_ctl_add(&opts); 1683 break; 1684 case ZLOOP_CTL_REMOVE: 1685 ret = zloop_ctl_remove(&opts); 1686 break; 1687 default: 1688 pr_err("Invalid operation\n"); 1689 ret = -EINVAL; 1690 goto out; 1691 } 1692 1693 out: 1694 kfree(opts.base_dir); 1695 kfree(buf); 1696 return ret ? ret : count; 1697 } 1698 1699 static int zloop_ctl_show(struct seq_file *seq_file, void *private) 1700 { 1701 const struct match_token *tok; 1702 int i; 1703 1704 /* Add operation */ 1705 seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name); 1706 for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) { 1707 tok = &zloop_opt_tokens[i]; 1708 if (!tok->pattern) 1709 break; 1710 if (i) 1711 seq_putc(seq_file, ','); 1712 seq_puts(seq_file, tok->pattern); 1713 } 1714 seq_putc(seq_file, '\n'); 1715 1716 /* Remove operation */ 1717 seq_puts(seq_file, zloop_ctl_ops[1].name); 1718 seq_puts(seq_file, " id=%d\n"); 1719 1720 return 0; 1721 } 1722 1723 static int zloop_ctl_open(struct inode *inode, struct file *file) 1724 { 1725 file->private_data = NULL; 1726 return single_open(file, zloop_ctl_show, NULL); 1727 } 1728 1729 static int zloop_ctl_release(struct inode *inode, struct file *file) 1730 { 1731 return single_release(inode, file); 1732 } 1733 1734 static const struct file_operations zloop_ctl_fops = { 1735 .owner = THIS_MODULE, 1736 .open = zloop_ctl_open, 1737 .release = zloop_ctl_release, 1738 .write = zloop_ctl_write, 1739 .read = seq_read, 1740 }; 1741 1742 static struct miscdevice zloop_misc = { 1743 .minor = MISC_DYNAMIC_MINOR, 1744 .name = "zloop-control", 1745 .fops = &zloop_ctl_fops, 1746 }; 1747 1748 static int __init zloop_init(void) 1749 { 1750 int ret; 1751 1752 ret = misc_register(&zloop_misc); 1753 if (ret) { 1754 pr_err("Failed to register misc device: %d\n", ret); 1755 return ret; 1756 } 1757 pr_info("Module loaded\n"); 1758 1759 return 0; 1760 } 1761 1762 static void __exit zloop_exit(void) 1763 { 1764 misc_deregister(&zloop_misc); 1765 idr_destroy(&zloop_index_idr); 1766 } 1767 1768 module_init(zloop_init); 1769 module_exit(zloop_exit); 1770 1771 MODULE_DESCRIPTION("Zoned loopback device"); 1772 MODULE_LICENSE("GPL"); 1773