1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (c) 2025, Christoph Hellwig. 4 * Copyright (c) 2025, Western Digital Corporation or its affiliates. 5 * 6 * Zoned Loop Device driver - exports a zoned block device using one file per 7 * zone as backing storage. 8 */ 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/module.h> 12 #include <linux/blk-mq.h> 13 #include <linux/blkzoned.h> 14 #include <linux/pagemap.h> 15 #include <linux/miscdevice.h> 16 #include <linux/falloc.h> 17 #include <linux/mutex.h> 18 #include <linux/parser.h> 19 #include <linux/seq_file.h> 20 #include <linux/xattr.h> 21 22 /* 23 * Options for adding (and removing) a device. 24 */ 25 enum { 26 ZLOOP_OPT_ERR = 0, 27 ZLOOP_OPT_ID = (1 << 0), 28 ZLOOP_OPT_CAPACITY = (1 << 1), 29 ZLOOP_OPT_ZONE_SIZE = (1 << 2), 30 ZLOOP_OPT_ZONE_CAPACITY = (1 << 3), 31 ZLOOP_OPT_NR_CONV_ZONES = (1 << 4), 32 ZLOOP_OPT_BASE_DIR = (1 << 5), 33 ZLOOP_OPT_NR_QUEUES = (1 << 6), 34 ZLOOP_OPT_QUEUE_DEPTH = (1 << 7), 35 ZLOOP_OPT_BUFFERED_IO = (1 << 8), 36 ZLOOP_OPT_ZONE_APPEND = (1 << 9), 37 ZLOOP_OPT_ORDERED_ZONE_APPEND = (1 << 10), 38 ZLOOP_OPT_DISCARD_WRITE_CACHE = (1 << 11), 39 ZLOOP_OPT_MAX_OPEN_ZONES = (1 << 12), 40 }; 41 42 static const match_table_t zloop_opt_tokens = { 43 { ZLOOP_OPT_ID, "id=%d" }, 44 { ZLOOP_OPT_CAPACITY, "capacity_mb=%u" }, 45 { ZLOOP_OPT_ZONE_SIZE, "zone_size_mb=%u" }, 46 { ZLOOP_OPT_ZONE_CAPACITY, "zone_capacity_mb=%u" }, 47 { ZLOOP_OPT_NR_CONV_ZONES, "conv_zones=%u" }, 48 { ZLOOP_OPT_BASE_DIR, "base_dir=%s" }, 49 { ZLOOP_OPT_NR_QUEUES, "nr_queues=%u" }, 50 { ZLOOP_OPT_QUEUE_DEPTH, "queue_depth=%u" }, 51 { ZLOOP_OPT_BUFFERED_IO, "buffered_io" }, 52 { ZLOOP_OPT_ZONE_APPEND, "zone_append=%u" }, 53 { ZLOOP_OPT_ORDERED_ZONE_APPEND, "ordered_zone_append" }, 54 { ZLOOP_OPT_DISCARD_WRITE_CACHE, "discard_write_cache" }, 55 { ZLOOP_OPT_MAX_OPEN_ZONES, "max_open_zones=%u" }, 56 { ZLOOP_OPT_ERR, NULL } 57 }; 58 59 /* Default values for the "add" operation. */ 60 #define ZLOOP_DEF_ID -1 61 #define ZLOOP_DEF_ZONE_SIZE ((256ULL * SZ_1M) >> SECTOR_SHIFT) 62 #define ZLOOP_DEF_NR_ZONES 64 63 #define ZLOOP_DEF_NR_CONV_ZONES 8 64 #define ZLOOP_DEF_MAX_OPEN_ZONES 0 65 #define ZLOOP_DEF_BASE_DIR "/var/local/zloop" 66 #define ZLOOP_DEF_NR_QUEUES 1 67 #define ZLOOP_DEF_QUEUE_DEPTH 128 68 #define ZLOOP_DEF_BUFFERED_IO false 69 #define ZLOOP_DEF_ZONE_APPEND true 70 #define ZLOOP_DEF_ORDERED_ZONE_APPEND false 71 72 /* Arbitrary limit on the zone size (16GB). */ 73 #define ZLOOP_MAX_ZONE_SIZE_MB 16384 74 75 struct zloop_options { 76 unsigned int mask; 77 int id; 78 sector_t capacity; 79 sector_t zone_size; 80 sector_t zone_capacity; 81 unsigned int nr_conv_zones; 82 unsigned int max_open_zones; 83 char *base_dir; 84 unsigned int nr_queues; 85 unsigned int queue_depth; 86 bool buffered_io; 87 bool zone_append; 88 bool ordered_zone_append; 89 bool discard_write_cache; 90 }; 91 92 /* 93 * Device states. 94 */ 95 enum { 96 Zlo_creating = 0, 97 Zlo_live, 98 Zlo_deleting, 99 }; 100 101 enum zloop_zone_flags { 102 ZLOOP_ZONE_CONV = 0, 103 ZLOOP_ZONE_SEQ_ERROR, 104 }; 105 106 /* 107 * Zone descriptor. 108 * Locking order: z.lock -> z.wp_lock -> zlo.open_zones_lock 109 */ 110 struct zloop_zone { 111 struct list_head open_zone_entry; 112 struct file *file; 113 114 unsigned long flags; 115 struct mutex lock; 116 spinlock_t wp_lock; 117 enum blk_zone_cond cond; 118 sector_t start; 119 sector_t wp; 120 121 gfp_t old_gfp_mask; 122 }; 123 124 struct zloop_device { 125 unsigned int id; 126 unsigned int state; 127 128 struct blk_mq_tag_set tag_set; 129 struct gendisk *disk; 130 131 struct workqueue_struct *workqueue; 132 bool buffered_io; 133 bool zone_append; 134 bool ordered_zone_append; 135 bool discard_write_cache; 136 137 const char *base_dir; 138 struct file *data_dir; 139 140 unsigned int zone_shift; 141 sector_t zone_size; 142 sector_t zone_capacity; 143 unsigned int nr_zones; 144 unsigned int nr_conv_zones; 145 unsigned int max_open_zones; 146 unsigned int block_size; 147 148 spinlock_t open_zones_lock; 149 struct list_head open_zones_lru_list; 150 unsigned int nr_open_zones; 151 152 struct zloop_zone zones[] __counted_by(nr_zones); 153 }; 154 155 struct zloop_cmd { 156 struct work_struct work; 157 atomic_t ref; 158 sector_t sector; 159 sector_t nr_sectors; 160 long ret; 161 struct kiocb iocb; 162 struct bio_vec *bvec; 163 }; 164 165 static DEFINE_IDR(zloop_index_idr); 166 static DEFINE_MUTEX(zloop_ctl_mutex); 167 168 static unsigned int rq_zone_no(struct request *rq) 169 { 170 struct zloop_device *zlo = rq->q->queuedata; 171 172 return blk_rq_pos(rq) >> zlo->zone_shift; 173 } 174 175 /* 176 * Open an already open zone. This is mostly a no-op, except for the imp open -> 177 * exp open condition change that may happen. We also move a zone at the tail of 178 * the list of open zones so that if we need to 179 * implicitly close one open zone, we can do so in LRU order. 180 */ 181 static inline void zloop_lru_rotate_open_zone(struct zloop_device *zlo, 182 struct zloop_zone *zone) 183 { 184 if (zlo->max_open_zones) { 185 spin_lock(&zlo->open_zones_lock); 186 list_move_tail(&zone->open_zone_entry, 187 &zlo->open_zones_lru_list); 188 spin_unlock(&zlo->open_zones_lock); 189 } 190 } 191 192 static inline void zloop_lru_remove_open_zone(struct zloop_device *zlo, 193 struct zloop_zone *zone) 194 { 195 if (zone->cond == BLK_ZONE_COND_IMP_OPEN || 196 zone->cond == BLK_ZONE_COND_EXP_OPEN) { 197 spin_lock(&zlo->open_zones_lock); 198 list_del_init(&zone->open_zone_entry); 199 zlo->nr_open_zones--; 200 spin_unlock(&zlo->open_zones_lock); 201 } 202 } 203 204 static inline bool zloop_can_open_zone(struct zloop_device *zlo) 205 { 206 return !zlo->max_open_zones || zlo->nr_open_zones < zlo->max_open_zones; 207 } 208 209 /* 210 * If we have reached the maximum open zones limit, attempt to close an 211 * implicitly open zone (if we have any) so that we can implicitly open another 212 * zone without exceeding the maximum number of open zones. 213 */ 214 static bool zloop_close_imp_open_zone(struct zloop_device *zlo) 215 { 216 struct zloop_zone *zone; 217 218 lockdep_assert_held(&zlo->open_zones_lock); 219 220 if (zloop_can_open_zone(zlo)) 221 return true; 222 223 list_for_each_entry(zone, &zlo->open_zones_lru_list, open_zone_entry) { 224 if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { 225 zone->cond = BLK_ZONE_COND_CLOSED; 226 list_del_init(&zone->open_zone_entry); 227 zlo->nr_open_zones--; 228 return true; 229 } 230 } 231 232 return false; 233 } 234 235 static bool zloop_open_closed_or_empty_zone(struct zloop_device *zlo, 236 struct zloop_zone *zone, 237 bool explicit) 238 { 239 spin_lock(&zlo->open_zones_lock); 240 241 if (explicit) { 242 /* 243 * Explicit open: we cannot allow this if we have reached the 244 * maximum open zones limit. 245 */ 246 if (!zloop_can_open_zone(zlo)) 247 goto fail; 248 zone->cond = BLK_ZONE_COND_EXP_OPEN; 249 } else { 250 /* 251 * Implicit open case: if we have reached the maximum open zones 252 * limit, try to close an implicitly open zone first. 253 */ 254 if (!zloop_close_imp_open_zone(zlo)) 255 goto fail; 256 zone->cond = BLK_ZONE_COND_IMP_OPEN; 257 } 258 259 zlo->nr_open_zones++; 260 list_add_tail(&zone->open_zone_entry, 261 &zlo->open_zones_lru_list); 262 263 spin_unlock(&zlo->open_zones_lock); 264 265 return true; 266 267 fail: 268 spin_unlock(&zlo->open_zones_lock); 269 270 return false; 271 } 272 273 static bool zloop_do_open_zone(struct zloop_device *zlo, 274 struct zloop_zone *zone, bool explicit) 275 { 276 switch (zone->cond) { 277 case BLK_ZONE_COND_IMP_OPEN: 278 case BLK_ZONE_COND_EXP_OPEN: 279 if (explicit) 280 zone->cond = BLK_ZONE_COND_EXP_OPEN; 281 zloop_lru_rotate_open_zone(zlo, zone); 282 return true; 283 case BLK_ZONE_COND_EMPTY: 284 case BLK_ZONE_COND_CLOSED: 285 return zloop_open_closed_or_empty_zone(zlo, zone, explicit); 286 default: 287 return false; 288 } 289 } 290 291 static int zloop_update_seq_zone(struct zloop_device *zlo, unsigned int zone_no) 292 { 293 struct zloop_zone *zone = &zlo->zones[zone_no]; 294 struct kstat stat; 295 sector_t file_sectors; 296 unsigned long flags; 297 int ret; 298 299 lockdep_assert_held(&zone->lock); 300 301 ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); 302 if (ret < 0) { 303 pr_err("Failed to get zone %u file stat (err=%d)\n", 304 zone_no, ret); 305 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 306 return ret; 307 } 308 309 file_sectors = stat.size >> SECTOR_SHIFT; 310 if (file_sectors > zlo->zone_capacity) { 311 pr_err("Zone %u file too large (%llu sectors > %llu)\n", 312 zone_no, file_sectors, zlo->zone_capacity); 313 return -EINVAL; 314 } 315 316 if (file_sectors & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { 317 pr_err("Zone %u file size not aligned to block size %u\n", 318 zone_no, zlo->block_size); 319 return -EINVAL; 320 } 321 322 spin_lock_irqsave(&zone->wp_lock, flags); 323 if (!file_sectors) { 324 zloop_lru_remove_open_zone(zlo, zone); 325 zone->cond = BLK_ZONE_COND_EMPTY; 326 zone->wp = zone->start; 327 } else if (file_sectors == zlo->zone_capacity) { 328 zloop_lru_remove_open_zone(zlo, zone); 329 zone->cond = BLK_ZONE_COND_FULL; 330 zone->wp = ULLONG_MAX; 331 } else { 332 if (zone->cond != BLK_ZONE_COND_IMP_OPEN && 333 zone->cond != BLK_ZONE_COND_EXP_OPEN) 334 zone->cond = BLK_ZONE_COND_CLOSED; 335 zone->wp = zone->start + file_sectors; 336 } 337 spin_unlock_irqrestore(&zone->wp_lock, flags); 338 339 return 0; 340 } 341 342 static int zloop_open_zone(struct zloop_device *zlo, unsigned int zone_no) 343 { 344 struct zloop_zone *zone = &zlo->zones[zone_no]; 345 int ret = 0; 346 347 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) 348 return -EIO; 349 350 mutex_lock(&zone->lock); 351 352 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { 353 ret = zloop_update_seq_zone(zlo, zone_no); 354 if (ret) 355 goto unlock; 356 } 357 358 if (!zloop_do_open_zone(zlo, zone, true)) 359 ret = -EIO; 360 361 unlock: 362 mutex_unlock(&zone->lock); 363 364 return ret; 365 } 366 367 static int zloop_close_zone(struct zloop_device *zlo, unsigned int zone_no) 368 { 369 struct zloop_zone *zone = &zlo->zones[zone_no]; 370 unsigned long flags; 371 int ret = 0; 372 373 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) 374 return -EIO; 375 376 mutex_lock(&zone->lock); 377 378 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { 379 ret = zloop_update_seq_zone(zlo, zone_no); 380 if (ret) 381 goto unlock; 382 } 383 384 switch (zone->cond) { 385 case BLK_ZONE_COND_CLOSED: 386 break; 387 case BLK_ZONE_COND_IMP_OPEN: 388 case BLK_ZONE_COND_EXP_OPEN: 389 spin_lock_irqsave(&zone->wp_lock, flags); 390 zloop_lru_remove_open_zone(zlo, zone); 391 if (zone->wp == zone->start) 392 zone->cond = BLK_ZONE_COND_EMPTY; 393 else 394 zone->cond = BLK_ZONE_COND_CLOSED; 395 spin_unlock_irqrestore(&zone->wp_lock, flags); 396 break; 397 case BLK_ZONE_COND_EMPTY: 398 case BLK_ZONE_COND_FULL: 399 default: 400 ret = -EIO; 401 break; 402 } 403 404 unlock: 405 mutex_unlock(&zone->lock); 406 407 return ret; 408 } 409 410 static int zloop_reset_zone(struct zloop_device *zlo, unsigned int zone_no) 411 { 412 struct zloop_zone *zone = &zlo->zones[zone_no]; 413 unsigned long flags; 414 int ret = 0; 415 416 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) 417 return -EIO; 418 419 mutex_lock(&zone->lock); 420 421 if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && 422 zone->cond == BLK_ZONE_COND_EMPTY) 423 goto unlock; 424 425 if (vfs_truncate(&zone->file->f_path, 0)) { 426 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 427 ret = -EIO; 428 goto unlock; 429 } 430 431 spin_lock_irqsave(&zone->wp_lock, flags); 432 zloop_lru_remove_open_zone(zlo, zone); 433 zone->cond = BLK_ZONE_COND_EMPTY; 434 zone->wp = zone->start; 435 clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 436 spin_unlock_irqrestore(&zone->wp_lock, flags); 437 438 unlock: 439 mutex_unlock(&zone->lock); 440 441 return ret; 442 } 443 444 static int zloop_reset_all_zones(struct zloop_device *zlo) 445 { 446 unsigned int i; 447 int ret; 448 449 for (i = zlo->nr_conv_zones; i < zlo->nr_zones; i++) { 450 ret = zloop_reset_zone(zlo, i); 451 if (ret) 452 return ret; 453 } 454 455 return 0; 456 } 457 458 static int zloop_finish_zone(struct zloop_device *zlo, unsigned int zone_no) 459 { 460 struct zloop_zone *zone = &zlo->zones[zone_no]; 461 unsigned long flags; 462 int ret = 0; 463 464 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) 465 return -EIO; 466 467 mutex_lock(&zone->lock); 468 469 if (!test_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags) && 470 zone->cond == BLK_ZONE_COND_FULL) 471 goto unlock; 472 473 if (vfs_truncate(&zone->file->f_path, zlo->zone_size << SECTOR_SHIFT)) { 474 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 475 ret = -EIO; 476 goto unlock; 477 } 478 479 spin_lock_irqsave(&zone->wp_lock, flags); 480 zloop_lru_remove_open_zone(zlo, zone); 481 zone->cond = BLK_ZONE_COND_FULL; 482 zone->wp = ULLONG_MAX; 483 clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 484 spin_unlock_irqrestore(&zone->wp_lock, flags); 485 486 unlock: 487 mutex_unlock(&zone->lock); 488 489 return ret; 490 } 491 492 static void zloop_put_cmd(struct zloop_cmd *cmd) 493 { 494 struct request *rq = blk_mq_rq_from_pdu(cmd); 495 496 if (!atomic_dec_and_test(&cmd->ref)) 497 return; 498 kfree(cmd->bvec); 499 cmd->bvec = NULL; 500 if (likely(!blk_should_fake_timeout(rq->q))) 501 blk_mq_complete_request(rq); 502 } 503 504 static void zloop_rw_complete(struct kiocb *iocb, long ret) 505 { 506 struct zloop_cmd *cmd = container_of(iocb, struct zloop_cmd, iocb); 507 508 cmd->ret = ret; 509 zloop_put_cmd(cmd); 510 } 511 512 static int zloop_do_rw(struct zloop_cmd *cmd) 513 { 514 struct request *rq = blk_mq_rq_from_pdu(cmd); 515 int rw = req_op(rq) == REQ_OP_READ ? ITER_DEST : ITER_SOURCE; 516 unsigned int nr_bvec = blk_rq_nr_bvec(rq); 517 struct zloop_device *zlo = rq->q->queuedata; 518 struct zloop_zone *zone = &zlo->zones[rq_zone_no(rq)]; 519 struct req_iterator rq_iter; 520 struct iov_iter iter; 521 522 if (rq->bio != rq->biotail) { 523 struct bio_vec tmp, *bvec; 524 525 cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO); 526 if (!cmd->bvec) 527 return -EIO; 528 529 /* 530 * The bios of the request may be started from the middle of 531 * the 'bvec' because of bio splitting, so we can't directly 532 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec 533 * API will take care of all details for us. 534 */ 535 bvec = cmd->bvec; 536 rq_for_each_bvec(tmp, rq, rq_iter) { 537 *bvec = tmp; 538 bvec++; 539 } 540 iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq)); 541 } else { 542 /* 543 * Same here, this bio may be started from the middle of the 544 * 'bvec' because of bio splitting, so offset from the bvec 545 * must be passed to iov iterator 546 */ 547 iov_iter_bvec(&iter, rw, 548 __bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter), 549 nr_bvec, blk_rq_bytes(rq)); 550 iter.iov_offset = rq->bio->bi_iter.bi_bvec_done; 551 } 552 553 cmd->iocb.ki_pos = (cmd->sector - zone->start) << SECTOR_SHIFT; 554 cmd->iocb.ki_filp = zone->file; 555 cmd->iocb.ki_complete = zloop_rw_complete; 556 if (!zlo->buffered_io) 557 cmd->iocb.ki_flags = IOCB_DIRECT; 558 cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); 559 560 if (rw == ITER_SOURCE) 561 return zone->file->f_op->write_iter(&cmd->iocb, &iter); 562 return zone->file->f_op->read_iter(&cmd->iocb, &iter); 563 } 564 565 static int zloop_seq_write_prep(struct zloop_cmd *cmd) 566 { 567 struct request *rq = blk_mq_rq_from_pdu(cmd); 568 struct zloop_device *zlo = rq->q->queuedata; 569 unsigned int zone_no = rq_zone_no(rq); 570 sector_t nr_sectors = blk_rq_sectors(rq); 571 bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; 572 struct zloop_zone *zone = &zlo->zones[zone_no]; 573 sector_t zone_end = zone->start + zlo->zone_capacity; 574 unsigned long flags; 575 int ret = 0; 576 577 spin_lock_irqsave(&zone->wp_lock, flags); 578 579 /* 580 * Zone append operations always go at the current write pointer, but 581 * regular write operations must already be aligned to the write pointer 582 * when submitted. 583 */ 584 if (is_append) { 585 /* 586 * If ordered zone append is in use, we already checked and set 587 * the target sector in zloop_queue_rq(). 588 */ 589 if (!zlo->ordered_zone_append) { 590 if (zone->cond == BLK_ZONE_COND_FULL || 591 zone->wp + nr_sectors > zone_end) { 592 ret = -EIO; 593 goto out_unlock; 594 } 595 cmd->sector = zone->wp; 596 } 597 } else { 598 if (cmd->sector != zone->wp) { 599 pr_err("Zone %u: unaligned write: sect %llu, wp %llu\n", 600 zone_no, cmd->sector, zone->wp); 601 ret = -EIO; 602 goto out_unlock; 603 } 604 } 605 606 /* Implicitly open the target zone. */ 607 if (!zloop_do_open_zone(zlo, zone, false)) { 608 ret = -EIO; 609 goto out_unlock; 610 } 611 612 /* 613 * Advance the write pointer, unless ordered zone append is in use. If 614 * the write fails, the write pointer position will be corrected when 615 * the next I/O starts execution. 616 */ 617 if (!is_append || !zlo->ordered_zone_append) { 618 zone->wp += nr_sectors; 619 if (zone->wp == zone_end) { 620 zloop_lru_remove_open_zone(zlo, zone); 621 zone->cond = BLK_ZONE_COND_FULL; 622 zone->wp = ULLONG_MAX; 623 } 624 } 625 out_unlock: 626 spin_unlock_irqrestore(&zone->wp_lock, flags); 627 return ret; 628 } 629 630 static void zloop_rw(struct zloop_cmd *cmd) 631 { 632 struct request *rq = blk_mq_rq_from_pdu(cmd); 633 struct zloop_device *zlo = rq->q->queuedata; 634 unsigned int zone_no = rq_zone_no(rq); 635 sector_t nr_sectors = blk_rq_sectors(rq); 636 bool is_append = req_op(rq) == REQ_OP_ZONE_APPEND; 637 bool is_write = req_op(rq) == REQ_OP_WRITE || is_append; 638 struct zloop_zone *zone; 639 int ret = -EIO; 640 641 atomic_set(&cmd->ref, 2); 642 cmd->sector = blk_rq_pos(rq); 643 cmd->nr_sectors = nr_sectors; 644 cmd->ret = 0; 645 646 if (WARN_ON_ONCE(is_append && !zlo->zone_append)) 647 goto out; 648 649 /* We should never get an I/O beyond the device capacity. */ 650 if (WARN_ON_ONCE(zone_no >= zlo->nr_zones)) 651 goto out; 652 653 zone = &zlo->zones[zone_no]; 654 655 /* 656 * The block layer should never send requests that are not fully 657 * contained within the zone. 658 */ 659 if (WARN_ON_ONCE(cmd->sector + nr_sectors > 660 zone->start + zlo->zone_size)) 661 goto out; 662 663 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { 664 mutex_lock(&zone->lock); 665 ret = zloop_update_seq_zone(zlo, zone_no); 666 mutex_unlock(&zone->lock); 667 if (ret) 668 goto out; 669 } 670 671 if (!test_bit(ZLOOP_ZONE_CONV, &zone->flags) && is_write) { 672 mutex_lock(&zone->lock); 673 ret = zloop_seq_write_prep(cmd); 674 if (!ret) 675 ret = zloop_do_rw(cmd); 676 mutex_unlock(&zone->lock); 677 } else { 678 ret = zloop_do_rw(cmd); 679 } 680 out: 681 if (ret != -EIOCBQUEUED) 682 zloop_rw_complete(&cmd->iocb, ret); 683 zloop_put_cmd(cmd); 684 } 685 686 static inline bool zloop_zone_is_active(struct zloop_zone *zone) 687 { 688 switch (zone->cond) { 689 case BLK_ZONE_COND_EXP_OPEN: 690 case BLK_ZONE_COND_IMP_OPEN: 691 case BLK_ZONE_COND_CLOSED: 692 return true; 693 default: 694 return false; 695 } 696 } 697 698 static int zloop_record_safe_wps(struct zloop_device *zlo) 699 { 700 unsigned int i; 701 int ret; 702 703 for (i = 0; i < zlo->nr_zones; i++) { 704 struct zloop_zone *zone = &zlo->zones[i]; 705 struct file *file = zone->file; 706 707 if (!zloop_zone_is_active(zone)) 708 continue; 709 ret = vfs_setxattr(file_mnt_idmap(file), file_dentry(file), 710 "user.zloop.wp", &zone->wp, sizeof(zone->wp), 0); 711 if (ret) { 712 pr_err("%pg: failed to record write pointer (%d)\n", 713 zlo->disk->part0, ret); 714 return ret; 715 } 716 } 717 718 return 0; 719 } 720 721 /* 722 * Sync the entire FS containing the zone files instead of walking all files. 723 */ 724 static int zloop_flush(struct zloop_device *zlo) 725 { 726 struct super_block *sb = file_inode(zlo->data_dir)->i_sb; 727 int ret; 728 729 if (zlo->discard_write_cache) { 730 ret = zloop_record_safe_wps(zlo); 731 if (ret) 732 return ret; 733 } 734 735 down_read(&sb->s_umount); 736 ret = sync_filesystem(sb); 737 up_read(&sb->s_umount); 738 739 return ret; 740 } 741 742 static void zloop_handle_cmd(struct zloop_cmd *cmd) 743 { 744 struct request *rq = blk_mq_rq_from_pdu(cmd); 745 struct zloop_device *zlo = rq->q->queuedata; 746 747 /* We can block in this context, so ignore REQ_NOWAIT. */ 748 if (rq->cmd_flags & REQ_NOWAIT) 749 rq->cmd_flags &= ~REQ_NOWAIT; 750 751 switch (req_op(rq)) { 752 case REQ_OP_READ: 753 case REQ_OP_WRITE: 754 case REQ_OP_ZONE_APPEND: 755 /* 756 * zloop_rw() always executes asynchronously or completes 757 * directly. 758 */ 759 zloop_rw(cmd); 760 return; 761 case REQ_OP_FLUSH: 762 cmd->ret = zloop_flush(zlo); 763 break; 764 case REQ_OP_ZONE_RESET: 765 cmd->ret = zloop_reset_zone(zlo, rq_zone_no(rq)); 766 break; 767 case REQ_OP_ZONE_RESET_ALL: 768 cmd->ret = zloop_reset_all_zones(zlo); 769 break; 770 case REQ_OP_ZONE_FINISH: 771 cmd->ret = zloop_finish_zone(zlo, rq_zone_no(rq)); 772 break; 773 case REQ_OP_ZONE_OPEN: 774 cmd->ret = zloop_open_zone(zlo, rq_zone_no(rq)); 775 break; 776 case REQ_OP_ZONE_CLOSE: 777 cmd->ret = zloop_close_zone(zlo, rq_zone_no(rq)); 778 break; 779 default: 780 WARN_ON_ONCE(1); 781 pr_err("Unsupported operation %d\n", req_op(rq)); 782 cmd->ret = -EOPNOTSUPP; 783 break; 784 } 785 786 blk_mq_complete_request(rq); 787 } 788 789 static void zloop_cmd_workfn(struct work_struct *work) 790 { 791 struct zloop_cmd *cmd = container_of(work, struct zloop_cmd, work); 792 int orig_flags = current->flags; 793 794 current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; 795 zloop_handle_cmd(cmd); 796 current->flags = orig_flags; 797 } 798 799 static void zloop_complete_rq(struct request *rq) 800 { 801 struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); 802 struct zloop_device *zlo = rq->q->queuedata; 803 unsigned int zone_no = cmd->sector >> zlo->zone_shift; 804 struct zloop_zone *zone = &zlo->zones[zone_no]; 805 blk_status_t sts = BLK_STS_OK; 806 807 switch (req_op(rq)) { 808 case REQ_OP_READ: 809 if (cmd->ret < 0) 810 pr_err("Zone %u: failed read sector %llu, %llu sectors\n", 811 zone_no, cmd->sector, cmd->nr_sectors); 812 813 if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { 814 /* short read */ 815 struct bio *bio; 816 817 __rq_for_each_bio(bio, rq) 818 zero_fill_bio(bio); 819 } 820 break; 821 case REQ_OP_WRITE: 822 case REQ_OP_ZONE_APPEND: 823 if (cmd->ret < 0) 824 pr_err("Zone %u: failed %swrite sector %llu, %llu sectors\n", 825 zone_no, 826 req_op(rq) == REQ_OP_WRITE ? "" : "append ", 827 cmd->sector, cmd->nr_sectors); 828 829 if (cmd->ret >= 0 && cmd->ret != blk_rq_bytes(rq)) { 830 pr_err("Zone %u: partial write %ld/%u B\n", 831 zone_no, cmd->ret, blk_rq_bytes(rq)); 832 cmd->ret = -EIO; 833 } 834 835 if (cmd->ret < 0 && !test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { 836 /* 837 * A write to a sequential zone file failed: mark the 838 * zone as having an error. This will be corrected and 839 * cleared when the next IO is submitted. 840 */ 841 set_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags); 842 break; 843 } 844 if (req_op(rq) == REQ_OP_ZONE_APPEND) 845 rq->__sector = cmd->sector; 846 847 break; 848 default: 849 break; 850 } 851 852 if (cmd->ret < 0) 853 sts = errno_to_blk_status(cmd->ret); 854 blk_mq_end_request(rq, sts); 855 } 856 857 static bool zloop_set_zone_append_sector(struct request *rq) 858 { 859 struct zloop_device *zlo = rq->q->queuedata; 860 unsigned int zone_no = rq_zone_no(rq); 861 struct zloop_zone *zone = &zlo->zones[zone_no]; 862 sector_t zone_end = zone->start + zlo->zone_capacity; 863 sector_t nr_sectors = blk_rq_sectors(rq); 864 unsigned long flags; 865 866 spin_lock_irqsave(&zone->wp_lock, flags); 867 868 if (zone->cond == BLK_ZONE_COND_FULL || 869 zone->wp + nr_sectors > zone_end) { 870 spin_unlock_irqrestore(&zone->wp_lock, flags); 871 return false; 872 } 873 874 rq->__sector = zone->wp; 875 zone->wp += blk_rq_sectors(rq); 876 if (zone->wp >= zone_end) { 877 zloop_lru_remove_open_zone(zlo, zone); 878 zone->cond = BLK_ZONE_COND_FULL; 879 zone->wp = ULLONG_MAX; 880 } 881 882 spin_unlock_irqrestore(&zone->wp_lock, flags); 883 884 return true; 885 } 886 887 static blk_status_t zloop_queue_rq(struct blk_mq_hw_ctx *hctx, 888 const struct blk_mq_queue_data *bd) 889 { 890 struct request *rq = bd->rq; 891 struct zloop_cmd *cmd = blk_mq_rq_to_pdu(rq); 892 struct zloop_device *zlo = rq->q->queuedata; 893 894 if (data_race(READ_ONCE(zlo->state)) == Zlo_deleting) 895 return BLK_STS_IOERR; 896 897 /* 898 * If we need to strongly order zone append operations, set the request 899 * sector to the zone write pointer location now instead of when the 900 * command work runs. 901 */ 902 if (zlo->ordered_zone_append && req_op(rq) == REQ_OP_ZONE_APPEND) { 903 if (!zloop_set_zone_append_sector(rq)) 904 return BLK_STS_IOERR; 905 } 906 907 blk_mq_start_request(rq); 908 909 INIT_WORK(&cmd->work, zloop_cmd_workfn); 910 queue_work(zlo->workqueue, &cmd->work); 911 912 return BLK_STS_OK; 913 } 914 915 static const struct blk_mq_ops zloop_mq_ops = { 916 .queue_rq = zloop_queue_rq, 917 .complete = zloop_complete_rq, 918 }; 919 920 static int zloop_open(struct gendisk *disk, blk_mode_t mode) 921 { 922 struct zloop_device *zlo = disk->private_data; 923 int ret; 924 925 ret = mutex_lock_killable(&zloop_ctl_mutex); 926 if (ret) 927 return ret; 928 929 if (zlo->state != Zlo_live) 930 ret = -ENXIO; 931 mutex_unlock(&zloop_ctl_mutex); 932 return ret; 933 } 934 935 static int zloop_report_zones(struct gendisk *disk, sector_t sector, 936 unsigned int nr_zones, struct blk_report_zones_args *args) 937 { 938 struct zloop_device *zlo = disk->private_data; 939 struct blk_zone blkz = {}; 940 unsigned int first, i; 941 unsigned long flags; 942 int ret; 943 944 first = disk_zone_no(disk, sector); 945 if (first >= zlo->nr_zones) 946 return 0; 947 nr_zones = min(nr_zones, zlo->nr_zones - first); 948 949 for (i = 0; i < nr_zones; i++) { 950 unsigned int zone_no = first + i; 951 struct zloop_zone *zone = &zlo->zones[zone_no]; 952 953 mutex_lock(&zone->lock); 954 955 if (test_and_clear_bit(ZLOOP_ZONE_SEQ_ERROR, &zone->flags)) { 956 ret = zloop_update_seq_zone(zlo, zone_no); 957 if (ret) { 958 mutex_unlock(&zone->lock); 959 return ret; 960 } 961 } 962 963 blkz.start = zone->start; 964 blkz.len = zlo->zone_size; 965 spin_lock_irqsave(&zone->wp_lock, flags); 966 blkz.wp = zone->wp; 967 spin_unlock_irqrestore(&zone->wp_lock, flags); 968 blkz.cond = zone->cond; 969 if (test_bit(ZLOOP_ZONE_CONV, &zone->flags)) { 970 blkz.type = BLK_ZONE_TYPE_CONVENTIONAL; 971 blkz.capacity = zlo->zone_size; 972 } else { 973 blkz.type = BLK_ZONE_TYPE_SEQWRITE_REQ; 974 blkz.capacity = zlo->zone_capacity; 975 } 976 977 mutex_unlock(&zone->lock); 978 979 ret = disk_report_zone(disk, &blkz, i, args); 980 if (ret) 981 return ret; 982 } 983 984 return nr_zones; 985 } 986 987 static void zloop_free_disk(struct gendisk *disk) 988 { 989 struct zloop_device *zlo = disk->private_data; 990 unsigned int i; 991 992 blk_mq_free_tag_set(&zlo->tag_set); 993 994 for (i = 0; i < zlo->nr_zones; i++) { 995 struct zloop_zone *zone = &zlo->zones[i]; 996 997 mapping_set_gfp_mask(zone->file->f_mapping, 998 zone->old_gfp_mask); 999 fput(zone->file); 1000 } 1001 1002 fput(zlo->data_dir); 1003 destroy_workqueue(zlo->workqueue); 1004 kfree(zlo->base_dir); 1005 kvfree(zlo); 1006 } 1007 1008 static const struct block_device_operations zloop_fops = { 1009 .owner = THIS_MODULE, 1010 .open = zloop_open, 1011 .report_zones = zloop_report_zones, 1012 .free_disk = zloop_free_disk, 1013 }; 1014 1015 __printf(3, 4) 1016 static struct file *zloop_filp_open_fmt(int oflags, umode_t mode, 1017 const char *fmt, ...) 1018 { 1019 struct file *file; 1020 va_list ap; 1021 char *p; 1022 1023 va_start(ap, fmt); 1024 p = kvasprintf(GFP_KERNEL, fmt, ap); 1025 va_end(ap); 1026 1027 if (!p) 1028 return ERR_PTR(-ENOMEM); 1029 file = filp_open(p, oflags, mode); 1030 kfree(p); 1031 return file; 1032 } 1033 1034 static int zloop_get_block_size(struct zloop_device *zlo, 1035 struct zloop_zone *zone) 1036 { 1037 struct block_device *sb_bdev = zone->file->f_mapping->host->i_sb->s_bdev; 1038 struct kstat st; 1039 1040 /* 1041 * If the FS block size is lower than or equal to 4K, use that as the 1042 * device block size. Otherwise, fallback to the FS direct IO alignment 1043 * constraint if that is provided, and to the FS underlying device 1044 * physical block size if the direct IO alignment is unknown. 1045 */ 1046 if (file_inode(zone->file)->i_sb->s_blocksize <= SZ_4K) 1047 zlo->block_size = file_inode(zone->file)->i_sb->s_blocksize; 1048 else if (!vfs_getattr(&zone->file->f_path, &st, STATX_DIOALIGN, 0) && 1049 (st.result_mask & STATX_DIOALIGN)) 1050 zlo->block_size = st.dio_offset_align; 1051 else if (sb_bdev) 1052 zlo->block_size = bdev_physical_block_size(sb_bdev); 1053 else 1054 zlo->block_size = SECTOR_SIZE; 1055 1056 if (zlo->zone_capacity & ((zlo->block_size >> SECTOR_SHIFT) - 1)) { 1057 pr_err("Zone capacity is not aligned to block size %u\n", 1058 zlo->block_size); 1059 return -EINVAL; 1060 } 1061 1062 return 0; 1063 } 1064 1065 static int zloop_init_zone(struct zloop_device *zlo, struct zloop_options *opts, 1066 unsigned int zone_no, bool restore) 1067 { 1068 struct zloop_zone *zone = &zlo->zones[zone_no]; 1069 int oflags = O_RDWR; 1070 struct kstat stat; 1071 sector_t file_sectors; 1072 int ret; 1073 1074 mutex_init(&zone->lock); 1075 INIT_LIST_HEAD(&zone->open_zone_entry); 1076 spin_lock_init(&zone->wp_lock); 1077 zone->start = (sector_t)zone_no << zlo->zone_shift; 1078 1079 if (!restore) 1080 oflags |= O_CREAT; 1081 1082 if (!opts->buffered_io) 1083 oflags |= O_DIRECT; 1084 1085 if (zone_no < zlo->nr_conv_zones) { 1086 /* Conventional zone file. */ 1087 set_bit(ZLOOP_ZONE_CONV, &zone->flags); 1088 zone->cond = BLK_ZONE_COND_NOT_WP; 1089 zone->wp = U64_MAX; 1090 1091 zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/cnv-%06u", 1092 zlo->base_dir, zlo->id, zone_no); 1093 if (IS_ERR(zone->file)) { 1094 pr_err("Failed to open zone %u file %s/%u/cnv-%06u (err=%ld)", 1095 zone_no, zlo->base_dir, zlo->id, zone_no, 1096 PTR_ERR(zone->file)); 1097 return PTR_ERR(zone->file); 1098 } 1099 1100 if (!zlo->block_size) { 1101 ret = zloop_get_block_size(zlo, zone); 1102 if (ret) 1103 return ret; 1104 } 1105 1106 ret = vfs_getattr(&zone->file->f_path, &stat, STATX_SIZE, 0); 1107 if (ret < 0) { 1108 pr_err("Failed to get zone %u file stat\n", zone_no); 1109 return ret; 1110 } 1111 file_sectors = stat.size >> SECTOR_SHIFT; 1112 1113 if (restore && file_sectors != zlo->zone_size) { 1114 pr_err("Invalid conventional zone %u file size (%llu sectors != %llu)\n", 1115 zone_no, file_sectors, zlo->zone_capacity); 1116 return ret; 1117 } 1118 1119 ret = vfs_truncate(&zone->file->f_path, 1120 zlo->zone_size << SECTOR_SHIFT); 1121 if (ret < 0) { 1122 pr_err("Failed to truncate zone %u file (err=%d)\n", 1123 zone_no, ret); 1124 return ret; 1125 } 1126 1127 return 0; 1128 } 1129 1130 /* Sequential zone file. */ 1131 zone->file = zloop_filp_open_fmt(oflags, 0600, "%s/%u/seq-%06u", 1132 zlo->base_dir, zlo->id, zone_no); 1133 if (IS_ERR(zone->file)) { 1134 pr_err("Failed to open zone %u file %s/%u/seq-%06u (err=%ld)", 1135 zone_no, zlo->base_dir, zlo->id, zone_no, 1136 PTR_ERR(zone->file)); 1137 return PTR_ERR(zone->file); 1138 } 1139 1140 if (!zlo->block_size) { 1141 ret = zloop_get_block_size(zlo, zone); 1142 if (ret) 1143 return ret; 1144 } 1145 1146 zloop_get_block_size(zlo, zone); 1147 1148 mutex_lock(&zone->lock); 1149 ret = zloop_update_seq_zone(zlo, zone_no); 1150 mutex_unlock(&zone->lock); 1151 1152 return ret; 1153 } 1154 1155 static bool zloop_dev_exists(struct zloop_device *zlo) 1156 { 1157 struct file *cnv, *seq; 1158 bool exists; 1159 1160 cnv = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/cnv-%06u", 1161 zlo->base_dir, zlo->id, 0); 1162 seq = zloop_filp_open_fmt(O_RDONLY, 0600, "%s/%u/seq-%06u", 1163 zlo->base_dir, zlo->id, 0); 1164 exists = !IS_ERR(cnv) || !IS_ERR(seq); 1165 1166 if (!IS_ERR(cnv)) 1167 fput(cnv); 1168 if (!IS_ERR(seq)) 1169 fput(seq); 1170 1171 return exists; 1172 } 1173 1174 static int zloop_ctl_add(struct zloop_options *opts) 1175 { 1176 struct queue_limits lim = { 1177 .max_hw_sectors = SZ_1M >> SECTOR_SHIFT, 1178 .chunk_sectors = opts->zone_size, 1179 .features = BLK_FEAT_ZONED | BLK_FEAT_WRITE_CACHE, 1180 1181 }; 1182 unsigned int nr_zones, i, j; 1183 struct zloop_device *zlo; 1184 int ret = -EINVAL; 1185 bool restore; 1186 1187 __module_get(THIS_MODULE); 1188 1189 nr_zones = opts->capacity >> ilog2(opts->zone_size); 1190 if (opts->nr_conv_zones >= nr_zones) { 1191 pr_err("Invalid number of conventional zones %u\n", 1192 opts->nr_conv_zones); 1193 goto out; 1194 } 1195 1196 if (opts->max_open_zones > nr_zones - opts->nr_conv_zones) { 1197 pr_err("Invalid maximum number of open zones %u\n", 1198 opts->max_open_zones); 1199 goto out; 1200 } 1201 1202 zlo = kvzalloc_flex(*zlo, zones, nr_zones); 1203 if (!zlo) { 1204 ret = -ENOMEM; 1205 goto out; 1206 } 1207 WRITE_ONCE(zlo->state, Zlo_creating); 1208 spin_lock_init(&zlo->open_zones_lock); 1209 INIT_LIST_HEAD(&zlo->open_zones_lru_list); 1210 1211 ret = mutex_lock_killable(&zloop_ctl_mutex); 1212 if (ret) 1213 goto out_free_dev; 1214 1215 /* Allocate id, if @opts->id >= 0, we're requesting that specific id */ 1216 if (opts->id >= 0) { 1217 ret = idr_alloc(&zloop_index_idr, zlo, 1218 opts->id, opts->id + 1, GFP_KERNEL); 1219 if (ret == -ENOSPC) 1220 ret = -EEXIST; 1221 } else { 1222 ret = idr_alloc(&zloop_index_idr, zlo, 0, 0, GFP_KERNEL); 1223 } 1224 mutex_unlock(&zloop_ctl_mutex); 1225 if (ret < 0) 1226 goto out_free_dev; 1227 1228 zlo->id = ret; 1229 zlo->zone_shift = ilog2(opts->zone_size); 1230 zlo->zone_size = opts->zone_size; 1231 if (opts->zone_capacity) 1232 zlo->zone_capacity = opts->zone_capacity; 1233 else 1234 zlo->zone_capacity = zlo->zone_size; 1235 zlo->nr_zones = nr_zones; 1236 zlo->nr_conv_zones = opts->nr_conv_zones; 1237 zlo->max_open_zones = opts->max_open_zones; 1238 zlo->buffered_io = opts->buffered_io; 1239 zlo->zone_append = opts->zone_append; 1240 if (zlo->zone_append) 1241 zlo->ordered_zone_append = opts->ordered_zone_append; 1242 zlo->discard_write_cache = opts->discard_write_cache; 1243 1244 zlo->workqueue = alloc_workqueue("zloop%d", WQ_UNBOUND | WQ_FREEZABLE, 1245 opts->nr_queues * opts->queue_depth, zlo->id); 1246 if (!zlo->workqueue) { 1247 ret = -ENOMEM; 1248 goto out_free_idr; 1249 } 1250 1251 if (opts->base_dir) 1252 zlo->base_dir = kstrdup(opts->base_dir, GFP_KERNEL); 1253 else 1254 zlo->base_dir = kstrdup(ZLOOP_DEF_BASE_DIR, GFP_KERNEL); 1255 if (!zlo->base_dir) { 1256 ret = -ENOMEM; 1257 goto out_destroy_workqueue; 1258 } 1259 1260 zlo->data_dir = zloop_filp_open_fmt(O_RDONLY | O_DIRECTORY, 0, "%s/%u", 1261 zlo->base_dir, zlo->id); 1262 if (IS_ERR(zlo->data_dir)) { 1263 ret = PTR_ERR(zlo->data_dir); 1264 pr_warn("Failed to open directory %s/%u (err=%d)\n", 1265 zlo->base_dir, zlo->id, ret); 1266 goto out_free_base_dir; 1267 } 1268 1269 /* 1270 * If we already have zone files, we are restoring a device created by a 1271 * previous add operation. In this case, zloop_init_zone() will check 1272 * that the zone files are consistent with the zone configuration given. 1273 */ 1274 restore = zloop_dev_exists(zlo); 1275 for (i = 0; i < nr_zones; i++) { 1276 ret = zloop_init_zone(zlo, opts, i, restore); 1277 if (ret) 1278 goto out_close_files; 1279 } 1280 1281 lim.physical_block_size = zlo->block_size; 1282 lim.logical_block_size = zlo->block_size; 1283 if (zlo->zone_append) 1284 lim.max_hw_zone_append_sectors = lim.max_hw_sectors; 1285 lim.max_open_zones = zlo->max_open_zones; 1286 1287 zlo->tag_set.ops = &zloop_mq_ops; 1288 zlo->tag_set.nr_hw_queues = opts->nr_queues; 1289 zlo->tag_set.queue_depth = opts->queue_depth; 1290 zlo->tag_set.numa_node = NUMA_NO_NODE; 1291 zlo->tag_set.cmd_size = sizeof(struct zloop_cmd); 1292 zlo->tag_set.driver_data = zlo; 1293 1294 ret = blk_mq_alloc_tag_set(&zlo->tag_set); 1295 if (ret) { 1296 pr_err("blk_mq_alloc_tag_set failed (err=%d)\n", ret); 1297 goto out_close_files; 1298 } 1299 1300 zlo->disk = blk_mq_alloc_disk(&zlo->tag_set, &lim, zlo); 1301 if (IS_ERR(zlo->disk)) { 1302 pr_err("blk_mq_alloc_disk failed (err=%d)\n", ret); 1303 ret = PTR_ERR(zlo->disk); 1304 goto out_cleanup_tags; 1305 } 1306 zlo->disk->flags = GENHD_FL_NO_PART; 1307 zlo->disk->fops = &zloop_fops; 1308 zlo->disk->private_data = zlo; 1309 sprintf(zlo->disk->disk_name, "zloop%d", zlo->id); 1310 set_capacity(zlo->disk, (u64)lim.chunk_sectors * zlo->nr_zones); 1311 1312 ret = blk_revalidate_disk_zones(zlo->disk); 1313 if (ret) 1314 goto out_cleanup_disk; 1315 1316 ret = add_disk(zlo->disk); 1317 if (ret) { 1318 pr_err("add_disk failed (err=%d)\n", ret); 1319 goto out_cleanup_disk; 1320 } 1321 1322 mutex_lock(&zloop_ctl_mutex); 1323 WRITE_ONCE(zlo->state, Zlo_live); 1324 mutex_unlock(&zloop_ctl_mutex); 1325 1326 pr_info("zloop: device %d, %u zones of %llu MiB, %u B block size\n", 1327 zlo->id, zlo->nr_zones, 1328 ((sector_t)zlo->zone_size << SECTOR_SHIFT) >> 20, 1329 zlo->block_size); 1330 pr_info("zloop%d: using %s%s zone append\n", 1331 zlo->id, 1332 zlo->ordered_zone_append ? "ordered " : "", 1333 zlo->zone_append ? "native" : "emulated"); 1334 1335 return 0; 1336 1337 out_cleanup_disk: 1338 put_disk(zlo->disk); 1339 out_cleanup_tags: 1340 blk_mq_free_tag_set(&zlo->tag_set); 1341 out_close_files: 1342 for (j = 0; j < i; j++) { 1343 struct zloop_zone *zone = &zlo->zones[j]; 1344 1345 if (!IS_ERR_OR_NULL(zone->file)) 1346 fput(zone->file); 1347 } 1348 fput(zlo->data_dir); 1349 out_free_base_dir: 1350 kfree(zlo->base_dir); 1351 out_destroy_workqueue: 1352 destroy_workqueue(zlo->workqueue); 1353 out_free_idr: 1354 mutex_lock(&zloop_ctl_mutex); 1355 idr_remove(&zloop_index_idr, zlo->id); 1356 mutex_unlock(&zloop_ctl_mutex); 1357 out_free_dev: 1358 kvfree(zlo); 1359 out: 1360 module_put(THIS_MODULE); 1361 if (ret == -ENOENT) 1362 ret = -EINVAL; 1363 return ret; 1364 } 1365 1366 static void zloop_truncate(struct file *file, loff_t pos) 1367 { 1368 struct mnt_idmap *idmap = file_mnt_idmap(file); 1369 struct dentry *dentry = file_dentry(file); 1370 struct iattr newattrs; 1371 1372 newattrs.ia_size = pos; 1373 newattrs.ia_valid = ATTR_SIZE; 1374 1375 inode_lock(dentry->d_inode); 1376 notify_change(idmap, dentry, &newattrs, NULL); 1377 inode_unlock(dentry->d_inode); 1378 } 1379 1380 static void zloop_forget_cache(struct zloop_device *zlo) 1381 { 1382 unsigned int i; 1383 int ret; 1384 1385 pr_info("%pg: discarding volatile write cache\n", zlo->disk->part0); 1386 1387 for (i = 0; i < zlo->nr_zones; i++) { 1388 struct zloop_zone *zone = &zlo->zones[i]; 1389 struct file *file = zone->file; 1390 sector_t old_wp; 1391 1392 if (!zloop_zone_is_active(zone)) 1393 continue; 1394 1395 ret = vfs_getxattr(file_mnt_idmap(file), file_dentry(file), 1396 "user.zloop.wp", &old_wp, sizeof(old_wp)); 1397 if (ret == -ENODATA) { 1398 old_wp = 0; 1399 } else if (ret != sizeof(old_wp)) { 1400 pr_err("%pg: failed to retrieve write pointer (%d)\n", 1401 zlo->disk->part0, ret); 1402 continue; 1403 } 1404 if (old_wp < zone->wp) 1405 zloop_truncate(file, old_wp); 1406 } 1407 } 1408 1409 static int zloop_ctl_remove(struct zloop_options *opts) 1410 { 1411 struct zloop_device *zlo; 1412 int ret; 1413 1414 if (!(opts->mask & ZLOOP_OPT_ID)) { 1415 pr_err("No ID specified for remove\n"); 1416 return -EINVAL; 1417 } 1418 1419 if (opts->mask & ~ZLOOP_OPT_ID) { 1420 pr_err("Invalid option specified for remove\n"); 1421 return -EINVAL; 1422 } 1423 1424 ret = mutex_lock_killable(&zloop_ctl_mutex); 1425 if (ret) 1426 return ret; 1427 1428 zlo = idr_find(&zloop_index_idr, opts->id); 1429 if (!zlo || zlo->state == Zlo_creating) { 1430 ret = -ENODEV; 1431 } else if (zlo->state == Zlo_deleting) { 1432 ret = -EINVAL; 1433 } else { 1434 idr_remove(&zloop_index_idr, zlo->id); 1435 WRITE_ONCE(zlo->state, Zlo_deleting); 1436 } 1437 1438 mutex_unlock(&zloop_ctl_mutex); 1439 if (ret) 1440 return ret; 1441 1442 del_gendisk(zlo->disk); 1443 1444 if (zlo->discard_write_cache) 1445 zloop_forget_cache(zlo); 1446 1447 put_disk(zlo->disk); 1448 1449 pr_info("Removed device %d\n", opts->id); 1450 1451 module_put(THIS_MODULE); 1452 1453 return 0; 1454 } 1455 1456 static int zloop_parse_options(struct zloop_options *opts, const char *buf) 1457 { 1458 substring_t args[MAX_OPT_ARGS]; 1459 char *options, *o, *p; 1460 unsigned int token; 1461 int ret = 0; 1462 1463 /* Set defaults. */ 1464 opts->mask = 0; 1465 opts->id = ZLOOP_DEF_ID; 1466 opts->capacity = ZLOOP_DEF_ZONE_SIZE * ZLOOP_DEF_NR_ZONES; 1467 opts->zone_size = ZLOOP_DEF_ZONE_SIZE; 1468 opts->nr_conv_zones = ZLOOP_DEF_NR_CONV_ZONES; 1469 opts->max_open_zones = ZLOOP_DEF_MAX_OPEN_ZONES; 1470 opts->nr_queues = ZLOOP_DEF_NR_QUEUES; 1471 opts->queue_depth = ZLOOP_DEF_QUEUE_DEPTH; 1472 opts->buffered_io = ZLOOP_DEF_BUFFERED_IO; 1473 opts->zone_append = ZLOOP_DEF_ZONE_APPEND; 1474 opts->ordered_zone_append = ZLOOP_DEF_ORDERED_ZONE_APPEND; 1475 1476 if (!buf) 1477 return 0; 1478 1479 /* Skip leading spaces before the options. */ 1480 while (isspace(*buf)) 1481 buf++; 1482 1483 options = o = kstrdup(buf, GFP_KERNEL); 1484 if (!options) 1485 return -ENOMEM; 1486 1487 /* Parse the options, doing only some light invalid value checks. */ 1488 while ((p = strsep(&o, ",\n")) != NULL) { 1489 if (!*p) 1490 continue; 1491 1492 token = match_token(p, zloop_opt_tokens, args); 1493 opts->mask |= token; 1494 switch (token) { 1495 case ZLOOP_OPT_ID: 1496 if (match_int(args, &opts->id)) { 1497 ret = -EINVAL; 1498 goto out; 1499 } 1500 break; 1501 case ZLOOP_OPT_CAPACITY: 1502 if (match_uint(args, &token)) { 1503 ret = -EINVAL; 1504 goto out; 1505 } 1506 if (!token) { 1507 pr_err("Invalid capacity\n"); 1508 ret = -EINVAL; 1509 goto out; 1510 } 1511 opts->capacity = 1512 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; 1513 break; 1514 case ZLOOP_OPT_ZONE_SIZE: 1515 if (match_uint(args, &token)) { 1516 ret = -EINVAL; 1517 goto out; 1518 } 1519 if (!token || token > ZLOOP_MAX_ZONE_SIZE_MB || 1520 !is_power_of_2(token)) { 1521 pr_err("Invalid zone size %u\n", token); 1522 ret = -EINVAL; 1523 goto out; 1524 } 1525 opts->zone_size = 1526 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; 1527 break; 1528 case ZLOOP_OPT_ZONE_CAPACITY: 1529 if (match_uint(args, &token)) { 1530 ret = -EINVAL; 1531 goto out; 1532 } 1533 if (!token) { 1534 pr_err("Invalid zone capacity\n"); 1535 ret = -EINVAL; 1536 goto out; 1537 } 1538 opts->zone_capacity = 1539 ((sector_t)token * SZ_1M) >> SECTOR_SHIFT; 1540 break; 1541 case ZLOOP_OPT_NR_CONV_ZONES: 1542 if (match_uint(args, &token)) { 1543 ret = -EINVAL; 1544 goto out; 1545 } 1546 opts->nr_conv_zones = token; 1547 break; 1548 case ZLOOP_OPT_MAX_OPEN_ZONES: 1549 if (match_uint(args, &token)) { 1550 ret = -EINVAL; 1551 goto out; 1552 } 1553 opts->max_open_zones = token; 1554 break; 1555 case ZLOOP_OPT_BASE_DIR: 1556 p = match_strdup(args); 1557 if (!p) { 1558 ret = -ENOMEM; 1559 goto out; 1560 } 1561 kfree(opts->base_dir); 1562 opts->base_dir = p; 1563 break; 1564 case ZLOOP_OPT_NR_QUEUES: 1565 if (match_uint(args, &token)) { 1566 ret = -EINVAL; 1567 goto out; 1568 } 1569 if (!token) { 1570 pr_err("Invalid number of queues\n"); 1571 ret = -EINVAL; 1572 goto out; 1573 } 1574 opts->nr_queues = min(token, num_online_cpus()); 1575 break; 1576 case ZLOOP_OPT_QUEUE_DEPTH: 1577 if (match_uint(args, &token)) { 1578 ret = -EINVAL; 1579 goto out; 1580 } 1581 if (!token) { 1582 pr_err("Invalid queue depth\n"); 1583 ret = -EINVAL; 1584 goto out; 1585 } 1586 opts->queue_depth = token; 1587 break; 1588 case ZLOOP_OPT_BUFFERED_IO: 1589 opts->buffered_io = true; 1590 break; 1591 case ZLOOP_OPT_ZONE_APPEND: 1592 if (match_uint(args, &token)) { 1593 ret = -EINVAL; 1594 goto out; 1595 } 1596 if (token != 0 && token != 1) { 1597 pr_err("Invalid zone_append value\n"); 1598 ret = -EINVAL; 1599 goto out; 1600 } 1601 opts->zone_append = token; 1602 break; 1603 case ZLOOP_OPT_ORDERED_ZONE_APPEND: 1604 opts->ordered_zone_append = true; 1605 break; 1606 case ZLOOP_OPT_DISCARD_WRITE_CACHE: 1607 opts->discard_write_cache = true; 1608 break; 1609 case ZLOOP_OPT_ERR: 1610 default: 1611 pr_warn("unknown parameter or missing value '%s'\n", p); 1612 ret = -EINVAL; 1613 goto out; 1614 } 1615 } 1616 1617 ret = -EINVAL; 1618 if (opts->capacity <= opts->zone_size) { 1619 pr_err("Invalid capacity\n"); 1620 goto out; 1621 } 1622 1623 if (opts->zone_capacity > opts->zone_size) { 1624 pr_err("Invalid zone capacity\n"); 1625 goto out; 1626 } 1627 1628 ret = 0; 1629 out: 1630 kfree(options); 1631 return ret; 1632 } 1633 1634 enum { 1635 ZLOOP_CTL_ADD, 1636 ZLOOP_CTL_REMOVE, 1637 }; 1638 1639 static struct zloop_ctl_op { 1640 int code; 1641 const char *name; 1642 } zloop_ctl_ops[] = { 1643 { ZLOOP_CTL_ADD, "add" }, 1644 { ZLOOP_CTL_REMOVE, "remove" }, 1645 { -1, NULL }, 1646 }; 1647 1648 static ssize_t zloop_ctl_write(struct file *file, const char __user *ubuf, 1649 size_t count, loff_t *pos) 1650 { 1651 struct zloop_options opts = { }; 1652 struct zloop_ctl_op *op; 1653 const char *buf, *opts_buf; 1654 int i, ret; 1655 1656 if (count > PAGE_SIZE) 1657 return -ENOMEM; 1658 1659 buf = memdup_user_nul(ubuf, count); 1660 if (IS_ERR(buf)) 1661 return PTR_ERR(buf); 1662 1663 for (i = 0; i < ARRAY_SIZE(zloop_ctl_ops); i++) { 1664 op = &zloop_ctl_ops[i]; 1665 if (!op->name) { 1666 pr_err("Invalid operation\n"); 1667 ret = -EINVAL; 1668 goto out; 1669 } 1670 if (!strncmp(buf, op->name, strlen(op->name))) 1671 break; 1672 } 1673 1674 if (count <= strlen(op->name)) 1675 opts_buf = NULL; 1676 else 1677 opts_buf = buf + strlen(op->name); 1678 1679 ret = zloop_parse_options(&opts, opts_buf); 1680 if (ret) { 1681 pr_err("Failed to parse options\n"); 1682 goto out; 1683 } 1684 1685 switch (op->code) { 1686 case ZLOOP_CTL_ADD: 1687 ret = zloop_ctl_add(&opts); 1688 break; 1689 case ZLOOP_CTL_REMOVE: 1690 ret = zloop_ctl_remove(&opts); 1691 break; 1692 default: 1693 pr_err("Invalid operation\n"); 1694 ret = -EINVAL; 1695 goto out; 1696 } 1697 1698 out: 1699 kfree(opts.base_dir); 1700 kfree(buf); 1701 return ret ? ret : count; 1702 } 1703 1704 static int zloop_ctl_show(struct seq_file *seq_file, void *private) 1705 { 1706 const struct match_token *tok; 1707 int i; 1708 1709 /* Add operation */ 1710 seq_printf(seq_file, "%s ", zloop_ctl_ops[0].name); 1711 for (i = 0; i < ARRAY_SIZE(zloop_opt_tokens); i++) { 1712 tok = &zloop_opt_tokens[i]; 1713 if (!tok->pattern) 1714 break; 1715 if (i) 1716 seq_putc(seq_file, ','); 1717 seq_puts(seq_file, tok->pattern); 1718 } 1719 seq_putc(seq_file, '\n'); 1720 1721 /* Remove operation */ 1722 seq_puts(seq_file, zloop_ctl_ops[1].name); 1723 seq_puts(seq_file, " id=%d\n"); 1724 1725 return 0; 1726 } 1727 1728 static int zloop_ctl_open(struct inode *inode, struct file *file) 1729 { 1730 file->private_data = NULL; 1731 return single_open(file, zloop_ctl_show, NULL); 1732 } 1733 1734 static int zloop_ctl_release(struct inode *inode, struct file *file) 1735 { 1736 return single_release(inode, file); 1737 } 1738 1739 static const struct file_operations zloop_ctl_fops = { 1740 .owner = THIS_MODULE, 1741 .open = zloop_ctl_open, 1742 .release = zloop_ctl_release, 1743 .write = zloop_ctl_write, 1744 .read = seq_read, 1745 }; 1746 1747 static struct miscdevice zloop_misc = { 1748 .minor = MISC_DYNAMIC_MINOR, 1749 .name = "zloop-control", 1750 .fops = &zloop_ctl_fops, 1751 }; 1752 1753 static int __init zloop_init(void) 1754 { 1755 int ret; 1756 1757 ret = misc_register(&zloop_misc); 1758 if (ret) { 1759 pr_err("Failed to register misc device: %d\n", ret); 1760 return ret; 1761 } 1762 pr_info("Module loaded\n"); 1763 1764 return 0; 1765 } 1766 1767 static void __exit zloop_exit(void) 1768 { 1769 misc_deregister(&zloop_misc); 1770 idr_destroy(&zloop_index_idr); 1771 } 1772 1773 module_init(zloop_init); 1774 module_exit(zloop_exit); 1775 1776 MODULE_DESCRIPTION("Zoned loopback device"); 1777 MODULE_LICENSE("GPL"); 1778