1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 24 * Copyright (c) 2024, 2025, Rob Norris <robn@despairlabs.com> 25 * Copyright (c) 2024, 2025, Klara, Inc. 26 */ 27 28 #include <sys/dataset_kstats.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/zil_impl.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/spa_impl.h> 41 #include <sys/zvol.h> 42 #include <sys/zvol_impl.h> 43 #include <cityhash.h> 44 45 #include <linux/blkdev_compat.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/workqueue.h> 48 #include <linux/blk-mq.h> 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned long zvol_max_discard_blocks = 16384; 55 56 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 57 static unsigned int zvol_open_timeout_ms = 1000; 58 #endif 59 60 static unsigned int zvol_blk_mq_threads = 0; 61 static unsigned int zvol_blk_mq_actual_threads; 62 static boolean_t zvol_use_blk_mq = B_FALSE; 63 64 /* 65 * The maximum number of volblocksize blocks to process per thread. Typically, 66 * write heavy workloads preform better with higher values here, and read 67 * heavy workloads preform better with lower values, but that's not a hard 68 * and fast rule. It's basically a knob to tune between "less overhead with 69 * less parallelism" and "more overhead, but more parallelism". 70 * 71 * '8' was chosen as a reasonable, balanced, default based off of sequential 72 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 73 */ 74 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 75 76 #ifndef BLKDEV_DEFAULT_RQ 77 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 78 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 79 #endif 80 81 /* 82 * Finalize our BIO or request. 83 */ 84 static inline void 85 zvol_end_io(struct bio *bio, struct request *rq, int error) 86 { 87 ASSERT3U(error, >=, 0); 88 if (bio) { 89 bio->bi_status = errno_to_bi_status(error); 90 bio_endio(bio); 91 } else { 92 blk_mq_end_request(rq, errno_to_bi_status(error)); 93 } 94 } 95 96 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 97 static unsigned int zvol_actual_blk_mq_queue_depth; 98 99 struct zvol_state_os { 100 struct gendisk *zvo_disk; /* generic disk */ 101 struct request_queue *zvo_queue; /* request queue */ 102 dev_t zvo_dev; /* device id */ 103 104 struct blk_mq_tag_set tag_set; 105 106 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 107 boolean_t use_blk_mq; 108 }; 109 110 static struct ida zvol_ida; 111 112 /* 113 * This is called when a new block multiqueue request comes in. A request 114 * contains one or more BIOs. 115 */ 116 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 117 const struct blk_mq_queue_data *bd) 118 { 119 struct request *rq = bd->rq; 120 zvol_state_t *zv = rq->q->queuedata; 121 122 /* Tell the kernel that we are starting to process this request */ 123 blk_mq_start_request(rq); 124 125 if (blk_rq_is_passthrough(rq)) { 126 /* Skip non filesystem request */ 127 blk_mq_end_request(rq, BLK_STS_IOERR); 128 return (BLK_STS_IOERR); 129 } 130 131 zvol_request_impl(zv, NULL, rq, 0); 132 133 /* Acknowledge to the kernel that we got this request */ 134 return (BLK_STS_OK); 135 } 136 137 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 138 .queue_rq = zvol_mq_queue_rq, 139 }; 140 141 /* Initialize our blk-mq struct */ 142 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 143 { 144 struct zvol_state_os *zso = zv->zv_zso; 145 146 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 147 148 /* Initialize tag set. */ 149 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 150 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 151 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 152 zso->tag_set.numa_node = NUMA_NO_NODE; 153 zso->tag_set.cmd_size = 0; 154 155 /* 156 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 157 * zvol_request_impl() 158 */ 159 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 160 161 #ifdef BLK_MQ_F_SHOULD_MERGE 162 /* 163 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 164 * For older kernels, we set it. 165 */ 166 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 167 #endif 168 169 zso->tag_set.driver_data = zv; 170 171 return (blk_mq_alloc_tag_set(&zso->tag_set)); 172 } 173 174 /* 175 * Given a path, return TRUE if path is a ZVOL. 176 */ 177 boolean_t 178 zvol_os_is_zvol(const char *path) 179 { 180 dev_t dev = 0; 181 182 if (vdev_lookup_bdev(path, &dev) != 0) 183 return (B_FALSE); 184 185 if (MAJOR(dev) == zvol_major) 186 return (B_TRUE); 187 188 return (B_FALSE); 189 } 190 191 static void 192 zvol_write(zv_request_t *zvr) 193 { 194 struct bio *bio = zvr->bio; 195 struct request *rq = zvr->rq; 196 int error = 0; 197 zfs_uio_t uio; 198 zvol_state_t *zv = zvr->zv; 199 struct request_queue *q; 200 struct gendisk *disk; 201 unsigned long start_time = 0; 202 boolean_t acct = B_FALSE; 203 204 ASSERT3P(zv, !=, NULL); 205 ASSERT3U(zv->zv_open_count, >, 0); 206 ASSERT3P(zv->zv_zilog, !=, NULL); 207 208 q = zv->zv_zso->zvo_queue; 209 disk = zv->zv_zso->zvo_disk; 210 211 /* bio marked as FLUSH need to flush before write */ 212 if (io_is_flush(bio, rq)) { 213 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 214 if (error != 0) { 215 rw_exit(&zv->zv_suspend_lock); 216 zvol_end_io(bio, rq, -error); 217 return; 218 } 219 } 220 221 /* Some requests are just for flush and nothing else. */ 222 if (io_size(bio, rq) == 0) { 223 rw_exit(&zv->zv_suspend_lock); 224 zvol_end_io(bio, rq, 0); 225 return; 226 } 227 228 zfs_uio_bvec_init(&uio, bio, rq); 229 230 ssize_t start_resid = uio.uio_resid; 231 232 /* 233 * With use_blk_mq, accounting is done by blk_mq_start_request() 234 * and blk_mq_end_request(), so we can skip it here. 235 */ 236 if (bio) { 237 acct = blk_queue_io_stat(q); 238 if (acct) { 239 start_time = blk_generic_start_io_acct(q, disk, WRITE, 240 bio); 241 } 242 } 243 244 boolean_t sync = 245 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 246 247 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 248 uio.uio_loffset, uio.uio_resid, RL_WRITER); 249 250 uint64_t volsize = zv->zv_volsize; 251 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 252 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 253 uint64_t off = uio.uio_loffset; 254 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 255 256 if (bytes > volsize - off) /* don't write past the end */ 257 bytes = volsize - off; 258 259 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 260 261 /* This will only fail for ENOSPC */ 262 error = dmu_tx_assign(tx, DMU_TX_WAIT); 263 if (error) { 264 dmu_tx_abort(tx); 265 break; 266 } 267 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, 268 DMU_READ_PREFETCH); 269 if (error == 0) { 270 zvol_log_write(zv, tx, off, bytes, sync); 271 } 272 dmu_tx_commit(tx); 273 274 if (error) 275 break; 276 } 277 zfs_rangelock_exit(lr); 278 279 int64_t nwritten = start_resid - uio.uio_resid; 280 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 281 task_io_account_write(nwritten); 282 283 if (error == 0 && sync) 284 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 285 286 rw_exit(&zv->zv_suspend_lock); 287 288 if (bio && acct) { 289 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 290 } 291 292 zvol_end_io(bio, rq, error); 293 } 294 295 static void 296 zvol_write_task(void *arg) 297 { 298 zv_request_task_t *task = arg; 299 zvol_write(&task->zvr); 300 zv_request_task_free(task); 301 } 302 303 static void 304 zvol_discard(zv_request_t *zvr) 305 { 306 struct bio *bio = zvr->bio; 307 struct request *rq = zvr->rq; 308 zvol_state_t *zv = zvr->zv; 309 uint64_t start = io_offset(bio, rq); 310 uint64_t size = io_size(bio, rq); 311 uint64_t end = start + size; 312 boolean_t sync; 313 int error = 0; 314 dmu_tx_t *tx; 315 struct request_queue *q = zv->zv_zso->zvo_queue; 316 struct gendisk *disk = zv->zv_zso->zvo_disk; 317 unsigned long start_time = 0; 318 boolean_t acct = B_FALSE; 319 320 ASSERT3P(zv, !=, NULL); 321 ASSERT3U(zv->zv_open_count, >, 0); 322 ASSERT3P(zv->zv_zilog, !=, NULL); 323 324 if (bio) { 325 acct = blk_queue_io_stat(q); 326 if (acct) { 327 start_time = blk_generic_start_io_acct(q, disk, WRITE, 328 bio); 329 } 330 } 331 332 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 333 334 if (end > zv->zv_volsize) { 335 error = SET_ERROR(EIO); 336 goto unlock; 337 } 338 339 /* 340 * Align the request to volume block boundaries. This will prevent 341 * dnode_free_range() from zeroing out the unaligned parts which is 342 * slow (read-modify-write) and useless since we are not freeing any 343 * space by doing so. 344 */ 345 start = P2ROUNDUP(start, zv->zv_volblocksize); 346 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 347 size = end - start; 348 349 if (start >= end) 350 goto unlock; 351 352 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 353 start, size, RL_WRITER); 354 355 tx = dmu_tx_create(zv->zv_objset); 356 dmu_tx_mark_netfree(tx); 357 error = dmu_tx_assign(tx, DMU_TX_WAIT); 358 if (error != 0) { 359 dmu_tx_abort(tx); 360 } else { 361 zvol_log_truncate(zv, tx, start, size); 362 dmu_tx_commit(tx); 363 error = dmu_free_long_range(zv->zv_objset, 364 ZVOL_OBJ, start, size); 365 } 366 zfs_rangelock_exit(lr); 367 368 if (error == 0 && sync) 369 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 370 371 unlock: 372 rw_exit(&zv->zv_suspend_lock); 373 374 if (bio && acct) { 375 blk_generic_end_io_acct(q, disk, WRITE, bio, 376 start_time); 377 } 378 379 zvol_end_io(bio, rq, error); 380 } 381 382 static void 383 zvol_discard_task(void *arg) 384 { 385 zv_request_task_t *task = arg; 386 zvol_discard(&task->zvr); 387 zv_request_task_free(task); 388 } 389 390 static void 391 zvol_read(zv_request_t *zvr) 392 { 393 struct bio *bio = zvr->bio; 394 struct request *rq = zvr->rq; 395 int error = 0; 396 zfs_uio_t uio; 397 boolean_t acct = B_FALSE; 398 zvol_state_t *zv = zvr->zv; 399 struct request_queue *q; 400 struct gendisk *disk; 401 unsigned long start_time = 0; 402 403 ASSERT3P(zv, !=, NULL); 404 ASSERT3U(zv->zv_open_count, >, 0); 405 406 zfs_uio_bvec_init(&uio, bio, rq); 407 408 q = zv->zv_zso->zvo_queue; 409 disk = zv->zv_zso->zvo_disk; 410 411 ssize_t start_resid = uio.uio_resid; 412 413 /* 414 * When blk-mq is being used, accounting is done by 415 * blk_mq_start_request() and blk_mq_end_request(). 416 */ 417 if (bio) { 418 acct = blk_queue_io_stat(q); 419 if (acct) 420 start_time = blk_generic_start_io_acct(q, disk, READ, 421 bio); 422 } 423 424 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 425 uio.uio_loffset, uio.uio_resid, RL_READER); 426 427 uint64_t volsize = zv->zv_volsize; 428 429 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 430 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 431 432 /* don't read past the end */ 433 if (bytes > volsize - uio.uio_loffset) 434 bytes = volsize - uio.uio_loffset; 435 436 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, 437 DMU_READ_PREFETCH); 438 if (error) { 439 /* convert checksum errors into IO errors */ 440 if (error == ECKSUM) 441 error = SET_ERROR(EIO); 442 break; 443 } 444 } 445 zfs_rangelock_exit(lr); 446 447 int64_t nread = start_resid - uio.uio_resid; 448 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 449 task_io_account_read(nread); 450 451 rw_exit(&zv->zv_suspend_lock); 452 453 if (bio && acct) { 454 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 455 } 456 457 zvol_end_io(bio, rq, error); 458 } 459 460 static void 461 zvol_read_task(void *arg) 462 { 463 zv_request_task_t *task = arg; 464 zvol_read(&task->zvr); 465 zv_request_task_free(task); 466 } 467 468 /* 469 * Note: 470 * 471 * The kernel uses different enum names for the IO opcode, depending on the 472 * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather 473 * than inline functions for these checks. 474 */ 475 /* Should this IO go down the zvol write path? */ 476 #define ZVOL_OP_IS_WRITE(op) \ 477 (op == REQ_OP_WRITE || \ 478 op == REQ_OP_FLUSH || \ 479 op == REQ_OP_DISCARD) 480 481 /* Is this IO type supported by zvols? */ 482 #define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) 483 484 /* Get the IO opcode */ 485 #define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq)) 486 487 /* 488 * Process a BIO or request 489 * 490 * Either 'bio' or 'rq' should be set depending on if we are processing a 491 * bio or a request (both should not be set). 492 * 493 * force_sync: Set to 0 to defer processing to a background taskq 494 * Set to 1 to process data synchronously 495 */ 496 static void 497 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 498 boolean_t force_sync) 499 { 500 fstrans_cookie_t cookie = spl_fstrans_mark(); 501 uint64_t offset = io_offset(bio, rq); 502 uint64_t size = io_size(bio, rq); 503 int rw; 504 505 if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { 506 zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", 507 rq != NULL ? "request" : "BIO", 508 ZVOL_OP(bio, rq), 509 rq != NULL ? rq->cmd_flags : bio->bi_opf); 510 ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); 511 zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); 512 goto out; 513 } 514 515 if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { 516 rw = WRITE; 517 } else { 518 rw = READ; 519 } 520 521 /* 522 * Sanity check 523 * 524 * If we're a BIO, check our rw matches the kernel's 525 * bio_data_dir(bio) rw. We need to check because we support fewer 526 * IO operations, and want to verify that what we think are reads and 527 * writes from those operations match what the kernel thinks. 528 */ 529 ASSERT(rq != NULL || rw == bio_data_dir(bio)); 530 531 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 532 zvol_end_io(bio, rq, SET_ERROR(ENXIO)); 533 goto out; 534 } 535 536 if (zvol_request_sync || zv->zv_threading == B_FALSE) 537 force_sync = 1; 538 539 zv_request_t zvr = { 540 .zv = zv, 541 .bio = bio, 542 .rq = rq, 543 }; 544 545 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 546 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 547 zv->zv_zso->zvo_disk->disk_name, 548 (long long unsigned)offset, 549 (long unsigned)size); 550 551 zvol_end_io(bio, rq, SET_ERROR(EIO)); 552 goto out; 553 } 554 555 zv_request_task_t *task; 556 zv_taskq_t *ztqs = &zvol_taskqs; 557 uint_t blk_mq_hw_queue = 0; 558 uint_t tq_idx; 559 uint_t taskq_hash; 560 if (rq) 561 #ifdef HAVE_BLK_MQ_RQ_HCTX 562 blk_mq_hw_queue = rq->mq_hctx->queue_num; 563 #else 564 blk_mq_hw_queue = rq->q->queue_hw_ctx[ 565 rq->q->mq_map[raw_smp_processor_id()]]->queue_num; 566 #endif 567 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 568 blk_mq_hw_queue); 569 tq_idx = taskq_hash % ztqs->tqs_cnt; 570 571 if (rw == WRITE) { 572 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 573 zvol_end_io(bio, rq, SET_ERROR(EROFS)); 574 goto out; 575 } 576 577 /* 578 * Prevents the zvol from being suspended, or the ZIL being 579 * concurrently opened. Will be released after the i/o 580 * completes. 581 */ 582 rw_enter(&zv->zv_suspend_lock, RW_READER); 583 584 /* 585 * Open a ZIL if this is the first time we have written to this 586 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 587 * than zv_state_lock so that we don't need to acquire an 588 * additional lock in this path. 589 */ 590 if (zv->zv_zilog == NULL) { 591 rw_exit(&zv->zv_suspend_lock); 592 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 593 if (zv->zv_zilog == NULL) { 594 zv->zv_zilog = zil_open(zv->zv_objset, 595 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 596 zv->zv_flags |= ZVOL_WRITTEN_TO; 597 /* replay / destroy done in zvol_create_minor */ 598 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 599 ZIL_REPLAY_NEEDED)); 600 } 601 rw_downgrade(&zv->zv_suspend_lock); 602 } 603 604 /* 605 * We don't want this thread to be blocked waiting for i/o to 606 * complete, so we instead wait from a taskq callback. The 607 * i/o may be a ZIL write (via zil_commit()), or a read of an 608 * indirect block, or a read of a data block (if this is a 609 * partial-block write). We will indicate that the i/o is 610 * complete by calling END_IO() from the taskq callback. 611 * 612 * This design allows the calling thread to continue and 613 * initiate more concurrent operations by calling 614 * zvol_request() again. There are typically only a small 615 * number of threads available to call zvol_request() (e.g. 616 * one per iSCSI target), so keeping the latency of 617 * zvol_request() low is important for performance. 618 * 619 * The zvol_request_sync module parameter allows this 620 * behavior to be altered, for performance evaluation 621 * purposes. If the callback blocks, setting 622 * zvol_request_sync=1 will result in much worse performance. 623 * 624 * We can have up to zvol_threads concurrent i/o's being 625 * processed for all zvols on the system. This is typically 626 * a vast improvement over the zvol_request_sync=1 behavior 627 * of one i/o at a time per zvol. However, an even better 628 * design would be for zvol_request() to initiate the zio 629 * directly, and then be notified by the zio_done callback, 630 * which would call END_IO(). Unfortunately, the DMU/ZIL 631 * interfaces lack this functionality (they block waiting for 632 * the i/o to complete). 633 */ 634 if (io_is_discard(bio, rq)) { 635 if (force_sync) { 636 zvol_discard(&zvr); 637 } else { 638 task = zv_request_task_create(zvr); 639 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 640 zvol_discard_task, task, 0, &task->ent); 641 } 642 } else { 643 if (force_sync) { 644 zvol_write(&zvr); 645 } else { 646 task = zv_request_task_create(zvr); 647 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 648 zvol_write_task, task, 0, &task->ent); 649 } 650 } 651 } else { 652 /* 653 * The SCST driver, and possibly others, may issue READ I/Os 654 * with a length of zero bytes. These empty I/Os contain no 655 * data and require no additional handling. 656 */ 657 if (size == 0) { 658 zvol_end_io(bio, rq, 0); 659 goto out; 660 } 661 662 rw_enter(&zv->zv_suspend_lock, RW_READER); 663 664 /* See comment in WRITE case above. */ 665 if (force_sync) { 666 zvol_read(&zvr); 667 } else { 668 task = zv_request_task_create(zvr); 669 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 670 zvol_read_task, task, 0, &task->ent); 671 } 672 } 673 674 out: 675 spl_fstrans_unmark(cookie); 676 } 677 678 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 679 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 680 static void 681 zvol_submit_bio(struct bio *bio) 682 #else 683 static blk_qc_t 684 zvol_submit_bio(struct bio *bio) 685 #endif 686 #else 687 static MAKE_REQUEST_FN_RET 688 zvol_request(struct request_queue *q, struct bio *bio) 689 #endif 690 { 691 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 692 #if defined(HAVE_BIO_BDEV_DISK) 693 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 694 #else 695 struct request_queue *q = bio->bi_disk->queue; 696 #endif 697 #endif 698 zvol_state_t *zv = q->queuedata; 699 700 zvol_request_impl(zv, bio, NULL, 0); 701 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 702 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 703 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 704 return (BLK_QC_T_NONE); 705 #endif 706 } 707 708 static int 709 #ifdef HAVE_BLK_MODE_T 710 zvol_open(struct gendisk *disk, blk_mode_t flag) 711 #else 712 zvol_open(struct block_device *bdev, fmode_t flag) 713 #endif 714 { 715 zvol_state_t *zv; 716 int error = 0; 717 boolean_t drop_suspend = B_FALSE; 718 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 719 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 720 hrtime_t start = gethrtime(); 721 722 retry: 723 #endif 724 725 #ifdef HAVE_BLK_MODE_T 726 zv = atomic_load_ptr(&disk->private_data); 727 #else 728 zv = atomic_load_ptr(&bdev->bd_disk->private_data); 729 #endif 730 if (zv == NULL) { 731 return (-SET_ERROR(ENXIO)); 732 } 733 734 mutex_enter(&zv->zv_state_lock); 735 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 736 mutex_exit(&zv->zv_state_lock); 737 return (-SET_ERROR(ENXIO)); 738 } 739 740 /* 741 * Make sure zvol is not suspended during first open 742 * (hold zv_suspend_lock) and respect proper lock acquisition 743 * ordering - zv_suspend_lock before zv_state_lock 744 */ 745 if (zv->zv_open_count == 0) { 746 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 747 mutex_exit(&zv->zv_state_lock); 748 749 /* 750 * Removal may happen while the locks are down, so 751 * we can't trust zv any longer; we have to start over. 752 */ 753 #ifdef HAVE_BLK_MODE_T 754 zv = atomic_load_ptr(&disk->private_data); 755 #else 756 zv = atomic_load_ptr(&bdev->bd_disk->private_data); 757 #endif 758 if (zv == NULL) 759 return (-SET_ERROR(ENXIO)); 760 761 rw_enter(&zv->zv_suspend_lock, RW_READER); 762 mutex_enter(&zv->zv_state_lock); 763 764 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 765 mutex_exit(&zv->zv_state_lock); 766 rw_exit(&zv->zv_suspend_lock); 767 return (-SET_ERROR(ENXIO)); 768 } 769 770 /* check to see if zv_suspend_lock is needed */ 771 if (zv->zv_open_count != 0) { 772 rw_exit(&zv->zv_suspend_lock); 773 } else { 774 drop_suspend = B_TRUE; 775 } 776 } else { 777 drop_suspend = B_TRUE; 778 } 779 } 780 781 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 782 783 if (zv->zv_open_count == 0) { 784 boolean_t drop_namespace = B_FALSE; 785 786 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 787 788 /* 789 * In all other call paths the spa_namespace_lock is taken 790 * before the bdev->bd_mutex lock. However, on open(2) 791 * the __blkdev_get() function calls fops->open() with the 792 * bdev->bd_mutex lock held. This can result in a deadlock 793 * when zvols from one pool are used as vdevs in another. 794 * 795 * To prevent a lock inversion deadlock we preemptively 796 * take the spa_namespace_lock. Normally the lock will not 797 * be contended and this is safe because spa_open_common() 798 * handles the case where the caller already holds the 799 * spa_namespace_lock. 800 * 801 * When the lock cannot be aquired after multiple retries 802 * this must be the vdev on zvol deadlock case and we have 803 * no choice but to return an error. For 5.12 and older 804 * kernels returning -ERESTARTSYS will result in the 805 * bdev->bd_mutex being dropped, then reacquired, and 806 * fops->open() being called again. This process can be 807 * repeated safely until both locks are acquired. For 5.13 808 * and newer the -ERESTARTSYS retry logic was removed from 809 * the kernel so the only option is to return the error for 810 * the caller to handle it. 811 */ 812 if (!mutex_owned(&spa_namespace_lock)) { 813 if (!mutex_tryenter(&spa_namespace_lock)) { 814 mutex_exit(&zv->zv_state_lock); 815 rw_exit(&zv->zv_suspend_lock); 816 drop_suspend = B_FALSE; 817 818 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 819 schedule(); 820 return (-SET_ERROR(ERESTARTSYS)); 821 #else 822 if ((gethrtime() - start) > timeout) 823 return (-SET_ERROR(ERESTARTSYS)); 824 825 schedule_timeout_interruptible( 826 MSEC_TO_TICK(10)); 827 goto retry; 828 #endif 829 } else { 830 drop_namespace = B_TRUE; 831 } 832 } 833 834 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 835 836 if (drop_namespace) 837 mutex_exit(&spa_namespace_lock); 838 } 839 840 if (error == 0) { 841 if ((blk_mode_is_open_write(flag)) && 842 (zv->zv_flags & ZVOL_RDONLY)) { 843 if (zv->zv_open_count == 0) 844 zvol_last_close(zv); 845 846 error = -SET_ERROR(EROFS); 847 } else { 848 zv->zv_open_count++; 849 } 850 } 851 852 mutex_exit(&zv->zv_state_lock); 853 if (drop_suspend) 854 rw_exit(&zv->zv_suspend_lock); 855 856 if (error == 0) 857 #ifdef HAVE_BLK_MODE_T 858 disk_check_media_change(disk); 859 #else 860 zfs_check_media_change(bdev); 861 #endif 862 863 return (error); 864 } 865 866 static void 867 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 868 zvol_release(struct gendisk *disk) 869 #else 870 zvol_release(struct gendisk *disk, fmode_t unused) 871 #endif 872 { 873 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 874 (void) unused; 875 #endif 876 boolean_t drop_suspend = B_TRUE; 877 878 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 879 if (zv == NULL) 880 return; 881 882 mutex_enter(&zv->zv_state_lock); 883 ASSERT3U(zv->zv_open_count, >, 0); 884 /* 885 * make sure zvol is not suspended during last close 886 * (hold zv_suspend_lock) and respect proper lock acquisition 887 * ordering - zv_suspend_lock before zv_state_lock 888 */ 889 if (zv->zv_open_count == 1) { 890 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 891 mutex_exit(&zv->zv_state_lock); 892 rw_enter(&zv->zv_suspend_lock, RW_READER); 893 mutex_enter(&zv->zv_state_lock); 894 895 /* 896 * Unlike in zvol_open(), we don't check if removal 897 * started here, because we might be one of the openers 898 * that needs to be thrown out! If we're the last, we 899 * need to call zvol_last_close() below to finish 900 * cleanup. So, no special treatment for us. 901 */ 902 903 /* check to see if zv_suspend_lock is needed */ 904 if (zv->zv_open_count != 1) { 905 rw_exit(&zv->zv_suspend_lock); 906 drop_suspend = B_FALSE; 907 } 908 } 909 } else { 910 drop_suspend = B_FALSE; 911 } 912 913 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 914 915 zv->zv_open_count--; 916 if (zv->zv_open_count == 0) { 917 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 918 zvol_last_close(zv); 919 } 920 921 mutex_exit(&zv->zv_state_lock); 922 923 if (drop_suspend) 924 rw_exit(&zv->zv_suspend_lock); 925 } 926 927 static int 928 zvol_ioctl(struct block_device *bdev, fmode_t mode, 929 unsigned int cmd, unsigned long arg) 930 { 931 int error = 0; 932 933 zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); 934 ASSERT3P(zv, !=, NULL); 935 ASSERT3U(zv->zv_open_count, >, 0); 936 937 switch (cmd) { 938 case BLKFLSBUF: 939 #ifdef HAVE_FSYNC_BDEV 940 fsync_bdev(bdev); 941 #elif defined(HAVE_SYNC_BLOCKDEV) 942 sync_blockdev(bdev); 943 #else 944 #error "Neither fsync_bdev() nor sync_blockdev() found" 945 #endif 946 invalidate_bdev(bdev); 947 rw_enter(&zv->zv_suspend_lock, RW_READER); 948 949 if (!(zv->zv_flags & ZVOL_RDONLY)) 950 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 951 952 rw_exit(&zv->zv_suspend_lock); 953 break; 954 955 case BLKZNAME: 956 mutex_enter(&zv->zv_state_lock); 957 error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 958 mutex_exit(&zv->zv_state_lock); 959 if (error) 960 error = SET_ERROR(error); 961 break; 962 963 default: 964 error = SET_ERROR(ENOTTY); 965 break; 966 } 967 968 return (-error); 969 } 970 971 #ifdef CONFIG_COMPAT 972 static int 973 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 974 unsigned cmd, unsigned long arg) 975 { 976 return (zvol_ioctl(bdev, mode, cmd, arg)); 977 } 978 #else 979 #define zvol_compat_ioctl NULL 980 #endif 981 982 static unsigned int 983 zvol_check_events(struct gendisk *disk, unsigned int clearing) 984 { 985 unsigned int mask = 0; 986 987 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 988 989 if (zv != NULL) { 990 mutex_enter(&zv->zv_state_lock); 991 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 992 zv->zv_changed = 0; 993 mutex_exit(&zv->zv_state_lock); 994 } 995 996 return (mask); 997 } 998 999 static int 1000 zvol_revalidate_disk(struct gendisk *disk) 1001 { 1002 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 1003 1004 if (zv != NULL) { 1005 mutex_enter(&zv->zv_state_lock); 1006 set_capacity(zv->zv_zso->zvo_disk, 1007 zv->zv_volsize >> SECTOR_BITS); 1008 mutex_exit(&zv->zv_state_lock); 1009 } 1010 1011 return (0); 1012 } 1013 1014 int 1015 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1016 { 1017 struct gendisk *disk = zv->zv_zso->zvo_disk; 1018 1019 #if defined(HAVE_REVALIDATE_DISK_SIZE) 1020 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 1021 #elif defined(HAVE_REVALIDATE_DISK) 1022 revalidate_disk(disk); 1023 #else 1024 zvol_revalidate_disk(disk); 1025 #endif 1026 return (0); 1027 } 1028 1029 /* 1030 * Provide a simple virtual geometry for legacy compatibility. For devices 1031 * smaller than 1 MiB a small head and sector count is used to allow very 1032 * tiny devices. For devices over 1 Mib a standard head and sector count 1033 * is used to keep the cylinders count reasonable. 1034 */ 1035 static inline int 1036 zvol_getgeo_impl(struct gendisk *disk, struct hd_geometry *geo) 1037 { 1038 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 1039 sector_t sectors; 1040 1041 ASSERT3P(zv, !=, NULL); 1042 ASSERT3U(zv->zv_open_count, >, 0); 1043 1044 sectors = get_capacity(zv->zv_zso->zvo_disk); 1045 1046 if (sectors > 2048) { 1047 geo->heads = 16; 1048 geo->sectors = 63; 1049 } else { 1050 geo->heads = 2; 1051 geo->sectors = 4; 1052 } 1053 1054 geo->start = 0; 1055 geo->cylinders = sectors / (geo->heads * geo->sectors); 1056 1057 return (0); 1058 } 1059 1060 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_GETGEO_GENDISK 1061 static int 1062 zvol_getgeo(struct gendisk *disk, struct hd_geometry *geo) 1063 { 1064 return (zvol_getgeo_impl(disk, geo)); 1065 } 1066 #else 1067 static int 1068 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1069 { 1070 return (zvol_getgeo_impl(bdev->bd_disk, geo)); 1071 } 1072 #endif 1073 1074 /* 1075 * Why have two separate block_device_operations structs? 1076 * 1077 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1078 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1079 * can't just change submit_bio dynamically at runtime. So just create two 1080 * separate structs to get around this. 1081 */ 1082 static const struct block_device_operations zvol_ops_blk_mq = { 1083 .open = zvol_open, 1084 .release = zvol_release, 1085 .ioctl = zvol_ioctl, 1086 .compat_ioctl = zvol_compat_ioctl, 1087 .check_events = zvol_check_events, 1088 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1089 .revalidate_disk = zvol_revalidate_disk, 1090 #endif 1091 .getgeo = zvol_getgeo, 1092 .owner = THIS_MODULE, 1093 }; 1094 1095 static const struct block_device_operations zvol_ops = { 1096 .open = zvol_open, 1097 .release = zvol_release, 1098 .ioctl = zvol_ioctl, 1099 .compat_ioctl = zvol_compat_ioctl, 1100 .check_events = zvol_check_events, 1101 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1102 .revalidate_disk = zvol_revalidate_disk, 1103 #endif 1104 .getgeo = zvol_getgeo, 1105 .owner = THIS_MODULE, 1106 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1107 .submit_bio = zvol_submit_bio, 1108 #endif 1109 }; 1110 1111 /* 1112 * Since 6.9, Linux has been removing queue limit setters in favour of an 1113 * initial queue_limits struct applied when the device is open. Since 6.11, 1114 * queue_limits is being extended to allow more things to be applied when the 1115 * device is open. Setters are also being removed for this. 1116 * 1117 * For OpenZFS, this means that depending on kernel version, some options may 1118 * be set up before the device is open, and some applied to an open device 1119 * (queue) after the fact. 1120 * 1121 * We manage this complexity by having our own limits struct, 1122 * zvol_queue_limits_t, in which we carry any queue config that we're 1123 * interested in setting. This structure is the same on all kernels. 1124 * 1125 * These limits are then applied to the queue at device open time by the most 1126 * appropriate method for the kernel. 1127 * 1128 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1129 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1130 * struct queue_limits, and passes it in. Any fields added in later kernels are 1131 * (obviously) not set up here. 1132 * 1133 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1134 * is created, and applies any remaining config. Before 6.9 that will be 1135 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1136 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1137 * will always be a no-op on the latest kernel we support). 1138 */ 1139 typedef struct zvol_queue_limits { 1140 unsigned int zql_max_hw_sectors; 1141 unsigned short zql_max_segments; 1142 unsigned int zql_max_segment_size; 1143 unsigned int zql_io_opt; 1144 unsigned int zql_physical_block_size; 1145 unsigned int zql_max_discard_sectors; 1146 unsigned int zql_discard_granularity; 1147 } zvol_queue_limits_t; 1148 1149 static void 1150 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1151 boolean_t use_blk_mq) 1152 { 1153 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1154 1155 if (use_blk_mq) { 1156 /* 1157 * IO requests can be really big (1MB). When an IO request 1158 * comes in, it is passed off to zvol_read() or zvol_write() 1159 * in a new thread, where it is chunked up into 'volblocksize' 1160 * sized pieces and processed. So for example, if the request 1161 * is a 1MB write and your volblocksize is 128k, one zvol_write 1162 * thread will take that request and sequentially do ten 128k 1163 * IOs. This is due to the fact that the thread needs to lock 1164 * each volblocksize sized block. So you might be wondering: 1165 * "instead of passing the whole 1MB request to one thread, 1166 * why not pass ten individual 128k chunks to ten threads and 1167 * process the whole write in parallel?" The short answer is 1168 * that there's a sweet spot number of chunks that balances 1169 * the greater parallelism with the added overhead of more 1170 * threads. The sweet spot can be different depending on if you 1171 * have a read or write heavy workload. Writes typically want 1172 * high chunk counts while reads typically want lower ones. On 1173 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1174 * configuration, with volblocksize=8k, the sweet spot for good 1175 * sequential reads and writes was at 8 chunks. 1176 */ 1177 1178 /* 1179 * Below we tell the kernel how big we want our requests 1180 * to be. You would think that blk_queue_io_opt() would be 1181 * used to do this since it is used to "set optimal request 1182 * size for the queue", but that doesn't seem to do 1183 * anything - the kernel still gives you huge requests 1184 * with tons of little PAGE_SIZE segments contained within it. 1185 * 1186 * Knowing that the kernel will just give you PAGE_SIZE segments 1187 * no matter what, you can say "ok, I want PAGE_SIZE byte 1188 * segments, and I want 'N' of them per request", where N is 1189 * the correct number of segments for the volblocksize and 1190 * number of chunks you want. 1191 */ 1192 if (zvol_blk_mq_blocks_per_thread != 0) { 1193 unsigned int chunks; 1194 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1195 1196 limits->zql_max_segment_size = PAGE_SIZE; 1197 limits->zql_max_segments = 1198 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1199 } else { 1200 /* 1201 * Special case: zvol_blk_mq_blocks_per_thread = 0 1202 * Max everything out. 1203 */ 1204 limits->zql_max_segments = UINT16_MAX; 1205 limits->zql_max_segment_size = UINT_MAX; 1206 } 1207 } else { 1208 limits->zql_max_segments = UINT16_MAX; 1209 limits->zql_max_segment_size = UINT_MAX; 1210 } 1211 1212 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1213 1214 limits->zql_physical_block_size = zv->zv_volblocksize; 1215 limits->zql_max_discard_sectors = 1216 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1217 limits->zql_discard_granularity = zv->zv_volblocksize; 1218 } 1219 1220 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1221 static void 1222 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1223 struct queue_limits *qlimits) 1224 { 1225 memset(qlimits, 0, sizeof (struct queue_limits)); 1226 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1227 qlimits->max_segments = limits->zql_max_segments; 1228 qlimits->max_segment_size = limits->zql_max_segment_size; 1229 qlimits->io_opt = limits->zql_io_opt; 1230 qlimits->physical_block_size = limits->zql_physical_block_size; 1231 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1232 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1233 qlimits->discard_granularity = limits->zql_discard_granularity; 1234 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1235 qlimits->features = 1236 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1237 #endif 1238 } 1239 #endif 1240 1241 static void 1242 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1243 struct request_queue *queue) 1244 { 1245 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1246 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1247 blk_queue_max_segments(queue, limits->zql_max_segments); 1248 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1249 blk_queue_io_opt(queue, limits->zql_io_opt); 1250 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1251 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1252 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1253 #endif 1254 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1255 blk_queue_set_write_cache(queue, B_TRUE); 1256 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1257 #endif 1258 } 1259 1260 static int 1261 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1262 { 1263 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1264 #if defined(HAVE_BLK_ALLOC_DISK) 1265 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1266 if (zso->zvo_disk == NULL) 1267 return (1); 1268 1269 zso->zvo_disk->minors = ZVOL_MINORS; 1270 zso->zvo_queue = zso->zvo_disk->queue; 1271 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1272 struct queue_limits qlimits; 1273 zvol_queue_limits_convert(limits, &qlimits); 1274 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1275 if (IS_ERR(disk)) { 1276 zso->zvo_disk = NULL; 1277 return (1); 1278 } 1279 1280 zso->zvo_disk = disk; 1281 zso->zvo_disk->minors = ZVOL_MINORS; 1282 zso->zvo_queue = zso->zvo_disk->queue; 1283 1284 #else 1285 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1286 if (zso->zvo_queue == NULL) 1287 return (1); 1288 1289 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1290 if (zso->zvo_disk == NULL) { 1291 blk_cleanup_queue(zso->zvo_queue); 1292 return (1); 1293 } 1294 1295 zso->zvo_disk->queue = zso->zvo_queue; 1296 #endif /* HAVE_BLK_ALLOC_DISK */ 1297 #else 1298 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1299 if (zso->zvo_queue == NULL) 1300 return (1); 1301 1302 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1303 if (zso->zvo_disk == NULL) { 1304 blk_cleanup_queue(zso->zvo_queue); 1305 return (1); 1306 } 1307 1308 zso->zvo_disk->queue = zso->zvo_queue; 1309 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1310 1311 zvol_queue_limits_apply(limits, zso->zvo_queue); 1312 1313 return (0); 1314 1315 } 1316 1317 static int 1318 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1319 { 1320 struct zvol_state_os *zso = zv->zv_zso; 1321 1322 /* Allocate our blk-mq tag_set */ 1323 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1324 return (1); 1325 1326 #if defined(HAVE_BLK_ALLOC_DISK) 1327 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1328 if (zso->zvo_disk == NULL) { 1329 blk_mq_free_tag_set(&zso->tag_set); 1330 return (1); 1331 } 1332 zso->zvo_queue = zso->zvo_disk->queue; 1333 zso->zvo_disk->minors = ZVOL_MINORS; 1334 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1335 struct queue_limits qlimits; 1336 zvol_queue_limits_convert(limits, &qlimits); 1337 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1338 if (IS_ERR(disk)) { 1339 zso->zvo_disk = NULL; 1340 blk_mq_free_tag_set(&zso->tag_set); 1341 return (1); 1342 } 1343 1344 zso->zvo_disk = disk; 1345 zso->zvo_queue = zso->zvo_disk->queue; 1346 zso->zvo_disk->minors = ZVOL_MINORS; 1347 #else 1348 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1349 if (zso->zvo_disk == NULL) { 1350 blk_cleanup_queue(zso->zvo_queue); 1351 blk_mq_free_tag_set(&zso->tag_set); 1352 return (1); 1353 } 1354 /* Allocate queue */ 1355 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1356 if (IS_ERR(zso->zvo_queue)) { 1357 blk_mq_free_tag_set(&zso->tag_set); 1358 return (1); 1359 } 1360 1361 /* Our queue is now created, assign it to our disk */ 1362 zso->zvo_disk->queue = zso->zvo_queue; 1363 #endif 1364 1365 zvol_queue_limits_apply(limits, zso->zvo_queue); 1366 1367 return (0); 1368 } 1369 1370 /* 1371 * Allocate memory for a new zvol_state_t and setup the required 1372 * request queue and generic disk structures for the block device. 1373 */ 1374 static int 1375 zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, 1376 zvol_state_t **zvp) 1377 { 1378 zvol_state_t *zv; 1379 struct zvol_state_os *zso; 1380 uint64_t volmode; 1381 int ret; 1382 1383 ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); 1384 if (ret) 1385 return (ret); 1386 1387 if (volmode == ZFS_VOLMODE_DEFAULT) 1388 volmode = zvol_volmode; 1389 1390 if (volmode == ZFS_VOLMODE_NONE) 1391 return (0); 1392 1393 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1394 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1395 zv->zv_zso = zso; 1396 zv->zv_volmode = volmode; 1397 zv->zv_volsize = volsize; 1398 zv->zv_volblocksize = volblocksize; 1399 1400 list_link_init(&zv->zv_next); 1401 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1402 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1403 1404 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1405 1406 zvol_queue_limits_t limits; 1407 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1408 1409 /* 1410 * The block layer has 3 interfaces for getting BIOs: 1411 * 1412 * 1. blk-mq request queues (new) 1413 * 2. submit_bio() (oldest) 1414 * 3. regular request queues (old). 1415 * 1416 * Each of those interfaces has two permutations: 1417 * 1418 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1419 * both the disk and its queue (5.14 kernel or newer) 1420 * 1421 * b) We don't have blk_*alloc_disk(), and have to allocate the 1422 * disk and the queue separately. (5.13 kernel or older) 1423 */ 1424 if (zv->zv_zso->use_blk_mq) { 1425 ret = zvol_alloc_blk_mq(zv, &limits); 1426 if (ret != 0) 1427 goto out_kmem; 1428 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1429 } else { 1430 ret = zvol_alloc_non_blk_mq(zso, &limits); 1431 if (ret != 0) 1432 goto out_kmem; 1433 zso->zvo_disk->fops = &zvol_ops; 1434 } 1435 1436 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1437 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1438 1439 if (!zv->zv_zso->use_blk_mq) { 1440 /* Disable write merging in favor of the ZIO pipeline. */ 1441 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1442 } 1443 1444 zso->zvo_queue->queuedata = zv; 1445 zso->zvo_dev = dev; 1446 zv->zv_open_count = 0; 1447 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1448 1449 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1450 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1451 1452 zso->zvo_disk->major = zvol_major; 1453 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1454 1455 /* 1456 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1457 * This is accomplished by limiting the number of minors for the 1458 * device to one and explicitly disabling partition scanning. 1459 */ 1460 if (volmode == ZFS_VOLMODE_DEV) { 1461 zso->zvo_disk->minors = 1; 1462 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1463 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1464 } 1465 1466 zso->zvo_disk->first_minor = (dev & MINORMASK); 1467 zso->zvo_disk->private_data = zv; 1468 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1469 ZVOL_DEV_NAME, (dev & MINORMASK)); 1470 1471 *zvp = zv; 1472 return (ret); 1473 1474 out_kmem: 1475 kmem_free(zso, sizeof (struct zvol_state_os)); 1476 kmem_free(zv, sizeof (zvol_state_t)); 1477 return (ret); 1478 } 1479 1480 void 1481 zvol_os_remove_minor(zvol_state_t *zv) 1482 { 1483 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1484 ASSERT0(zv->zv_open_count); 1485 ASSERT0(atomic_read(&zv->zv_suspend_ref)); 1486 ASSERT(zv->zv_flags & ZVOL_REMOVING); 1487 1488 struct zvol_state_os *zso = zv->zv_zso; 1489 zv->zv_zso = NULL; 1490 1491 /* Clearing private_data will make new callers return immediately. */ 1492 atomic_store_ptr(&zso->zvo_disk->private_data, NULL); 1493 1494 /* 1495 * Drop the state lock before calling del_gendisk(). There may be 1496 * callers waiting to acquire it, but del_gendisk() will block until 1497 * they exit, which would deadlock. 1498 */ 1499 mutex_exit(&zv->zv_state_lock); 1500 1501 del_gendisk(zso->zvo_disk); 1502 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1503 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1504 #if defined(HAVE_BLK_CLEANUP_DISK) 1505 blk_cleanup_disk(zso->zvo_disk); 1506 #else 1507 put_disk(zso->zvo_disk); 1508 #endif 1509 #else 1510 blk_cleanup_queue(zso->zvo_queue); 1511 put_disk(zso->zvo_disk); 1512 #endif 1513 1514 if (zso->use_blk_mq) 1515 blk_mq_free_tag_set(&zso->tag_set); 1516 1517 ida_free(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); 1518 1519 kmem_free(zso, sizeof (struct zvol_state_os)); 1520 1521 mutex_enter(&zv->zv_state_lock); 1522 } 1523 1524 void 1525 zvol_os_free(zvol_state_t *zv) 1526 { 1527 1528 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1529 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1530 ASSERT0(zv->zv_open_count); 1531 ASSERT0P(zv->zv_zso); 1532 1533 ASSERT0P(zv->zv_objset); 1534 ASSERT0P(zv->zv_zilog); 1535 ASSERT0P(zv->zv_dn); 1536 1537 rw_destroy(&zv->zv_suspend_lock); 1538 zfs_rangelock_fini(&zv->zv_rangelock); 1539 1540 cv_destroy(&zv->zv_removing_cv); 1541 mutex_destroy(&zv->zv_state_lock); 1542 dataset_kstats_destroy(&zv->zv_kstat); 1543 1544 kmem_free(zv, sizeof (zvol_state_t)); 1545 } 1546 1547 void 1548 zvol_wait_close(zvol_state_t *zv) 1549 { 1550 } 1551 1552 struct add_disk_work { 1553 struct delayed_work work; 1554 struct gendisk *disk; 1555 int error; 1556 }; 1557 1558 static int 1559 __zvol_os_add_disk(struct gendisk *disk) 1560 { 1561 int error = 0; 1562 #ifdef HAVE_ADD_DISK_RET 1563 error = -add_disk(disk); 1564 if (error) 1565 error = SET_ERROR(error); 1566 #else 1567 add_disk(disk); 1568 #endif 1569 return (error); 1570 } 1571 1572 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1573 static void 1574 zvol_os_add_disk_work(struct work_struct *work) 1575 { 1576 struct add_disk_work *add_disk_work; 1577 add_disk_work = container_of(work, struct add_disk_work, work.work); 1578 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1579 } 1580 #endif 1581 1582 /* 1583 * SPECIAL CASE: 1584 * 1585 * This function basically calls add_disk() from a workqueue. You may be 1586 * thinking: why not just call add_disk() directly? 1587 * 1588 * When you call add_disk(), the zvol appears to the world. When this happens, 1589 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1590 * differently on the 6.9+ kernels: 1591 * 1592 * - 6.8 and older kernels - 1593 * disk_scan_partitions() 1594 * handle = bdev_open_by_dev( 1595 * zvol_open() 1596 * bdev_release(handle); 1597 * zvol_release() 1598 * 1599 * 1600 * - 6.9+ kernels - 1601 * disk_scan_partitions() 1602 * file = bdev_file_open_by_dev() 1603 * zvol_open() 1604 * fput(file) 1605 * < wait for return to userspace > 1606 * zvol_release() 1607 * 1608 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1609 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1610 * async that has to wait until we return to userspace (since it adds the fput 1611 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1612 * behavior we want, since we want do things like create+destroy a zvol within 1613 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1614 * reference to the zvol while we're in the IOCTL, which can't wait until we 1615 * return to userspace. 1616 * 1617 * We can get around this since fput() has a special codepath for when it's 1618 * running in a kernel thread or interrupt. In those cases, it just puts the 1619 * fput into the system workqueue, which we can force to run with 1620 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1621 * run from a kernel thread and "tricks" the fput() codepaths. 1622 * 1623 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1624 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1625 * fput) to happen, which it eventually, naturally, will from the system_wq 1626 * without us explicitly calling __flush_workqueue(). 1627 */ 1628 static int 1629 zvol_os_add_disk(struct gendisk *disk) 1630 { 1631 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1632 struct add_disk_work add_disk_work; 1633 1634 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1635 add_disk_work.disk = disk; 1636 add_disk_work.error = 0; 1637 1638 /* Use *_delayed_work functions since they're not GPL'd */ 1639 schedule_delayed_work(&add_disk_work.work, 0); 1640 flush_delayed_work(&add_disk_work.work); 1641 1642 __flush_workqueue(system_wq); 1643 return (add_disk_work.error); 1644 #else /* <= 6.8 kernel */ 1645 return (__zvol_os_add_disk(disk)); 1646 #endif 1647 } 1648 1649 /* 1650 * Create a block device minor node and setup the linkage between it 1651 * and the specified volume. Once this function returns the block 1652 * device is live and ready for use. 1653 */ 1654 int 1655 zvol_os_create_minor(const char *name) 1656 { 1657 zvol_state_t *zv = NULL; 1658 objset_t *os; 1659 dmu_object_info_t *doi; 1660 uint64_t volsize; 1661 uint64_t len; 1662 unsigned minor = 0; 1663 int error = 0; 1664 int idx; 1665 uint64_t hash = zvol_name_hash(name); 1666 uint64_t volthreading; 1667 bool replayed_zil = B_FALSE; 1668 1669 if (zvol_inhibit_dev) 1670 return (0); 1671 1672 idx = ida_alloc(&zvol_ida, kmem_flags_convert(KM_SLEEP)); 1673 if (idx < 0) 1674 return (SET_ERROR(-idx)); 1675 minor = idx << ZVOL_MINOR_BITS; 1676 if (MINOR(minor) != minor) { 1677 /* too many partitions can cause an overflow */ 1678 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1679 name, minor, MINOR(minor)); 1680 ida_free(&zvol_ida, idx); 1681 return (SET_ERROR(EINVAL)); 1682 } 1683 1684 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1685 if (zv) { 1686 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1687 mutex_exit(&zv->zv_state_lock); 1688 ida_free(&zvol_ida, idx); 1689 return (SET_ERROR(EEXIST)); 1690 } 1691 1692 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1693 1694 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1695 if (error) 1696 goto out_doi; 1697 1698 error = dmu_object_info(os, ZVOL_OBJ, doi); 1699 if (error) 1700 goto out_dmu_objset_disown; 1701 1702 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1703 if (error) 1704 goto out_dmu_objset_disown; 1705 1706 error = zvol_alloc(MKDEV(zvol_major, minor), name, 1707 volsize, doi->doi_data_block_size, &zv); 1708 if (error || zv == NULL) 1709 goto out_dmu_objset_disown; 1710 1711 zv->zv_hash = hash; 1712 1713 if (dmu_objset_is_snapshot(os)) 1714 zv->zv_flags |= ZVOL_RDONLY; 1715 1716 zv->zv_objset = os; 1717 1718 /* Default */ 1719 zv->zv_threading = B_TRUE; 1720 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1721 == 0) 1722 zv->zv_threading = volthreading; 1723 1724 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1725 1726 #ifdef QUEUE_FLAG_DISCARD 1727 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1728 #endif 1729 #ifdef QUEUE_FLAG_NONROT 1730 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1731 #endif 1732 #ifdef QUEUE_FLAG_ADD_RANDOM 1733 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1734 #endif 1735 /* This flag was introduced in kernel version 4.12. */ 1736 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1737 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1738 #endif 1739 1740 ASSERT0P(zv->zv_kstat.dk_kstats); 1741 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1742 if (error) 1743 goto out_dmu_objset_disown; 1744 ASSERT0P(zv->zv_zilog); 1745 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1746 if (spa_writeable(dmu_objset_spa(os))) { 1747 if (zil_replay_disable) 1748 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1749 else 1750 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1751 } 1752 if (replayed_zil) 1753 zil_close(zv->zv_zilog); 1754 zv->zv_zilog = NULL; 1755 1756 /* 1757 * When udev detects the addition of the device it will immediately 1758 * invoke blkid(8) to determine the type of content on the device. 1759 * Prefetching the blocks commonly scanned by blkid(8) will speed 1760 * up this process. 1761 */ 1762 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1763 if (len > 0) { 1764 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1765 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1766 ZIO_PRIORITY_SYNC_READ); 1767 } 1768 1769 zv->zv_objset = NULL; 1770 out_dmu_objset_disown: 1771 dmu_objset_disown(os, B_TRUE, FTAG); 1772 out_doi: 1773 kmem_free(doi, sizeof (dmu_object_info_t)); 1774 1775 /* 1776 * Keep in mind that once add_disk() is called, the zvol is 1777 * announced to the world, and zvol_open()/zvol_release() can 1778 * be called at any time. Incidentally, add_disk() itself calls 1779 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1780 * directly as well. 1781 */ 1782 if (error == 0 && zv) { 1783 rw_enter(&zvol_state_lock, RW_WRITER); 1784 zvol_insert(zv); 1785 rw_exit(&zvol_state_lock); 1786 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1787 } else { 1788 ida_free(&zvol_ida, idx); 1789 } 1790 1791 return (error); 1792 } 1793 1794 int 1795 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1796 { 1797 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1798 1799 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1800 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1801 1802 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1803 1804 /* move to new hashtable entry */ 1805 zv->zv_hash = zvol_name_hash(newname); 1806 hlist_del(&zv->zv_hlink); 1807 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1808 1809 /* 1810 * The block device's read-only state is briefly changed causing 1811 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1812 * the name change and fixes the symlinks. This does not change 1813 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1814 * changes. This would normally be done using kobject_uevent() but 1815 * that is a GPL-only symbol which is why we need this workaround. 1816 */ 1817 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1818 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1819 1820 dataset_kstats_rename(&zv->zv_kstat, newname); 1821 1822 return (0); 1823 } 1824 1825 void 1826 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1827 { 1828 1829 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1830 } 1831 1832 void 1833 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1834 { 1835 1836 set_capacity(zv->zv_zso->zvo_disk, capacity); 1837 } 1838 1839 int 1840 zvol_init(void) 1841 { 1842 int error; 1843 1844 error = zvol_init_impl(); 1845 if (error) { 1846 printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); 1847 return (error); 1848 } 1849 1850 error = -register_blkdev(zvol_major, ZVOL_DRIVER); 1851 if (error) { 1852 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1853 return (SET_ERROR(error)); 1854 } 1855 1856 if (zvol_blk_mq_queue_depth == 0) { 1857 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1858 } else { 1859 zvol_actual_blk_mq_queue_depth = 1860 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1861 } 1862 1863 if (zvol_blk_mq_threads == 0) { 1864 zvol_blk_mq_actual_threads = num_online_cpus(); 1865 } else { 1866 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1867 1024); 1868 } 1869 1870 ida_init(&zvol_ida); 1871 return (0); 1872 } 1873 1874 void 1875 zvol_fini(void) 1876 { 1877 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1878 1879 zvol_fini_impl(); 1880 1881 ida_destroy(&zvol_ida); 1882 } 1883 1884 module_param(zvol_major, uint, 0444); 1885 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1886 1887 module_param(zvol_max_discard_blocks, ulong, 0444); 1888 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1889 1890 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1891 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1892 1893 module_param(zvol_use_blk_mq, uint, 0644); 1894 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1895 1896 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1897 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1898 "Process volblocksize blocks per thread"); 1899 1900 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1901 module_param(zvol_open_timeout_ms, uint, 0644); 1902 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1903 #endif 1904