1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 24 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 25 * Copyright (c) 2024, 2025, Klara, Inc. 26 */ 27 28 #include <sys/dataset_kstats.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/zil_impl.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/spa_impl.h> 41 #include <sys/zvol.h> 42 #include <sys/zvol_impl.h> 43 #include <cityhash.h> 44 45 #include <linux/blkdev_compat.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/workqueue.h> 48 #include <linux/blk-mq.h> 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned long zvol_max_discard_blocks = 16384; 55 56 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 57 static unsigned int zvol_open_timeout_ms = 1000; 58 #endif 59 60 static unsigned int zvol_blk_mq_threads = 0; 61 static unsigned int zvol_blk_mq_actual_threads; 62 static boolean_t zvol_use_blk_mq = B_FALSE; 63 64 /* 65 * The maximum number of volblocksize blocks to process per thread. Typically, 66 * write heavy workloads preform better with higher values here, and read 67 * heavy workloads preform better with lower values, but that's not a hard 68 * and fast rule. It's basically a knob to tune between "less overhead with 69 * less parallelism" and "more overhead, but more parallelism". 70 * 71 * '8' was chosen as a reasonable, balanced, default based off of sequential 72 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 73 */ 74 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 75 76 #ifndef BLKDEV_DEFAULT_RQ 77 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 78 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 79 #endif 80 81 /* 82 * Finalize our BIO or request. 83 */ 84 static inline void 85 zvol_end_io(struct bio *bio, struct request *rq, int error) 86 { 87 ASSERT3U(error, >=, 0); 88 if (bio) { 89 bio->bi_status = errno_to_bi_status(error); 90 bio_endio(bio); 91 } else { 92 blk_mq_end_request(rq, errno_to_bi_status(error)); 93 } 94 } 95 96 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 97 static unsigned int zvol_actual_blk_mq_queue_depth; 98 99 struct zvol_state_os { 100 struct gendisk *zvo_disk; /* generic disk */ 101 struct request_queue *zvo_queue; /* request queue */ 102 dev_t zvo_dev; /* device id */ 103 104 struct blk_mq_tag_set tag_set; 105 106 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 107 boolean_t use_blk_mq; 108 }; 109 110 static struct ida zvol_ida; 111 112 /* 113 * This is called when a new block multiqueue request comes in. A request 114 * contains one or more BIOs. 115 */ 116 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 117 const struct blk_mq_queue_data *bd) 118 { 119 struct request *rq = bd->rq; 120 zvol_state_t *zv = rq->q->queuedata; 121 122 /* Tell the kernel that we are starting to process this request */ 123 blk_mq_start_request(rq); 124 125 if (blk_rq_is_passthrough(rq)) { 126 /* Skip non filesystem request */ 127 blk_mq_end_request(rq, BLK_STS_IOERR); 128 return (BLK_STS_IOERR); 129 } 130 131 zvol_request_impl(zv, NULL, rq, 0); 132 133 /* Acknowledge to the kernel that we got this request */ 134 return (BLK_STS_OK); 135 } 136 137 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 138 .queue_rq = zvol_mq_queue_rq, 139 }; 140 141 /* Initialize our blk-mq struct */ 142 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 143 { 144 struct zvol_state_os *zso = zv->zv_zso; 145 146 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 147 148 /* Initialize tag set. */ 149 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 150 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 151 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 152 zso->tag_set.numa_node = NUMA_NO_NODE; 153 zso->tag_set.cmd_size = 0; 154 155 /* 156 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 157 * zvol_request_impl() 158 */ 159 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 160 161 #ifdef BLK_MQ_F_SHOULD_MERGE 162 /* 163 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 164 * For older kernels, we set it. 165 */ 166 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 167 #endif 168 169 zso->tag_set.driver_data = zv; 170 171 return (blk_mq_alloc_tag_set(&zso->tag_set)); 172 } 173 174 /* 175 * Given a path, return TRUE if path is a ZVOL. 176 */ 177 boolean_t 178 zvol_os_is_zvol(const char *path) 179 { 180 dev_t dev = 0; 181 182 if (vdev_lookup_bdev(path, &dev) != 0) 183 return (B_FALSE); 184 185 if (MAJOR(dev) == zvol_major) 186 return (B_TRUE); 187 188 return (B_FALSE); 189 } 190 191 static void 192 zvol_write(zv_request_t *zvr) 193 { 194 struct bio *bio = zvr->bio; 195 struct request *rq = zvr->rq; 196 int error = 0; 197 zfs_uio_t uio; 198 zvol_state_t *zv = zvr->zv; 199 struct request_queue *q; 200 struct gendisk *disk; 201 unsigned long start_time = 0; 202 boolean_t acct = B_FALSE; 203 204 ASSERT3P(zv, !=, NULL); 205 ASSERT3U(zv->zv_open_count, >, 0); 206 ASSERT3P(zv->zv_zilog, !=, NULL); 207 208 q = zv->zv_zso->zvo_queue; 209 disk = zv->zv_zso->zvo_disk; 210 211 /* bio marked as FLUSH need to flush before write */ 212 if (io_is_flush(bio, rq)) { 213 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 214 if (error != 0) { 215 rw_exit(&zv->zv_suspend_lock); 216 zvol_end_io(bio, rq, -error); 217 return; 218 } 219 } 220 221 /* Some requests are just for flush and nothing else. */ 222 if (io_size(bio, rq) == 0) { 223 rw_exit(&zv->zv_suspend_lock); 224 zvol_end_io(bio, rq, 0); 225 return; 226 } 227 228 zfs_uio_bvec_init(&uio, bio, rq); 229 230 ssize_t start_resid = uio.uio_resid; 231 232 /* 233 * With use_blk_mq, accounting is done by blk_mq_start_request() 234 * and blk_mq_end_request(), so we can skip it here. 235 */ 236 if (bio) { 237 acct = blk_queue_io_stat(q); 238 if (acct) { 239 start_time = blk_generic_start_io_acct(q, disk, WRITE, 240 bio); 241 } 242 } 243 244 boolean_t sync = 245 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 246 247 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 248 uio.uio_loffset, uio.uio_resid, RL_WRITER); 249 250 uint64_t volsize = zv->zv_volsize; 251 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 252 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 253 uint64_t off = uio.uio_loffset; 254 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 255 256 if (bytes > volsize - off) /* don't write past the end */ 257 bytes = volsize - off; 258 259 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 260 261 /* This will only fail for ENOSPC */ 262 error = dmu_tx_assign(tx, DMU_TX_WAIT); 263 if (error) { 264 dmu_tx_abort(tx); 265 break; 266 } 267 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, 268 DMU_READ_PREFETCH); 269 if (error == 0) { 270 zvol_log_write(zv, tx, off, bytes, sync); 271 } 272 dmu_tx_commit(tx); 273 274 if (error) 275 break; 276 } 277 zfs_rangelock_exit(lr); 278 279 int64_t nwritten = start_resid - uio.uio_resid; 280 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 281 task_io_account_write(nwritten); 282 283 if (error == 0 && sync) 284 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 285 286 rw_exit(&zv->zv_suspend_lock); 287 288 if (bio && acct) { 289 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 290 } 291 292 zvol_end_io(bio, rq, error); 293 } 294 295 static void 296 zvol_write_task(void *arg) 297 { 298 zv_request_task_t *task = arg; 299 zvol_write(&task->zvr); 300 zv_request_task_free(task); 301 } 302 303 static void 304 zvol_discard(zv_request_t *zvr) 305 { 306 struct bio *bio = zvr->bio; 307 struct request *rq = zvr->rq; 308 zvol_state_t *zv = zvr->zv; 309 uint64_t start = io_offset(bio, rq); 310 uint64_t size = io_size(bio, rq); 311 uint64_t end = start + size; 312 boolean_t sync; 313 int error = 0; 314 dmu_tx_t *tx; 315 struct request_queue *q = zv->zv_zso->zvo_queue; 316 struct gendisk *disk = zv->zv_zso->zvo_disk; 317 unsigned long start_time = 0; 318 boolean_t acct = B_FALSE; 319 320 ASSERT3P(zv, !=, NULL); 321 ASSERT3U(zv->zv_open_count, >, 0); 322 ASSERT3P(zv->zv_zilog, !=, NULL); 323 324 if (bio) { 325 acct = blk_queue_io_stat(q); 326 if (acct) { 327 start_time = blk_generic_start_io_acct(q, disk, WRITE, 328 bio); 329 } 330 } 331 332 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 333 334 if (end > zv->zv_volsize) { 335 error = SET_ERROR(EIO); 336 goto unlock; 337 } 338 339 /* 340 * Align the request to volume block boundaries. This will prevent 341 * dnode_free_range() from zeroing out the unaligned parts which is 342 * slow (read-modify-write) and useless since we are not freeing any 343 * space by doing so. 344 */ 345 start = P2ROUNDUP(start, zv->zv_volblocksize); 346 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 347 size = end - start; 348 349 if (start >= end) 350 goto unlock; 351 352 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 353 start, size, RL_WRITER); 354 355 tx = dmu_tx_create(zv->zv_objset); 356 dmu_tx_mark_netfree(tx); 357 error = dmu_tx_assign(tx, DMU_TX_WAIT); 358 if (error != 0) { 359 dmu_tx_abort(tx); 360 } else { 361 zvol_log_truncate(zv, tx, start, size); 362 dmu_tx_commit(tx); 363 error = dmu_free_long_range(zv->zv_objset, 364 ZVOL_OBJ, start, size); 365 } 366 zfs_rangelock_exit(lr); 367 368 if (error == 0 && sync) 369 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 370 371 unlock: 372 rw_exit(&zv->zv_suspend_lock); 373 374 if (bio && acct) { 375 blk_generic_end_io_acct(q, disk, WRITE, bio, 376 start_time); 377 } 378 379 zvol_end_io(bio, rq, error); 380 } 381 382 static void 383 zvol_discard_task(void *arg) 384 { 385 zv_request_task_t *task = arg; 386 zvol_discard(&task->zvr); 387 zv_request_task_free(task); 388 } 389 390 static void 391 zvol_read(zv_request_t *zvr) 392 { 393 struct bio *bio = zvr->bio; 394 struct request *rq = zvr->rq; 395 int error = 0; 396 zfs_uio_t uio; 397 boolean_t acct = B_FALSE; 398 zvol_state_t *zv = zvr->zv; 399 struct request_queue *q; 400 struct gendisk *disk; 401 unsigned long start_time = 0; 402 403 ASSERT3P(zv, !=, NULL); 404 ASSERT3U(zv->zv_open_count, >, 0); 405 406 zfs_uio_bvec_init(&uio, bio, rq); 407 408 q = zv->zv_zso->zvo_queue; 409 disk = zv->zv_zso->zvo_disk; 410 411 ssize_t start_resid = uio.uio_resid; 412 413 /* 414 * When blk-mq is being used, accounting is done by 415 * blk_mq_start_request() and blk_mq_end_request(). 416 */ 417 if (bio) { 418 acct = blk_queue_io_stat(q); 419 if (acct) 420 start_time = blk_generic_start_io_acct(q, disk, READ, 421 bio); 422 } 423 424 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 425 uio.uio_loffset, uio.uio_resid, RL_READER); 426 427 uint64_t volsize = zv->zv_volsize; 428 429 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 430 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 431 432 /* don't read past the end */ 433 if (bytes > volsize - uio.uio_loffset) 434 bytes = volsize - uio.uio_loffset; 435 436 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, 437 DMU_READ_PREFETCH); 438 if (error) { 439 /* convert checksum errors into IO errors */ 440 if (error == ECKSUM) 441 error = SET_ERROR(EIO); 442 break; 443 } 444 } 445 zfs_rangelock_exit(lr); 446 447 int64_t nread = start_resid - uio.uio_resid; 448 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 449 task_io_account_read(nread); 450 451 rw_exit(&zv->zv_suspend_lock); 452 453 if (bio && acct) { 454 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 455 } 456 457 zvol_end_io(bio, rq, error); 458 } 459 460 static void 461 zvol_read_task(void *arg) 462 { 463 zv_request_task_t *task = arg; 464 zvol_read(&task->zvr); 465 zv_request_task_free(task); 466 } 467 468 /* 469 * Note: 470 * 471 * The kernel uses different enum names for the IO opcode, depending on the 472 * kernel version ('req_opf', 'req_op'). To sidestep this, use macros rather 473 * than inline functions for these checks. 474 */ 475 /* Should this IO go down the zvol write path? */ 476 #define ZVOL_OP_IS_WRITE(op) \ 477 (op == REQ_OP_WRITE || \ 478 op == REQ_OP_FLUSH || \ 479 op == REQ_OP_DISCARD) 480 481 /* Is this IO type supported by zvols? */ 482 #define ZVOL_OP_IS_SUPPORTED(op) (op == REQ_OP_READ || ZVOL_OP_IS_WRITE(op)) 483 484 /* Get the IO opcode */ 485 #define ZVOL_OP(bio, rq) (bio != NULL ? bio_op(bio) : req_op(rq)) 486 487 /* 488 * Process a BIO or request 489 * 490 * Either 'bio' or 'rq' should be set depending on if we are processing a 491 * bio or a request (both should not be set). 492 * 493 * force_sync: Set to 0 to defer processing to a background taskq 494 * Set to 1 to process data synchronously 495 */ 496 static void 497 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 498 boolean_t force_sync) 499 { 500 fstrans_cookie_t cookie = spl_fstrans_mark(); 501 uint64_t offset = io_offset(bio, rq); 502 uint64_t size = io_size(bio, rq); 503 int rw; 504 505 if (unlikely(!ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq)))) { 506 zfs_dbgmsg("Unsupported zvol %s, op=%d, flags=0x%x", 507 rq != NULL ? "request" : "BIO", 508 ZVOL_OP(bio, rq), 509 rq != NULL ? rq->cmd_flags : bio->bi_opf); 510 ASSERT(ZVOL_OP_IS_SUPPORTED(ZVOL_OP(bio, rq))); 511 zvol_end_io(bio, rq, SET_ERROR(ENOTSUPP)); 512 goto out; 513 } 514 515 if (ZVOL_OP_IS_WRITE(ZVOL_OP(bio, rq))) { 516 rw = WRITE; 517 } else { 518 rw = READ; 519 } 520 521 /* 522 * Sanity check 523 * 524 * If we're a BIO, check our rw matches the kernel's 525 * bio_data_dir(bio) rw. We need to check because we support fewer 526 * IO operations, and want to verify that what we think are reads and 527 * writes from those operations match what the kernel thinks. 528 */ 529 ASSERT(rq != NULL || rw == bio_data_dir(bio)); 530 531 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 532 zvol_end_io(bio, rq, SET_ERROR(ENXIO)); 533 goto out; 534 } 535 536 if (zvol_request_sync || zv->zv_threading == B_FALSE) 537 force_sync = 1; 538 539 zv_request_t zvr = { 540 .zv = zv, 541 .bio = bio, 542 .rq = rq, 543 }; 544 545 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 546 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 547 zv->zv_zso->zvo_disk->disk_name, 548 (long long unsigned)offset, 549 (long unsigned)size); 550 551 zvol_end_io(bio, rq, SET_ERROR(EIO)); 552 goto out; 553 } 554 555 zv_request_task_t *task; 556 zv_taskq_t *ztqs = &zvol_taskqs; 557 uint_t blk_mq_hw_queue = 0; 558 uint_t tq_idx; 559 uint_t taskq_hash; 560 if (rq) 561 #ifdef HAVE_BLK_MQ_RQ_HCTX 562 blk_mq_hw_queue = rq->mq_hctx->queue_num; 563 #else 564 blk_mq_hw_queue = rq->q->queue_hw_ctx[ 565 rq->q->mq_map[raw_smp_processor_id()]]->queue_num; 566 #endif 567 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 568 blk_mq_hw_queue); 569 tq_idx = taskq_hash % ztqs->tqs_cnt; 570 571 if (rw == WRITE) { 572 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 573 zvol_end_io(bio, rq, SET_ERROR(EROFS)); 574 goto out; 575 } 576 577 /* 578 * Prevents the zvol from being suspended, or the ZIL being 579 * concurrently opened. Will be released after the i/o 580 * completes. 581 */ 582 rw_enter(&zv->zv_suspend_lock, RW_READER); 583 584 /* 585 * Open a ZIL if this is the first time we have written to this 586 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 587 * than zv_state_lock so that we don't need to acquire an 588 * additional lock in this path. 589 */ 590 if (zv->zv_zilog == NULL) { 591 rw_exit(&zv->zv_suspend_lock); 592 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 593 if (zv->zv_zilog == NULL) { 594 zv->zv_zilog = zil_open(zv->zv_objset, 595 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 596 zv->zv_flags |= ZVOL_WRITTEN_TO; 597 /* replay / destroy done in zvol_create_minor */ 598 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 599 ZIL_REPLAY_NEEDED)); 600 } 601 rw_downgrade(&zv->zv_suspend_lock); 602 } 603 604 /* 605 * We don't want this thread to be blocked waiting for i/o to 606 * complete, so we instead wait from a taskq callback. The 607 * i/o may be a ZIL write (via zil_commit()), or a read of an 608 * indirect block, or a read of a data block (if this is a 609 * partial-block write). We will indicate that the i/o is 610 * complete by calling END_IO() from the taskq callback. 611 * 612 * This design allows the calling thread to continue and 613 * initiate more concurrent operations by calling 614 * zvol_request() again. There are typically only a small 615 * number of threads available to call zvol_request() (e.g. 616 * one per iSCSI target), so keeping the latency of 617 * zvol_request() low is important for performance. 618 * 619 * The zvol_request_sync module parameter allows this 620 * behavior to be altered, for performance evaluation 621 * purposes. If the callback blocks, setting 622 * zvol_request_sync=1 will result in much worse performance. 623 * 624 * We can have up to zvol_threads concurrent i/o's being 625 * processed for all zvols on the system. This is typically 626 * a vast improvement over the zvol_request_sync=1 behavior 627 * of one i/o at a time per zvol. However, an even better 628 * design would be for zvol_request() to initiate the zio 629 * directly, and then be notified by the zio_done callback, 630 * which would call END_IO(). Unfortunately, the DMU/ZIL 631 * interfaces lack this functionality (they block waiting for 632 * the i/o to complete). 633 */ 634 if (io_is_discard(bio, rq)) { 635 if (force_sync) { 636 zvol_discard(&zvr); 637 } else { 638 task = zv_request_task_create(zvr); 639 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 640 zvol_discard_task, task, 0, &task->ent); 641 } 642 } else { 643 if (force_sync) { 644 zvol_write(&zvr); 645 } else { 646 task = zv_request_task_create(zvr); 647 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 648 zvol_write_task, task, 0, &task->ent); 649 } 650 } 651 } else { 652 /* 653 * The SCST driver, and possibly others, may issue READ I/Os 654 * with a length of zero bytes. These empty I/Os contain no 655 * data and require no additional handling. 656 */ 657 if (size == 0) { 658 zvol_end_io(bio, rq, 0); 659 goto out; 660 } 661 662 rw_enter(&zv->zv_suspend_lock, RW_READER); 663 664 /* See comment in WRITE case above. */ 665 if (force_sync) { 666 zvol_read(&zvr); 667 } else { 668 task = zv_request_task_create(zvr); 669 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 670 zvol_read_task, task, 0, &task->ent); 671 } 672 } 673 674 out: 675 spl_fstrans_unmark(cookie); 676 } 677 678 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 679 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 680 static void 681 zvol_submit_bio(struct bio *bio) 682 #else 683 static blk_qc_t 684 zvol_submit_bio(struct bio *bio) 685 #endif 686 #else 687 static MAKE_REQUEST_FN_RET 688 zvol_request(struct request_queue *q, struct bio *bio) 689 #endif 690 { 691 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 692 #if defined(HAVE_BIO_BDEV_DISK) 693 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 694 #else 695 struct request_queue *q = bio->bi_disk->queue; 696 #endif 697 #endif 698 zvol_state_t *zv = q->queuedata; 699 700 zvol_request_impl(zv, bio, NULL, 0); 701 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 702 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 703 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 704 return (BLK_QC_T_NONE); 705 #endif 706 } 707 708 static int 709 #ifdef HAVE_BLK_MODE_T 710 zvol_open(struct gendisk *disk, blk_mode_t flag) 711 #else 712 zvol_open(struct block_device *bdev, fmode_t flag) 713 #endif 714 { 715 zvol_state_t *zv; 716 int error = 0; 717 boolean_t drop_suspend = B_FALSE; 718 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 719 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 720 hrtime_t start = gethrtime(); 721 722 retry: 723 #endif 724 725 #ifdef HAVE_BLK_MODE_T 726 zv = atomic_load_ptr(&disk->private_data); 727 #else 728 zv = atomic_load_ptr(&bdev->bd_disk->private_data); 729 #endif 730 if (zv == NULL) { 731 return (-SET_ERROR(ENXIO)); 732 } 733 734 mutex_enter(&zv->zv_state_lock); 735 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 736 mutex_exit(&zv->zv_state_lock); 737 return (-SET_ERROR(ENXIO)); 738 } 739 740 /* 741 * Make sure zvol is not suspended during first open 742 * (hold zv_suspend_lock) and respect proper lock acquisition 743 * ordering - zv_suspend_lock before zv_state_lock 744 */ 745 if (zv->zv_open_count == 0) { 746 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 747 mutex_exit(&zv->zv_state_lock); 748 749 /* 750 * Removal may happen while the locks are down, so 751 * we can't trust zv any longer; we have to start over. 752 */ 753 #ifdef HAVE_BLK_MODE_T 754 zv = atomic_load_ptr(&disk->private_data); 755 #else 756 zv = atomic_load_ptr(&bdev->bd_disk->private_data); 757 #endif 758 if (zv == NULL) 759 return (-SET_ERROR(ENXIO)); 760 761 rw_enter(&zv->zv_suspend_lock, RW_READER); 762 mutex_enter(&zv->zv_state_lock); 763 764 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 765 mutex_exit(&zv->zv_state_lock); 766 rw_exit(&zv->zv_suspend_lock); 767 return (-SET_ERROR(ENXIO)); 768 } 769 770 /* check to see if zv_suspend_lock is needed */ 771 if (zv->zv_open_count != 0) { 772 rw_exit(&zv->zv_suspend_lock); 773 } else { 774 drop_suspend = B_TRUE; 775 } 776 } else { 777 drop_suspend = B_TRUE; 778 } 779 } 780 781 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 782 783 if (zv->zv_open_count == 0) { 784 boolean_t drop_namespace = B_FALSE; 785 786 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 787 788 /* 789 * In all other call paths the spa_namespace_lock is taken 790 * before the bdev->bd_mutex lock. However, on open(2) 791 * the __blkdev_get() function calls fops->open() with the 792 * bdev->bd_mutex lock held. This can result in a deadlock 793 * when zvols from one pool are used as vdevs in another. 794 * 795 * To prevent a lock inversion deadlock we preemptively 796 * take the spa_namespace_lock. Normally the lock will not 797 * be contended and this is safe because spa_open_common() 798 * handles the case where the caller already holds the 799 * spa_namespace_lock. 800 * 801 * When the lock cannot be aquired after multiple retries 802 * this must be the vdev on zvol deadlock case and we have 803 * no choice but to return an error. For 5.12 and older 804 * kernels returning -ERESTARTSYS will result in the 805 * bdev->bd_mutex being dropped, then reacquired, and 806 * fops->open() being called again. This process can be 807 * repeated safely until both locks are acquired. For 5.13 808 * and newer the -ERESTARTSYS retry logic was removed from 809 * the kernel so the only option is to return the error for 810 * the caller to handle it. 811 */ 812 if (!mutex_owned(&spa_namespace_lock)) { 813 if (!mutex_tryenter(&spa_namespace_lock)) { 814 mutex_exit(&zv->zv_state_lock); 815 rw_exit(&zv->zv_suspend_lock); 816 drop_suspend = B_FALSE; 817 818 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 819 schedule(); 820 return (-SET_ERROR(ERESTARTSYS)); 821 #else 822 if ((gethrtime() - start) > timeout) 823 return (-SET_ERROR(ERESTARTSYS)); 824 825 schedule_timeout_interruptible( 826 MSEC_TO_TICK(10)); 827 goto retry; 828 #endif 829 } else { 830 drop_namespace = B_TRUE; 831 } 832 } 833 834 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 835 836 if (drop_namespace) 837 mutex_exit(&spa_namespace_lock); 838 } 839 840 if (error == 0) { 841 if ((blk_mode_is_open_write(flag)) && 842 (zv->zv_flags & ZVOL_RDONLY)) { 843 if (zv->zv_open_count == 0) 844 zvol_last_close(zv); 845 846 error = -SET_ERROR(EROFS); 847 } else { 848 zv->zv_open_count++; 849 } 850 } 851 852 mutex_exit(&zv->zv_state_lock); 853 if (drop_suspend) 854 rw_exit(&zv->zv_suspend_lock); 855 856 if (error == 0) 857 #ifdef HAVE_BLK_MODE_T 858 disk_check_media_change(disk); 859 #else 860 zfs_check_media_change(bdev); 861 #endif 862 863 return (error); 864 } 865 866 static void 867 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 868 zvol_release(struct gendisk *disk) 869 #else 870 zvol_release(struct gendisk *disk, fmode_t unused) 871 #endif 872 { 873 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 874 (void) unused; 875 #endif 876 boolean_t drop_suspend = B_TRUE; 877 878 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 879 if (zv == NULL) 880 return; 881 882 mutex_enter(&zv->zv_state_lock); 883 ASSERT3U(zv->zv_open_count, >, 0); 884 /* 885 * make sure zvol is not suspended during last close 886 * (hold zv_suspend_lock) and respect proper lock acquisition 887 * ordering - zv_suspend_lock before zv_state_lock 888 */ 889 if (zv->zv_open_count == 1) { 890 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 891 mutex_exit(&zv->zv_state_lock); 892 rw_enter(&zv->zv_suspend_lock, RW_READER); 893 mutex_enter(&zv->zv_state_lock); 894 895 /* 896 * Unlike in zvol_open(), we don't check if removal 897 * started here, because we might be one of the openers 898 * that needs to be thrown out! If we're the last, we 899 * need to call zvol_last_close() below to finish 900 * cleanup. So, no special treatment for us. 901 */ 902 903 /* check to see if zv_suspend_lock is needed */ 904 if (zv->zv_open_count != 1) { 905 rw_exit(&zv->zv_suspend_lock); 906 drop_suspend = B_FALSE; 907 } 908 } 909 } else { 910 drop_suspend = B_FALSE; 911 } 912 913 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 914 915 zv->zv_open_count--; 916 if (zv->zv_open_count == 0) { 917 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 918 zvol_last_close(zv); 919 } 920 921 mutex_exit(&zv->zv_state_lock); 922 923 if (drop_suspend) 924 rw_exit(&zv->zv_suspend_lock); 925 } 926 927 static int 928 zvol_ioctl(struct block_device *bdev, fmode_t mode, 929 unsigned int cmd, unsigned long arg) 930 { 931 int error = 0; 932 933 zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); 934 ASSERT3P(zv, !=, NULL); 935 ASSERT3U(zv->zv_open_count, >, 0); 936 937 switch (cmd) { 938 case BLKFLSBUF: 939 #ifdef HAVE_FSYNC_BDEV 940 fsync_bdev(bdev); 941 #elif defined(HAVE_SYNC_BLOCKDEV) 942 sync_blockdev(bdev); 943 #else 944 #error "Neither fsync_bdev() nor sync_blockdev() found" 945 #endif 946 invalidate_bdev(bdev); 947 rw_enter(&zv->zv_suspend_lock, RW_READER); 948 949 if (!(zv->zv_flags & ZVOL_RDONLY)) 950 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 951 952 rw_exit(&zv->zv_suspend_lock); 953 break; 954 955 case BLKZNAME: 956 mutex_enter(&zv->zv_state_lock); 957 error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 958 mutex_exit(&zv->zv_state_lock); 959 if (error) 960 error = SET_ERROR(error); 961 break; 962 963 default: 964 error = SET_ERROR(ENOTTY); 965 break; 966 } 967 968 return (-error); 969 } 970 971 #ifdef CONFIG_COMPAT 972 static int 973 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 974 unsigned cmd, unsigned long arg) 975 { 976 return (zvol_ioctl(bdev, mode, cmd, arg)); 977 } 978 #else 979 #define zvol_compat_ioctl NULL 980 #endif 981 982 static unsigned int 983 zvol_check_events(struct gendisk *disk, unsigned int clearing) 984 { 985 unsigned int mask = 0; 986 987 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 988 989 if (zv != NULL) { 990 mutex_enter(&zv->zv_state_lock); 991 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 992 zv->zv_changed = 0; 993 mutex_exit(&zv->zv_state_lock); 994 } 995 996 return (mask); 997 } 998 999 static int 1000 zvol_revalidate_disk(struct gendisk *disk) 1001 { 1002 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 1003 1004 if (zv != NULL) { 1005 mutex_enter(&zv->zv_state_lock); 1006 set_capacity(zv->zv_zso->zvo_disk, 1007 zv->zv_volsize >> SECTOR_BITS); 1008 mutex_exit(&zv->zv_state_lock); 1009 } 1010 1011 return (0); 1012 } 1013 1014 int 1015 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1016 { 1017 struct gendisk *disk = zv->zv_zso->zvo_disk; 1018 1019 #if defined(HAVE_REVALIDATE_DISK_SIZE) 1020 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 1021 #elif defined(HAVE_REVALIDATE_DISK) 1022 revalidate_disk(disk); 1023 #else 1024 zvol_revalidate_disk(disk); 1025 #endif 1026 return (0); 1027 } 1028 1029 /* 1030 * Provide a simple virtual geometry for legacy compatibility. For devices 1031 * smaller than 1 MiB a small head and sector count is used to allow very 1032 * tiny devices. For devices over 1 Mib a standard head and sector count 1033 * is used to keep the cylinders count reasonable. 1034 */ 1035 static int 1036 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1037 { 1038 sector_t sectors; 1039 1040 zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); 1041 ASSERT3P(zv, !=, NULL); 1042 ASSERT3U(zv->zv_open_count, >, 0); 1043 1044 sectors = get_capacity(zv->zv_zso->zvo_disk); 1045 1046 if (sectors > 2048) { 1047 geo->heads = 16; 1048 geo->sectors = 63; 1049 } else { 1050 geo->heads = 2; 1051 geo->sectors = 4; 1052 } 1053 1054 geo->start = 0; 1055 geo->cylinders = sectors / (geo->heads * geo->sectors); 1056 1057 return (0); 1058 } 1059 1060 /* 1061 * Why have two separate block_device_operations structs? 1062 * 1063 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1064 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1065 * can't just change submit_bio dynamically at runtime. So just create two 1066 * separate structs to get around this. 1067 */ 1068 static const struct block_device_operations zvol_ops_blk_mq = { 1069 .open = zvol_open, 1070 .release = zvol_release, 1071 .ioctl = zvol_ioctl, 1072 .compat_ioctl = zvol_compat_ioctl, 1073 .check_events = zvol_check_events, 1074 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1075 .revalidate_disk = zvol_revalidate_disk, 1076 #endif 1077 .getgeo = zvol_getgeo, 1078 .owner = THIS_MODULE, 1079 }; 1080 1081 static const struct block_device_operations zvol_ops = { 1082 .open = zvol_open, 1083 .release = zvol_release, 1084 .ioctl = zvol_ioctl, 1085 .compat_ioctl = zvol_compat_ioctl, 1086 .check_events = zvol_check_events, 1087 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1088 .revalidate_disk = zvol_revalidate_disk, 1089 #endif 1090 .getgeo = zvol_getgeo, 1091 .owner = THIS_MODULE, 1092 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1093 .submit_bio = zvol_submit_bio, 1094 #endif 1095 }; 1096 1097 /* 1098 * Since 6.9, Linux has been removing queue limit setters in favour of an 1099 * initial queue_limits struct applied when the device is open. Since 6.11, 1100 * queue_limits is being extended to allow more things to be applied when the 1101 * device is open. Setters are also being removed for this. 1102 * 1103 * For OpenZFS, this means that depending on kernel version, some options may 1104 * be set up before the device is open, and some applied to an open device 1105 * (queue) after the fact. 1106 * 1107 * We manage this complexity by having our own limits struct, 1108 * zvol_queue_limits_t, in which we carry any queue config that we're 1109 * interested in setting. This structure is the same on all kernels. 1110 * 1111 * These limits are then applied to the queue at device open time by the most 1112 * appropriate method for the kernel. 1113 * 1114 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1115 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1116 * struct queue_limits, and passes it in. Any fields added in later kernels are 1117 * (obviously) not set up here. 1118 * 1119 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1120 * is created, and applies any remaining config. Before 6.9 that will be 1121 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1122 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1123 * will always be a no-op on the latest kernel we support). 1124 */ 1125 typedef struct zvol_queue_limits { 1126 unsigned int zql_max_hw_sectors; 1127 unsigned short zql_max_segments; 1128 unsigned int zql_max_segment_size; 1129 unsigned int zql_io_opt; 1130 unsigned int zql_physical_block_size; 1131 unsigned int zql_max_discard_sectors; 1132 unsigned int zql_discard_granularity; 1133 } zvol_queue_limits_t; 1134 1135 static void 1136 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1137 boolean_t use_blk_mq) 1138 { 1139 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1140 1141 if (use_blk_mq) { 1142 /* 1143 * IO requests can be really big (1MB). When an IO request 1144 * comes in, it is passed off to zvol_read() or zvol_write() 1145 * in a new thread, where it is chunked up into 'volblocksize' 1146 * sized pieces and processed. So for example, if the request 1147 * is a 1MB write and your volblocksize is 128k, one zvol_write 1148 * thread will take that request and sequentially do ten 128k 1149 * IOs. This is due to the fact that the thread needs to lock 1150 * each volblocksize sized block. So you might be wondering: 1151 * "instead of passing the whole 1MB request to one thread, 1152 * why not pass ten individual 128k chunks to ten threads and 1153 * process the whole write in parallel?" The short answer is 1154 * that there's a sweet spot number of chunks that balances 1155 * the greater parallelism with the added overhead of more 1156 * threads. The sweet spot can be different depending on if you 1157 * have a read or write heavy workload. Writes typically want 1158 * high chunk counts while reads typically want lower ones. On 1159 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1160 * configuration, with volblocksize=8k, the sweet spot for good 1161 * sequential reads and writes was at 8 chunks. 1162 */ 1163 1164 /* 1165 * Below we tell the kernel how big we want our requests 1166 * to be. You would think that blk_queue_io_opt() would be 1167 * used to do this since it is used to "set optimal request 1168 * size for the queue", but that doesn't seem to do 1169 * anything - the kernel still gives you huge requests 1170 * with tons of little PAGE_SIZE segments contained within it. 1171 * 1172 * Knowing that the kernel will just give you PAGE_SIZE segments 1173 * no matter what, you can say "ok, I want PAGE_SIZE byte 1174 * segments, and I want 'N' of them per request", where N is 1175 * the correct number of segments for the volblocksize and 1176 * number of chunks you want. 1177 */ 1178 if (zvol_blk_mq_blocks_per_thread != 0) { 1179 unsigned int chunks; 1180 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1181 1182 limits->zql_max_segment_size = PAGE_SIZE; 1183 limits->zql_max_segments = 1184 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1185 } else { 1186 /* 1187 * Special case: zvol_blk_mq_blocks_per_thread = 0 1188 * Max everything out. 1189 */ 1190 limits->zql_max_segments = UINT16_MAX; 1191 limits->zql_max_segment_size = UINT_MAX; 1192 } 1193 } else { 1194 limits->zql_max_segments = UINT16_MAX; 1195 limits->zql_max_segment_size = UINT_MAX; 1196 } 1197 1198 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1199 1200 limits->zql_physical_block_size = zv->zv_volblocksize; 1201 limits->zql_max_discard_sectors = 1202 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1203 limits->zql_discard_granularity = zv->zv_volblocksize; 1204 } 1205 1206 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1207 static void 1208 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1209 struct queue_limits *qlimits) 1210 { 1211 memset(qlimits, 0, sizeof (struct queue_limits)); 1212 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1213 qlimits->max_segments = limits->zql_max_segments; 1214 qlimits->max_segment_size = limits->zql_max_segment_size; 1215 qlimits->io_opt = limits->zql_io_opt; 1216 qlimits->physical_block_size = limits->zql_physical_block_size; 1217 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1218 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1219 qlimits->discard_granularity = limits->zql_discard_granularity; 1220 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1221 qlimits->features = 1222 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1223 #endif 1224 } 1225 #endif 1226 1227 static void 1228 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1229 struct request_queue *queue) 1230 { 1231 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1232 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1233 blk_queue_max_segments(queue, limits->zql_max_segments); 1234 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1235 blk_queue_io_opt(queue, limits->zql_io_opt); 1236 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1237 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1238 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1239 #endif 1240 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1241 blk_queue_set_write_cache(queue, B_TRUE); 1242 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1243 #endif 1244 } 1245 1246 static int 1247 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1248 { 1249 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1250 #if defined(HAVE_BLK_ALLOC_DISK) 1251 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1252 if (zso->zvo_disk == NULL) 1253 return (1); 1254 1255 zso->zvo_disk->minors = ZVOL_MINORS; 1256 zso->zvo_queue = zso->zvo_disk->queue; 1257 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1258 struct queue_limits qlimits; 1259 zvol_queue_limits_convert(limits, &qlimits); 1260 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1261 if (IS_ERR(disk)) { 1262 zso->zvo_disk = NULL; 1263 return (1); 1264 } 1265 1266 zso->zvo_disk = disk; 1267 zso->zvo_disk->minors = ZVOL_MINORS; 1268 zso->zvo_queue = zso->zvo_disk->queue; 1269 1270 #else 1271 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1272 if (zso->zvo_queue == NULL) 1273 return (1); 1274 1275 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1276 if (zso->zvo_disk == NULL) { 1277 blk_cleanup_queue(zso->zvo_queue); 1278 return (1); 1279 } 1280 1281 zso->zvo_disk->queue = zso->zvo_queue; 1282 #endif /* HAVE_BLK_ALLOC_DISK */ 1283 #else 1284 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1285 if (zso->zvo_queue == NULL) 1286 return (1); 1287 1288 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1289 if (zso->zvo_disk == NULL) { 1290 blk_cleanup_queue(zso->zvo_queue); 1291 return (1); 1292 } 1293 1294 zso->zvo_disk->queue = zso->zvo_queue; 1295 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1296 1297 zvol_queue_limits_apply(limits, zso->zvo_queue); 1298 1299 return (0); 1300 1301 } 1302 1303 static int 1304 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1305 { 1306 struct zvol_state_os *zso = zv->zv_zso; 1307 1308 /* Allocate our blk-mq tag_set */ 1309 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1310 return (1); 1311 1312 #if defined(HAVE_BLK_ALLOC_DISK) 1313 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1314 if (zso->zvo_disk == NULL) { 1315 blk_mq_free_tag_set(&zso->tag_set); 1316 return (1); 1317 } 1318 zso->zvo_queue = zso->zvo_disk->queue; 1319 zso->zvo_disk->minors = ZVOL_MINORS; 1320 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1321 struct queue_limits qlimits; 1322 zvol_queue_limits_convert(limits, &qlimits); 1323 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1324 if (IS_ERR(disk)) { 1325 zso->zvo_disk = NULL; 1326 blk_mq_free_tag_set(&zso->tag_set); 1327 return (1); 1328 } 1329 1330 zso->zvo_disk = disk; 1331 zso->zvo_queue = zso->zvo_disk->queue; 1332 zso->zvo_disk->minors = ZVOL_MINORS; 1333 #else 1334 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1335 if (zso->zvo_disk == NULL) { 1336 blk_cleanup_queue(zso->zvo_queue); 1337 blk_mq_free_tag_set(&zso->tag_set); 1338 return (1); 1339 } 1340 /* Allocate queue */ 1341 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1342 if (IS_ERR(zso->zvo_queue)) { 1343 blk_mq_free_tag_set(&zso->tag_set); 1344 return (1); 1345 } 1346 1347 /* Our queue is now created, assign it to our disk */ 1348 zso->zvo_disk->queue = zso->zvo_queue; 1349 #endif 1350 1351 zvol_queue_limits_apply(limits, zso->zvo_queue); 1352 1353 return (0); 1354 } 1355 1356 /* 1357 * Allocate memory for a new zvol_state_t and setup the required 1358 * request queue and generic disk structures for the block device. 1359 */ 1360 static int 1361 zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, 1362 zvol_state_t **zvp) 1363 { 1364 zvol_state_t *zv; 1365 struct zvol_state_os *zso; 1366 uint64_t volmode; 1367 int ret; 1368 1369 ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); 1370 if (ret) 1371 return (ret); 1372 1373 if (volmode == ZFS_VOLMODE_DEFAULT) 1374 volmode = zvol_volmode; 1375 1376 if (volmode == ZFS_VOLMODE_NONE) 1377 return (0); 1378 1379 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1380 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1381 zv->zv_zso = zso; 1382 zv->zv_volmode = volmode; 1383 zv->zv_volsize = volsize; 1384 zv->zv_volblocksize = volblocksize; 1385 1386 list_link_init(&zv->zv_next); 1387 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1388 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1389 1390 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1391 1392 zvol_queue_limits_t limits; 1393 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1394 1395 /* 1396 * The block layer has 3 interfaces for getting BIOs: 1397 * 1398 * 1. blk-mq request queues (new) 1399 * 2. submit_bio() (oldest) 1400 * 3. regular request queues (old). 1401 * 1402 * Each of those interfaces has two permutations: 1403 * 1404 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1405 * both the disk and its queue (5.14 kernel or newer) 1406 * 1407 * b) We don't have blk_*alloc_disk(), and have to allocate the 1408 * disk and the queue separately. (5.13 kernel or older) 1409 */ 1410 if (zv->zv_zso->use_blk_mq) { 1411 ret = zvol_alloc_blk_mq(zv, &limits); 1412 if (ret != 0) 1413 goto out_kmem; 1414 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1415 } else { 1416 ret = zvol_alloc_non_blk_mq(zso, &limits); 1417 if (ret != 0) 1418 goto out_kmem; 1419 zso->zvo_disk->fops = &zvol_ops; 1420 } 1421 1422 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1423 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1424 1425 if (!zv->zv_zso->use_blk_mq) { 1426 /* Disable write merging in favor of the ZIO pipeline. */ 1427 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1428 } 1429 1430 zso->zvo_queue->queuedata = zv; 1431 zso->zvo_dev = dev; 1432 zv->zv_open_count = 0; 1433 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1434 1435 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1436 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1437 1438 zso->zvo_disk->major = zvol_major; 1439 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1440 1441 /* 1442 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1443 * This is accomplished by limiting the number of minors for the 1444 * device to one and explicitly disabling partition scanning. 1445 */ 1446 if (volmode == ZFS_VOLMODE_DEV) { 1447 zso->zvo_disk->minors = 1; 1448 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1449 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1450 } 1451 1452 zso->zvo_disk->first_minor = (dev & MINORMASK); 1453 zso->zvo_disk->private_data = zv; 1454 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1455 ZVOL_DEV_NAME, (dev & MINORMASK)); 1456 1457 *zvp = zv; 1458 return (ret); 1459 1460 out_kmem: 1461 kmem_free(zso, sizeof (struct zvol_state_os)); 1462 kmem_free(zv, sizeof (zvol_state_t)); 1463 return (ret); 1464 } 1465 1466 void 1467 zvol_os_remove_minor(zvol_state_t *zv) 1468 { 1469 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1470 ASSERT0(zv->zv_open_count); 1471 ASSERT0(atomic_read(&zv->zv_suspend_ref)); 1472 ASSERT(zv->zv_flags & ZVOL_REMOVING); 1473 1474 struct zvol_state_os *zso = zv->zv_zso; 1475 zv->zv_zso = NULL; 1476 1477 /* Clearing private_data will make new callers return immediately. */ 1478 atomic_store_ptr(&zso->zvo_disk->private_data, NULL); 1479 1480 /* 1481 * Drop the state lock before calling del_gendisk(). There may be 1482 * callers waiting to acquire it, but del_gendisk() will block until 1483 * they exit, which would deadlock. 1484 */ 1485 mutex_exit(&zv->zv_state_lock); 1486 1487 del_gendisk(zso->zvo_disk); 1488 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1489 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1490 #if defined(HAVE_BLK_CLEANUP_DISK) 1491 blk_cleanup_disk(zso->zvo_disk); 1492 #else 1493 put_disk(zso->zvo_disk); 1494 #endif 1495 #else 1496 blk_cleanup_queue(zso->zvo_queue); 1497 put_disk(zso->zvo_disk); 1498 #endif 1499 1500 if (zso->use_blk_mq) 1501 blk_mq_free_tag_set(&zso->tag_set); 1502 1503 ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); 1504 1505 kmem_free(zso, sizeof (struct zvol_state_os)); 1506 1507 mutex_enter(&zv->zv_state_lock); 1508 } 1509 1510 void 1511 zvol_os_free(zvol_state_t *zv) 1512 { 1513 1514 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1515 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1516 ASSERT0(zv->zv_open_count); 1517 ASSERT0P(zv->zv_zso); 1518 1519 ASSERT0P(zv->zv_objset); 1520 ASSERT0P(zv->zv_zilog); 1521 ASSERT0P(zv->zv_dn); 1522 1523 rw_destroy(&zv->zv_suspend_lock); 1524 zfs_rangelock_fini(&zv->zv_rangelock); 1525 1526 cv_destroy(&zv->zv_removing_cv); 1527 mutex_destroy(&zv->zv_state_lock); 1528 dataset_kstats_destroy(&zv->zv_kstat); 1529 1530 kmem_free(zv, sizeof (zvol_state_t)); 1531 } 1532 1533 void 1534 zvol_wait_close(zvol_state_t *zv) 1535 { 1536 } 1537 1538 struct add_disk_work { 1539 struct delayed_work work; 1540 struct gendisk *disk; 1541 int error; 1542 }; 1543 1544 static int 1545 __zvol_os_add_disk(struct gendisk *disk) 1546 { 1547 int error = 0; 1548 #ifdef HAVE_ADD_DISK_RET 1549 error = -add_disk(disk); 1550 if (error) 1551 error = SET_ERROR(error); 1552 #else 1553 add_disk(disk); 1554 #endif 1555 return (error); 1556 } 1557 1558 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1559 static void 1560 zvol_os_add_disk_work(struct work_struct *work) 1561 { 1562 struct add_disk_work *add_disk_work; 1563 add_disk_work = container_of(work, struct add_disk_work, work.work); 1564 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1565 } 1566 #endif 1567 1568 /* 1569 * SPECIAL CASE: 1570 * 1571 * This function basically calls add_disk() from a workqueue. You may be 1572 * thinking: why not just call add_disk() directly? 1573 * 1574 * When you call add_disk(), the zvol appears to the world. When this happens, 1575 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1576 * differently on the 6.9+ kernels: 1577 * 1578 * - 6.8 and older kernels - 1579 * disk_scan_partitions() 1580 * handle = bdev_open_by_dev( 1581 * zvol_open() 1582 * bdev_release(handle); 1583 * zvol_release() 1584 * 1585 * 1586 * - 6.9+ kernels - 1587 * disk_scan_partitions() 1588 * file = bdev_file_open_by_dev() 1589 * zvol_open() 1590 * fput(file) 1591 * < wait for return to userspace > 1592 * zvol_release() 1593 * 1594 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1595 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1596 * async that has to wait until we return to userspace (since it adds the fput 1597 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1598 * behavior we want, since we want do things like create+destroy a zvol within 1599 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1600 * reference to the zvol while we're in the IOCTL, which can't wait until we 1601 * return to userspace. 1602 * 1603 * We can get around this since fput() has a special codepath for when it's 1604 * running in a kernel thread or interrupt. In those cases, it just puts the 1605 * fput into the system workqueue, which we can force to run with 1606 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1607 * run from a kernel thread and "tricks" the fput() codepaths. 1608 * 1609 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1610 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1611 * fput) to happen, which it eventually, naturally, will from the system_wq 1612 * without us explicitly calling __flush_workqueue(). 1613 */ 1614 static int 1615 zvol_os_add_disk(struct gendisk *disk) 1616 { 1617 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1618 struct add_disk_work add_disk_work; 1619 1620 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1621 add_disk_work.disk = disk; 1622 add_disk_work.error = 0; 1623 1624 /* Use *_delayed_work functions since they're not GPL'd */ 1625 schedule_delayed_work(&add_disk_work.work, 0); 1626 flush_delayed_work(&add_disk_work.work); 1627 1628 __flush_workqueue(system_wq); 1629 return (add_disk_work.error); 1630 #else /* <= 6.8 kernel */ 1631 return (__zvol_os_add_disk(disk)); 1632 #endif 1633 } 1634 1635 /* 1636 * Create a block device minor node and setup the linkage between it 1637 * and the specified volume. Once this function returns the block 1638 * device is live and ready for use. 1639 */ 1640 int 1641 zvol_os_create_minor(const char *name) 1642 { 1643 zvol_state_t *zv = NULL; 1644 objset_t *os; 1645 dmu_object_info_t *doi; 1646 uint64_t volsize; 1647 uint64_t len; 1648 unsigned minor = 0; 1649 int error = 0; 1650 int idx; 1651 uint64_t hash = zvol_name_hash(name); 1652 uint64_t volthreading; 1653 bool replayed_zil = B_FALSE; 1654 1655 if (zvol_inhibit_dev) 1656 return (0); 1657 1658 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1659 if (idx < 0) 1660 return (SET_ERROR(-idx)); 1661 minor = idx << ZVOL_MINOR_BITS; 1662 if (MINOR(minor) != minor) { 1663 /* too many partitions can cause an overflow */ 1664 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1665 name, minor, MINOR(minor)); 1666 ida_simple_remove(&zvol_ida, idx); 1667 return (SET_ERROR(EINVAL)); 1668 } 1669 1670 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1671 if (zv) { 1672 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1673 mutex_exit(&zv->zv_state_lock); 1674 ida_simple_remove(&zvol_ida, idx); 1675 return (SET_ERROR(EEXIST)); 1676 } 1677 1678 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1679 1680 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1681 if (error) 1682 goto out_doi; 1683 1684 error = dmu_object_info(os, ZVOL_OBJ, doi); 1685 if (error) 1686 goto out_dmu_objset_disown; 1687 1688 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1689 if (error) 1690 goto out_dmu_objset_disown; 1691 1692 error = zvol_alloc(MKDEV(zvol_major, minor), name, 1693 volsize, doi->doi_data_block_size, &zv); 1694 if (error || zv == NULL) 1695 goto out_dmu_objset_disown; 1696 1697 zv->zv_hash = hash; 1698 1699 if (dmu_objset_is_snapshot(os)) 1700 zv->zv_flags |= ZVOL_RDONLY; 1701 1702 zv->zv_objset = os; 1703 1704 /* Default */ 1705 zv->zv_threading = B_TRUE; 1706 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1707 == 0) 1708 zv->zv_threading = volthreading; 1709 1710 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1711 1712 #ifdef QUEUE_FLAG_DISCARD 1713 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1714 #endif 1715 #ifdef QUEUE_FLAG_NONROT 1716 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1717 #endif 1718 #ifdef QUEUE_FLAG_ADD_RANDOM 1719 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1720 #endif 1721 /* This flag was introduced in kernel version 4.12. */ 1722 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1723 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1724 #endif 1725 1726 ASSERT0P(zv->zv_kstat.dk_kstats); 1727 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1728 if (error) 1729 goto out_dmu_objset_disown; 1730 ASSERT0P(zv->zv_zilog); 1731 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1732 if (spa_writeable(dmu_objset_spa(os))) { 1733 if (zil_replay_disable) 1734 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1735 else 1736 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1737 } 1738 if (replayed_zil) 1739 zil_close(zv->zv_zilog); 1740 zv->zv_zilog = NULL; 1741 1742 /* 1743 * When udev detects the addition of the device it will immediately 1744 * invoke blkid(8) to determine the type of content on the device. 1745 * Prefetching the blocks commonly scanned by blkid(8) will speed 1746 * up this process. 1747 */ 1748 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1749 if (len > 0) { 1750 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1751 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1752 ZIO_PRIORITY_SYNC_READ); 1753 } 1754 1755 zv->zv_objset = NULL; 1756 out_dmu_objset_disown: 1757 dmu_objset_disown(os, B_TRUE, FTAG); 1758 out_doi: 1759 kmem_free(doi, sizeof (dmu_object_info_t)); 1760 1761 /* 1762 * Keep in mind that once add_disk() is called, the zvol is 1763 * announced to the world, and zvol_open()/zvol_release() can 1764 * be called at any time. Incidentally, add_disk() itself calls 1765 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1766 * directly as well. 1767 */ 1768 if (error == 0 && zv) { 1769 rw_enter(&zvol_state_lock, RW_WRITER); 1770 zvol_insert(zv); 1771 rw_exit(&zvol_state_lock); 1772 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1773 } else { 1774 ida_simple_remove(&zvol_ida, idx); 1775 } 1776 1777 return (error); 1778 } 1779 1780 int 1781 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1782 { 1783 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1784 1785 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1786 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1787 1788 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1789 1790 /* move to new hashtable entry */ 1791 zv->zv_hash = zvol_name_hash(newname); 1792 hlist_del(&zv->zv_hlink); 1793 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1794 1795 /* 1796 * The block device's read-only state is briefly changed causing 1797 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1798 * the name change and fixes the symlinks. This does not change 1799 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1800 * changes. This would normally be done using kobject_uevent() but 1801 * that is a GPL-only symbol which is why we need this workaround. 1802 */ 1803 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1804 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1805 1806 dataset_kstats_rename(&zv->zv_kstat, newname); 1807 1808 return (0); 1809 } 1810 1811 void 1812 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1813 { 1814 1815 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1816 } 1817 1818 void 1819 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1820 { 1821 1822 set_capacity(zv->zv_zso->zvo_disk, capacity); 1823 } 1824 1825 int 1826 zvol_init(void) 1827 { 1828 int error; 1829 1830 error = zvol_init_impl(); 1831 if (error) { 1832 printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); 1833 return (error); 1834 } 1835 1836 error = -register_blkdev(zvol_major, ZVOL_DRIVER); 1837 if (error) { 1838 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1839 return (SET_ERROR(error)); 1840 } 1841 1842 if (zvol_blk_mq_queue_depth == 0) { 1843 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1844 } else { 1845 zvol_actual_blk_mq_queue_depth = 1846 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1847 } 1848 1849 if (zvol_blk_mq_threads == 0) { 1850 zvol_blk_mq_actual_threads = num_online_cpus(); 1851 } else { 1852 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1853 1024); 1854 } 1855 1856 ida_init(&zvol_ida); 1857 return (0); 1858 } 1859 1860 void 1861 zvol_fini(void) 1862 { 1863 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1864 1865 zvol_fini_impl(); 1866 1867 ida_destroy(&zvol_ida); 1868 } 1869 1870 module_param(zvol_major, uint, 0444); 1871 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1872 1873 module_param(zvol_max_discard_blocks, ulong, 0444); 1874 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1875 1876 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1877 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1878 1879 module_param(zvol_use_blk_mq, uint, 0644); 1880 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1881 1882 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1883 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1884 "Process volblocksize blocks per thread"); 1885 1886 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1887 module_param(zvol_open_timeout_ms, uint, 0644); 1888 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1889 #endif 1890