1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 23 */ 24 25 #include <sys/dataset_kstats.h> 26 #include <sys/dbuf.h> 27 #include <sys/dmu_traverse.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/zap.h> 32 #include <sys/zfeature.h> 33 #include <sys/zil_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/zio.h> 36 #include <sys/zfs_rlock.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zvol.h> 39 #include <sys/zvol_impl.h> 40 #include <cityhash.h> 41 42 #include <linux/blkdev_compat.h> 43 #include <linux/task_io_accounting_ops.h> 44 #include <linux/workqueue.h> 45 46 #ifdef HAVE_BLK_MQ 47 #include <linux/blk-mq.h> 48 #endif 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned int zvol_request_sync = 0; 55 static unsigned int zvol_prefetch_bytes = (128 * 1024); 56 static unsigned long zvol_max_discard_blocks = 16384; 57 58 /* 59 * Switch taskq at multiple of 512 MB offset. This can be set to a lower value 60 * to utilize more threads for small files but may affect prefetch hits. 61 */ 62 #define ZVOL_TASKQ_OFFSET_SHIFT 29 63 64 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 65 static unsigned int zvol_open_timeout_ms = 1000; 66 #endif 67 68 static unsigned int zvol_threads = 0; 69 #ifdef HAVE_BLK_MQ 70 static unsigned int zvol_blk_mq_threads = 0; 71 static unsigned int zvol_blk_mq_actual_threads; 72 static boolean_t zvol_use_blk_mq = B_FALSE; 73 74 /* 75 * The maximum number of volblocksize blocks to process per thread. Typically, 76 * write heavy workloads preform better with higher values here, and read 77 * heavy workloads preform better with lower values, but that's not a hard 78 * and fast rule. It's basically a knob to tune between "less overhead with 79 * less parallelism" and "more overhead, but more parallelism". 80 * 81 * '8' was chosen as a reasonable, balanced, default based off of sequential 82 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 83 */ 84 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 85 #endif 86 87 static unsigned int zvol_num_taskqs = 0; 88 89 #ifndef BLKDEV_DEFAULT_RQ 90 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 91 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 92 #endif 93 94 /* 95 * Finalize our BIO or request. 96 */ 97 #ifdef HAVE_BLK_MQ 98 #define END_IO(zv, bio, rq, error) do { \ 99 if (bio) { \ 100 BIO_END_IO(bio, error); \ 101 } else { \ 102 blk_mq_end_request(rq, errno_to_bi_status(error)); \ 103 } \ 104 } while (0) 105 #else 106 #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) 107 #endif 108 109 #ifdef HAVE_BLK_MQ 110 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 111 static unsigned int zvol_actual_blk_mq_queue_depth; 112 #endif 113 114 struct zvol_state_os { 115 struct gendisk *zvo_disk; /* generic disk */ 116 struct request_queue *zvo_queue; /* request queue */ 117 dev_t zvo_dev; /* device id */ 118 119 #ifdef HAVE_BLK_MQ 120 struct blk_mq_tag_set tag_set; 121 #endif 122 123 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 124 boolean_t use_blk_mq; 125 }; 126 127 typedef struct zv_taskq { 128 uint_t tqs_cnt; 129 taskq_t **tqs_taskq; 130 } zv_taskq_t; 131 static zv_taskq_t zvol_taskqs; 132 static struct ida zvol_ida; 133 134 typedef struct zv_request_stack { 135 zvol_state_t *zv; 136 struct bio *bio; 137 struct request *rq; 138 } zv_request_t; 139 140 typedef struct zv_work { 141 struct request *rq; 142 struct work_struct work; 143 } zv_work_t; 144 145 typedef struct zv_request_task { 146 zv_request_t zvr; 147 taskq_ent_t ent; 148 } zv_request_task_t; 149 150 static zv_request_task_t * 151 zv_request_task_create(zv_request_t zvr) 152 { 153 zv_request_task_t *task; 154 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 155 taskq_init_ent(&task->ent); 156 task->zvr = zvr; 157 return (task); 158 } 159 160 static void 161 zv_request_task_free(zv_request_task_t *task) 162 { 163 kmem_free(task, sizeof (*task)); 164 } 165 166 #ifdef HAVE_BLK_MQ 167 168 /* 169 * This is called when a new block multiqueue request comes in. A request 170 * contains one or more BIOs. 171 */ 172 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 173 const struct blk_mq_queue_data *bd) 174 { 175 struct request *rq = bd->rq; 176 zvol_state_t *zv = rq->q->queuedata; 177 178 /* Tell the kernel that we are starting to process this request */ 179 blk_mq_start_request(rq); 180 181 if (blk_rq_is_passthrough(rq)) { 182 /* Skip non filesystem request */ 183 blk_mq_end_request(rq, BLK_STS_IOERR); 184 return (BLK_STS_IOERR); 185 } 186 187 zvol_request_impl(zv, NULL, rq, 0); 188 189 /* Acknowledge to the kernel that we got this request */ 190 return (BLK_STS_OK); 191 } 192 193 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 194 .queue_rq = zvol_mq_queue_rq, 195 }; 196 197 /* Initialize our blk-mq struct */ 198 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 199 { 200 struct zvol_state_os *zso = zv->zv_zso; 201 202 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 203 204 /* Initialize tag set. */ 205 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 206 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 207 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 208 zso->tag_set.numa_node = NUMA_NO_NODE; 209 zso->tag_set.cmd_size = 0; 210 211 /* 212 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 213 * zvol_request_impl() 214 */ 215 zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 216 zso->tag_set.driver_data = zv; 217 218 return (blk_mq_alloc_tag_set(&zso->tag_set)); 219 } 220 #endif /* HAVE_BLK_MQ */ 221 222 /* 223 * Given a path, return TRUE if path is a ZVOL. 224 */ 225 boolean_t 226 zvol_os_is_zvol(const char *path) 227 { 228 dev_t dev = 0; 229 230 if (vdev_lookup_bdev(path, &dev) != 0) 231 return (B_FALSE); 232 233 if (MAJOR(dev) == zvol_major) 234 return (B_TRUE); 235 236 return (B_FALSE); 237 } 238 239 static void 240 zvol_write(zv_request_t *zvr) 241 { 242 struct bio *bio = zvr->bio; 243 struct request *rq = zvr->rq; 244 int error = 0; 245 zfs_uio_t uio; 246 zvol_state_t *zv = zvr->zv; 247 struct request_queue *q; 248 struct gendisk *disk; 249 unsigned long start_time = 0; 250 boolean_t acct = B_FALSE; 251 252 ASSERT3P(zv, !=, NULL); 253 ASSERT3U(zv->zv_open_count, >, 0); 254 ASSERT3P(zv->zv_zilog, !=, NULL); 255 256 q = zv->zv_zso->zvo_queue; 257 disk = zv->zv_zso->zvo_disk; 258 259 /* bio marked as FLUSH need to flush before write */ 260 if (io_is_flush(bio, rq)) 261 zil_commit(zv->zv_zilog, ZVOL_OBJ); 262 263 /* Some requests are just for flush and nothing else. */ 264 if (io_size(bio, rq) == 0) { 265 rw_exit(&zv->zv_suspend_lock); 266 END_IO(zv, bio, rq, 0); 267 return; 268 } 269 270 zfs_uio_bvec_init(&uio, bio, rq); 271 272 ssize_t start_resid = uio.uio_resid; 273 274 /* 275 * With use_blk_mq, accounting is done by blk_mq_start_request() 276 * and blk_mq_end_request(), so we can skip it here. 277 */ 278 if (bio) { 279 acct = blk_queue_io_stat(q); 280 if (acct) { 281 start_time = blk_generic_start_io_acct(q, disk, WRITE, 282 bio); 283 } 284 } 285 286 boolean_t sync = 287 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 288 289 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 290 uio.uio_loffset, uio.uio_resid, RL_WRITER); 291 292 uint64_t volsize = zv->zv_volsize; 293 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 294 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 295 uint64_t off = uio.uio_loffset; 296 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 297 298 if (bytes > volsize - off) /* don't write past the end */ 299 bytes = volsize - off; 300 301 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 302 303 /* This will only fail for ENOSPC */ 304 error = dmu_tx_assign(tx, TXG_WAIT); 305 if (error) { 306 dmu_tx_abort(tx); 307 break; 308 } 309 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 310 if (error == 0) { 311 zvol_log_write(zv, tx, off, bytes, sync); 312 } 313 dmu_tx_commit(tx); 314 315 if (error) 316 break; 317 } 318 zfs_rangelock_exit(lr); 319 320 int64_t nwritten = start_resid - uio.uio_resid; 321 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 322 task_io_account_write(nwritten); 323 324 if (sync) 325 zil_commit(zv->zv_zilog, ZVOL_OBJ); 326 327 rw_exit(&zv->zv_suspend_lock); 328 329 if (bio && acct) { 330 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 331 } 332 333 END_IO(zv, bio, rq, -error); 334 } 335 336 static void 337 zvol_write_task(void *arg) 338 { 339 zv_request_task_t *task = arg; 340 zvol_write(&task->zvr); 341 zv_request_task_free(task); 342 } 343 344 static void 345 zvol_discard(zv_request_t *zvr) 346 { 347 struct bio *bio = zvr->bio; 348 struct request *rq = zvr->rq; 349 zvol_state_t *zv = zvr->zv; 350 uint64_t start = io_offset(bio, rq); 351 uint64_t size = io_size(bio, rq); 352 uint64_t end = start + size; 353 boolean_t sync; 354 int error = 0; 355 dmu_tx_t *tx; 356 struct request_queue *q = zv->zv_zso->zvo_queue; 357 struct gendisk *disk = zv->zv_zso->zvo_disk; 358 unsigned long start_time = 0; 359 boolean_t acct = B_FALSE; 360 361 ASSERT3P(zv, !=, NULL); 362 ASSERT3U(zv->zv_open_count, >, 0); 363 ASSERT3P(zv->zv_zilog, !=, NULL); 364 365 if (bio) { 366 acct = blk_queue_io_stat(q); 367 if (acct) { 368 start_time = blk_generic_start_io_acct(q, disk, WRITE, 369 bio); 370 } 371 } 372 373 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 374 375 if (end > zv->zv_volsize) { 376 error = SET_ERROR(EIO); 377 goto unlock; 378 } 379 380 /* 381 * Align the request to volume block boundaries when a secure erase is 382 * not required. This will prevent dnode_free_range() from zeroing out 383 * the unaligned parts which is slow (read-modify-write) and useless 384 * since we are not freeing any space by doing so. 385 */ 386 if (!io_is_secure_erase(bio, rq)) { 387 start = P2ROUNDUP(start, zv->zv_volblocksize); 388 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 389 size = end - start; 390 } 391 392 if (start >= end) 393 goto unlock; 394 395 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 396 start, size, RL_WRITER); 397 398 tx = dmu_tx_create(zv->zv_objset); 399 dmu_tx_mark_netfree(tx); 400 error = dmu_tx_assign(tx, TXG_WAIT); 401 if (error != 0) { 402 dmu_tx_abort(tx); 403 } else { 404 zvol_log_truncate(zv, tx, start, size); 405 dmu_tx_commit(tx); 406 error = dmu_free_long_range(zv->zv_objset, 407 ZVOL_OBJ, start, size); 408 } 409 zfs_rangelock_exit(lr); 410 411 if (error == 0 && sync) 412 zil_commit(zv->zv_zilog, ZVOL_OBJ); 413 414 unlock: 415 rw_exit(&zv->zv_suspend_lock); 416 417 if (bio && acct) { 418 blk_generic_end_io_acct(q, disk, WRITE, bio, 419 start_time); 420 } 421 422 END_IO(zv, bio, rq, -error); 423 } 424 425 static void 426 zvol_discard_task(void *arg) 427 { 428 zv_request_task_t *task = arg; 429 zvol_discard(&task->zvr); 430 zv_request_task_free(task); 431 } 432 433 static void 434 zvol_read(zv_request_t *zvr) 435 { 436 struct bio *bio = zvr->bio; 437 struct request *rq = zvr->rq; 438 int error = 0; 439 zfs_uio_t uio; 440 boolean_t acct = B_FALSE; 441 zvol_state_t *zv = zvr->zv; 442 struct request_queue *q; 443 struct gendisk *disk; 444 unsigned long start_time = 0; 445 446 ASSERT3P(zv, !=, NULL); 447 ASSERT3U(zv->zv_open_count, >, 0); 448 449 zfs_uio_bvec_init(&uio, bio, rq); 450 451 q = zv->zv_zso->zvo_queue; 452 disk = zv->zv_zso->zvo_disk; 453 454 ssize_t start_resid = uio.uio_resid; 455 456 /* 457 * When blk-mq is being used, accounting is done by 458 * blk_mq_start_request() and blk_mq_end_request(). 459 */ 460 if (bio) { 461 acct = blk_queue_io_stat(q); 462 if (acct) 463 start_time = blk_generic_start_io_acct(q, disk, READ, 464 bio); 465 } 466 467 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 468 uio.uio_loffset, uio.uio_resid, RL_READER); 469 470 uint64_t volsize = zv->zv_volsize; 471 472 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 473 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 474 475 /* don't read past the end */ 476 if (bytes > volsize - uio.uio_loffset) 477 bytes = volsize - uio.uio_loffset; 478 479 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 480 if (error) { 481 /* convert checksum errors into IO errors */ 482 if (error == ECKSUM) 483 error = SET_ERROR(EIO); 484 break; 485 } 486 } 487 zfs_rangelock_exit(lr); 488 489 int64_t nread = start_resid - uio.uio_resid; 490 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 491 task_io_account_read(nread); 492 493 rw_exit(&zv->zv_suspend_lock); 494 495 if (bio && acct) { 496 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 497 } 498 499 END_IO(zv, bio, rq, -error); 500 } 501 502 static void 503 zvol_read_task(void *arg) 504 { 505 zv_request_task_t *task = arg; 506 zvol_read(&task->zvr); 507 zv_request_task_free(task); 508 } 509 510 511 /* 512 * Process a BIO or request 513 * 514 * Either 'bio' or 'rq' should be set depending on if we are processing a 515 * bio or a request (both should not be set). 516 * 517 * force_sync: Set to 0 to defer processing to a background taskq 518 * Set to 1 to process data synchronously 519 */ 520 static void 521 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 522 boolean_t force_sync) 523 { 524 fstrans_cookie_t cookie = spl_fstrans_mark(); 525 uint64_t offset = io_offset(bio, rq); 526 uint64_t size = io_size(bio, rq); 527 int rw = io_data_dir(bio, rq); 528 529 if (zvol_request_sync || zv->zv_threading == B_FALSE) 530 force_sync = 1; 531 532 zv_request_t zvr = { 533 .zv = zv, 534 .bio = bio, 535 .rq = rq, 536 }; 537 538 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 539 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 540 zv->zv_zso->zvo_disk->disk_name, 541 (long long unsigned)offset, 542 (long unsigned)size); 543 544 END_IO(zv, bio, rq, -SET_ERROR(EIO)); 545 goto out; 546 } 547 548 zv_request_task_t *task; 549 zv_taskq_t *ztqs = &zvol_taskqs; 550 uint_t blk_mq_hw_queue = 0; 551 uint_t tq_idx; 552 uint_t taskq_hash; 553 #ifdef HAVE_BLK_MQ 554 if (rq) 555 #ifdef HAVE_BLK_MQ_RQ_HCTX 556 blk_mq_hw_queue = rq->mq_hctx->queue_num; 557 #else 558 blk_mq_hw_queue = 559 rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 560 #endif 561 #endif 562 taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 563 blk_mq_hw_queue, 0); 564 tq_idx = taskq_hash % ztqs->tqs_cnt; 565 566 if (rw == WRITE) { 567 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 568 END_IO(zv, bio, rq, -SET_ERROR(EROFS)); 569 goto out; 570 } 571 572 /* 573 * Prevents the zvol from being suspended, or the ZIL being 574 * concurrently opened. Will be released after the i/o 575 * completes. 576 */ 577 rw_enter(&zv->zv_suspend_lock, RW_READER); 578 579 /* 580 * Open a ZIL if this is the first time we have written to this 581 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 582 * than zv_state_lock so that we don't need to acquire an 583 * additional lock in this path. 584 */ 585 if (zv->zv_zilog == NULL) { 586 rw_exit(&zv->zv_suspend_lock); 587 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 588 if (zv->zv_zilog == NULL) { 589 zv->zv_zilog = zil_open(zv->zv_objset, 590 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 591 zv->zv_flags |= ZVOL_WRITTEN_TO; 592 /* replay / destroy done in zvol_create_minor */ 593 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 594 ZIL_REPLAY_NEEDED)); 595 } 596 rw_downgrade(&zv->zv_suspend_lock); 597 } 598 599 /* 600 * We don't want this thread to be blocked waiting for i/o to 601 * complete, so we instead wait from a taskq callback. The 602 * i/o may be a ZIL write (via zil_commit()), or a read of an 603 * indirect block, or a read of a data block (if this is a 604 * partial-block write). We will indicate that the i/o is 605 * complete by calling END_IO() from the taskq callback. 606 * 607 * This design allows the calling thread to continue and 608 * initiate more concurrent operations by calling 609 * zvol_request() again. There are typically only a small 610 * number of threads available to call zvol_request() (e.g. 611 * one per iSCSI target), so keeping the latency of 612 * zvol_request() low is important for performance. 613 * 614 * The zvol_request_sync module parameter allows this 615 * behavior to be altered, for performance evaluation 616 * purposes. If the callback blocks, setting 617 * zvol_request_sync=1 will result in much worse performance. 618 * 619 * We can have up to zvol_threads concurrent i/o's being 620 * processed for all zvols on the system. This is typically 621 * a vast improvement over the zvol_request_sync=1 behavior 622 * of one i/o at a time per zvol. However, an even better 623 * design would be for zvol_request() to initiate the zio 624 * directly, and then be notified by the zio_done callback, 625 * which would call END_IO(). Unfortunately, the DMU/ZIL 626 * interfaces lack this functionality (they block waiting for 627 * the i/o to complete). 628 */ 629 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 630 if (force_sync) { 631 zvol_discard(&zvr); 632 } else { 633 task = zv_request_task_create(zvr); 634 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 635 zvol_discard_task, task, 0, &task->ent); 636 } 637 } else { 638 if (force_sync) { 639 zvol_write(&zvr); 640 } else { 641 task = zv_request_task_create(zvr); 642 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 643 zvol_write_task, task, 0, &task->ent); 644 } 645 } 646 } else { 647 /* 648 * The SCST driver, and possibly others, may issue READ I/Os 649 * with a length of zero bytes. These empty I/Os contain no 650 * data and require no additional handling. 651 */ 652 if (size == 0) { 653 END_IO(zv, bio, rq, 0); 654 goto out; 655 } 656 657 rw_enter(&zv->zv_suspend_lock, RW_READER); 658 659 /* See comment in WRITE case above. */ 660 if (force_sync) { 661 zvol_read(&zvr); 662 } else { 663 task = zv_request_task_create(zvr); 664 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 665 zvol_read_task, task, 0, &task->ent); 666 } 667 } 668 669 out: 670 spl_fstrans_unmark(cookie); 671 } 672 673 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 674 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 675 static void 676 zvol_submit_bio(struct bio *bio) 677 #else 678 static blk_qc_t 679 zvol_submit_bio(struct bio *bio) 680 #endif 681 #else 682 static MAKE_REQUEST_FN_RET 683 zvol_request(struct request_queue *q, struct bio *bio) 684 #endif 685 { 686 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 687 #if defined(HAVE_BIO_BDEV_DISK) 688 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 689 #else 690 struct request_queue *q = bio->bi_disk->queue; 691 #endif 692 #endif 693 zvol_state_t *zv = q->queuedata; 694 695 zvol_request_impl(zv, bio, NULL, 0); 696 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 697 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 698 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 699 return (BLK_QC_T_NONE); 700 #endif 701 } 702 703 static int 704 #ifdef HAVE_BLK_MODE_T 705 zvol_open(struct gendisk *disk, blk_mode_t flag) 706 #else 707 zvol_open(struct block_device *bdev, fmode_t flag) 708 #endif 709 { 710 zvol_state_t *zv; 711 int error = 0; 712 boolean_t drop_suspend = B_FALSE; 713 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 714 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 715 hrtime_t start = gethrtime(); 716 717 retry: 718 #endif 719 rw_enter(&zvol_state_lock, RW_READER); 720 /* 721 * Obtain a copy of private_data under the zvol_state_lock to make 722 * sure that either the result of zvol free code path setting 723 * disk->private_data to NULL is observed, or zvol_os_free() 724 * is not called on this zv because of the positive zv_open_count. 725 */ 726 #ifdef HAVE_BLK_MODE_T 727 zv = disk->private_data; 728 #else 729 zv = bdev->bd_disk->private_data; 730 #endif 731 if (zv == NULL) { 732 rw_exit(&zvol_state_lock); 733 return (SET_ERROR(-ENXIO)); 734 } 735 736 mutex_enter(&zv->zv_state_lock); 737 /* 738 * Make sure zvol is not suspended during first open 739 * (hold zv_suspend_lock) and respect proper lock acquisition 740 * ordering - zv_suspend_lock before zv_state_lock 741 */ 742 if (zv->zv_open_count == 0) { 743 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 744 mutex_exit(&zv->zv_state_lock); 745 rw_enter(&zv->zv_suspend_lock, RW_READER); 746 mutex_enter(&zv->zv_state_lock); 747 /* check to see if zv_suspend_lock is needed */ 748 if (zv->zv_open_count != 0) { 749 rw_exit(&zv->zv_suspend_lock); 750 } else { 751 drop_suspend = B_TRUE; 752 } 753 } else { 754 drop_suspend = B_TRUE; 755 } 756 } 757 rw_exit(&zvol_state_lock); 758 759 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 760 761 if (zv->zv_open_count == 0) { 762 boolean_t drop_namespace = B_FALSE; 763 764 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 765 766 /* 767 * In all other call paths the spa_namespace_lock is taken 768 * before the bdev->bd_mutex lock. However, on open(2) 769 * the __blkdev_get() function calls fops->open() with the 770 * bdev->bd_mutex lock held. This can result in a deadlock 771 * when zvols from one pool are used as vdevs in another. 772 * 773 * To prevent a lock inversion deadlock we preemptively 774 * take the spa_namespace_lock. Normally the lock will not 775 * be contended and this is safe because spa_open_common() 776 * handles the case where the caller already holds the 777 * spa_namespace_lock. 778 * 779 * When the lock cannot be aquired after multiple retries 780 * this must be the vdev on zvol deadlock case and we have 781 * no choice but to return an error. For 5.12 and older 782 * kernels returning -ERESTARTSYS will result in the 783 * bdev->bd_mutex being dropped, then reacquired, and 784 * fops->open() being called again. This process can be 785 * repeated safely until both locks are acquired. For 5.13 786 * and newer the -ERESTARTSYS retry logic was removed from 787 * the kernel so the only option is to return the error for 788 * the caller to handle it. 789 */ 790 if (!mutex_owned(&spa_namespace_lock)) { 791 if (!mutex_tryenter(&spa_namespace_lock)) { 792 mutex_exit(&zv->zv_state_lock); 793 rw_exit(&zv->zv_suspend_lock); 794 drop_suspend = B_FALSE; 795 796 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 797 schedule(); 798 return (SET_ERROR(-ERESTARTSYS)); 799 #else 800 if ((gethrtime() - start) > timeout) 801 return (SET_ERROR(-ERESTARTSYS)); 802 803 schedule_timeout_interruptible( 804 MSEC_TO_TICK(10)); 805 goto retry; 806 #endif 807 } else { 808 drop_namespace = B_TRUE; 809 } 810 } 811 812 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 813 814 if (drop_namespace) 815 mutex_exit(&spa_namespace_lock); 816 } 817 818 if (error == 0) { 819 if ((blk_mode_is_open_write(flag)) && 820 (zv->zv_flags & ZVOL_RDONLY)) { 821 if (zv->zv_open_count == 0) 822 zvol_last_close(zv); 823 824 error = SET_ERROR(-EROFS); 825 } else { 826 zv->zv_open_count++; 827 } 828 } 829 830 mutex_exit(&zv->zv_state_lock); 831 if (drop_suspend) 832 rw_exit(&zv->zv_suspend_lock); 833 834 if (error == 0) 835 #ifdef HAVE_BLK_MODE_T 836 disk_check_media_change(disk); 837 #else 838 zfs_check_media_change(bdev); 839 #endif 840 841 return (error); 842 } 843 844 static void 845 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 846 zvol_release(struct gendisk *disk) 847 #else 848 zvol_release(struct gendisk *disk, fmode_t unused) 849 #endif 850 { 851 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 852 (void) unused; 853 #endif 854 zvol_state_t *zv; 855 boolean_t drop_suspend = B_TRUE; 856 857 rw_enter(&zvol_state_lock, RW_READER); 858 zv = disk->private_data; 859 860 mutex_enter(&zv->zv_state_lock); 861 ASSERT3U(zv->zv_open_count, >, 0); 862 /* 863 * make sure zvol is not suspended during last close 864 * (hold zv_suspend_lock) and respect proper lock acquisition 865 * ordering - zv_suspend_lock before zv_state_lock 866 */ 867 if (zv->zv_open_count == 1) { 868 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 869 mutex_exit(&zv->zv_state_lock); 870 rw_enter(&zv->zv_suspend_lock, RW_READER); 871 mutex_enter(&zv->zv_state_lock); 872 /* check to see if zv_suspend_lock is needed */ 873 if (zv->zv_open_count != 1) { 874 rw_exit(&zv->zv_suspend_lock); 875 drop_suspend = B_FALSE; 876 } 877 } 878 } else { 879 drop_suspend = B_FALSE; 880 } 881 rw_exit(&zvol_state_lock); 882 883 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 884 885 zv->zv_open_count--; 886 if (zv->zv_open_count == 0) { 887 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 888 zvol_last_close(zv); 889 } 890 891 mutex_exit(&zv->zv_state_lock); 892 893 if (drop_suspend) 894 rw_exit(&zv->zv_suspend_lock); 895 } 896 897 static int 898 zvol_ioctl(struct block_device *bdev, fmode_t mode, 899 unsigned int cmd, unsigned long arg) 900 { 901 zvol_state_t *zv = bdev->bd_disk->private_data; 902 int error = 0; 903 904 ASSERT3U(zv->zv_open_count, >, 0); 905 906 switch (cmd) { 907 case BLKFLSBUF: 908 #ifdef HAVE_FSYNC_BDEV 909 fsync_bdev(bdev); 910 #elif defined(HAVE_SYNC_BLOCKDEV) 911 sync_blockdev(bdev); 912 #else 913 #error "Neither fsync_bdev() nor sync_blockdev() found" 914 #endif 915 invalidate_bdev(bdev); 916 rw_enter(&zv->zv_suspend_lock, RW_READER); 917 918 if (!(zv->zv_flags & ZVOL_RDONLY)) 919 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 920 921 rw_exit(&zv->zv_suspend_lock); 922 break; 923 924 case BLKZNAME: 925 mutex_enter(&zv->zv_state_lock); 926 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 927 mutex_exit(&zv->zv_state_lock); 928 break; 929 930 default: 931 error = -ENOTTY; 932 break; 933 } 934 935 return (SET_ERROR(error)); 936 } 937 938 #ifdef CONFIG_COMPAT 939 static int 940 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 941 unsigned cmd, unsigned long arg) 942 { 943 return (zvol_ioctl(bdev, mode, cmd, arg)); 944 } 945 #else 946 #define zvol_compat_ioctl NULL 947 #endif 948 949 static unsigned int 950 zvol_check_events(struct gendisk *disk, unsigned int clearing) 951 { 952 unsigned int mask = 0; 953 954 rw_enter(&zvol_state_lock, RW_READER); 955 956 zvol_state_t *zv = disk->private_data; 957 if (zv != NULL) { 958 mutex_enter(&zv->zv_state_lock); 959 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 960 zv->zv_changed = 0; 961 mutex_exit(&zv->zv_state_lock); 962 } 963 964 rw_exit(&zvol_state_lock); 965 966 return (mask); 967 } 968 969 static int 970 zvol_revalidate_disk(struct gendisk *disk) 971 { 972 rw_enter(&zvol_state_lock, RW_READER); 973 974 zvol_state_t *zv = disk->private_data; 975 if (zv != NULL) { 976 mutex_enter(&zv->zv_state_lock); 977 set_capacity(zv->zv_zso->zvo_disk, 978 zv->zv_volsize >> SECTOR_BITS); 979 mutex_exit(&zv->zv_state_lock); 980 } 981 982 rw_exit(&zvol_state_lock); 983 984 return (0); 985 } 986 987 int 988 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 989 { 990 struct gendisk *disk = zv->zv_zso->zvo_disk; 991 992 #if defined(HAVE_REVALIDATE_DISK_SIZE) 993 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 994 #elif defined(HAVE_REVALIDATE_DISK) 995 revalidate_disk(disk); 996 #else 997 zvol_revalidate_disk(disk); 998 #endif 999 return (0); 1000 } 1001 1002 void 1003 zvol_os_clear_private(zvol_state_t *zv) 1004 { 1005 /* 1006 * Cleared while holding zvol_state_lock as a writer 1007 * which will prevent zvol_open() from opening it. 1008 */ 1009 zv->zv_zso->zvo_disk->private_data = NULL; 1010 } 1011 1012 /* 1013 * Provide a simple virtual geometry for legacy compatibility. For devices 1014 * smaller than 1 MiB a small head and sector count is used to allow very 1015 * tiny devices. For devices over 1 Mib a standard head and sector count 1016 * is used to keep the cylinders count reasonable. 1017 */ 1018 static int 1019 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1020 { 1021 zvol_state_t *zv = bdev->bd_disk->private_data; 1022 sector_t sectors; 1023 1024 ASSERT3U(zv->zv_open_count, >, 0); 1025 1026 sectors = get_capacity(zv->zv_zso->zvo_disk); 1027 1028 if (sectors > 2048) { 1029 geo->heads = 16; 1030 geo->sectors = 63; 1031 } else { 1032 geo->heads = 2; 1033 geo->sectors = 4; 1034 } 1035 1036 geo->start = 0; 1037 geo->cylinders = sectors / (geo->heads * geo->sectors); 1038 1039 return (0); 1040 } 1041 1042 /* 1043 * Why have two separate block_device_operations structs? 1044 * 1045 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1046 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1047 * can't just change submit_bio dynamically at runtime. So just create two 1048 * separate structs to get around this. 1049 */ 1050 static const struct block_device_operations zvol_ops_blk_mq = { 1051 .open = zvol_open, 1052 .release = zvol_release, 1053 .ioctl = zvol_ioctl, 1054 .compat_ioctl = zvol_compat_ioctl, 1055 .check_events = zvol_check_events, 1056 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1057 .revalidate_disk = zvol_revalidate_disk, 1058 #endif 1059 .getgeo = zvol_getgeo, 1060 .owner = THIS_MODULE, 1061 }; 1062 1063 static const struct block_device_operations zvol_ops = { 1064 .open = zvol_open, 1065 .release = zvol_release, 1066 .ioctl = zvol_ioctl, 1067 .compat_ioctl = zvol_compat_ioctl, 1068 .check_events = zvol_check_events, 1069 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1070 .revalidate_disk = zvol_revalidate_disk, 1071 #endif 1072 .getgeo = zvol_getgeo, 1073 .owner = THIS_MODULE, 1074 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1075 .submit_bio = zvol_submit_bio, 1076 #endif 1077 }; 1078 1079 typedef struct zvol_queue_limits { 1080 unsigned int zql_max_hw_sectors; 1081 unsigned short zql_max_segments; 1082 unsigned int zql_max_segment_size; 1083 unsigned int zql_io_opt; 1084 } zvol_queue_limits_t; 1085 1086 static void 1087 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1088 boolean_t use_blk_mq) 1089 { 1090 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1091 1092 if (use_blk_mq) { 1093 /* 1094 * IO requests can be really big (1MB). When an IO request 1095 * comes in, it is passed off to zvol_read() or zvol_write() 1096 * in a new thread, where it is chunked up into 'volblocksize' 1097 * sized pieces and processed. So for example, if the request 1098 * is a 1MB write and your volblocksize is 128k, one zvol_write 1099 * thread will take that request and sequentially do ten 128k 1100 * IOs. This is due to the fact that the thread needs to lock 1101 * each volblocksize sized block. So you might be wondering: 1102 * "instead of passing the whole 1MB request to one thread, 1103 * why not pass ten individual 128k chunks to ten threads and 1104 * process the whole write in parallel?" The short answer is 1105 * that there's a sweet spot number of chunks that balances 1106 * the greater parallelism with the added overhead of more 1107 * threads. The sweet spot can be different depending on if you 1108 * have a read or write heavy workload. Writes typically want 1109 * high chunk counts while reads typically want lower ones. On 1110 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1111 * configuration, with volblocksize=8k, the sweet spot for good 1112 * sequential reads and writes was at 8 chunks. 1113 */ 1114 1115 /* 1116 * Below we tell the kernel how big we want our requests 1117 * to be. You would think that blk_queue_io_opt() would be 1118 * used to do this since it is used to "set optimal request 1119 * size for the queue", but that doesn't seem to do 1120 * anything - the kernel still gives you huge requests 1121 * with tons of little PAGE_SIZE segments contained within it. 1122 * 1123 * Knowing that the kernel will just give you PAGE_SIZE segments 1124 * no matter what, you can say "ok, I want PAGE_SIZE byte 1125 * segments, and I want 'N' of them per request", where N is 1126 * the correct number of segments for the volblocksize and 1127 * number of chunks you want. 1128 */ 1129 #ifdef HAVE_BLK_MQ 1130 if (zvol_blk_mq_blocks_per_thread != 0) { 1131 unsigned int chunks; 1132 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1133 1134 limits->zql_max_segment_size = PAGE_SIZE; 1135 limits->zql_max_segments = 1136 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1137 } else { 1138 /* 1139 * Special case: zvol_blk_mq_blocks_per_thread = 0 1140 * Max everything out. 1141 */ 1142 limits->zql_max_segments = UINT16_MAX; 1143 limits->zql_max_segment_size = UINT_MAX; 1144 } 1145 } else { 1146 #endif 1147 limits->zql_max_segments = UINT16_MAX; 1148 limits->zql_max_segment_size = UINT_MAX; 1149 } 1150 1151 limits->zql_io_opt = zv->zv_volblocksize; 1152 } 1153 1154 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1155 static void 1156 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1157 struct queue_limits *qlimits) 1158 { 1159 memset(qlimits, 0, sizeof (struct queue_limits)); 1160 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1161 qlimits->max_segments = limits->zql_max_segments; 1162 qlimits->max_segment_size = limits->zql_max_segment_size; 1163 qlimits->io_opt = limits->zql_io_opt; 1164 } 1165 #else 1166 static void 1167 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1168 struct request_queue *queue) 1169 { 1170 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1171 blk_queue_max_segments(queue, limits->zql_max_segments); 1172 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1173 blk_queue_io_opt(queue, limits->zql_io_opt); 1174 } 1175 #endif 1176 1177 static int 1178 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1179 { 1180 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1181 #if defined(HAVE_BLK_ALLOC_DISK) 1182 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1183 if (zso->zvo_disk == NULL) 1184 return (1); 1185 1186 zso->zvo_disk->minors = ZVOL_MINORS; 1187 zso->zvo_queue = zso->zvo_disk->queue; 1188 zvol_queue_limits_apply(limits, zso->zvo_queue); 1189 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1190 struct queue_limits qlimits; 1191 zvol_queue_limits_convert(limits, &qlimits); 1192 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1193 if (IS_ERR(disk)) { 1194 zso->zvo_disk = NULL; 1195 return (1); 1196 } 1197 1198 zso->zvo_disk = disk; 1199 zso->zvo_disk->minors = ZVOL_MINORS; 1200 zso->zvo_queue = zso->zvo_disk->queue; 1201 #else 1202 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1203 if (zso->zvo_queue == NULL) 1204 return (1); 1205 1206 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1207 if (zso->zvo_disk == NULL) { 1208 blk_cleanup_queue(zso->zvo_queue); 1209 return (1); 1210 } 1211 1212 zso->zvo_disk->queue = zso->zvo_queue; 1213 zvol_queue_limits_apply(limits, zso->zvo_queue); 1214 #endif /* HAVE_BLK_ALLOC_DISK */ 1215 #else 1216 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1217 if (zso->zvo_queue == NULL) 1218 return (1); 1219 1220 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1221 if (zso->zvo_disk == NULL) { 1222 blk_cleanup_queue(zso->zvo_queue); 1223 return (1); 1224 } 1225 1226 zso->zvo_disk->queue = zso->zvo_queue; 1227 zvol_queue_limits_apply(limits, zso->zvo_queue); 1228 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1229 return (0); 1230 1231 } 1232 1233 static int 1234 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1235 { 1236 #ifdef HAVE_BLK_MQ 1237 struct zvol_state_os *zso = zv->zv_zso; 1238 1239 /* Allocate our blk-mq tag_set */ 1240 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1241 return (1); 1242 1243 #if defined(HAVE_BLK_ALLOC_DISK) 1244 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1245 if (zso->zvo_disk == NULL) { 1246 blk_mq_free_tag_set(&zso->tag_set); 1247 return (1); 1248 } 1249 zso->zvo_queue = zso->zvo_disk->queue; 1250 zvol_queue_limits_apply(limits, zso->zvo_queue); 1251 zso->zvo_disk->minors = ZVOL_MINORS; 1252 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1253 struct queue_limits qlimits; 1254 zvol_queue_limits_convert(limits, &qlimits); 1255 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1256 if (IS_ERR(disk)) { 1257 zso->zvo_disk = NULL; 1258 blk_mq_free_tag_set(&zso->tag_set); 1259 return (1); 1260 } 1261 1262 zso->zvo_disk = disk; 1263 zso->zvo_queue = zso->zvo_disk->queue; 1264 zso->zvo_disk->minors = ZVOL_MINORS; 1265 #else 1266 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1267 if (zso->zvo_disk == NULL) { 1268 blk_cleanup_queue(zso->zvo_queue); 1269 blk_mq_free_tag_set(&zso->tag_set); 1270 return (1); 1271 } 1272 /* Allocate queue */ 1273 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1274 if (IS_ERR(zso->zvo_queue)) { 1275 blk_mq_free_tag_set(&zso->tag_set); 1276 return (1); 1277 } 1278 1279 /* Our queue is now created, assign it to our disk */ 1280 zso->zvo_disk->queue = zso->zvo_queue; 1281 zvol_queue_limits_apply(limits, zso->zvo_queue); 1282 1283 #endif 1284 #endif 1285 return (0); 1286 } 1287 1288 /* 1289 * Allocate memory for a new zvol_state_t and setup the required 1290 * request queue and generic disk structures for the block device. 1291 */ 1292 static zvol_state_t * 1293 zvol_alloc(dev_t dev, const char *name) 1294 { 1295 zvol_state_t *zv; 1296 struct zvol_state_os *zso; 1297 uint64_t volmode; 1298 int ret; 1299 1300 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1301 return (NULL); 1302 1303 if (volmode == ZFS_VOLMODE_DEFAULT) 1304 volmode = zvol_volmode; 1305 1306 if (volmode == ZFS_VOLMODE_NONE) 1307 return (NULL); 1308 1309 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1310 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1311 zv->zv_zso = zso; 1312 zv->zv_volmode = volmode; 1313 1314 list_link_init(&zv->zv_next); 1315 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1316 1317 #ifdef HAVE_BLK_MQ 1318 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1319 #endif 1320 1321 zvol_queue_limits_t limits; 1322 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1323 1324 /* 1325 * The block layer has 3 interfaces for getting BIOs: 1326 * 1327 * 1. blk-mq request queues (new) 1328 * 2. submit_bio() (oldest) 1329 * 3. regular request queues (old). 1330 * 1331 * Each of those interfaces has two permutations: 1332 * 1333 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1334 * both the disk and its queue (5.14 kernel or newer) 1335 * 1336 * b) We don't have blk_*alloc_disk(), and have to allocate the 1337 * disk and the queue separately. (5.13 kernel or older) 1338 */ 1339 if (zv->zv_zso->use_blk_mq) { 1340 ret = zvol_alloc_blk_mq(zv, &limits); 1341 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1342 } else { 1343 ret = zvol_alloc_non_blk_mq(zso, &limits); 1344 zso->zvo_disk->fops = &zvol_ops; 1345 } 1346 if (ret != 0) 1347 goto out_kmem; 1348 1349 blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); 1350 1351 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1352 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1353 1354 if (!zv->zv_zso->use_blk_mq) { 1355 /* Disable write merging in favor of the ZIO pipeline. */ 1356 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1357 } 1358 1359 /* Enable /proc/diskstats */ 1360 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); 1361 1362 zso->zvo_queue->queuedata = zv; 1363 zso->zvo_dev = dev; 1364 zv->zv_open_count = 0; 1365 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1366 1367 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1368 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1369 1370 zso->zvo_disk->major = zvol_major; 1371 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1372 1373 /* 1374 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1375 * This is accomplished by limiting the number of minors for the 1376 * device to one and explicitly disabling partition scanning. 1377 */ 1378 if (volmode == ZFS_VOLMODE_DEV) { 1379 zso->zvo_disk->minors = 1; 1380 zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; 1381 zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; 1382 } 1383 1384 zso->zvo_disk->first_minor = (dev & MINORMASK); 1385 zso->zvo_disk->private_data = zv; 1386 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1387 ZVOL_DEV_NAME, (dev & MINORMASK)); 1388 1389 return (zv); 1390 1391 out_kmem: 1392 kmem_free(zso, sizeof (struct zvol_state_os)); 1393 kmem_free(zv, sizeof (zvol_state_t)); 1394 return (NULL); 1395 } 1396 1397 /* 1398 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1399 * At this time, the structure is not opened by anyone, is taken off 1400 * the zvol_state_list, and has its private data set to NULL. 1401 * The zvol_state_lock is dropped. 1402 * 1403 * This function may take many milliseconds to complete (e.g. we've seen 1404 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1405 * "del_gendisk". Thus, consumers need to be careful to account for this 1406 * latency when calling this function. 1407 */ 1408 void 1409 zvol_os_free(zvol_state_t *zv) 1410 { 1411 1412 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1413 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1414 ASSERT0(zv->zv_open_count); 1415 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1416 1417 rw_destroy(&zv->zv_suspend_lock); 1418 zfs_rangelock_fini(&zv->zv_rangelock); 1419 1420 del_gendisk(zv->zv_zso->zvo_disk); 1421 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1422 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1423 #if defined(HAVE_BLK_CLEANUP_DISK) 1424 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1425 #else 1426 put_disk(zv->zv_zso->zvo_disk); 1427 #endif 1428 #else 1429 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1430 put_disk(zv->zv_zso->zvo_disk); 1431 #endif 1432 1433 #ifdef HAVE_BLK_MQ 1434 if (zv->zv_zso->use_blk_mq) 1435 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1436 #endif 1437 1438 ida_simple_remove(&zvol_ida, 1439 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1440 1441 mutex_destroy(&zv->zv_state_lock); 1442 dataset_kstats_destroy(&zv->zv_kstat); 1443 1444 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1445 kmem_free(zv, sizeof (zvol_state_t)); 1446 } 1447 1448 void 1449 zvol_wait_close(zvol_state_t *zv) 1450 { 1451 } 1452 1453 struct add_disk_work { 1454 struct delayed_work work; 1455 struct gendisk *disk; 1456 int error; 1457 }; 1458 1459 static int 1460 __zvol_os_add_disk(struct gendisk *disk) 1461 { 1462 int error = 0; 1463 #ifdef HAVE_ADD_DISK_RET 1464 error = add_disk(disk); 1465 #else 1466 add_disk(disk); 1467 #endif 1468 return (error); 1469 } 1470 1471 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1472 static void 1473 zvol_os_add_disk_work(struct work_struct *work) 1474 { 1475 struct add_disk_work *add_disk_work; 1476 add_disk_work = container_of(work, struct add_disk_work, work.work); 1477 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1478 } 1479 #endif 1480 1481 /* 1482 * SPECIAL CASE: 1483 * 1484 * This function basically calls add_disk() from a workqueue. You may be 1485 * thinking: why not just call add_disk() directly? 1486 * 1487 * When you call add_disk(), the zvol appears to the world. When this happens, 1488 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1489 * differently on the 6.9+ kernels: 1490 * 1491 * - 6.8 and older kernels - 1492 * disk_scan_partitions() 1493 * handle = bdev_open_by_dev( 1494 * zvol_open() 1495 * bdev_release(handle); 1496 * zvol_release() 1497 * 1498 * 1499 * - 6.9+ kernels - 1500 * disk_scan_partitions() 1501 * file = bdev_file_open_by_dev() 1502 * zvol_open() 1503 * fput(file) 1504 * < wait for return to userspace > 1505 * zvol_release() 1506 * 1507 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1508 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1509 * async that has to wait until we return to userspace (since it adds the fput 1510 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1511 * behavior we want, since we want do things like create+destroy a zvol within 1512 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1513 * reference to the zvol while we're in the IOCTL, which can't wait until we 1514 * return to userspace. 1515 * 1516 * We can get around this since fput() has a special codepath for when it's 1517 * running in a kernel thread or interrupt. In those cases, it just puts the 1518 * fput into the system workqueue, which we can force to run with 1519 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1520 * run from a kernel thread and "tricks" the fput() codepaths. 1521 * 1522 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1523 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1524 * fput) to happen, which it eventually, naturally, will from the system_wq 1525 * without us explicitly calling __flush_workqueue(). 1526 */ 1527 static int 1528 zvol_os_add_disk(struct gendisk *disk) 1529 { 1530 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1531 struct add_disk_work add_disk_work; 1532 1533 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1534 add_disk_work.disk = disk; 1535 add_disk_work.error = 0; 1536 1537 /* Use *_delayed_work functions since they're not GPL'd */ 1538 schedule_delayed_work(&add_disk_work.work, 0); 1539 flush_delayed_work(&add_disk_work.work); 1540 1541 __flush_workqueue(system_wq); 1542 return (add_disk_work.error); 1543 #else /* <= 6.8 kernel */ 1544 return (__zvol_os_add_disk(disk)); 1545 #endif 1546 } 1547 1548 /* 1549 * Create a block device minor node and setup the linkage between it 1550 * and the specified volume. Once this function returns the block 1551 * device is live and ready for use. 1552 */ 1553 int 1554 zvol_os_create_minor(const char *name) 1555 { 1556 zvol_state_t *zv; 1557 objset_t *os; 1558 dmu_object_info_t *doi; 1559 uint64_t volsize; 1560 uint64_t len; 1561 unsigned minor = 0; 1562 int error = 0; 1563 int idx; 1564 uint64_t hash = zvol_name_hash(name); 1565 uint64_t volthreading; 1566 bool replayed_zil = B_FALSE; 1567 1568 if (zvol_inhibit_dev) 1569 return (0); 1570 1571 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1572 if (idx < 0) 1573 return (SET_ERROR(-idx)); 1574 minor = idx << ZVOL_MINOR_BITS; 1575 if (MINOR(minor) != minor) { 1576 /* too many partitions can cause an overflow */ 1577 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1578 name, minor, MINOR(minor)); 1579 ida_simple_remove(&zvol_ida, idx); 1580 return (SET_ERROR(EINVAL)); 1581 } 1582 1583 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1584 if (zv) { 1585 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1586 mutex_exit(&zv->zv_state_lock); 1587 ida_simple_remove(&zvol_ida, idx); 1588 return (SET_ERROR(EEXIST)); 1589 } 1590 1591 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1592 1593 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1594 if (error) 1595 goto out_doi; 1596 1597 error = dmu_object_info(os, ZVOL_OBJ, doi); 1598 if (error) 1599 goto out_dmu_objset_disown; 1600 1601 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1602 if (error) 1603 goto out_dmu_objset_disown; 1604 1605 zv = zvol_alloc(MKDEV(zvol_major, minor), name); 1606 if (zv == NULL) { 1607 error = SET_ERROR(EAGAIN); 1608 goto out_dmu_objset_disown; 1609 } 1610 zv->zv_hash = hash; 1611 1612 if (dmu_objset_is_snapshot(os)) 1613 zv->zv_flags |= ZVOL_RDONLY; 1614 1615 zv->zv_volblocksize = doi->doi_data_block_size; 1616 zv->zv_volsize = volsize; 1617 zv->zv_objset = os; 1618 1619 /* Default */ 1620 zv->zv_threading = B_TRUE; 1621 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1622 == 0) 1623 zv->zv_threading = volthreading; 1624 1625 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1626 1627 1628 1629 blk_queue_physical_block_size(zv->zv_zso->zvo_queue, 1630 zv->zv_volblocksize); 1631 blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, 1632 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); 1633 blk_queue_discard_granularity(zv->zv_zso->zvo_queue, 1634 zv->zv_volblocksize); 1635 #ifdef QUEUE_FLAG_DISCARD 1636 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1637 #endif 1638 #ifdef QUEUE_FLAG_NONROT 1639 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1640 #endif 1641 #ifdef QUEUE_FLAG_ADD_RANDOM 1642 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1643 #endif 1644 /* This flag was introduced in kernel version 4.12. */ 1645 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1646 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1647 #endif 1648 1649 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1650 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1651 if (error) 1652 goto out_dmu_objset_disown; 1653 ASSERT3P(zv->zv_zilog, ==, NULL); 1654 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1655 if (spa_writeable(dmu_objset_spa(os))) { 1656 if (zil_replay_disable) 1657 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1658 else 1659 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1660 } 1661 if (replayed_zil) 1662 zil_close(zv->zv_zilog); 1663 zv->zv_zilog = NULL; 1664 1665 /* 1666 * When udev detects the addition of the device it will immediately 1667 * invoke blkid(8) to determine the type of content on the device. 1668 * Prefetching the blocks commonly scanned by blkid(8) will speed 1669 * up this process. 1670 */ 1671 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1672 if (len > 0) { 1673 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1674 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1675 ZIO_PRIORITY_SYNC_READ); 1676 } 1677 1678 zv->zv_objset = NULL; 1679 out_dmu_objset_disown: 1680 dmu_objset_disown(os, B_TRUE, FTAG); 1681 out_doi: 1682 kmem_free(doi, sizeof (dmu_object_info_t)); 1683 1684 /* 1685 * Keep in mind that once add_disk() is called, the zvol is 1686 * announced to the world, and zvol_open()/zvol_release() can 1687 * be called at any time. Incidentally, add_disk() itself calls 1688 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1689 * directly as well. 1690 */ 1691 if (error == 0) { 1692 rw_enter(&zvol_state_lock, RW_WRITER); 1693 zvol_insert(zv); 1694 rw_exit(&zvol_state_lock); 1695 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1696 } else { 1697 ida_simple_remove(&zvol_ida, idx); 1698 } 1699 1700 return (error); 1701 } 1702 1703 void 1704 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1705 { 1706 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1707 1708 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1709 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1710 1711 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1712 1713 /* move to new hashtable entry */ 1714 zv->zv_hash = zvol_name_hash(newname); 1715 hlist_del(&zv->zv_hlink); 1716 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1717 1718 /* 1719 * The block device's read-only state is briefly changed causing 1720 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1721 * the name change and fixes the symlinks. This does not change 1722 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1723 * changes. This would normally be done using kobject_uevent() but 1724 * that is a GPL-only symbol which is why we need this workaround. 1725 */ 1726 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1727 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1728 1729 dataset_kstats_rename(&zv->zv_kstat, newname); 1730 } 1731 1732 void 1733 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1734 { 1735 1736 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1737 } 1738 1739 void 1740 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1741 { 1742 1743 set_capacity(zv->zv_zso->zvo_disk, capacity); 1744 } 1745 1746 int 1747 zvol_init(void) 1748 { 1749 int error; 1750 1751 /* 1752 * zvol_threads is the module param the user passes in. 1753 * 1754 * zvol_actual_threads is what we use internally, since the user can 1755 * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 1756 */ 1757 static unsigned int zvol_actual_threads; 1758 1759 if (zvol_threads == 0) { 1760 /* 1761 * See dde9380a1 for why 32 was chosen here. This should 1762 * probably be refined to be some multiple of the number 1763 * of CPUs. 1764 */ 1765 zvol_actual_threads = MAX(num_online_cpus(), 32); 1766 } else { 1767 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 1768 } 1769 1770 /* 1771 * Use atleast 32 zvol_threads but for many core system, 1772 * prefer 6 threads per taskq, but no more taskqs 1773 * than threads in them on large systems. 1774 * 1775 * taskq total 1776 * cpus taskqs threads threads 1777 * ------- ------- ------- ------- 1778 * 1 1 32 32 1779 * 2 1 32 32 1780 * 4 1 32 32 1781 * 8 2 16 32 1782 * 16 3 11 33 1783 * 32 5 7 35 1784 * 64 8 8 64 1785 * 128 11 12 132 1786 * 256 16 16 256 1787 */ 1788 zv_taskq_t *ztqs = &zvol_taskqs; 1789 uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); 1790 if (num_tqs == 0) { 1791 num_tqs = 1 + num_online_cpus() / 6; 1792 while (num_tqs * num_tqs > zvol_actual_threads) 1793 num_tqs--; 1794 } 1795 uint_t per_tq_thread = zvol_actual_threads / num_tqs; 1796 if (per_tq_thread * num_tqs < zvol_actual_threads) 1797 per_tq_thread++; 1798 ztqs->tqs_cnt = num_tqs; 1799 ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); 1800 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1801 if (error) { 1802 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); 1803 ztqs->tqs_taskq = NULL; 1804 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1805 return (error); 1806 } 1807 1808 #ifdef HAVE_BLK_MQ 1809 if (zvol_blk_mq_queue_depth == 0) { 1810 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1811 } else { 1812 zvol_actual_blk_mq_queue_depth = 1813 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1814 } 1815 1816 if (zvol_blk_mq_threads == 0) { 1817 zvol_blk_mq_actual_threads = num_online_cpus(); 1818 } else { 1819 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1820 1024); 1821 } 1822 #endif 1823 for (uint_t i = 0; i < num_tqs; i++) { 1824 char name[32]; 1825 (void) snprintf(name, sizeof (name), "%s_tq-%u", 1826 ZVOL_DRIVER, i); 1827 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, 1828 maxclsyspri, per_tq_thread, INT_MAX, 1829 TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1830 if (ztqs->tqs_taskq[i] == NULL) { 1831 for (int j = i - 1; j >= 0; j--) 1832 taskq_destroy(ztqs->tqs_taskq[j]); 1833 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1834 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1835 sizeof (taskq_t *)); 1836 ztqs->tqs_taskq = NULL; 1837 return (-ENOMEM); 1838 } 1839 } 1840 1841 zvol_init_impl(); 1842 ida_init(&zvol_ida); 1843 return (0); 1844 } 1845 1846 void 1847 zvol_fini(void) 1848 { 1849 zv_taskq_t *ztqs = &zvol_taskqs; 1850 zvol_fini_impl(); 1851 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1852 1853 if (ztqs->tqs_taskq == NULL) { 1854 ASSERT3U(ztqs->tqs_cnt, ==, 0); 1855 } else { 1856 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { 1857 ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); 1858 taskq_destroy(ztqs->tqs_taskq[i]); 1859 } 1860 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1861 sizeof (taskq_t *)); 1862 ztqs->tqs_taskq = NULL; 1863 } 1864 1865 ida_destroy(&zvol_ida); 1866 } 1867 1868 /* BEGIN CSTYLED */ 1869 module_param(zvol_inhibit_dev, uint, 0644); 1870 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1871 1872 module_param(zvol_major, uint, 0444); 1873 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1874 1875 module_param(zvol_threads, uint, 0444); 1876 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" 1877 "to 0 to use all active CPUs"); 1878 1879 module_param(zvol_request_sync, uint, 0644); 1880 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1881 1882 module_param(zvol_max_discard_blocks, ulong, 0444); 1883 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1884 1885 module_param(zvol_num_taskqs, uint, 0444); 1886 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); 1887 1888 module_param(zvol_prefetch_bytes, uint, 0644); 1889 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1890 1891 module_param(zvol_volmode, uint, 0644); 1892 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1893 1894 #ifdef HAVE_BLK_MQ 1895 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1896 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1897 1898 module_param(zvol_use_blk_mq, uint, 0644); 1899 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1900 1901 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1902 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1903 "Process volblocksize blocks per thread"); 1904 #endif 1905 1906 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1907 module_param(zvol_open_timeout_ms, uint, 0644); 1908 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1909 #endif 1910 1911 /* END CSTYLED */ 1912