1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 23 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 24 * Copyright (c) 2024, Klara, Inc. 25 */ 26 27 #include <sys/dataset_kstats.h> 28 #include <sys/dbuf.h> 29 #include <sys/dmu_traverse.h> 30 #include <sys/dsl_dataset.h> 31 #include <sys/dsl_prop.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/zap.h> 34 #include <sys/zfeature.h> 35 #include <sys/zil_impl.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/zio.h> 38 #include <sys/zfs_rlock.h> 39 #include <sys/spa_impl.h> 40 #include <sys/zvol.h> 41 #include <sys/zvol_impl.h> 42 #include <cityhash.h> 43 44 #include <linux/blkdev_compat.h> 45 #include <linux/task_io_accounting_ops.h> 46 #include <linux/workqueue.h> 47 48 #ifdef HAVE_BLK_MQ 49 #include <linux/blk-mq.h> 50 #endif 51 52 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 53 struct request *rq, boolean_t force_sync); 54 55 static unsigned int zvol_major = ZVOL_MAJOR; 56 static unsigned int zvol_request_sync = 0; 57 static unsigned int zvol_prefetch_bytes = (128 * 1024); 58 static unsigned long zvol_max_discard_blocks = 16384; 59 60 /* 61 * Switch taskq at multiple of 512 MB offset. This can be set to a lower value 62 * to utilize more threads for small files but may affect prefetch hits. 63 */ 64 #define ZVOL_TASKQ_OFFSET_SHIFT 29 65 66 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 67 static unsigned int zvol_open_timeout_ms = 1000; 68 #endif 69 70 static unsigned int zvol_threads = 0; 71 #ifdef HAVE_BLK_MQ 72 static unsigned int zvol_blk_mq_threads = 0; 73 static unsigned int zvol_blk_mq_actual_threads; 74 static boolean_t zvol_use_blk_mq = B_FALSE; 75 76 /* 77 * The maximum number of volblocksize blocks to process per thread. Typically, 78 * write heavy workloads preform better with higher values here, and read 79 * heavy workloads preform better with lower values, but that's not a hard 80 * and fast rule. It's basically a knob to tune between "less overhead with 81 * less parallelism" and "more overhead, but more parallelism". 82 * 83 * '8' was chosen as a reasonable, balanced, default based off of sequential 84 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 85 */ 86 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 87 #endif 88 89 static unsigned int zvol_num_taskqs = 0; 90 91 #ifndef BLKDEV_DEFAULT_RQ 92 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 93 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 94 #endif 95 96 /* 97 * Finalize our BIO or request. 98 */ 99 #ifdef HAVE_BLK_MQ 100 #define END_IO(zv, bio, rq, error) do { \ 101 if (bio) { \ 102 BIO_END_IO(bio, error); \ 103 } else { \ 104 blk_mq_end_request(rq, errno_to_bi_status(error)); \ 105 } \ 106 } while (0) 107 #else 108 #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) 109 #endif 110 111 #ifdef HAVE_BLK_MQ 112 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 113 static unsigned int zvol_actual_blk_mq_queue_depth; 114 #endif 115 116 struct zvol_state_os { 117 struct gendisk *zvo_disk; /* generic disk */ 118 struct request_queue *zvo_queue; /* request queue */ 119 dev_t zvo_dev; /* device id */ 120 121 #ifdef HAVE_BLK_MQ 122 struct blk_mq_tag_set tag_set; 123 #endif 124 125 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 126 boolean_t use_blk_mq; 127 }; 128 129 typedef struct zv_taskq { 130 uint_t tqs_cnt; 131 taskq_t **tqs_taskq; 132 } zv_taskq_t; 133 static zv_taskq_t zvol_taskqs; 134 static struct ida zvol_ida; 135 136 typedef struct zv_request_stack { 137 zvol_state_t *zv; 138 struct bio *bio; 139 struct request *rq; 140 } zv_request_t; 141 142 typedef struct zv_work { 143 struct request *rq; 144 struct work_struct work; 145 } zv_work_t; 146 147 typedef struct zv_request_task { 148 zv_request_t zvr; 149 taskq_ent_t ent; 150 } zv_request_task_t; 151 152 static zv_request_task_t * 153 zv_request_task_create(zv_request_t zvr) 154 { 155 zv_request_task_t *task; 156 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 157 taskq_init_ent(&task->ent); 158 task->zvr = zvr; 159 return (task); 160 } 161 162 static void 163 zv_request_task_free(zv_request_task_t *task) 164 { 165 kmem_free(task, sizeof (*task)); 166 } 167 168 #ifdef HAVE_BLK_MQ 169 170 /* 171 * This is called when a new block multiqueue request comes in. A request 172 * contains one or more BIOs. 173 */ 174 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 175 const struct blk_mq_queue_data *bd) 176 { 177 struct request *rq = bd->rq; 178 zvol_state_t *zv = rq->q->queuedata; 179 180 /* Tell the kernel that we are starting to process this request */ 181 blk_mq_start_request(rq); 182 183 if (blk_rq_is_passthrough(rq)) { 184 /* Skip non filesystem request */ 185 blk_mq_end_request(rq, BLK_STS_IOERR); 186 return (BLK_STS_IOERR); 187 } 188 189 zvol_request_impl(zv, NULL, rq, 0); 190 191 /* Acknowledge to the kernel that we got this request */ 192 return (BLK_STS_OK); 193 } 194 195 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 196 .queue_rq = zvol_mq_queue_rq, 197 }; 198 199 /* Initialize our blk-mq struct */ 200 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 201 { 202 struct zvol_state_os *zso = zv->zv_zso; 203 204 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 205 206 /* Initialize tag set. */ 207 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 208 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 209 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 210 zso->tag_set.numa_node = NUMA_NO_NODE; 211 zso->tag_set.cmd_size = 0; 212 213 /* 214 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 215 * zvol_request_impl() 216 */ 217 zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 218 zso->tag_set.driver_data = zv; 219 220 return (blk_mq_alloc_tag_set(&zso->tag_set)); 221 } 222 #endif /* HAVE_BLK_MQ */ 223 224 /* 225 * Given a path, return TRUE if path is a ZVOL. 226 */ 227 boolean_t 228 zvol_os_is_zvol(const char *path) 229 { 230 dev_t dev = 0; 231 232 if (vdev_lookup_bdev(path, &dev) != 0) 233 return (B_FALSE); 234 235 if (MAJOR(dev) == zvol_major) 236 return (B_TRUE); 237 238 return (B_FALSE); 239 } 240 241 static void 242 zvol_write(zv_request_t *zvr) 243 { 244 struct bio *bio = zvr->bio; 245 struct request *rq = zvr->rq; 246 int error = 0; 247 zfs_uio_t uio; 248 zvol_state_t *zv = zvr->zv; 249 struct request_queue *q; 250 struct gendisk *disk; 251 unsigned long start_time = 0; 252 boolean_t acct = B_FALSE; 253 254 ASSERT3P(zv, !=, NULL); 255 ASSERT3U(zv->zv_open_count, >, 0); 256 ASSERT3P(zv->zv_zilog, !=, NULL); 257 258 q = zv->zv_zso->zvo_queue; 259 disk = zv->zv_zso->zvo_disk; 260 261 /* bio marked as FLUSH need to flush before write */ 262 if (io_is_flush(bio, rq)) 263 zil_commit(zv->zv_zilog, ZVOL_OBJ); 264 265 /* Some requests are just for flush and nothing else. */ 266 if (io_size(bio, rq) == 0) { 267 rw_exit(&zv->zv_suspend_lock); 268 END_IO(zv, bio, rq, 0); 269 return; 270 } 271 272 zfs_uio_bvec_init(&uio, bio, rq); 273 274 ssize_t start_resid = uio.uio_resid; 275 276 /* 277 * With use_blk_mq, accounting is done by blk_mq_start_request() 278 * and blk_mq_end_request(), so we can skip it here. 279 */ 280 if (bio) { 281 acct = blk_queue_io_stat(q); 282 if (acct) { 283 start_time = blk_generic_start_io_acct(q, disk, WRITE, 284 bio); 285 } 286 } 287 288 boolean_t sync = 289 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 290 291 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 292 uio.uio_loffset, uio.uio_resid, RL_WRITER); 293 294 uint64_t volsize = zv->zv_volsize; 295 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 296 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 297 uint64_t off = uio.uio_loffset; 298 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 299 300 if (bytes > volsize - off) /* don't write past the end */ 301 bytes = volsize - off; 302 303 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 304 305 /* This will only fail for ENOSPC */ 306 error = dmu_tx_assign(tx, TXG_WAIT); 307 if (error) { 308 dmu_tx_abort(tx); 309 break; 310 } 311 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 312 if (error == 0) { 313 zvol_log_write(zv, tx, off, bytes, sync); 314 } 315 dmu_tx_commit(tx); 316 317 if (error) 318 break; 319 } 320 zfs_rangelock_exit(lr); 321 322 int64_t nwritten = start_resid - uio.uio_resid; 323 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 324 task_io_account_write(nwritten); 325 326 if (sync) 327 zil_commit(zv->zv_zilog, ZVOL_OBJ); 328 329 rw_exit(&zv->zv_suspend_lock); 330 331 if (bio && acct) { 332 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 333 } 334 335 END_IO(zv, bio, rq, -error); 336 } 337 338 static void 339 zvol_write_task(void *arg) 340 { 341 zv_request_task_t *task = arg; 342 zvol_write(&task->zvr); 343 zv_request_task_free(task); 344 } 345 346 static void 347 zvol_discard(zv_request_t *zvr) 348 { 349 struct bio *bio = zvr->bio; 350 struct request *rq = zvr->rq; 351 zvol_state_t *zv = zvr->zv; 352 uint64_t start = io_offset(bio, rq); 353 uint64_t size = io_size(bio, rq); 354 uint64_t end = start + size; 355 boolean_t sync; 356 int error = 0; 357 dmu_tx_t *tx; 358 struct request_queue *q = zv->zv_zso->zvo_queue; 359 struct gendisk *disk = zv->zv_zso->zvo_disk; 360 unsigned long start_time = 0; 361 boolean_t acct = B_FALSE; 362 363 ASSERT3P(zv, !=, NULL); 364 ASSERT3U(zv->zv_open_count, >, 0); 365 ASSERT3P(zv->zv_zilog, !=, NULL); 366 367 if (bio) { 368 acct = blk_queue_io_stat(q); 369 if (acct) { 370 start_time = blk_generic_start_io_acct(q, disk, WRITE, 371 bio); 372 } 373 } 374 375 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 376 377 if (end > zv->zv_volsize) { 378 error = SET_ERROR(EIO); 379 goto unlock; 380 } 381 382 /* 383 * Align the request to volume block boundaries when a secure erase is 384 * not required. This will prevent dnode_free_range() from zeroing out 385 * the unaligned parts which is slow (read-modify-write) and useless 386 * since we are not freeing any space by doing so. 387 */ 388 if (!io_is_secure_erase(bio, rq)) { 389 start = P2ROUNDUP(start, zv->zv_volblocksize); 390 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 391 size = end - start; 392 } 393 394 if (start >= end) 395 goto unlock; 396 397 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 398 start, size, RL_WRITER); 399 400 tx = dmu_tx_create(zv->zv_objset); 401 dmu_tx_mark_netfree(tx); 402 error = dmu_tx_assign(tx, TXG_WAIT); 403 if (error != 0) { 404 dmu_tx_abort(tx); 405 } else { 406 zvol_log_truncate(zv, tx, start, size); 407 dmu_tx_commit(tx); 408 error = dmu_free_long_range(zv->zv_objset, 409 ZVOL_OBJ, start, size); 410 } 411 zfs_rangelock_exit(lr); 412 413 if (error == 0 && sync) 414 zil_commit(zv->zv_zilog, ZVOL_OBJ); 415 416 unlock: 417 rw_exit(&zv->zv_suspend_lock); 418 419 if (bio && acct) { 420 blk_generic_end_io_acct(q, disk, WRITE, bio, 421 start_time); 422 } 423 424 END_IO(zv, bio, rq, -error); 425 } 426 427 static void 428 zvol_discard_task(void *arg) 429 { 430 zv_request_task_t *task = arg; 431 zvol_discard(&task->zvr); 432 zv_request_task_free(task); 433 } 434 435 static void 436 zvol_read(zv_request_t *zvr) 437 { 438 struct bio *bio = zvr->bio; 439 struct request *rq = zvr->rq; 440 int error = 0; 441 zfs_uio_t uio; 442 boolean_t acct = B_FALSE; 443 zvol_state_t *zv = zvr->zv; 444 struct request_queue *q; 445 struct gendisk *disk; 446 unsigned long start_time = 0; 447 448 ASSERT3P(zv, !=, NULL); 449 ASSERT3U(zv->zv_open_count, >, 0); 450 451 zfs_uio_bvec_init(&uio, bio, rq); 452 453 q = zv->zv_zso->zvo_queue; 454 disk = zv->zv_zso->zvo_disk; 455 456 ssize_t start_resid = uio.uio_resid; 457 458 /* 459 * When blk-mq is being used, accounting is done by 460 * blk_mq_start_request() and blk_mq_end_request(). 461 */ 462 if (bio) { 463 acct = blk_queue_io_stat(q); 464 if (acct) 465 start_time = blk_generic_start_io_acct(q, disk, READ, 466 bio); 467 } 468 469 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 470 uio.uio_loffset, uio.uio_resid, RL_READER); 471 472 uint64_t volsize = zv->zv_volsize; 473 474 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 475 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 476 477 /* don't read past the end */ 478 if (bytes > volsize - uio.uio_loffset) 479 bytes = volsize - uio.uio_loffset; 480 481 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 482 if (error) { 483 /* convert checksum errors into IO errors */ 484 if (error == ECKSUM) 485 error = SET_ERROR(EIO); 486 break; 487 } 488 } 489 zfs_rangelock_exit(lr); 490 491 int64_t nread = start_resid - uio.uio_resid; 492 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 493 task_io_account_read(nread); 494 495 rw_exit(&zv->zv_suspend_lock); 496 497 if (bio && acct) { 498 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 499 } 500 501 END_IO(zv, bio, rq, -error); 502 } 503 504 static void 505 zvol_read_task(void *arg) 506 { 507 zv_request_task_t *task = arg; 508 zvol_read(&task->zvr); 509 zv_request_task_free(task); 510 } 511 512 513 /* 514 * Process a BIO or request 515 * 516 * Either 'bio' or 'rq' should be set depending on if we are processing a 517 * bio or a request (both should not be set). 518 * 519 * force_sync: Set to 0 to defer processing to a background taskq 520 * Set to 1 to process data synchronously 521 */ 522 static void 523 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 524 boolean_t force_sync) 525 { 526 fstrans_cookie_t cookie = spl_fstrans_mark(); 527 uint64_t offset = io_offset(bio, rq); 528 uint64_t size = io_size(bio, rq); 529 int rw = io_data_dir(bio, rq); 530 531 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 532 END_IO(zv, bio, rq, -SET_ERROR(ENXIO)); 533 goto out; 534 } 535 536 if (zvol_request_sync || zv->zv_threading == B_FALSE) 537 force_sync = 1; 538 539 zv_request_t zvr = { 540 .zv = zv, 541 .bio = bio, 542 .rq = rq, 543 }; 544 545 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 546 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 547 zv->zv_zso->zvo_disk->disk_name, 548 (long long unsigned)offset, 549 (long unsigned)size); 550 551 END_IO(zv, bio, rq, -SET_ERROR(EIO)); 552 goto out; 553 } 554 555 zv_request_task_t *task; 556 zv_taskq_t *ztqs = &zvol_taskqs; 557 uint_t blk_mq_hw_queue = 0; 558 uint_t tq_idx; 559 uint_t taskq_hash; 560 #ifdef HAVE_BLK_MQ 561 if (rq) 562 #ifdef HAVE_BLK_MQ_RQ_HCTX 563 blk_mq_hw_queue = rq->mq_hctx->queue_num; 564 #else 565 blk_mq_hw_queue = 566 rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 567 #endif 568 #endif 569 taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 570 blk_mq_hw_queue, 0); 571 tq_idx = taskq_hash % ztqs->tqs_cnt; 572 573 if (rw == WRITE) { 574 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 575 END_IO(zv, bio, rq, -SET_ERROR(EROFS)); 576 goto out; 577 } 578 579 /* 580 * Prevents the zvol from being suspended, or the ZIL being 581 * concurrently opened. Will be released after the i/o 582 * completes. 583 */ 584 rw_enter(&zv->zv_suspend_lock, RW_READER); 585 586 /* 587 * Open a ZIL if this is the first time we have written to this 588 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 589 * than zv_state_lock so that we don't need to acquire an 590 * additional lock in this path. 591 */ 592 if (zv->zv_zilog == NULL) { 593 rw_exit(&zv->zv_suspend_lock); 594 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 595 if (zv->zv_zilog == NULL) { 596 zv->zv_zilog = zil_open(zv->zv_objset, 597 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 598 zv->zv_flags |= ZVOL_WRITTEN_TO; 599 /* replay / destroy done in zvol_create_minor */ 600 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 601 ZIL_REPLAY_NEEDED)); 602 } 603 rw_downgrade(&zv->zv_suspend_lock); 604 } 605 606 /* 607 * We don't want this thread to be blocked waiting for i/o to 608 * complete, so we instead wait from a taskq callback. The 609 * i/o may be a ZIL write (via zil_commit()), or a read of an 610 * indirect block, or a read of a data block (if this is a 611 * partial-block write). We will indicate that the i/o is 612 * complete by calling END_IO() from the taskq callback. 613 * 614 * This design allows the calling thread to continue and 615 * initiate more concurrent operations by calling 616 * zvol_request() again. There are typically only a small 617 * number of threads available to call zvol_request() (e.g. 618 * one per iSCSI target), so keeping the latency of 619 * zvol_request() low is important for performance. 620 * 621 * The zvol_request_sync module parameter allows this 622 * behavior to be altered, for performance evaluation 623 * purposes. If the callback blocks, setting 624 * zvol_request_sync=1 will result in much worse performance. 625 * 626 * We can have up to zvol_threads concurrent i/o's being 627 * processed for all zvols on the system. This is typically 628 * a vast improvement over the zvol_request_sync=1 behavior 629 * of one i/o at a time per zvol. However, an even better 630 * design would be for zvol_request() to initiate the zio 631 * directly, and then be notified by the zio_done callback, 632 * which would call END_IO(). Unfortunately, the DMU/ZIL 633 * interfaces lack this functionality (they block waiting for 634 * the i/o to complete). 635 */ 636 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 637 if (force_sync) { 638 zvol_discard(&zvr); 639 } else { 640 task = zv_request_task_create(zvr); 641 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 642 zvol_discard_task, task, 0, &task->ent); 643 } 644 } else { 645 if (force_sync) { 646 zvol_write(&zvr); 647 } else { 648 task = zv_request_task_create(zvr); 649 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 650 zvol_write_task, task, 0, &task->ent); 651 } 652 } 653 } else { 654 /* 655 * The SCST driver, and possibly others, may issue READ I/Os 656 * with a length of zero bytes. These empty I/Os contain no 657 * data and require no additional handling. 658 */ 659 if (size == 0) { 660 END_IO(zv, bio, rq, 0); 661 goto out; 662 } 663 664 rw_enter(&zv->zv_suspend_lock, RW_READER); 665 666 /* See comment in WRITE case above. */ 667 if (force_sync) { 668 zvol_read(&zvr); 669 } else { 670 task = zv_request_task_create(zvr); 671 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 672 zvol_read_task, task, 0, &task->ent); 673 } 674 } 675 676 out: 677 spl_fstrans_unmark(cookie); 678 } 679 680 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 681 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 682 static void 683 zvol_submit_bio(struct bio *bio) 684 #else 685 static blk_qc_t 686 zvol_submit_bio(struct bio *bio) 687 #endif 688 #else 689 static MAKE_REQUEST_FN_RET 690 zvol_request(struct request_queue *q, struct bio *bio) 691 #endif 692 { 693 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 694 #if defined(HAVE_BIO_BDEV_DISK) 695 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 696 #else 697 struct request_queue *q = bio->bi_disk->queue; 698 #endif 699 #endif 700 zvol_state_t *zv = q->queuedata; 701 702 zvol_request_impl(zv, bio, NULL, 0); 703 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 704 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 705 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 706 return (BLK_QC_T_NONE); 707 #endif 708 } 709 710 static int 711 #ifdef HAVE_BLK_MODE_T 712 zvol_open(struct gendisk *disk, blk_mode_t flag) 713 #else 714 zvol_open(struct block_device *bdev, fmode_t flag) 715 #endif 716 { 717 zvol_state_t *zv; 718 int error = 0; 719 boolean_t drop_suspend = B_FALSE; 720 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 721 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 722 hrtime_t start = gethrtime(); 723 724 retry: 725 #endif 726 rw_enter(&zvol_state_lock, RW_READER); 727 /* 728 * Obtain a copy of private_data under the zvol_state_lock to make 729 * sure that either the result of zvol free code path setting 730 * disk->private_data to NULL is observed, or zvol_os_free() 731 * is not called on this zv because of the positive zv_open_count. 732 */ 733 #ifdef HAVE_BLK_MODE_T 734 zv = disk->private_data; 735 #else 736 zv = bdev->bd_disk->private_data; 737 #endif 738 if (zv == NULL) { 739 rw_exit(&zvol_state_lock); 740 return (-SET_ERROR(ENXIO)); 741 } 742 743 mutex_enter(&zv->zv_state_lock); 744 745 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 746 mutex_exit(&zv->zv_state_lock); 747 rw_exit(&zvol_state_lock); 748 return (-SET_ERROR(ENXIO)); 749 } 750 751 /* 752 * Make sure zvol is not suspended during first open 753 * (hold zv_suspend_lock) and respect proper lock acquisition 754 * ordering - zv_suspend_lock before zv_state_lock 755 */ 756 if (zv->zv_open_count == 0) { 757 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 758 mutex_exit(&zv->zv_state_lock); 759 rw_enter(&zv->zv_suspend_lock, RW_READER); 760 mutex_enter(&zv->zv_state_lock); 761 /* check to see if zv_suspend_lock is needed */ 762 if (zv->zv_open_count != 0) { 763 rw_exit(&zv->zv_suspend_lock); 764 } else { 765 drop_suspend = B_TRUE; 766 } 767 } else { 768 drop_suspend = B_TRUE; 769 } 770 } 771 rw_exit(&zvol_state_lock); 772 773 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 774 775 if (zv->zv_open_count == 0) { 776 boolean_t drop_namespace = B_FALSE; 777 778 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 779 780 /* 781 * In all other call paths the spa_namespace_lock is taken 782 * before the bdev->bd_mutex lock. However, on open(2) 783 * the __blkdev_get() function calls fops->open() with the 784 * bdev->bd_mutex lock held. This can result in a deadlock 785 * when zvols from one pool are used as vdevs in another. 786 * 787 * To prevent a lock inversion deadlock we preemptively 788 * take the spa_namespace_lock. Normally the lock will not 789 * be contended and this is safe because spa_open_common() 790 * handles the case where the caller already holds the 791 * spa_namespace_lock. 792 * 793 * When the lock cannot be aquired after multiple retries 794 * this must be the vdev on zvol deadlock case and we have 795 * no choice but to return an error. For 5.12 and older 796 * kernels returning -ERESTARTSYS will result in the 797 * bdev->bd_mutex being dropped, then reacquired, and 798 * fops->open() being called again. This process can be 799 * repeated safely until both locks are acquired. For 5.13 800 * and newer the -ERESTARTSYS retry logic was removed from 801 * the kernel so the only option is to return the error for 802 * the caller to handle it. 803 */ 804 if (!mutex_owned(&spa_namespace_lock)) { 805 if (!mutex_tryenter(&spa_namespace_lock)) { 806 mutex_exit(&zv->zv_state_lock); 807 rw_exit(&zv->zv_suspend_lock); 808 drop_suspend = B_FALSE; 809 810 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 811 schedule(); 812 return (-SET_ERROR(ERESTARTSYS)); 813 #else 814 if ((gethrtime() - start) > timeout) 815 return (-SET_ERROR(ERESTARTSYS)); 816 817 schedule_timeout_interruptible( 818 MSEC_TO_TICK(10)); 819 goto retry; 820 #endif 821 } else { 822 drop_namespace = B_TRUE; 823 } 824 } 825 826 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 827 828 if (drop_namespace) 829 mutex_exit(&spa_namespace_lock); 830 } 831 832 if (error == 0) { 833 if ((blk_mode_is_open_write(flag)) && 834 (zv->zv_flags & ZVOL_RDONLY)) { 835 if (zv->zv_open_count == 0) 836 zvol_last_close(zv); 837 838 error = -SET_ERROR(EROFS); 839 } else { 840 zv->zv_open_count++; 841 } 842 } 843 844 mutex_exit(&zv->zv_state_lock); 845 if (drop_suspend) 846 rw_exit(&zv->zv_suspend_lock); 847 848 if (error == 0) 849 #ifdef HAVE_BLK_MODE_T 850 disk_check_media_change(disk); 851 #else 852 zfs_check_media_change(bdev); 853 #endif 854 855 return (error); 856 } 857 858 static void 859 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 860 zvol_release(struct gendisk *disk) 861 #else 862 zvol_release(struct gendisk *disk, fmode_t unused) 863 #endif 864 { 865 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 866 (void) unused; 867 #endif 868 zvol_state_t *zv; 869 boolean_t drop_suspend = B_TRUE; 870 871 rw_enter(&zvol_state_lock, RW_READER); 872 zv = disk->private_data; 873 874 mutex_enter(&zv->zv_state_lock); 875 ASSERT3U(zv->zv_open_count, >, 0); 876 /* 877 * make sure zvol is not suspended during last close 878 * (hold zv_suspend_lock) and respect proper lock acquisition 879 * ordering - zv_suspend_lock before zv_state_lock 880 */ 881 if (zv->zv_open_count == 1) { 882 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 883 mutex_exit(&zv->zv_state_lock); 884 rw_enter(&zv->zv_suspend_lock, RW_READER); 885 mutex_enter(&zv->zv_state_lock); 886 /* check to see if zv_suspend_lock is needed */ 887 if (zv->zv_open_count != 1) { 888 rw_exit(&zv->zv_suspend_lock); 889 drop_suspend = B_FALSE; 890 } 891 } 892 } else { 893 drop_suspend = B_FALSE; 894 } 895 rw_exit(&zvol_state_lock); 896 897 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 898 899 zv->zv_open_count--; 900 if (zv->zv_open_count == 0) { 901 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 902 zvol_last_close(zv); 903 } 904 905 mutex_exit(&zv->zv_state_lock); 906 907 if (drop_suspend) 908 rw_exit(&zv->zv_suspend_lock); 909 } 910 911 static int 912 zvol_ioctl(struct block_device *bdev, fmode_t mode, 913 unsigned int cmd, unsigned long arg) 914 { 915 zvol_state_t *zv = bdev->bd_disk->private_data; 916 int error = 0; 917 918 ASSERT3U(zv->zv_open_count, >, 0); 919 920 switch (cmd) { 921 case BLKFLSBUF: 922 #ifdef HAVE_FSYNC_BDEV 923 fsync_bdev(bdev); 924 #elif defined(HAVE_SYNC_BLOCKDEV) 925 sync_blockdev(bdev); 926 #else 927 #error "Neither fsync_bdev() nor sync_blockdev() found" 928 #endif 929 invalidate_bdev(bdev); 930 rw_enter(&zv->zv_suspend_lock, RW_READER); 931 932 if (!(zv->zv_flags & ZVOL_RDONLY)) 933 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 934 935 rw_exit(&zv->zv_suspend_lock); 936 break; 937 938 case BLKZNAME: 939 mutex_enter(&zv->zv_state_lock); 940 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 941 mutex_exit(&zv->zv_state_lock); 942 break; 943 944 default: 945 error = -ENOTTY; 946 break; 947 } 948 949 return (SET_ERROR(error)); 950 } 951 952 #ifdef CONFIG_COMPAT 953 static int 954 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 955 unsigned cmd, unsigned long arg) 956 { 957 return (zvol_ioctl(bdev, mode, cmd, arg)); 958 } 959 #else 960 #define zvol_compat_ioctl NULL 961 #endif 962 963 static unsigned int 964 zvol_check_events(struct gendisk *disk, unsigned int clearing) 965 { 966 unsigned int mask = 0; 967 968 rw_enter(&zvol_state_lock, RW_READER); 969 970 zvol_state_t *zv = disk->private_data; 971 if (zv != NULL) { 972 mutex_enter(&zv->zv_state_lock); 973 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 974 zv->zv_changed = 0; 975 mutex_exit(&zv->zv_state_lock); 976 } 977 978 rw_exit(&zvol_state_lock); 979 980 return (mask); 981 } 982 983 static int 984 zvol_revalidate_disk(struct gendisk *disk) 985 { 986 rw_enter(&zvol_state_lock, RW_READER); 987 988 zvol_state_t *zv = disk->private_data; 989 if (zv != NULL) { 990 mutex_enter(&zv->zv_state_lock); 991 set_capacity(zv->zv_zso->zvo_disk, 992 zv->zv_volsize >> SECTOR_BITS); 993 mutex_exit(&zv->zv_state_lock); 994 } 995 996 rw_exit(&zvol_state_lock); 997 998 return (0); 999 } 1000 1001 int 1002 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 1003 { 1004 struct gendisk *disk = zv->zv_zso->zvo_disk; 1005 1006 #if defined(HAVE_REVALIDATE_DISK_SIZE) 1007 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 1008 #elif defined(HAVE_REVALIDATE_DISK) 1009 revalidate_disk(disk); 1010 #else 1011 zvol_revalidate_disk(disk); 1012 #endif 1013 return (0); 1014 } 1015 1016 void 1017 zvol_os_clear_private(zvol_state_t *zv) 1018 { 1019 /* 1020 * Cleared while holding zvol_state_lock as a writer 1021 * which will prevent zvol_open() from opening it. 1022 */ 1023 zv->zv_zso->zvo_disk->private_data = NULL; 1024 } 1025 1026 /* 1027 * Provide a simple virtual geometry for legacy compatibility. For devices 1028 * smaller than 1 MiB a small head and sector count is used to allow very 1029 * tiny devices. For devices over 1 Mib a standard head and sector count 1030 * is used to keep the cylinders count reasonable. 1031 */ 1032 static int 1033 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1034 { 1035 zvol_state_t *zv = bdev->bd_disk->private_data; 1036 sector_t sectors; 1037 1038 ASSERT3U(zv->zv_open_count, >, 0); 1039 1040 sectors = get_capacity(zv->zv_zso->zvo_disk); 1041 1042 if (sectors > 2048) { 1043 geo->heads = 16; 1044 geo->sectors = 63; 1045 } else { 1046 geo->heads = 2; 1047 geo->sectors = 4; 1048 } 1049 1050 geo->start = 0; 1051 geo->cylinders = sectors / (geo->heads * geo->sectors); 1052 1053 return (0); 1054 } 1055 1056 /* 1057 * Why have two separate block_device_operations structs? 1058 * 1059 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1060 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1061 * can't just change submit_bio dynamically at runtime. So just create two 1062 * separate structs to get around this. 1063 */ 1064 static const struct block_device_operations zvol_ops_blk_mq = { 1065 .open = zvol_open, 1066 .release = zvol_release, 1067 .ioctl = zvol_ioctl, 1068 .compat_ioctl = zvol_compat_ioctl, 1069 .check_events = zvol_check_events, 1070 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1071 .revalidate_disk = zvol_revalidate_disk, 1072 #endif 1073 .getgeo = zvol_getgeo, 1074 .owner = THIS_MODULE, 1075 }; 1076 1077 static const struct block_device_operations zvol_ops = { 1078 .open = zvol_open, 1079 .release = zvol_release, 1080 .ioctl = zvol_ioctl, 1081 .compat_ioctl = zvol_compat_ioctl, 1082 .check_events = zvol_check_events, 1083 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1084 .revalidate_disk = zvol_revalidate_disk, 1085 #endif 1086 .getgeo = zvol_getgeo, 1087 .owner = THIS_MODULE, 1088 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1089 .submit_bio = zvol_submit_bio, 1090 #endif 1091 }; 1092 1093 /* 1094 * Since 6.9, Linux has been removing queue limit setters in favour of an 1095 * initial queue_limits struct applied when the device is open. Since 6.11, 1096 * queue_limits is being extended to allow more things to be applied when the 1097 * device is open. Setters are also being removed for this. 1098 * 1099 * For OpenZFS, this means that depending on kernel version, some options may 1100 * be set up before the device is open, and some applied to an open device 1101 * (queue) after the fact. 1102 * 1103 * We manage this complexity by having our own limits struct, 1104 * zvol_queue_limits_t, in which we carry any queue config that we're 1105 * interested in setting. This structure is the same on all kernels. 1106 * 1107 * These limits are then applied to the queue at device open time by the most 1108 * appropriate method for the kernel. 1109 * 1110 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1111 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1112 * struct queue_limits, and passes it in. Any fields added in later kernels are 1113 * (obviously) not set up here. 1114 * 1115 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1116 * is created, and applies any remaining config. Before 6.9 that will be 1117 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1118 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1119 * will always be a no-op on the latest kernel we support). 1120 */ 1121 typedef struct zvol_queue_limits { 1122 unsigned int zql_max_hw_sectors; 1123 unsigned short zql_max_segments; 1124 unsigned int zql_max_segment_size; 1125 unsigned int zql_io_opt; 1126 unsigned int zql_physical_block_size; 1127 unsigned int zql_max_discard_sectors; 1128 unsigned int zql_discard_granularity; 1129 } zvol_queue_limits_t; 1130 1131 static void 1132 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1133 boolean_t use_blk_mq) 1134 { 1135 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1136 1137 if (use_blk_mq) { 1138 /* 1139 * IO requests can be really big (1MB). When an IO request 1140 * comes in, it is passed off to zvol_read() or zvol_write() 1141 * in a new thread, where it is chunked up into 'volblocksize' 1142 * sized pieces and processed. So for example, if the request 1143 * is a 1MB write and your volblocksize is 128k, one zvol_write 1144 * thread will take that request and sequentially do ten 128k 1145 * IOs. This is due to the fact that the thread needs to lock 1146 * each volblocksize sized block. So you might be wondering: 1147 * "instead of passing the whole 1MB request to one thread, 1148 * why not pass ten individual 128k chunks to ten threads and 1149 * process the whole write in parallel?" The short answer is 1150 * that there's a sweet spot number of chunks that balances 1151 * the greater parallelism with the added overhead of more 1152 * threads. The sweet spot can be different depending on if you 1153 * have a read or write heavy workload. Writes typically want 1154 * high chunk counts while reads typically want lower ones. On 1155 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1156 * configuration, with volblocksize=8k, the sweet spot for good 1157 * sequential reads and writes was at 8 chunks. 1158 */ 1159 1160 /* 1161 * Below we tell the kernel how big we want our requests 1162 * to be. You would think that blk_queue_io_opt() would be 1163 * used to do this since it is used to "set optimal request 1164 * size for the queue", but that doesn't seem to do 1165 * anything - the kernel still gives you huge requests 1166 * with tons of little PAGE_SIZE segments contained within it. 1167 * 1168 * Knowing that the kernel will just give you PAGE_SIZE segments 1169 * no matter what, you can say "ok, I want PAGE_SIZE byte 1170 * segments, and I want 'N' of them per request", where N is 1171 * the correct number of segments for the volblocksize and 1172 * number of chunks you want. 1173 */ 1174 #ifdef HAVE_BLK_MQ 1175 if (zvol_blk_mq_blocks_per_thread != 0) { 1176 unsigned int chunks; 1177 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1178 1179 limits->zql_max_segment_size = PAGE_SIZE; 1180 limits->zql_max_segments = 1181 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1182 } else { 1183 /* 1184 * Special case: zvol_blk_mq_blocks_per_thread = 0 1185 * Max everything out. 1186 */ 1187 limits->zql_max_segments = UINT16_MAX; 1188 limits->zql_max_segment_size = UINT_MAX; 1189 } 1190 } else { 1191 #endif 1192 limits->zql_max_segments = UINT16_MAX; 1193 limits->zql_max_segment_size = UINT_MAX; 1194 } 1195 1196 limits->zql_io_opt = zv->zv_volblocksize; 1197 1198 limits->zql_physical_block_size = zv->zv_volblocksize; 1199 limits->zql_max_discard_sectors = 1200 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1201 limits->zql_discard_granularity = zv->zv_volblocksize; 1202 } 1203 1204 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1205 static void 1206 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1207 struct queue_limits *qlimits) 1208 { 1209 memset(qlimits, 0, sizeof (struct queue_limits)); 1210 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1211 qlimits->max_segments = limits->zql_max_segments; 1212 qlimits->max_segment_size = limits->zql_max_segment_size; 1213 qlimits->io_opt = limits->zql_io_opt; 1214 qlimits->physical_block_size = limits->zql_physical_block_size; 1215 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1216 qlimits->discard_granularity = limits->zql_discard_granularity; 1217 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1218 qlimits->features = 1219 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1220 #endif 1221 } 1222 #endif 1223 1224 static void 1225 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1226 struct request_queue *queue) 1227 { 1228 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1229 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1230 blk_queue_max_segments(queue, limits->zql_max_segments); 1231 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1232 blk_queue_io_opt(queue, limits->zql_io_opt); 1233 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1234 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1235 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1236 #endif 1237 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1238 blk_queue_set_write_cache(queue, B_TRUE); 1239 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1240 #endif 1241 } 1242 1243 static int 1244 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1245 { 1246 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1247 #if defined(HAVE_BLK_ALLOC_DISK) 1248 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1249 if (zso->zvo_disk == NULL) 1250 return (1); 1251 1252 zso->zvo_disk->minors = ZVOL_MINORS; 1253 zso->zvo_queue = zso->zvo_disk->queue; 1254 zvol_queue_limits_apply(limits, zso->zvo_queue); 1255 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1256 struct queue_limits qlimits; 1257 zvol_queue_limits_convert(limits, &qlimits); 1258 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1259 if (IS_ERR(disk)) { 1260 zso->zvo_disk = NULL; 1261 return (1); 1262 } 1263 1264 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1265 blk_queue_set_write_cache(zso->zvo_queue, B_TRUE); 1266 #endif 1267 1268 zso->zvo_disk = disk; 1269 zso->zvo_disk->minors = ZVOL_MINORS; 1270 zso->zvo_queue = zso->zvo_disk->queue; 1271 #else 1272 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1273 if (zso->zvo_queue == NULL) 1274 return (1); 1275 1276 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1277 if (zso->zvo_disk == NULL) { 1278 blk_cleanup_queue(zso->zvo_queue); 1279 return (1); 1280 } 1281 1282 zso->zvo_disk->queue = zso->zvo_queue; 1283 #endif /* HAVE_BLK_ALLOC_DISK */ 1284 #else 1285 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1286 if (zso->zvo_queue == NULL) 1287 return (1); 1288 1289 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1290 if (zso->zvo_disk == NULL) { 1291 blk_cleanup_queue(zso->zvo_queue); 1292 return (1); 1293 } 1294 1295 zso->zvo_disk->queue = zso->zvo_queue; 1296 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1297 1298 zvol_queue_limits_apply(limits, zso->zvo_queue); 1299 1300 return (0); 1301 1302 } 1303 1304 static int 1305 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1306 { 1307 #ifdef HAVE_BLK_MQ 1308 struct zvol_state_os *zso = zv->zv_zso; 1309 1310 /* Allocate our blk-mq tag_set */ 1311 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1312 return (1); 1313 1314 #if defined(HAVE_BLK_ALLOC_DISK) 1315 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1316 if (zso->zvo_disk == NULL) { 1317 blk_mq_free_tag_set(&zso->tag_set); 1318 return (1); 1319 } 1320 zso->zvo_queue = zso->zvo_disk->queue; 1321 zso->zvo_disk->minors = ZVOL_MINORS; 1322 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1323 struct queue_limits qlimits; 1324 zvol_queue_limits_convert(limits, &qlimits); 1325 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1326 if (IS_ERR(disk)) { 1327 zso->zvo_disk = NULL; 1328 blk_mq_free_tag_set(&zso->tag_set); 1329 return (1); 1330 } 1331 1332 zso->zvo_disk = disk; 1333 zso->zvo_queue = zso->zvo_disk->queue; 1334 zso->zvo_disk->minors = ZVOL_MINORS; 1335 #else 1336 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1337 if (zso->zvo_disk == NULL) { 1338 blk_cleanup_queue(zso->zvo_queue); 1339 blk_mq_free_tag_set(&zso->tag_set); 1340 return (1); 1341 } 1342 /* Allocate queue */ 1343 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1344 if (IS_ERR(zso->zvo_queue)) { 1345 blk_mq_free_tag_set(&zso->tag_set); 1346 return (1); 1347 } 1348 1349 /* Our queue is now created, assign it to our disk */ 1350 zso->zvo_disk->queue = zso->zvo_queue; 1351 #endif 1352 1353 zvol_queue_limits_apply(limits, zso->zvo_queue); 1354 #endif 1355 1356 return (0); 1357 } 1358 1359 /* 1360 * Allocate memory for a new zvol_state_t and setup the required 1361 * request queue and generic disk structures for the block device. 1362 */ 1363 static zvol_state_t * 1364 zvol_alloc(dev_t dev, const char *name) 1365 { 1366 zvol_state_t *zv; 1367 struct zvol_state_os *zso; 1368 uint64_t volmode; 1369 int ret; 1370 1371 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1372 return (NULL); 1373 1374 if (volmode == ZFS_VOLMODE_DEFAULT) 1375 volmode = zvol_volmode; 1376 1377 if (volmode == ZFS_VOLMODE_NONE) 1378 return (NULL); 1379 1380 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1381 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1382 zv->zv_zso = zso; 1383 zv->zv_volmode = volmode; 1384 1385 list_link_init(&zv->zv_next); 1386 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1387 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1388 1389 #ifdef HAVE_BLK_MQ 1390 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1391 #endif 1392 1393 zvol_queue_limits_t limits; 1394 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1395 1396 /* 1397 * The block layer has 3 interfaces for getting BIOs: 1398 * 1399 * 1. blk-mq request queues (new) 1400 * 2. submit_bio() (oldest) 1401 * 3. regular request queues (old). 1402 * 1403 * Each of those interfaces has two permutations: 1404 * 1405 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1406 * both the disk and its queue (5.14 kernel or newer) 1407 * 1408 * b) We don't have blk_*alloc_disk(), and have to allocate the 1409 * disk and the queue separately. (5.13 kernel or older) 1410 */ 1411 if (zv->zv_zso->use_blk_mq) { 1412 ret = zvol_alloc_blk_mq(zv, &limits); 1413 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1414 } else { 1415 ret = zvol_alloc_non_blk_mq(zso, &limits); 1416 zso->zvo_disk->fops = &zvol_ops; 1417 } 1418 if (ret != 0) 1419 goto out_kmem; 1420 1421 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1422 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1423 1424 if (!zv->zv_zso->use_blk_mq) { 1425 /* Disable write merging in favor of the ZIO pipeline. */ 1426 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1427 } 1428 1429 zso->zvo_queue->queuedata = zv; 1430 zso->zvo_dev = dev; 1431 zv->zv_open_count = 0; 1432 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1433 1434 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1435 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1436 1437 zso->zvo_disk->major = zvol_major; 1438 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1439 1440 /* 1441 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1442 * This is accomplished by limiting the number of minors for the 1443 * device to one and explicitly disabling partition scanning. 1444 */ 1445 if (volmode == ZFS_VOLMODE_DEV) { 1446 zso->zvo_disk->minors = 1; 1447 zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; 1448 zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; 1449 } 1450 1451 zso->zvo_disk->first_minor = (dev & MINORMASK); 1452 zso->zvo_disk->private_data = zv; 1453 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1454 ZVOL_DEV_NAME, (dev & MINORMASK)); 1455 1456 return (zv); 1457 1458 out_kmem: 1459 kmem_free(zso, sizeof (struct zvol_state_os)); 1460 kmem_free(zv, sizeof (zvol_state_t)); 1461 return (NULL); 1462 } 1463 1464 /* 1465 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1466 * At this time, the structure is not opened by anyone, is taken off 1467 * the zvol_state_list, and has its private data set to NULL. 1468 * The zvol_state_lock is dropped. 1469 * 1470 * This function may take many milliseconds to complete (e.g. we've seen 1471 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1472 * "del_gendisk". Thus, consumers need to be careful to account for this 1473 * latency when calling this function. 1474 */ 1475 void 1476 zvol_os_free(zvol_state_t *zv) 1477 { 1478 1479 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1480 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1481 ASSERT0(zv->zv_open_count); 1482 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1483 1484 rw_destroy(&zv->zv_suspend_lock); 1485 zfs_rangelock_fini(&zv->zv_rangelock); 1486 1487 del_gendisk(zv->zv_zso->zvo_disk); 1488 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1489 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1490 #if defined(HAVE_BLK_CLEANUP_DISK) 1491 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1492 #else 1493 put_disk(zv->zv_zso->zvo_disk); 1494 #endif 1495 #else 1496 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1497 put_disk(zv->zv_zso->zvo_disk); 1498 #endif 1499 1500 #ifdef HAVE_BLK_MQ 1501 if (zv->zv_zso->use_blk_mq) 1502 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1503 #endif 1504 1505 ida_simple_remove(&zvol_ida, 1506 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1507 1508 cv_destroy(&zv->zv_removing_cv); 1509 mutex_destroy(&zv->zv_state_lock); 1510 dataset_kstats_destroy(&zv->zv_kstat); 1511 1512 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1513 kmem_free(zv, sizeof (zvol_state_t)); 1514 } 1515 1516 void 1517 zvol_wait_close(zvol_state_t *zv) 1518 { 1519 } 1520 1521 struct add_disk_work { 1522 struct delayed_work work; 1523 struct gendisk *disk; 1524 int error; 1525 }; 1526 1527 static int 1528 __zvol_os_add_disk(struct gendisk *disk) 1529 { 1530 int error = 0; 1531 #ifdef HAVE_ADD_DISK_RET 1532 error = add_disk(disk); 1533 #else 1534 add_disk(disk); 1535 #endif 1536 return (error); 1537 } 1538 1539 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1540 static void 1541 zvol_os_add_disk_work(struct work_struct *work) 1542 { 1543 struct add_disk_work *add_disk_work; 1544 add_disk_work = container_of(work, struct add_disk_work, work.work); 1545 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1546 } 1547 #endif 1548 1549 /* 1550 * SPECIAL CASE: 1551 * 1552 * This function basically calls add_disk() from a workqueue. You may be 1553 * thinking: why not just call add_disk() directly? 1554 * 1555 * When you call add_disk(), the zvol appears to the world. When this happens, 1556 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1557 * differently on the 6.9+ kernels: 1558 * 1559 * - 6.8 and older kernels - 1560 * disk_scan_partitions() 1561 * handle = bdev_open_by_dev( 1562 * zvol_open() 1563 * bdev_release(handle); 1564 * zvol_release() 1565 * 1566 * 1567 * - 6.9+ kernels - 1568 * disk_scan_partitions() 1569 * file = bdev_file_open_by_dev() 1570 * zvol_open() 1571 * fput(file) 1572 * < wait for return to userspace > 1573 * zvol_release() 1574 * 1575 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1576 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1577 * async that has to wait until we return to userspace (since it adds the fput 1578 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1579 * behavior we want, since we want do things like create+destroy a zvol within 1580 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1581 * reference to the zvol while we're in the IOCTL, which can't wait until we 1582 * return to userspace. 1583 * 1584 * We can get around this since fput() has a special codepath for when it's 1585 * running in a kernel thread or interrupt. In those cases, it just puts the 1586 * fput into the system workqueue, which we can force to run with 1587 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1588 * run from a kernel thread and "tricks" the fput() codepaths. 1589 * 1590 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1591 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1592 * fput) to happen, which it eventually, naturally, will from the system_wq 1593 * without us explicitly calling __flush_workqueue(). 1594 */ 1595 static int 1596 zvol_os_add_disk(struct gendisk *disk) 1597 { 1598 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1599 struct add_disk_work add_disk_work; 1600 1601 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1602 add_disk_work.disk = disk; 1603 add_disk_work.error = 0; 1604 1605 /* Use *_delayed_work functions since they're not GPL'd */ 1606 schedule_delayed_work(&add_disk_work.work, 0); 1607 flush_delayed_work(&add_disk_work.work); 1608 1609 __flush_workqueue(system_wq); 1610 return (add_disk_work.error); 1611 #else /* <= 6.8 kernel */ 1612 return (__zvol_os_add_disk(disk)); 1613 #endif 1614 } 1615 1616 /* 1617 * Create a block device minor node and setup the linkage between it 1618 * and the specified volume. Once this function returns the block 1619 * device is live and ready for use. 1620 */ 1621 int 1622 zvol_os_create_minor(const char *name) 1623 { 1624 zvol_state_t *zv; 1625 objset_t *os; 1626 dmu_object_info_t *doi; 1627 uint64_t volsize; 1628 uint64_t len; 1629 unsigned minor = 0; 1630 int error = 0; 1631 int idx; 1632 uint64_t hash = zvol_name_hash(name); 1633 uint64_t volthreading; 1634 bool replayed_zil = B_FALSE; 1635 1636 if (zvol_inhibit_dev) 1637 return (0); 1638 1639 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1640 if (idx < 0) 1641 return (SET_ERROR(-idx)); 1642 minor = idx << ZVOL_MINOR_BITS; 1643 if (MINOR(minor) != minor) { 1644 /* too many partitions can cause an overflow */ 1645 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1646 name, minor, MINOR(minor)); 1647 ida_simple_remove(&zvol_ida, idx); 1648 return (SET_ERROR(EINVAL)); 1649 } 1650 1651 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1652 if (zv) { 1653 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1654 mutex_exit(&zv->zv_state_lock); 1655 ida_simple_remove(&zvol_ida, idx); 1656 return (SET_ERROR(EEXIST)); 1657 } 1658 1659 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1660 1661 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1662 if (error) 1663 goto out_doi; 1664 1665 error = dmu_object_info(os, ZVOL_OBJ, doi); 1666 if (error) 1667 goto out_dmu_objset_disown; 1668 1669 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1670 if (error) 1671 goto out_dmu_objset_disown; 1672 1673 zv = zvol_alloc(MKDEV(zvol_major, minor), name); 1674 if (zv == NULL) { 1675 error = SET_ERROR(EAGAIN); 1676 goto out_dmu_objset_disown; 1677 } 1678 zv->zv_hash = hash; 1679 1680 if (dmu_objset_is_snapshot(os)) 1681 zv->zv_flags |= ZVOL_RDONLY; 1682 1683 zv->zv_volblocksize = doi->doi_data_block_size; 1684 zv->zv_volsize = volsize; 1685 zv->zv_objset = os; 1686 1687 /* Default */ 1688 zv->zv_threading = B_TRUE; 1689 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1690 == 0) 1691 zv->zv_threading = volthreading; 1692 1693 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1694 1695 #ifdef QUEUE_FLAG_DISCARD 1696 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1697 #endif 1698 #ifdef QUEUE_FLAG_NONROT 1699 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1700 #endif 1701 #ifdef QUEUE_FLAG_ADD_RANDOM 1702 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1703 #endif 1704 /* This flag was introduced in kernel version 4.12. */ 1705 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1706 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1707 #endif 1708 1709 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1710 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1711 if (error) 1712 goto out_dmu_objset_disown; 1713 ASSERT3P(zv->zv_zilog, ==, NULL); 1714 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1715 if (spa_writeable(dmu_objset_spa(os))) { 1716 if (zil_replay_disable) 1717 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1718 else 1719 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1720 } 1721 if (replayed_zil) 1722 zil_close(zv->zv_zilog); 1723 zv->zv_zilog = NULL; 1724 1725 /* 1726 * When udev detects the addition of the device it will immediately 1727 * invoke blkid(8) to determine the type of content on the device. 1728 * Prefetching the blocks commonly scanned by blkid(8) will speed 1729 * up this process. 1730 */ 1731 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1732 if (len > 0) { 1733 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1734 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1735 ZIO_PRIORITY_SYNC_READ); 1736 } 1737 1738 zv->zv_objset = NULL; 1739 out_dmu_objset_disown: 1740 dmu_objset_disown(os, B_TRUE, FTAG); 1741 out_doi: 1742 kmem_free(doi, sizeof (dmu_object_info_t)); 1743 1744 /* 1745 * Keep in mind that once add_disk() is called, the zvol is 1746 * announced to the world, and zvol_open()/zvol_release() can 1747 * be called at any time. Incidentally, add_disk() itself calls 1748 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1749 * directly as well. 1750 */ 1751 if (error == 0) { 1752 rw_enter(&zvol_state_lock, RW_WRITER); 1753 zvol_insert(zv); 1754 rw_exit(&zvol_state_lock); 1755 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1756 } else { 1757 ida_simple_remove(&zvol_ida, idx); 1758 } 1759 1760 return (error); 1761 } 1762 1763 void 1764 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1765 { 1766 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1767 1768 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1769 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1770 1771 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1772 1773 /* move to new hashtable entry */ 1774 zv->zv_hash = zvol_name_hash(newname); 1775 hlist_del(&zv->zv_hlink); 1776 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1777 1778 /* 1779 * The block device's read-only state is briefly changed causing 1780 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1781 * the name change and fixes the symlinks. This does not change 1782 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1783 * changes. This would normally be done using kobject_uevent() but 1784 * that is a GPL-only symbol which is why we need this workaround. 1785 */ 1786 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1787 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1788 1789 dataset_kstats_rename(&zv->zv_kstat, newname); 1790 } 1791 1792 void 1793 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1794 { 1795 1796 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1797 } 1798 1799 void 1800 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1801 { 1802 1803 set_capacity(zv->zv_zso->zvo_disk, capacity); 1804 } 1805 1806 int 1807 zvol_init(void) 1808 { 1809 int error; 1810 1811 /* 1812 * zvol_threads is the module param the user passes in. 1813 * 1814 * zvol_actual_threads is what we use internally, since the user can 1815 * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 1816 */ 1817 static unsigned int zvol_actual_threads; 1818 1819 if (zvol_threads == 0) { 1820 /* 1821 * See dde9380a1 for why 32 was chosen here. This should 1822 * probably be refined to be some multiple of the number 1823 * of CPUs. 1824 */ 1825 zvol_actual_threads = MAX(num_online_cpus(), 32); 1826 } else { 1827 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 1828 } 1829 1830 /* 1831 * Use atleast 32 zvol_threads but for many core system, 1832 * prefer 6 threads per taskq, but no more taskqs 1833 * than threads in them on large systems. 1834 * 1835 * taskq total 1836 * cpus taskqs threads threads 1837 * ------- ------- ------- ------- 1838 * 1 1 32 32 1839 * 2 1 32 32 1840 * 4 1 32 32 1841 * 8 2 16 32 1842 * 16 3 11 33 1843 * 32 5 7 35 1844 * 64 8 8 64 1845 * 128 11 12 132 1846 * 256 16 16 256 1847 */ 1848 zv_taskq_t *ztqs = &zvol_taskqs; 1849 uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); 1850 if (num_tqs == 0) { 1851 num_tqs = 1 + num_online_cpus() / 6; 1852 while (num_tqs * num_tqs > zvol_actual_threads) 1853 num_tqs--; 1854 } 1855 uint_t per_tq_thread = zvol_actual_threads / num_tqs; 1856 if (per_tq_thread * num_tqs < zvol_actual_threads) 1857 per_tq_thread++; 1858 ztqs->tqs_cnt = num_tqs; 1859 ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); 1860 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1861 if (error) { 1862 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); 1863 ztqs->tqs_taskq = NULL; 1864 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1865 return (error); 1866 } 1867 1868 #ifdef HAVE_BLK_MQ 1869 if (zvol_blk_mq_queue_depth == 0) { 1870 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1871 } else { 1872 zvol_actual_blk_mq_queue_depth = 1873 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1874 } 1875 1876 if (zvol_blk_mq_threads == 0) { 1877 zvol_blk_mq_actual_threads = num_online_cpus(); 1878 } else { 1879 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1880 1024); 1881 } 1882 #endif 1883 for (uint_t i = 0; i < num_tqs; i++) { 1884 char name[32]; 1885 (void) snprintf(name, sizeof (name), "%s_tq-%u", 1886 ZVOL_DRIVER, i); 1887 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, 1888 maxclsyspri, per_tq_thread, INT_MAX, 1889 TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1890 if (ztqs->tqs_taskq[i] == NULL) { 1891 for (int j = i - 1; j >= 0; j--) 1892 taskq_destroy(ztqs->tqs_taskq[j]); 1893 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1894 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1895 sizeof (taskq_t *)); 1896 ztqs->tqs_taskq = NULL; 1897 return (-ENOMEM); 1898 } 1899 } 1900 1901 zvol_init_impl(); 1902 ida_init(&zvol_ida); 1903 return (0); 1904 } 1905 1906 void 1907 zvol_fini(void) 1908 { 1909 zv_taskq_t *ztqs = &zvol_taskqs; 1910 zvol_fini_impl(); 1911 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1912 1913 if (ztqs->tqs_taskq == NULL) { 1914 ASSERT3U(ztqs->tqs_cnt, ==, 0); 1915 } else { 1916 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { 1917 ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); 1918 taskq_destroy(ztqs->tqs_taskq[i]); 1919 } 1920 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1921 sizeof (taskq_t *)); 1922 ztqs->tqs_taskq = NULL; 1923 } 1924 1925 ida_destroy(&zvol_ida); 1926 } 1927 1928 /* BEGIN CSTYLED */ 1929 module_param(zvol_inhibit_dev, uint, 0644); 1930 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1931 1932 module_param(zvol_major, uint, 0444); 1933 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1934 1935 module_param(zvol_threads, uint, 0444); 1936 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" 1937 "to 0 to use all active CPUs"); 1938 1939 module_param(zvol_request_sync, uint, 0644); 1940 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1941 1942 module_param(zvol_max_discard_blocks, ulong, 0444); 1943 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1944 1945 module_param(zvol_num_taskqs, uint, 0444); 1946 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); 1947 1948 module_param(zvol_prefetch_bytes, uint, 0644); 1949 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1950 1951 module_param(zvol_volmode, uint, 0644); 1952 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1953 1954 #ifdef HAVE_BLK_MQ 1955 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1956 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1957 1958 module_param(zvol_use_blk_mq, uint, 0644); 1959 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1960 1961 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1962 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1963 "Process volblocksize blocks per thread"); 1964 #endif 1965 1966 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1967 module_param(zvol_open_timeout_ms, uint, 0644); 1968 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1969 #endif 1970 1971 /* END CSTYLED */ 1972