1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 23 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 24 * Copyright (c) 2024, Klara, Inc. 25 */ 26 27 #include <sys/dataset_kstats.h> 28 #include <sys/dbuf.h> 29 #include <sys/dmu_traverse.h> 30 #include <sys/dsl_dataset.h> 31 #include <sys/dsl_prop.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/zap.h> 34 #include <sys/zfeature.h> 35 #include <sys/zil_impl.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/zio.h> 38 #include <sys/zfs_rlock.h> 39 #include <sys/spa_impl.h> 40 #include <sys/zvol.h> 41 #include <sys/zvol_impl.h> 42 #include <cityhash.h> 43 44 #include <linux/blkdev_compat.h> 45 #include <linux/task_io_accounting_ops.h> 46 #include <linux/workqueue.h> 47 #include <linux/blk-mq.h> 48 49 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 50 struct request *rq, boolean_t force_sync); 51 52 static unsigned int zvol_major = ZVOL_MAJOR; 53 static unsigned int zvol_request_sync = 0; 54 static unsigned int zvol_prefetch_bytes = (128 * 1024); 55 static unsigned long zvol_max_discard_blocks = 16384; 56 57 /* 58 * Switch taskq at multiple of 512 MB offset. This can be set to a lower value 59 * to utilize more threads for small files but may affect prefetch hits. 60 */ 61 #define ZVOL_TASKQ_OFFSET_SHIFT 29 62 63 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 64 static unsigned int zvol_open_timeout_ms = 1000; 65 #endif 66 67 static unsigned int zvol_threads = 0; 68 static unsigned int zvol_blk_mq_threads = 0; 69 static unsigned int zvol_blk_mq_actual_threads; 70 static boolean_t zvol_use_blk_mq = B_FALSE; 71 72 /* 73 * The maximum number of volblocksize blocks to process per thread. Typically, 74 * write heavy workloads preform better with higher values here, and read 75 * heavy workloads preform better with lower values, but that's not a hard 76 * and fast rule. It's basically a knob to tune between "less overhead with 77 * less parallelism" and "more overhead, but more parallelism". 78 * 79 * '8' was chosen as a reasonable, balanced, default based off of sequential 80 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 81 */ 82 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 83 84 static unsigned int zvol_num_taskqs = 0; 85 86 #ifndef BLKDEV_DEFAULT_RQ 87 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 88 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 89 #endif 90 91 /* 92 * Finalize our BIO or request. 93 */ 94 static inline void 95 zvol_end_io(struct bio *bio, struct request *rq, int error) 96 { 97 if (bio) { 98 bio->bi_status = errno_to_bi_status(-error); 99 bio_endio(bio); 100 } else { 101 blk_mq_end_request(rq, errno_to_bi_status(error)); 102 } 103 } 104 105 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 106 static unsigned int zvol_actual_blk_mq_queue_depth; 107 108 struct zvol_state_os { 109 struct gendisk *zvo_disk; /* generic disk */ 110 struct request_queue *zvo_queue; /* request queue */ 111 dev_t zvo_dev; /* device id */ 112 113 struct blk_mq_tag_set tag_set; 114 115 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 116 boolean_t use_blk_mq; 117 }; 118 119 typedef struct zv_taskq { 120 uint_t tqs_cnt; 121 taskq_t **tqs_taskq; 122 } zv_taskq_t; 123 static zv_taskq_t zvol_taskqs; 124 static struct ida zvol_ida; 125 126 typedef struct zv_request_stack { 127 zvol_state_t *zv; 128 struct bio *bio; 129 struct request *rq; 130 } zv_request_t; 131 132 typedef struct zv_work { 133 struct request *rq; 134 struct work_struct work; 135 } zv_work_t; 136 137 typedef struct zv_request_task { 138 zv_request_t zvr; 139 taskq_ent_t ent; 140 } zv_request_task_t; 141 142 static zv_request_task_t * 143 zv_request_task_create(zv_request_t zvr) 144 { 145 zv_request_task_t *task; 146 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 147 taskq_init_ent(&task->ent); 148 task->zvr = zvr; 149 return (task); 150 } 151 152 static void 153 zv_request_task_free(zv_request_task_t *task) 154 { 155 kmem_free(task, sizeof (*task)); 156 } 157 158 /* 159 * This is called when a new block multiqueue request comes in. A request 160 * contains one or more BIOs. 161 */ 162 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 163 const struct blk_mq_queue_data *bd) 164 { 165 struct request *rq = bd->rq; 166 zvol_state_t *zv = rq->q->queuedata; 167 168 /* Tell the kernel that we are starting to process this request */ 169 blk_mq_start_request(rq); 170 171 if (blk_rq_is_passthrough(rq)) { 172 /* Skip non filesystem request */ 173 blk_mq_end_request(rq, BLK_STS_IOERR); 174 return (BLK_STS_IOERR); 175 } 176 177 zvol_request_impl(zv, NULL, rq, 0); 178 179 /* Acknowledge to the kernel that we got this request */ 180 return (BLK_STS_OK); 181 } 182 183 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 184 .queue_rq = zvol_mq_queue_rq, 185 }; 186 187 /* Initialize our blk-mq struct */ 188 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 189 { 190 struct zvol_state_os *zso = zv->zv_zso; 191 192 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 193 194 /* Initialize tag set. */ 195 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 196 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 197 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 198 zso->tag_set.numa_node = NUMA_NO_NODE; 199 zso->tag_set.cmd_size = 0; 200 201 /* 202 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 203 * zvol_request_impl() 204 */ 205 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 206 207 #ifdef BLK_MQ_F_SHOULD_MERGE 208 /* 209 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 210 * For older kernels, we set it. 211 */ 212 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 213 #endif 214 215 zso->tag_set.driver_data = zv; 216 217 return (blk_mq_alloc_tag_set(&zso->tag_set)); 218 } 219 220 /* 221 * Given a path, return TRUE if path is a ZVOL. 222 */ 223 boolean_t 224 zvol_os_is_zvol(const char *path) 225 { 226 dev_t dev = 0; 227 228 if (vdev_lookup_bdev(path, &dev) != 0) 229 return (B_FALSE); 230 231 if (MAJOR(dev) == zvol_major) 232 return (B_TRUE); 233 234 return (B_FALSE); 235 } 236 237 static void 238 zvol_write(zv_request_t *zvr) 239 { 240 struct bio *bio = zvr->bio; 241 struct request *rq = zvr->rq; 242 int error = 0; 243 zfs_uio_t uio; 244 zvol_state_t *zv = zvr->zv; 245 struct request_queue *q; 246 struct gendisk *disk; 247 unsigned long start_time = 0; 248 boolean_t acct = B_FALSE; 249 250 ASSERT3P(zv, !=, NULL); 251 ASSERT3U(zv->zv_open_count, >, 0); 252 ASSERT3P(zv->zv_zilog, !=, NULL); 253 254 q = zv->zv_zso->zvo_queue; 255 disk = zv->zv_zso->zvo_disk; 256 257 /* bio marked as FLUSH need to flush before write */ 258 if (io_is_flush(bio, rq)) 259 zil_commit(zv->zv_zilog, ZVOL_OBJ); 260 261 /* Some requests are just for flush and nothing else. */ 262 if (io_size(bio, rq) == 0) { 263 rw_exit(&zv->zv_suspend_lock); 264 zvol_end_io(bio, rq, 0); 265 return; 266 } 267 268 zfs_uio_bvec_init(&uio, bio, rq); 269 270 ssize_t start_resid = uio.uio_resid; 271 272 /* 273 * With use_blk_mq, accounting is done by blk_mq_start_request() 274 * and blk_mq_end_request(), so we can skip it here. 275 */ 276 if (bio) { 277 acct = blk_queue_io_stat(q); 278 if (acct) { 279 start_time = blk_generic_start_io_acct(q, disk, WRITE, 280 bio); 281 } 282 } 283 284 boolean_t sync = 285 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 286 287 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 288 uio.uio_loffset, uio.uio_resid, RL_WRITER); 289 290 uint64_t volsize = zv->zv_volsize; 291 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 292 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 293 uint64_t off = uio.uio_loffset; 294 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 295 296 if (bytes > volsize - off) /* don't write past the end */ 297 bytes = volsize - off; 298 299 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 300 301 /* This will only fail for ENOSPC */ 302 error = dmu_tx_assign(tx, TXG_WAIT); 303 if (error) { 304 dmu_tx_abort(tx); 305 break; 306 } 307 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 308 if (error == 0) { 309 zvol_log_write(zv, tx, off, bytes, sync); 310 } 311 dmu_tx_commit(tx); 312 313 if (error) 314 break; 315 } 316 zfs_rangelock_exit(lr); 317 318 int64_t nwritten = start_resid - uio.uio_resid; 319 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 320 task_io_account_write(nwritten); 321 322 if (sync) 323 zil_commit(zv->zv_zilog, ZVOL_OBJ); 324 325 rw_exit(&zv->zv_suspend_lock); 326 327 if (bio && acct) { 328 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 329 } 330 331 zvol_end_io(bio, rq, -error); 332 } 333 334 static void 335 zvol_write_task(void *arg) 336 { 337 zv_request_task_t *task = arg; 338 zvol_write(&task->zvr); 339 zv_request_task_free(task); 340 } 341 342 static void 343 zvol_discard(zv_request_t *zvr) 344 { 345 struct bio *bio = zvr->bio; 346 struct request *rq = zvr->rq; 347 zvol_state_t *zv = zvr->zv; 348 uint64_t start = io_offset(bio, rq); 349 uint64_t size = io_size(bio, rq); 350 uint64_t end = start + size; 351 boolean_t sync; 352 int error = 0; 353 dmu_tx_t *tx; 354 struct request_queue *q = zv->zv_zso->zvo_queue; 355 struct gendisk *disk = zv->zv_zso->zvo_disk; 356 unsigned long start_time = 0; 357 boolean_t acct = B_FALSE; 358 359 ASSERT3P(zv, !=, NULL); 360 ASSERT3U(zv->zv_open_count, >, 0); 361 ASSERT3P(zv->zv_zilog, !=, NULL); 362 363 if (bio) { 364 acct = blk_queue_io_stat(q); 365 if (acct) { 366 start_time = blk_generic_start_io_acct(q, disk, WRITE, 367 bio); 368 } 369 } 370 371 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 372 373 if (end > zv->zv_volsize) { 374 error = SET_ERROR(EIO); 375 goto unlock; 376 } 377 378 /* 379 * Align the request to volume block boundaries when a secure erase is 380 * not required. This will prevent dnode_free_range() from zeroing out 381 * the unaligned parts which is slow (read-modify-write) and useless 382 * since we are not freeing any space by doing so. 383 */ 384 if (!io_is_secure_erase(bio, rq)) { 385 start = P2ROUNDUP(start, zv->zv_volblocksize); 386 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 387 size = end - start; 388 } 389 390 if (start >= end) 391 goto unlock; 392 393 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 394 start, size, RL_WRITER); 395 396 tx = dmu_tx_create(zv->zv_objset); 397 dmu_tx_mark_netfree(tx); 398 error = dmu_tx_assign(tx, TXG_WAIT); 399 if (error != 0) { 400 dmu_tx_abort(tx); 401 } else { 402 zvol_log_truncate(zv, tx, start, size); 403 dmu_tx_commit(tx); 404 error = dmu_free_long_range(zv->zv_objset, 405 ZVOL_OBJ, start, size); 406 } 407 zfs_rangelock_exit(lr); 408 409 if (error == 0 && sync) 410 zil_commit(zv->zv_zilog, ZVOL_OBJ); 411 412 unlock: 413 rw_exit(&zv->zv_suspend_lock); 414 415 if (bio && acct) { 416 blk_generic_end_io_acct(q, disk, WRITE, bio, 417 start_time); 418 } 419 420 zvol_end_io(bio, rq, -error); 421 } 422 423 static void 424 zvol_discard_task(void *arg) 425 { 426 zv_request_task_t *task = arg; 427 zvol_discard(&task->zvr); 428 zv_request_task_free(task); 429 } 430 431 static void 432 zvol_read(zv_request_t *zvr) 433 { 434 struct bio *bio = zvr->bio; 435 struct request *rq = zvr->rq; 436 int error = 0; 437 zfs_uio_t uio; 438 boolean_t acct = B_FALSE; 439 zvol_state_t *zv = zvr->zv; 440 struct request_queue *q; 441 struct gendisk *disk; 442 unsigned long start_time = 0; 443 444 ASSERT3P(zv, !=, NULL); 445 ASSERT3U(zv->zv_open_count, >, 0); 446 447 zfs_uio_bvec_init(&uio, bio, rq); 448 449 q = zv->zv_zso->zvo_queue; 450 disk = zv->zv_zso->zvo_disk; 451 452 ssize_t start_resid = uio.uio_resid; 453 454 /* 455 * When blk-mq is being used, accounting is done by 456 * blk_mq_start_request() and blk_mq_end_request(). 457 */ 458 if (bio) { 459 acct = blk_queue_io_stat(q); 460 if (acct) 461 start_time = blk_generic_start_io_acct(q, disk, READ, 462 bio); 463 } 464 465 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 466 uio.uio_loffset, uio.uio_resid, RL_READER); 467 468 uint64_t volsize = zv->zv_volsize; 469 470 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 471 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 472 473 /* don't read past the end */ 474 if (bytes > volsize - uio.uio_loffset) 475 bytes = volsize - uio.uio_loffset; 476 477 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 478 if (error) { 479 /* convert checksum errors into IO errors */ 480 if (error == ECKSUM) 481 error = SET_ERROR(EIO); 482 break; 483 } 484 } 485 zfs_rangelock_exit(lr); 486 487 int64_t nread = start_resid - uio.uio_resid; 488 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 489 task_io_account_read(nread); 490 491 rw_exit(&zv->zv_suspend_lock); 492 493 if (bio && acct) { 494 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 495 } 496 497 zvol_end_io(bio, rq, -error); 498 } 499 500 static void 501 zvol_read_task(void *arg) 502 { 503 zv_request_task_t *task = arg; 504 zvol_read(&task->zvr); 505 zv_request_task_free(task); 506 } 507 508 509 /* 510 * Process a BIO or request 511 * 512 * Either 'bio' or 'rq' should be set depending on if we are processing a 513 * bio or a request (both should not be set). 514 * 515 * force_sync: Set to 0 to defer processing to a background taskq 516 * Set to 1 to process data synchronously 517 */ 518 static void 519 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 520 boolean_t force_sync) 521 { 522 fstrans_cookie_t cookie = spl_fstrans_mark(); 523 uint64_t offset = io_offset(bio, rq); 524 uint64_t size = io_size(bio, rq); 525 int rw = io_data_dir(bio, rq); 526 527 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 528 zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); 529 goto out; 530 } 531 532 if (zvol_request_sync || zv->zv_threading == B_FALSE) 533 force_sync = 1; 534 535 zv_request_t zvr = { 536 .zv = zv, 537 .bio = bio, 538 .rq = rq, 539 }; 540 541 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 542 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 543 zv->zv_zso->zvo_disk->disk_name, 544 (long long unsigned)offset, 545 (long unsigned)size); 546 547 zvol_end_io(bio, rq, -SET_ERROR(EIO)); 548 goto out; 549 } 550 551 zv_request_task_t *task; 552 zv_taskq_t *ztqs = &zvol_taskqs; 553 uint_t blk_mq_hw_queue = 0; 554 uint_t tq_idx; 555 uint_t taskq_hash; 556 if (rq) 557 #ifdef HAVE_BLK_MQ_RQ_HCTX 558 blk_mq_hw_queue = rq->mq_hctx->queue_num; 559 #else 560 blk_mq_hw_queue = 561 rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 562 #endif 563 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 564 blk_mq_hw_queue); 565 tq_idx = taskq_hash % ztqs->tqs_cnt; 566 567 if (rw == WRITE) { 568 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 569 zvol_end_io(bio, rq, -SET_ERROR(EROFS)); 570 goto out; 571 } 572 573 /* 574 * Prevents the zvol from being suspended, or the ZIL being 575 * concurrently opened. Will be released after the i/o 576 * completes. 577 */ 578 rw_enter(&zv->zv_suspend_lock, RW_READER); 579 580 /* 581 * Open a ZIL if this is the first time we have written to this 582 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 583 * than zv_state_lock so that we don't need to acquire an 584 * additional lock in this path. 585 */ 586 if (zv->zv_zilog == NULL) { 587 rw_exit(&zv->zv_suspend_lock); 588 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 589 if (zv->zv_zilog == NULL) { 590 zv->zv_zilog = zil_open(zv->zv_objset, 591 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 592 zv->zv_flags |= ZVOL_WRITTEN_TO; 593 /* replay / destroy done in zvol_create_minor */ 594 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 595 ZIL_REPLAY_NEEDED)); 596 } 597 rw_downgrade(&zv->zv_suspend_lock); 598 } 599 600 /* 601 * We don't want this thread to be blocked waiting for i/o to 602 * complete, so we instead wait from a taskq callback. The 603 * i/o may be a ZIL write (via zil_commit()), or a read of an 604 * indirect block, or a read of a data block (if this is a 605 * partial-block write). We will indicate that the i/o is 606 * complete by calling END_IO() from the taskq callback. 607 * 608 * This design allows the calling thread to continue and 609 * initiate more concurrent operations by calling 610 * zvol_request() again. There are typically only a small 611 * number of threads available to call zvol_request() (e.g. 612 * one per iSCSI target), so keeping the latency of 613 * zvol_request() low is important for performance. 614 * 615 * The zvol_request_sync module parameter allows this 616 * behavior to be altered, for performance evaluation 617 * purposes. If the callback blocks, setting 618 * zvol_request_sync=1 will result in much worse performance. 619 * 620 * We can have up to zvol_threads concurrent i/o's being 621 * processed for all zvols on the system. This is typically 622 * a vast improvement over the zvol_request_sync=1 behavior 623 * of one i/o at a time per zvol. However, an even better 624 * design would be for zvol_request() to initiate the zio 625 * directly, and then be notified by the zio_done callback, 626 * which would call END_IO(). Unfortunately, the DMU/ZIL 627 * interfaces lack this functionality (they block waiting for 628 * the i/o to complete). 629 */ 630 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 631 if (force_sync) { 632 zvol_discard(&zvr); 633 } else { 634 task = zv_request_task_create(zvr); 635 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 636 zvol_discard_task, task, 0, &task->ent); 637 } 638 } else { 639 if (force_sync) { 640 zvol_write(&zvr); 641 } else { 642 task = zv_request_task_create(zvr); 643 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 644 zvol_write_task, task, 0, &task->ent); 645 } 646 } 647 } else { 648 /* 649 * The SCST driver, and possibly others, may issue READ I/Os 650 * with a length of zero bytes. These empty I/Os contain no 651 * data and require no additional handling. 652 */ 653 if (size == 0) { 654 zvol_end_io(bio, rq, 0); 655 goto out; 656 } 657 658 rw_enter(&zv->zv_suspend_lock, RW_READER); 659 660 /* See comment in WRITE case above. */ 661 if (force_sync) { 662 zvol_read(&zvr); 663 } else { 664 task = zv_request_task_create(zvr); 665 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 666 zvol_read_task, task, 0, &task->ent); 667 } 668 } 669 670 out: 671 spl_fstrans_unmark(cookie); 672 } 673 674 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 675 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 676 static void 677 zvol_submit_bio(struct bio *bio) 678 #else 679 static blk_qc_t 680 zvol_submit_bio(struct bio *bio) 681 #endif 682 #else 683 static MAKE_REQUEST_FN_RET 684 zvol_request(struct request_queue *q, struct bio *bio) 685 #endif 686 { 687 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 688 #if defined(HAVE_BIO_BDEV_DISK) 689 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 690 #else 691 struct request_queue *q = bio->bi_disk->queue; 692 #endif 693 #endif 694 zvol_state_t *zv = q->queuedata; 695 696 zvol_request_impl(zv, bio, NULL, 0); 697 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 698 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 699 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 700 return (BLK_QC_T_NONE); 701 #endif 702 } 703 704 static int 705 #ifdef HAVE_BLK_MODE_T 706 zvol_open(struct gendisk *disk, blk_mode_t flag) 707 #else 708 zvol_open(struct block_device *bdev, fmode_t flag) 709 #endif 710 { 711 zvol_state_t *zv; 712 int error = 0; 713 boolean_t drop_suspend = B_FALSE; 714 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 715 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 716 hrtime_t start = gethrtime(); 717 718 retry: 719 #endif 720 rw_enter(&zvol_state_lock, RW_READER); 721 /* 722 * Obtain a copy of private_data under the zvol_state_lock to make 723 * sure that either the result of zvol free code path setting 724 * disk->private_data to NULL is observed, or zvol_os_free() 725 * is not called on this zv because of the positive zv_open_count. 726 */ 727 #ifdef HAVE_BLK_MODE_T 728 zv = disk->private_data; 729 #else 730 zv = bdev->bd_disk->private_data; 731 #endif 732 if (zv == NULL) { 733 rw_exit(&zvol_state_lock); 734 return (-SET_ERROR(ENXIO)); 735 } 736 737 mutex_enter(&zv->zv_state_lock); 738 739 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 740 mutex_exit(&zv->zv_state_lock); 741 rw_exit(&zvol_state_lock); 742 return (-SET_ERROR(ENXIO)); 743 } 744 745 /* 746 * Make sure zvol is not suspended during first open 747 * (hold zv_suspend_lock) and respect proper lock acquisition 748 * ordering - zv_suspend_lock before zv_state_lock 749 */ 750 if (zv->zv_open_count == 0) { 751 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 752 mutex_exit(&zv->zv_state_lock); 753 rw_enter(&zv->zv_suspend_lock, RW_READER); 754 mutex_enter(&zv->zv_state_lock); 755 /* check to see if zv_suspend_lock is needed */ 756 if (zv->zv_open_count != 0) { 757 rw_exit(&zv->zv_suspend_lock); 758 } else { 759 drop_suspend = B_TRUE; 760 } 761 } else { 762 drop_suspend = B_TRUE; 763 } 764 } 765 rw_exit(&zvol_state_lock); 766 767 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 768 769 if (zv->zv_open_count == 0) { 770 boolean_t drop_namespace = B_FALSE; 771 772 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 773 774 /* 775 * In all other call paths the spa_namespace_lock is taken 776 * before the bdev->bd_mutex lock. However, on open(2) 777 * the __blkdev_get() function calls fops->open() with the 778 * bdev->bd_mutex lock held. This can result in a deadlock 779 * when zvols from one pool are used as vdevs in another. 780 * 781 * To prevent a lock inversion deadlock we preemptively 782 * take the spa_namespace_lock. Normally the lock will not 783 * be contended and this is safe because spa_open_common() 784 * handles the case where the caller already holds the 785 * spa_namespace_lock. 786 * 787 * When the lock cannot be aquired after multiple retries 788 * this must be the vdev on zvol deadlock case and we have 789 * no choice but to return an error. For 5.12 and older 790 * kernels returning -ERESTARTSYS will result in the 791 * bdev->bd_mutex being dropped, then reacquired, and 792 * fops->open() being called again. This process can be 793 * repeated safely until both locks are acquired. For 5.13 794 * and newer the -ERESTARTSYS retry logic was removed from 795 * the kernel so the only option is to return the error for 796 * the caller to handle it. 797 */ 798 if (!mutex_owned(&spa_namespace_lock)) { 799 if (!mutex_tryenter(&spa_namespace_lock)) { 800 mutex_exit(&zv->zv_state_lock); 801 rw_exit(&zv->zv_suspend_lock); 802 drop_suspend = B_FALSE; 803 804 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 805 schedule(); 806 return (-SET_ERROR(ERESTARTSYS)); 807 #else 808 if ((gethrtime() - start) > timeout) 809 return (-SET_ERROR(ERESTARTSYS)); 810 811 schedule_timeout_interruptible( 812 MSEC_TO_TICK(10)); 813 goto retry; 814 #endif 815 } else { 816 drop_namespace = B_TRUE; 817 } 818 } 819 820 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 821 822 if (drop_namespace) 823 mutex_exit(&spa_namespace_lock); 824 } 825 826 if (error == 0) { 827 if ((blk_mode_is_open_write(flag)) && 828 (zv->zv_flags & ZVOL_RDONLY)) { 829 if (zv->zv_open_count == 0) 830 zvol_last_close(zv); 831 832 error = -SET_ERROR(EROFS); 833 } else { 834 zv->zv_open_count++; 835 } 836 } 837 838 mutex_exit(&zv->zv_state_lock); 839 if (drop_suspend) 840 rw_exit(&zv->zv_suspend_lock); 841 842 if (error == 0) 843 #ifdef HAVE_BLK_MODE_T 844 disk_check_media_change(disk); 845 #else 846 zfs_check_media_change(bdev); 847 #endif 848 849 return (error); 850 } 851 852 static void 853 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 854 zvol_release(struct gendisk *disk) 855 #else 856 zvol_release(struct gendisk *disk, fmode_t unused) 857 #endif 858 { 859 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 860 (void) unused; 861 #endif 862 zvol_state_t *zv; 863 boolean_t drop_suspend = B_TRUE; 864 865 rw_enter(&zvol_state_lock, RW_READER); 866 zv = disk->private_data; 867 868 mutex_enter(&zv->zv_state_lock); 869 ASSERT3U(zv->zv_open_count, >, 0); 870 /* 871 * make sure zvol is not suspended during last close 872 * (hold zv_suspend_lock) and respect proper lock acquisition 873 * ordering - zv_suspend_lock before zv_state_lock 874 */ 875 if (zv->zv_open_count == 1) { 876 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 877 mutex_exit(&zv->zv_state_lock); 878 rw_enter(&zv->zv_suspend_lock, RW_READER); 879 mutex_enter(&zv->zv_state_lock); 880 /* check to see if zv_suspend_lock is needed */ 881 if (zv->zv_open_count != 1) { 882 rw_exit(&zv->zv_suspend_lock); 883 drop_suspend = B_FALSE; 884 } 885 } 886 } else { 887 drop_suspend = B_FALSE; 888 } 889 rw_exit(&zvol_state_lock); 890 891 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 892 893 zv->zv_open_count--; 894 if (zv->zv_open_count == 0) { 895 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 896 zvol_last_close(zv); 897 } 898 899 mutex_exit(&zv->zv_state_lock); 900 901 if (drop_suspend) 902 rw_exit(&zv->zv_suspend_lock); 903 } 904 905 static int 906 zvol_ioctl(struct block_device *bdev, fmode_t mode, 907 unsigned int cmd, unsigned long arg) 908 { 909 zvol_state_t *zv = bdev->bd_disk->private_data; 910 int error = 0; 911 912 ASSERT3U(zv->zv_open_count, >, 0); 913 914 switch (cmd) { 915 case BLKFLSBUF: 916 #ifdef HAVE_FSYNC_BDEV 917 fsync_bdev(bdev); 918 #elif defined(HAVE_SYNC_BLOCKDEV) 919 sync_blockdev(bdev); 920 #else 921 #error "Neither fsync_bdev() nor sync_blockdev() found" 922 #endif 923 invalidate_bdev(bdev); 924 rw_enter(&zv->zv_suspend_lock, RW_READER); 925 926 if (!(zv->zv_flags & ZVOL_RDONLY)) 927 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 928 929 rw_exit(&zv->zv_suspend_lock); 930 break; 931 932 case BLKZNAME: 933 mutex_enter(&zv->zv_state_lock); 934 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 935 mutex_exit(&zv->zv_state_lock); 936 break; 937 938 default: 939 error = -ENOTTY; 940 break; 941 } 942 943 return (SET_ERROR(error)); 944 } 945 946 #ifdef CONFIG_COMPAT 947 static int 948 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 949 unsigned cmd, unsigned long arg) 950 { 951 return (zvol_ioctl(bdev, mode, cmd, arg)); 952 } 953 #else 954 #define zvol_compat_ioctl NULL 955 #endif 956 957 static unsigned int 958 zvol_check_events(struct gendisk *disk, unsigned int clearing) 959 { 960 unsigned int mask = 0; 961 962 rw_enter(&zvol_state_lock, RW_READER); 963 964 zvol_state_t *zv = disk->private_data; 965 if (zv != NULL) { 966 mutex_enter(&zv->zv_state_lock); 967 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 968 zv->zv_changed = 0; 969 mutex_exit(&zv->zv_state_lock); 970 } 971 972 rw_exit(&zvol_state_lock); 973 974 return (mask); 975 } 976 977 static int 978 zvol_revalidate_disk(struct gendisk *disk) 979 { 980 rw_enter(&zvol_state_lock, RW_READER); 981 982 zvol_state_t *zv = disk->private_data; 983 if (zv != NULL) { 984 mutex_enter(&zv->zv_state_lock); 985 set_capacity(zv->zv_zso->zvo_disk, 986 zv->zv_volsize >> SECTOR_BITS); 987 mutex_exit(&zv->zv_state_lock); 988 } 989 990 rw_exit(&zvol_state_lock); 991 992 return (0); 993 } 994 995 int 996 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 997 { 998 struct gendisk *disk = zv->zv_zso->zvo_disk; 999 1000 #if defined(HAVE_REVALIDATE_DISK_SIZE) 1001 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 1002 #elif defined(HAVE_REVALIDATE_DISK) 1003 revalidate_disk(disk); 1004 #else 1005 zvol_revalidate_disk(disk); 1006 #endif 1007 return (0); 1008 } 1009 1010 void 1011 zvol_os_clear_private(zvol_state_t *zv) 1012 { 1013 /* 1014 * Cleared while holding zvol_state_lock as a writer 1015 * which will prevent zvol_open() from opening it. 1016 */ 1017 zv->zv_zso->zvo_disk->private_data = NULL; 1018 } 1019 1020 /* 1021 * Provide a simple virtual geometry for legacy compatibility. For devices 1022 * smaller than 1 MiB a small head and sector count is used to allow very 1023 * tiny devices. For devices over 1 Mib a standard head and sector count 1024 * is used to keep the cylinders count reasonable. 1025 */ 1026 static int 1027 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1028 { 1029 zvol_state_t *zv = bdev->bd_disk->private_data; 1030 sector_t sectors; 1031 1032 ASSERT3U(zv->zv_open_count, >, 0); 1033 1034 sectors = get_capacity(zv->zv_zso->zvo_disk); 1035 1036 if (sectors > 2048) { 1037 geo->heads = 16; 1038 geo->sectors = 63; 1039 } else { 1040 geo->heads = 2; 1041 geo->sectors = 4; 1042 } 1043 1044 geo->start = 0; 1045 geo->cylinders = sectors / (geo->heads * geo->sectors); 1046 1047 return (0); 1048 } 1049 1050 /* 1051 * Why have two separate block_device_operations structs? 1052 * 1053 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1054 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1055 * can't just change submit_bio dynamically at runtime. So just create two 1056 * separate structs to get around this. 1057 */ 1058 static const struct block_device_operations zvol_ops_blk_mq = { 1059 .open = zvol_open, 1060 .release = zvol_release, 1061 .ioctl = zvol_ioctl, 1062 .compat_ioctl = zvol_compat_ioctl, 1063 .check_events = zvol_check_events, 1064 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1065 .revalidate_disk = zvol_revalidate_disk, 1066 #endif 1067 .getgeo = zvol_getgeo, 1068 .owner = THIS_MODULE, 1069 }; 1070 1071 static const struct block_device_operations zvol_ops = { 1072 .open = zvol_open, 1073 .release = zvol_release, 1074 .ioctl = zvol_ioctl, 1075 .compat_ioctl = zvol_compat_ioctl, 1076 .check_events = zvol_check_events, 1077 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1078 .revalidate_disk = zvol_revalidate_disk, 1079 #endif 1080 .getgeo = zvol_getgeo, 1081 .owner = THIS_MODULE, 1082 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1083 .submit_bio = zvol_submit_bio, 1084 #endif 1085 }; 1086 1087 /* 1088 * Since 6.9, Linux has been removing queue limit setters in favour of an 1089 * initial queue_limits struct applied when the device is open. Since 6.11, 1090 * queue_limits is being extended to allow more things to be applied when the 1091 * device is open. Setters are also being removed for this. 1092 * 1093 * For OpenZFS, this means that depending on kernel version, some options may 1094 * be set up before the device is open, and some applied to an open device 1095 * (queue) after the fact. 1096 * 1097 * We manage this complexity by having our own limits struct, 1098 * zvol_queue_limits_t, in which we carry any queue config that we're 1099 * interested in setting. This structure is the same on all kernels. 1100 * 1101 * These limits are then applied to the queue at device open time by the most 1102 * appropriate method for the kernel. 1103 * 1104 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1105 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1106 * struct queue_limits, and passes it in. Any fields added in later kernels are 1107 * (obviously) not set up here. 1108 * 1109 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1110 * is created, and applies any remaining config. Before 6.9 that will be 1111 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1112 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1113 * will always be a no-op on the latest kernel we support). 1114 */ 1115 typedef struct zvol_queue_limits { 1116 unsigned int zql_max_hw_sectors; 1117 unsigned short zql_max_segments; 1118 unsigned int zql_max_segment_size; 1119 unsigned int zql_io_opt; 1120 unsigned int zql_physical_block_size; 1121 unsigned int zql_max_discard_sectors; 1122 unsigned int zql_discard_granularity; 1123 } zvol_queue_limits_t; 1124 1125 static void 1126 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1127 boolean_t use_blk_mq) 1128 { 1129 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1130 1131 if (use_blk_mq) { 1132 /* 1133 * IO requests can be really big (1MB). When an IO request 1134 * comes in, it is passed off to zvol_read() or zvol_write() 1135 * in a new thread, where it is chunked up into 'volblocksize' 1136 * sized pieces and processed. So for example, if the request 1137 * is a 1MB write and your volblocksize is 128k, one zvol_write 1138 * thread will take that request and sequentially do ten 128k 1139 * IOs. This is due to the fact that the thread needs to lock 1140 * each volblocksize sized block. So you might be wondering: 1141 * "instead of passing the whole 1MB request to one thread, 1142 * why not pass ten individual 128k chunks to ten threads and 1143 * process the whole write in parallel?" The short answer is 1144 * that there's a sweet spot number of chunks that balances 1145 * the greater parallelism with the added overhead of more 1146 * threads. The sweet spot can be different depending on if you 1147 * have a read or write heavy workload. Writes typically want 1148 * high chunk counts while reads typically want lower ones. On 1149 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1150 * configuration, with volblocksize=8k, the sweet spot for good 1151 * sequential reads and writes was at 8 chunks. 1152 */ 1153 1154 /* 1155 * Below we tell the kernel how big we want our requests 1156 * to be. You would think that blk_queue_io_opt() would be 1157 * used to do this since it is used to "set optimal request 1158 * size for the queue", but that doesn't seem to do 1159 * anything - the kernel still gives you huge requests 1160 * with tons of little PAGE_SIZE segments contained within it. 1161 * 1162 * Knowing that the kernel will just give you PAGE_SIZE segments 1163 * no matter what, you can say "ok, I want PAGE_SIZE byte 1164 * segments, and I want 'N' of them per request", where N is 1165 * the correct number of segments for the volblocksize and 1166 * number of chunks you want. 1167 */ 1168 if (zvol_blk_mq_blocks_per_thread != 0) { 1169 unsigned int chunks; 1170 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1171 1172 limits->zql_max_segment_size = PAGE_SIZE; 1173 limits->zql_max_segments = 1174 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1175 } else { 1176 /* 1177 * Special case: zvol_blk_mq_blocks_per_thread = 0 1178 * Max everything out. 1179 */ 1180 limits->zql_max_segments = UINT16_MAX; 1181 limits->zql_max_segment_size = UINT_MAX; 1182 } 1183 } else { 1184 limits->zql_max_segments = UINT16_MAX; 1185 limits->zql_max_segment_size = UINT_MAX; 1186 } 1187 1188 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1189 1190 limits->zql_physical_block_size = zv->zv_volblocksize; 1191 limits->zql_max_discard_sectors = 1192 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1193 limits->zql_discard_granularity = zv->zv_volblocksize; 1194 } 1195 1196 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1197 static void 1198 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1199 struct queue_limits *qlimits) 1200 { 1201 memset(qlimits, 0, sizeof (struct queue_limits)); 1202 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1203 qlimits->max_segments = limits->zql_max_segments; 1204 qlimits->max_segment_size = limits->zql_max_segment_size; 1205 qlimits->io_opt = limits->zql_io_opt; 1206 qlimits->physical_block_size = limits->zql_physical_block_size; 1207 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1208 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1209 qlimits->discard_granularity = limits->zql_discard_granularity; 1210 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1211 qlimits->features = 1212 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1213 #endif 1214 } 1215 #endif 1216 1217 static void 1218 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1219 struct request_queue *queue) 1220 { 1221 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1222 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1223 blk_queue_max_segments(queue, limits->zql_max_segments); 1224 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1225 blk_queue_io_opt(queue, limits->zql_io_opt); 1226 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1227 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1228 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1229 #endif 1230 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1231 blk_queue_set_write_cache(queue, B_TRUE); 1232 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1233 #endif 1234 } 1235 1236 static int 1237 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1238 { 1239 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1240 #if defined(HAVE_BLK_ALLOC_DISK) 1241 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1242 if (zso->zvo_disk == NULL) 1243 return (1); 1244 1245 zso->zvo_disk->minors = ZVOL_MINORS; 1246 zso->zvo_queue = zso->zvo_disk->queue; 1247 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1248 struct queue_limits qlimits; 1249 zvol_queue_limits_convert(limits, &qlimits); 1250 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1251 if (IS_ERR(disk)) { 1252 zso->zvo_disk = NULL; 1253 return (1); 1254 } 1255 1256 zso->zvo_disk = disk; 1257 zso->zvo_disk->minors = ZVOL_MINORS; 1258 zso->zvo_queue = zso->zvo_disk->queue; 1259 1260 #else 1261 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1262 if (zso->zvo_queue == NULL) 1263 return (1); 1264 1265 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1266 if (zso->zvo_disk == NULL) { 1267 blk_cleanup_queue(zso->zvo_queue); 1268 return (1); 1269 } 1270 1271 zso->zvo_disk->queue = zso->zvo_queue; 1272 #endif /* HAVE_BLK_ALLOC_DISK */ 1273 #else 1274 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1275 if (zso->zvo_queue == NULL) 1276 return (1); 1277 1278 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1279 if (zso->zvo_disk == NULL) { 1280 blk_cleanup_queue(zso->zvo_queue); 1281 return (1); 1282 } 1283 1284 zso->zvo_disk->queue = zso->zvo_queue; 1285 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1286 1287 zvol_queue_limits_apply(limits, zso->zvo_queue); 1288 1289 return (0); 1290 1291 } 1292 1293 static int 1294 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1295 { 1296 struct zvol_state_os *zso = zv->zv_zso; 1297 1298 /* Allocate our blk-mq tag_set */ 1299 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1300 return (1); 1301 1302 #if defined(HAVE_BLK_ALLOC_DISK) 1303 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1304 if (zso->zvo_disk == NULL) { 1305 blk_mq_free_tag_set(&zso->tag_set); 1306 return (1); 1307 } 1308 zso->zvo_queue = zso->zvo_disk->queue; 1309 zso->zvo_disk->minors = ZVOL_MINORS; 1310 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1311 struct queue_limits qlimits; 1312 zvol_queue_limits_convert(limits, &qlimits); 1313 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1314 if (IS_ERR(disk)) { 1315 zso->zvo_disk = NULL; 1316 blk_mq_free_tag_set(&zso->tag_set); 1317 return (1); 1318 } 1319 1320 zso->zvo_disk = disk; 1321 zso->zvo_queue = zso->zvo_disk->queue; 1322 zso->zvo_disk->minors = ZVOL_MINORS; 1323 #else 1324 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1325 if (zso->zvo_disk == NULL) { 1326 blk_cleanup_queue(zso->zvo_queue); 1327 blk_mq_free_tag_set(&zso->tag_set); 1328 return (1); 1329 } 1330 /* Allocate queue */ 1331 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1332 if (IS_ERR(zso->zvo_queue)) { 1333 blk_mq_free_tag_set(&zso->tag_set); 1334 return (1); 1335 } 1336 1337 /* Our queue is now created, assign it to our disk */ 1338 zso->zvo_disk->queue = zso->zvo_queue; 1339 #endif 1340 1341 zvol_queue_limits_apply(limits, zso->zvo_queue); 1342 1343 return (0); 1344 } 1345 1346 /* 1347 * Allocate memory for a new zvol_state_t and setup the required 1348 * request queue and generic disk structures for the block device. 1349 */ 1350 static zvol_state_t * 1351 zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) 1352 { 1353 zvol_state_t *zv; 1354 struct zvol_state_os *zso; 1355 uint64_t volmode; 1356 int ret; 1357 1358 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1359 return (NULL); 1360 1361 if (volmode == ZFS_VOLMODE_DEFAULT) 1362 volmode = zvol_volmode; 1363 1364 if (volmode == ZFS_VOLMODE_NONE) 1365 return (NULL); 1366 1367 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1368 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1369 zv->zv_zso = zso; 1370 zv->zv_volmode = volmode; 1371 zv->zv_volblocksize = volblocksize; 1372 1373 list_link_init(&zv->zv_next); 1374 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1375 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1376 1377 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1378 1379 zvol_queue_limits_t limits; 1380 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1381 1382 /* 1383 * The block layer has 3 interfaces for getting BIOs: 1384 * 1385 * 1. blk-mq request queues (new) 1386 * 2. submit_bio() (oldest) 1387 * 3. regular request queues (old). 1388 * 1389 * Each of those interfaces has two permutations: 1390 * 1391 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1392 * both the disk and its queue (5.14 kernel or newer) 1393 * 1394 * b) We don't have blk_*alloc_disk(), and have to allocate the 1395 * disk and the queue separately. (5.13 kernel or older) 1396 */ 1397 if (zv->zv_zso->use_blk_mq) { 1398 ret = zvol_alloc_blk_mq(zv, &limits); 1399 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1400 } else { 1401 ret = zvol_alloc_non_blk_mq(zso, &limits); 1402 zso->zvo_disk->fops = &zvol_ops; 1403 } 1404 if (ret != 0) 1405 goto out_kmem; 1406 1407 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1408 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1409 1410 if (!zv->zv_zso->use_blk_mq) { 1411 /* Disable write merging in favor of the ZIO pipeline. */ 1412 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1413 } 1414 1415 zso->zvo_queue->queuedata = zv; 1416 zso->zvo_dev = dev; 1417 zv->zv_open_count = 0; 1418 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1419 1420 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1421 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1422 1423 zso->zvo_disk->major = zvol_major; 1424 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1425 1426 /* 1427 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1428 * This is accomplished by limiting the number of minors for the 1429 * device to one and explicitly disabling partition scanning. 1430 */ 1431 if (volmode == ZFS_VOLMODE_DEV) { 1432 zso->zvo_disk->minors = 1; 1433 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1434 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1435 } 1436 1437 zso->zvo_disk->first_minor = (dev & MINORMASK); 1438 zso->zvo_disk->private_data = zv; 1439 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1440 ZVOL_DEV_NAME, (dev & MINORMASK)); 1441 1442 return (zv); 1443 1444 out_kmem: 1445 kmem_free(zso, sizeof (struct zvol_state_os)); 1446 kmem_free(zv, sizeof (zvol_state_t)); 1447 return (NULL); 1448 } 1449 1450 /* 1451 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1452 * At this time, the structure is not opened by anyone, is taken off 1453 * the zvol_state_list, and has its private data set to NULL. 1454 * The zvol_state_lock is dropped. 1455 * 1456 * This function may take many milliseconds to complete (e.g. we've seen 1457 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1458 * "del_gendisk". Thus, consumers need to be careful to account for this 1459 * latency when calling this function. 1460 */ 1461 void 1462 zvol_os_free(zvol_state_t *zv) 1463 { 1464 1465 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1466 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1467 ASSERT0(zv->zv_open_count); 1468 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1469 1470 rw_destroy(&zv->zv_suspend_lock); 1471 zfs_rangelock_fini(&zv->zv_rangelock); 1472 1473 del_gendisk(zv->zv_zso->zvo_disk); 1474 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1475 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1476 #if defined(HAVE_BLK_CLEANUP_DISK) 1477 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1478 #else 1479 put_disk(zv->zv_zso->zvo_disk); 1480 #endif 1481 #else 1482 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1483 put_disk(zv->zv_zso->zvo_disk); 1484 #endif 1485 1486 if (zv->zv_zso->use_blk_mq) 1487 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1488 1489 ida_simple_remove(&zvol_ida, 1490 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1491 1492 cv_destroy(&zv->zv_removing_cv); 1493 mutex_destroy(&zv->zv_state_lock); 1494 dataset_kstats_destroy(&zv->zv_kstat); 1495 1496 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1497 kmem_free(zv, sizeof (zvol_state_t)); 1498 } 1499 1500 void 1501 zvol_wait_close(zvol_state_t *zv) 1502 { 1503 } 1504 1505 struct add_disk_work { 1506 struct delayed_work work; 1507 struct gendisk *disk; 1508 int error; 1509 }; 1510 1511 static int 1512 __zvol_os_add_disk(struct gendisk *disk) 1513 { 1514 int error = 0; 1515 #ifdef HAVE_ADD_DISK_RET 1516 error = add_disk(disk); 1517 #else 1518 add_disk(disk); 1519 #endif 1520 return (error); 1521 } 1522 1523 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1524 static void 1525 zvol_os_add_disk_work(struct work_struct *work) 1526 { 1527 struct add_disk_work *add_disk_work; 1528 add_disk_work = container_of(work, struct add_disk_work, work.work); 1529 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1530 } 1531 #endif 1532 1533 /* 1534 * SPECIAL CASE: 1535 * 1536 * This function basically calls add_disk() from a workqueue. You may be 1537 * thinking: why not just call add_disk() directly? 1538 * 1539 * When you call add_disk(), the zvol appears to the world. When this happens, 1540 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1541 * differently on the 6.9+ kernels: 1542 * 1543 * - 6.8 and older kernels - 1544 * disk_scan_partitions() 1545 * handle = bdev_open_by_dev( 1546 * zvol_open() 1547 * bdev_release(handle); 1548 * zvol_release() 1549 * 1550 * 1551 * - 6.9+ kernels - 1552 * disk_scan_partitions() 1553 * file = bdev_file_open_by_dev() 1554 * zvol_open() 1555 * fput(file) 1556 * < wait for return to userspace > 1557 * zvol_release() 1558 * 1559 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1560 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1561 * async that has to wait until we return to userspace (since it adds the fput 1562 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1563 * behavior we want, since we want do things like create+destroy a zvol within 1564 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1565 * reference to the zvol while we're in the IOCTL, which can't wait until we 1566 * return to userspace. 1567 * 1568 * We can get around this since fput() has a special codepath for when it's 1569 * running in a kernel thread or interrupt. In those cases, it just puts the 1570 * fput into the system workqueue, which we can force to run with 1571 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1572 * run from a kernel thread and "tricks" the fput() codepaths. 1573 * 1574 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1575 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1576 * fput) to happen, which it eventually, naturally, will from the system_wq 1577 * without us explicitly calling __flush_workqueue(). 1578 */ 1579 static int 1580 zvol_os_add_disk(struct gendisk *disk) 1581 { 1582 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1583 struct add_disk_work add_disk_work; 1584 1585 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1586 add_disk_work.disk = disk; 1587 add_disk_work.error = 0; 1588 1589 /* Use *_delayed_work functions since they're not GPL'd */ 1590 schedule_delayed_work(&add_disk_work.work, 0); 1591 flush_delayed_work(&add_disk_work.work); 1592 1593 __flush_workqueue(system_wq); 1594 return (add_disk_work.error); 1595 #else /* <= 6.8 kernel */ 1596 return (__zvol_os_add_disk(disk)); 1597 #endif 1598 } 1599 1600 /* 1601 * Create a block device minor node and setup the linkage between it 1602 * and the specified volume. Once this function returns the block 1603 * device is live and ready for use. 1604 */ 1605 int 1606 zvol_os_create_minor(const char *name) 1607 { 1608 zvol_state_t *zv; 1609 objset_t *os; 1610 dmu_object_info_t *doi; 1611 uint64_t volsize; 1612 uint64_t len; 1613 unsigned minor = 0; 1614 int error = 0; 1615 int idx; 1616 uint64_t hash = zvol_name_hash(name); 1617 uint64_t volthreading; 1618 bool replayed_zil = B_FALSE; 1619 1620 if (zvol_inhibit_dev) 1621 return (0); 1622 1623 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1624 if (idx < 0) 1625 return (SET_ERROR(-idx)); 1626 minor = idx << ZVOL_MINOR_BITS; 1627 if (MINOR(minor) != minor) { 1628 /* too many partitions can cause an overflow */ 1629 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1630 name, minor, MINOR(minor)); 1631 ida_simple_remove(&zvol_ida, idx); 1632 return (SET_ERROR(EINVAL)); 1633 } 1634 1635 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1636 if (zv) { 1637 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1638 mutex_exit(&zv->zv_state_lock); 1639 ida_simple_remove(&zvol_ida, idx); 1640 return (SET_ERROR(EEXIST)); 1641 } 1642 1643 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1644 1645 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1646 if (error) 1647 goto out_doi; 1648 1649 error = dmu_object_info(os, ZVOL_OBJ, doi); 1650 if (error) 1651 goto out_dmu_objset_disown; 1652 1653 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1654 if (error) 1655 goto out_dmu_objset_disown; 1656 1657 zv = zvol_alloc(MKDEV(zvol_major, minor), name, 1658 doi->doi_data_block_size); 1659 if (zv == NULL) { 1660 error = SET_ERROR(EAGAIN); 1661 goto out_dmu_objset_disown; 1662 } 1663 zv->zv_hash = hash; 1664 1665 if (dmu_objset_is_snapshot(os)) 1666 zv->zv_flags |= ZVOL_RDONLY; 1667 1668 zv->zv_volsize = volsize; 1669 zv->zv_objset = os; 1670 1671 /* Default */ 1672 zv->zv_threading = B_TRUE; 1673 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1674 == 0) 1675 zv->zv_threading = volthreading; 1676 1677 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1678 1679 #ifdef QUEUE_FLAG_DISCARD 1680 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1681 #endif 1682 #ifdef QUEUE_FLAG_NONROT 1683 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1684 #endif 1685 #ifdef QUEUE_FLAG_ADD_RANDOM 1686 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1687 #endif 1688 /* This flag was introduced in kernel version 4.12. */ 1689 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1690 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1691 #endif 1692 1693 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1694 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1695 if (error) 1696 goto out_dmu_objset_disown; 1697 ASSERT3P(zv->zv_zilog, ==, NULL); 1698 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1699 if (spa_writeable(dmu_objset_spa(os))) { 1700 if (zil_replay_disable) 1701 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1702 else 1703 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1704 } 1705 if (replayed_zil) 1706 zil_close(zv->zv_zilog); 1707 zv->zv_zilog = NULL; 1708 1709 /* 1710 * When udev detects the addition of the device it will immediately 1711 * invoke blkid(8) to determine the type of content on the device. 1712 * Prefetching the blocks commonly scanned by blkid(8) will speed 1713 * up this process. 1714 */ 1715 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1716 if (len > 0) { 1717 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1718 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1719 ZIO_PRIORITY_SYNC_READ); 1720 } 1721 1722 zv->zv_objset = NULL; 1723 out_dmu_objset_disown: 1724 dmu_objset_disown(os, B_TRUE, FTAG); 1725 out_doi: 1726 kmem_free(doi, sizeof (dmu_object_info_t)); 1727 1728 /* 1729 * Keep in mind that once add_disk() is called, the zvol is 1730 * announced to the world, and zvol_open()/zvol_release() can 1731 * be called at any time. Incidentally, add_disk() itself calls 1732 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1733 * directly as well. 1734 */ 1735 if (error == 0) { 1736 rw_enter(&zvol_state_lock, RW_WRITER); 1737 zvol_insert(zv); 1738 rw_exit(&zvol_state_lock); 1739 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1740 } else { 1741 ida_simple_remove(&zvol_ida, idx); 1742 } 1743 1744 return (error); 1745 } 1746 1747 void 1748 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1749 { 1750 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1751 1752 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1753 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1754 1755 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1756 1757 /* move to new hashtable entry */ 1758 zv->zv_hash = zvol_name_hash(newname); 1759 hlist_del(&zv->zv_hlink); 1760 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1761 1762 /* 1763 * The block device's read-only state is briefly changed causing 1764 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1765 * the name change and fixes the symlinks. This does not change 1766 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1767 * changes. This would normally be done using kobject_uevent() but 1768 * that is a GPL-only symbol which is why we need this workaround. 1769 */ 1770 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1771 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1772 1773 dataset_kstats_rename(&zv->zv_kstat, newname); 1774 } 1775 1776 void 1777 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1778 { 1779 1780 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1781 } 1782 1783 void 1784 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1785 { 1786 1787 set_capacity(zv->zv_zso->zvo_disk, capacity); 1788 } 1789 1790 int 1791 zvol_init(void) 1792 { 1793 int error; 1794 1795 /* 1796 * zvol_threads is the module param the user passes in. 1797 * 1798 * zvol_actual_threads is what we use internally, since the user can 1799 * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 1800 */ 1801 static unsigned int zvol_actual_threads; 1802 1803 if (zvol_threads == 0) { 1804 /* 1805 * See dde9380a1 for why 32 was chosen here. This should 1806 * probably be refined to be some multiple of the number 1807 * of CPUs. 1808 */ 1809 zvol_actual_threads = MAX(num_online_cpus(), 32); 1810 } else { 1811 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 1812 } 1813 1814 /* 1815 * Use atleast 32 zvol_threads but for many core system, 1816 * prefer 6 threads per taskq, but no more taskqs 1817 * than threads in them on large systems. 1818 * 1819 * taskq total 1820 * cpus taskqs threads threads 1821 * ------- ------- ------- ------- 1822 * 1 1 32 32 1823 * 2 1 32 32 1824 * 4 1 32 32 1825 * 8 2 16 32 1826 * 16 3 11 33 1827 * 32 5 7 35 1828 * 64 8 8 64 1829 * 128 11 12 132 1830 * 256 16 16 256 1831 */ 1832 zv_taskq_t *ztqs = &zvol_taskqs; 1833 uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); 1834 if (num_tqs == 0) { 1835 num_tqs = 1 + num_online_cpus() / 6; 1836 while (num_tqs * num_tqs > zvol_actual_threads) 1837 num_tqs--; 1838 } 1839 uint_t per_tq_thread = zvol_actual_threads / num_tqs; 1840 if (per_tq_thread * num_tqs < zvol_actual_threads) 1841 per_tq_thread++; 1842 ztqs->tqs_cnt = num_tqs; 1843 ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); 1844 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1845 if (error) { 1846 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); 1847 ztqs->tqs_taskq = NULL; 1848 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1849 return (error); 1850 } 1851 1852 if (zvol_blk_mq_queue_depth == 0) { 1853 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1854 } else { 1855 zvol_actual_blk_mq_queue_depth = 1856 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1857 } 1858 1859 if (zvol_blk_mq_threads == 0) { 1860 zvol_blk_mq_actual_threads = num_online_cpus(); 1861 } else { 1862 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1863 1024); 1864 } 1865 1866 for (uint_t i = 0; i < num_tqs; i++) { 1867 char name[32]; 1868 (void) snprintf(name, sizeof (name), "%s_tq-%u", 1869 ZVOL_DRIVER, i); 1870 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, 1871 maxclsyspri, per_tq_thread, INT_MAX, 1872 TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1873 if (ztqs->tqs_taskq[i] == NULL) { 1874 for (int j = i - 1; j >= 0; j--) 1875 taskq_destroy(ztqs->tqs_taskq[j]); 1876 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1877 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1878 sizeof (taskq_t *)); 1879 ztqs->tqs_taskq = NULL; 1880 return (-ENOMEM); 1881 } 1882 } 1883 1884 zvol_init_impl(); 1885 ida_init(&zvol_ida); 1886 return (0); 1887 } 1888 1889 void 1890 zvol_fini(void) 1891 { 1892 zv_taskq_t *ztqs = &zvol_taskqs; 1893 zvol_fini_impl(); 1894 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1895 1896 if (ztqs->tqs_taskq == NULL) { 1897 ASSERT3U(ztqs->tqs_cnt, ==, 0); 1898 } else { 1899 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { 1900 ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); 1901 taskq_destroy(ztqs->tqs_taskq[i]); 1902 } 1903 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1904 sizeof (taskq_t *)); 1905 ztqs->tqs_taskq = NULL; 1906 } 1907 1908 ida_destroy(&zvol_ida); 1909 } 1910 1911 module_param(zvol_inhibit_dev, uint, 0644); 1912 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1913 1914 module_param(zvol_major, uint, 0444); 1915 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1916 1917 module_param(zvol_threads, uint, 0444); 1918 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" 1919 "to 0 to use all active CPUs"); 1920 1921 module_param(zvol_request_sync, uint, 0644); 1922 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1923 1924 module_param(zvol_max_discard_blocks, ulong, 0444); 1925 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1926 1927 module_param(zvol_num_taskqs, uint, 0444); 1928 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); 1929 1930 module_param(zvol_prefetch_bytes, uint, 0644); 1931 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1932 1933 module_param(zvol_volmode, uint, 0644); 1934 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1935 1936 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1937 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1938 1939 module_param(zvol_use_blk_mq, uint, 0644); 1940 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1941 1942 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1943 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1944 "Process volblocksize blocks per thread"); 1945 1946 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1947 module_param(zvol_open_timeout_ms, uint, 0644); 1948 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1949 #endif 1950