1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 23 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 24 * Copyright (c) 2024, Klara, Inc. 25 */ 26 27 #include <sys/dataset_kstats.h> 28 #include <sys/dbuf.h> 29 #include <sys/dmu_traverse.h> 30 #include <sys/dsl_dataset.h> 31 #include <sys/dsl_prop.h> 32 #include <sys/dsl_dir.h> 33 #include <sys/zap.h> 34 #include <sys/zfeature.h> 35 #include <sys/zil_impl.h> 36 #include <sys/dmu_tx.h> 37 #include <sys/zio.h> 38 #include <sys/zfs_rlock.h> 39 #include <sys/spa_impl.h> 40 #include <sys/zvol.h> 41 #include <sys/zvol_impl.h> 42 #include <cityhash.h> 43 44 #include <linux/blkdev_compat.h> 45 #include <linux/task_io_accounting_ops.h> 46 #include <linux/workqueue.h> 47 #include <linux/blk-mq.h> 48 49 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 50 struct request *rq, boolean_t force_sync); 51 52 static unsigned int zvol_major = ZVOL_MAJOR; 53 static unsigned int zvol_request_sync = 0; 54 static unsigned int zvol_prefetch_bytes = (128 * 1024); 55 static unsigned long zvol_max_discard_blocks = 16384; 56 57 /* 58 * Switch taskq at multiple of 512 MB offset. This can be set to a lower value 59 * to utilize more threads for small files but may affect prefetch hits. 60 */ 61 #define ZVOL_TASKQ_OFFSET_SHIFT 29 62 63 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 64 static unsigned int zvol_open_timeout_ms = 1000; 65 #endif 66 67 static unsigned int zvol_threads = 0; 68 static unsigned int zvol_blk_mq_threads = 0; 69 static unsigned int zvol_blk_mq_actual_threads; 70 static boolean_t zvol_use_blk_mq = B_FALSE; 71 72 /* 73 * The maximum number of volblocksize blocks to process per thread. Typically, 74 * write heavy workloads preform better with higher values here, and read 75 * heavy workloads preform better with lower values, but that's not a hard 76 * and fast rule. It's basically a knob to tune between "less overhead with 77 * less parallelism" and "more overhead, but more parallelism". 78 * 79 * '8' was chosen as a reasonable, balanced, default based off of sequential 80 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 81 */ 82 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 83 84 static unsigned int zvol_num_taskqs = 0; 85 86 #ifndef BLKDEV_DEFAULT_RQ 87 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 88 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 89 #endif 90 91 /* 92 * Finalize our BIO or request. 93 */ 94 static inline void 95 zvol_end_io(struct bio *bio, struct request *rq, int error) 96 { 97 if (bio) { 98 bio->bi_status = errno_to_bi_status(-error); 99 bio_endio(bio); 100 } else { 101 blk_mq_end_request(rq, errno_to_bi_status(error)); 102 } 103 } 104 105 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 106 static unsigned int zvol_actual_blk_mq_queue_depth; 107 108 struct zvol_state_os { 109 struct gendisk *zvo_disk; /* generic disk */ 110 struct request_queue *zvo_queue; /* request queue */ 111 dev_t zvo_dev; /* device id */ 112 113 struct blk_mq_tag_set tag_set; 114 115 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 116 boolean_t use_blk_mq; 117 }; 118 119 typedef struct zv_taskq { 120 uint_t tqs_cnt; 121 taskq_t **tqs_taskq; 122 } zv_taskq_t; 123 static zv_taskq_t zvol_taskqs; 124 static struct ida zvol_ida; 125 126 typedef struct zv_request_stack { 127 zvol_state_t *zv; 128 struct bio *bio; 129 struct request *rq; 130 } zv_request_t; 131 132 typedef struct zv_work { 133 struct request *rq; 134 struct work_struct work; 135 } zv_work_t; 136 137 typedef struct zv_request_task { 138 zv_request_t zvr; 139 taskq_ent_t ent; 140 } zv_request_task_t; 141 142 static zv_request_task_t * 143 zv_request_task_create(zv_request_t zvr) 144 { 145 zv_request_task_t *task; 146 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 147 taskq_init_ent(&task->ent); 148 task->zvr = zvr; 149 return (task); 150 } 151 152 static void 153 zv_request_task_free(zv_request_task_t *task) 154 { 155 kmem_free(task, sizeof (*task)); 156 } 157 158 /* 159 * This is called when a new block multiqueue request comes in. A request 160 * contains one or more BIOs. 161 */ 162 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 163 const struct blk_mq_queue_data *bd) 164 { 165 struct request *rq = bd->rq; 166 zvol_state_t *zv = rq->q->queuedata; 167 168 /* Tell the kernel that we are starting to process this request */ 169 blk_mq_start_request(rq); 170 171 if (blk_rq_is_passthrough(rq)) { 172 /* Skip non filesystem request */ 173 blk_mq_end_request(rq, BLK_STS_IOERR); 174 return (BLK_STS_IOERR); 175 } 176 177 zvol_request_impl(zv, NULL, rq, 0); 178 179 /* Acknowledge to the kernel that we got this request */ 180 return (BLK_STS_OK); 181 } 182 183 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 184 .queue_rq = zvol_mq_queue_rq, 185 }; 186 187 /* Initialize our blk-mq struct */ 188 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 189 { 190 struct zvol_state_os *zso = zv->zv_zso; 191 192 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 193 194 /* Initialize tag set. */ 195 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 196 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 197 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 198 zso->tag_set.numa_node = NUMA_NO_NODE; 199 zso->tag_set.cmd_size = 0; 200 201 /* 202 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 203 * zvol_request_impl() 204 */ 205 zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 206 zso->tag_set.driver_data = zv; 207 208 return (blk_mq_alloc_tag_set(&zso->tag_set)); 209 } 210 211 /* 212 * Given a path, return TRUE if path is a ZVOL. 213 */ 214 boolean_t 215 zvol_os_is_zvol(const char *path) 216 { 217 dev_t dev = 0; 218 219 if (vdev_lookup_bdev(path, &dev) != 0) 220 return (B_FALSE); 221 222 if (MAJOR(dev) == zvol_major) 223 return (B_TRUE); 224 225 return (B_FALSE); 226 } 227 228 static void 229 zvol_write(zv_request_t *zvr) 230 { 231 struct bio *bio = zvr->bio; 232 struct request *rq = zvr->rq; 233 int error = 0; 234 zfs_uio_t uio; 235 zvol_state_t *zv = zvr->zv; 236 struct request_queue *q; 237 struct gendisk *disk; 238 unsigned long start_time = 0; 239 boolean_t acct = B_FALSE; 240 241 ASSERT3P(zv, !=, NULL); 242 ASSERT3U(zv->zv_open_count, >, 0); 243 ASSERT3P(zv->zv_zilog, !=, NULL); 244 245 q = zv->zv_zso->zvo_queue; 246 disk = zv->zv_zso->zvo_disk; 247 248 /* bio marked as FLUSH need to flush before write */ 249 if (io_is_flush(bio, rq)) 250 zil_commit(zv->zv_zilog, ZVOL_OBJ); 251 252 /* Some requests are just for flush and nothing else. */ 253 if (io_size(bio, rq) == 0) { 254 rw_exit(&zv->zv_suspend_lock); 255 zvol_end_io(bio, rq, 0); 256 return; 257 } 258 259 zfs_uio_bvec_init(&uio, bio, rq); 260 261 ssize_t start_resid = uio.uio_resid; 262 263 /* 264 * With use_blk_mq, accounting is done by blk_mq_start_request() 265 * and blk_mq_end_request(), so we can skip it here. 266 */ 267 if (bio) { 268 acct = blk_queue_io_stat(q); 269 if (acct) { 270 start_time = blk_generic_start_io_acct(q, disk, WRITE, 271 bio); 272 } 273 } 274 275 boolean_t sync = 276 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 277 278 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 279 uio.uio_loffset, uio.uio_resid, RL_WRITER); 280 281 uint64_t volsize = zv->zv_volsize; 282 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 283 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 284 uint64_t off = uio.uio_loffset; 285 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 286 287 if (bytes > volsize - off) /* don't write past the end */ 288 bytes = volsize - off; 289 290 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 291 292 /* This will only fail for ENOSPC */ 293 error = dmu_tx_assign(tx, TXG_WAIT); 294 if (error) { 295 dmu_tx_abort(tx); 296 break; 297 } 298 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 299 if (error == 0) { 300 zvol_log_write(zv, tx, off, bytes, sync); 301 } 302 dmu_tx_commit(tx); 303 304 if (error) 305 break; 306 } 307 zfs_rangelock_exit(lr); 308 309 int64_t nwritten = start_resid - uio.uio_resid; 310 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 311 task_io_account_write(nwritten); 312 313 if (sync) 314 zil_commit(zv->zv_zilog, ZVOL_OBJ); 315 316 rw_exit(&zv->zv_suspend_lock); 317 318 if (bio && acct) { 319 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 320 } 321 322 zvol_end_io(bio, rq, -error); 323 } 324 325 static void 326 zvol_write_task(void *arg) 327 { 328 zv_request_task_t *task = arg; 329 zvol_write(&task->zvr); 330 zv_request_task_free(task); 331 } 332 333 static void 334 zvol_discard(zv_request_t *zvr) 335 { 336 struct bio *bio = zvr->bio; 337 struct request *rq = zvr->rq; 338 zvol_state_t *zv = zvr->zv; 339 uint64_t start = io_offset(bio, rq); 340 uint64_t size = io_size(bio, rq); 341 uint64_t end = start + size; 342 boolean_t sync; 343 int error = 0; 344 dmu_tx_t *tx; 345 struct request_queue *q = zv->zv_zso->zvo_queue; 346 struct gendisk *disk = zv->zv_zso->zvo_disk; 347 unsigned long start_time = 0; 348 boolean_t acct = B_FALSE; 349 350 ASSERT3P(zv, !=, NULL); 351 ASSERT3U(zv->zv_open_count, >, 0); 352 ASSERT3P(zv->zv_zilog, !=, NULL); 353 354 if (bio) { 355 acct = blk_queue_io_stat(q); 356 if (acct) { 357 start_time = blk_generic_start_io_acct(q, disk, WRITE, 358 bio); 359 } 360 } 361 362 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 363 364 if (end > zv->zv_volsize) { 365 error = SET_ERROR(EIO); 366 goto unlock; 367 } 368 369 /* 370 * Align the request to volume block boundaries when a secure erase is 371 * not required. This will prevent dnode_free_range() from zeroing out 372 * the unaligned parts which is slow (read-modify-write) and useless 373 * since we are not freeing any space by doing so. 374 */ 375 if (!io_is_secure_erase(bio, rq)) { 376 start = P2ROUNDUP(start, zv->zv_volblocksize); 377 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 378 size = end - start; 379 } 380 381 if (start >= end) 382 goto unlock; 383 384 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 385 start, size, RL_WRITER); 386 387 tx = dmu_tx_create(zv->zv_objset); 388 dmu_tx_mark_netfree(tx); 389 error = dmu_tx_assign(tx, TXG_WAIT); 390 if (error != 0) { 391 dmu_tx_abort(tx); 392 } else { 393 zvol_log_truncate(zv, tx, start, size); 394 dmu_tx_commit(tx); 395 error = dmu_free_long_range(zv->zv_objset, 396 ZVOL_OBJ, start, size); 397 } 398 zfs_rangelock_exit(lr); 399 400 if (error == 0 && sync) 401 zil_commit(zv->zv_zilog, ZVOL_OBJ); 402 403 unlock: 404 rw_exit(&zv->zv_suspend_lock); 405 406 if (bio && acct) { 407 blk_generic_end_io_acct(q, disk, WRITE, bio, 408 start_time); 409 } 410 411 zvol_end_io(bio, rq, -error); 412 } 413 414 static void 415 zvol_discard_task(void *arg) 416 { 417 zv_request_task_t *task = arg; 418 zvol_discard(&task->zvr); 419 zv_request_task_free(task); 420 } 421 422 static void 423 zvol_read(zv_request_t *zvr) 424 { 425 struct bio *bio = zvr->bio; 426 struct request *rq = zvr->rq; 427 int error = 0; 428 zfs_uio_t uio; 429 boolean_t acct = B_FALSE; 430 zvol_state_t *zv = zvr->zv; 431 struct request_queue *q; 432 struct gendisk *disk; 433 unsigned long start_time = 0; 434 435 ASSERT3P(zv, !=, NULL); 436 ASSERT3U(zv->zv_open_count, >, 0); 437 438 zfs_uio_bvec_init(&uio, bio, rq); 439 440 q = zv->zv_zso->zvo_queue; 441 disk = zv->zv_zso->zvo_disk; 442 443 ssize_t start_resid = uio.uio_resid; 444 445 /* 446 * When blk-mq is being used, accounting is done by 447 * blk_mq_start_request() and blk_mq_end_request(). 448 */ 449 if (bio) { 450 acct = blk_queue_io_stat(q); 451 if (acct) 452 start_time = blk_generic_start_io_acct(q, disk, READ, 453 bio); 454 } 455 456 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 457 uio.uio_loffset, uio.uio_resid, RL_READER); 458 459 uint64_t volsize = zv->zv_volsize; 460 461 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 462 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 463 464 /* don't read past the end */ 465 if (bytes > volsize - uio.uio_loffset) 466 bytes = volsize - uio.uio_loffset; 467 468 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 469 if (error) { 470 /* convert checksum errors into IO errors */ 471 if (error == ECKSUM) 472 error = SET_ERROR(EIO); 473 break; 474 } 475 } 476 zfs_rangelock_exit(lr); 477 478 int64_t nread = start_resid - uio.uio_resid; 479 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 480 task_io_account_read(nread); 481 482 rw_exit(&zv->zv_suspend_lock); 483 484 if (bio && acct) { 485 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 486 } 487 488 zvol_end_io(bio, rq, -error); 489 } 490 491 static void 492 zvol_read_task(void *arg) 493 { 494 zv_request_task_t *task = arg; 495 zvol_read(&task->zvr); 496 zv_request_task_free(task); 497 } 498 499 500 /* 501 * Process a BIO or request 502 * 503 * Either 'bio' or 'rq' should be set depending on if we are processing a 504 * bio or a request (both should not be set). 505 * 506 * force_sync: Set to 0 to defer processing to a background taskq 507 * Set to 1 to process data synchronously 508 */ 509 static void 510 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 511 boolean_t force_sync) 512 { 513 fstrans_cookie_t cookie = spl_fstrans_mark(); 514 uint64_t offset = io_offset(bio, rq); 515 uint64_t size = io_size(bio, rq); 516 int rw = io_data_dir(bio, rq); 517 518 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 519 zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); 520 goto out; 521 } 522 523 if (zvol_request_sync || zv->zv_threading == B_FALSE) 524 force_sync = 1; 525 526 zv_request_t zvr = { 527 .zv = zv, 528 .bio = bio, 529 .rq = rq, 530 }; 531 532 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 533 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 534 zv->zv_zso->zvo_disk->disk_name, 535 (long long unsigned)offset, 536 (long unsigned)size); 537 538 zvol_end_io(bio, rq, -SET_ERROR(EIO)); 539 goto out; 540 } 541 542 zv_request_task_t *task; 543 zv_taskq_t *ztqs = &zvol_taskqs; 544 uint_t blk_mq_hw_queue = 0; 545 uint_t tq_idx; 546 uint_t taskq_hash; 547 if (rq) 548 #ifdef HAVE_BLK_MQ_RQ_HCTX 549 blk_mq_hw_queue = rq->mq_hctx->queue_num; 550 #else 551 blk_mq_hw_queue = 552 rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 553 #endif 554 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 555 blk_mq_hw_queue); 556 tq_idx = taskq_hash % ztqs->tqs_cnt; 557 558 if (rw == WRITE) { 559 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 560 zvol_end_io(bio, rq, -SET_ERROR(EROFS)); 561 goto out; 562 } 563 564 /* 565 * Prevents the zvol from being suspended, or the ZIL being 566 * concurrently opened. Will be released after the i/o 567 * completes. 568 */ 569 rw_enter(&zv->zv_suspend_lock, RW_READER); 570 571 /* 572 * Open a ZIL if this is the first time we have written to this 573 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 574 * than zv_state_lock so that we don't need to acquire an 575 * additional lock in this path. 576 */ 577 if (zv->zv_zilog == NULL) { 578 rw_exit(&zv->zv_suspend_lock); 579 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 580 if (zv->zv_zilog == NULL) { 581 zv->zv_zilog = zil_open(zv->zv_objset, 582 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 583 zv->zv_flags |= ZVOL_WRITTEN_TO; 584 /* replay / destroy done in zvol_create_minor */ 585 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 586 ZIL_REPLAY_NEEDED)); 587 } 588 rw_downgrade(&zv->zv_suspend_lock); 589 } 590 591 /* 592 * We don't want this thread to be blocked waiting for i/o to 593 * complete, so we instead wait from a taskq callback. The 594 * i/o may be a ZIL write (via zil_commit()), or a read of an 595 * indirect block, or a read of a data block (if this is a 596 * partial-block write). We will indicate that the i/o is 597 * complete by calling END_IO() from the taskq callback. 598 * 599 * This design allows the calling thread to continue and 600 * initiate more concurrent operations by calling 601 * zvol_request() again. There are typically only a small 602 * number of threads available to call zvol_request() (e.g. 603 * one per iSCSI target), so keeping the latency of 604 * zvol_request() low is important for performance. 605 * 606 * The zvol_request_sync module parameter allows this 607 * behavior to be altered, for performance evaluation 608 * purposes. If the callback blocks, setting 609 * zvol_request_sync=1 will result in much worse performance. 610 * 611 * We can have up to zvol_threads concurrent i/o's being 612 * processed for all zvols on the system. This is typically 613 * a vast improvement over the zvol_request_sync=1 behavior 614 * of one i/o at a time per zvol. However, an even better 615 * design would be for zvol_request() to initiate the zio 616 * directly, and then be notified by the zio_done callback, 617 * which would call END_IO(). Unfortunately, the DMU/ZIL 618 * interfaces lack this functionality (they block waiting for 619 * the i/o to complete). 620 */ 621 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 622 if (force_sync) { 623 zvol_discard(&zvr); 624 } else { 625 task = zv_request_task_create(zvr); 626 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 627 zvol_discard_task, task, 0, &task->ent); 628 } 629 } else { 630 if (force_sync) { 631 zvol_write(&zvr); 632 } else { 633 task = zv_request_task_create(zvr); 634 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 635 zvol_write_task, task, 0, &task->ent); 636 } 637 } 638 } else { 639 /* 640 * The SCST driver, and possibly others, may issue READ I/Os 641 * with a length of zero bytes. These empty I/Os contain no 642 * data and require no additional handling. 643 */ 644 if (size == 0) { 645 zvol_end_io(bio, rq, 0); 646 goto out; 647 } 648 649 rw_enter(&zv->zv_suspend_lock, RW_READER); 650 651 /* See comment in WRITE case above. */ 652 if (force_sync) { 653 zvol_read(&zvr); 654 } else { 655 task = zv_request_task_create(zvr); 656 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 657 zvol_read_task, task, 0, &task->ent); 658 } 659 } 660 661 out: 662 spl_fstrans_unmark(cookie); 663 } 664 665 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 666 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 667 static void 668 zvol_submit_bio(struct bio *bio) 669 #else 670 static blk_qc_t 671 zvol_submit_bio(struct bio *bio) 672 #endif 673 #else 674 static MAKE_REQUEST_FN_RET 675 zvol_request(struct request_queue *q, struct bio *bio) 676 #endif 677 { 678 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 679 #if defined(HAVE_BIO_BDEV_DISK) 680 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 681 #else 682 struct request_queue *q = bio->bi_disk->queue; 683 #endif 684 #endif 685 zvol_state_t *zv = q->queuedata; 686 687 zvol_request_impl(zv, bio, NULL, 0); 688 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 689 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 690 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 691 return (BLK_QC_T_NONE); 692 #endif 693 } 694 695 static int 696 #ifdef HAVE_BLK_MODE_T 697 zvol_open(struct gendisk *disk, blk_mode_t flag) 698 #else 699 zvol_open(struct block_device *bdev, fmode_t flag) 700 #endif 701 { 702 zvol_state_t *zv; 703 int error = 0; 704 boolean_t drop_suspend = B_FALSE; 705 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 706 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 707 hrtime_t start = gethrtime(); 708 709 retry: 710 #endif 711 rw_enter(&zvol_state_lock, RW_READER); 712 /* 713 * Obtain a copy of private_data under the zvol_state_lock to make 714 * sure that either the result of zvol free code path setting 715 * disk->private_data to NULL is observed, or zvol_os_free() 716 * is not called on this zv because of the positive zv_open_count. 717 */ 718 #ifdef HAVE_BLK_MODE_T 719 zv = disk->private_data; 720 #else 721 zv = bdev->bd_disk->private_data; 722 #endif 723 if (zv == NULL) { 724 rw_exit(&zvol_state_lock); 725 return (-SET_ERROR(ENXIO)); 726 } 727 728 mutex_enter(&zv->zv_state_lock); 729 730 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 731 mutex_exit(&zv->zv_state_lock); 732 rw_exit(&zvol_state_lock); 733 return (-SET_ERROR(ENXIO)); 734 } 735 736 /* 737 * Make sure zvol is not suspended during first open 738 * (hold zv_suspend_lock) and respect proper lock acquisition 739 * ordering - zv_suspend_lock before zv_state_lock 740 */ 741 if (zv->zv_open_count == 0) { 742 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 743 mutex_exit(&zv->zv_state_lock); 744 rw_enter(&zv->zv_suspend_lock, RW_READER); 745 mutex_enter(&zv->zv_state_lock); 746 /* check to see if zv_suspend_lock is needed */ 747 if (zv->zv_open_count != 0) { 748 rw_exit(&zv->zv_suspend_lock); 749 } else { 750 drop_suspend = B_TRUE; 751 } 752 } else { 753 drop_suspend = B_TRUE; 754 } 755 } 756 rw_exit(&zvol_state_lock); 757 758 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 759 760 if (zv->zv_open_count == 0) { 761 boolean_t drop_namespace = B_FALSE; 762 763 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 764 765 /* 766 * In all other call paths the spa_namespace_lock is taken 767 * before the bdev->bd_mutex lock. However, on open(2) 768 * the __blkdev_get() function calls fops->open() with the 769 * bdev->bd_mutex lock held. This can result in a deadlock 770 * when zvols from one pool are used as vdevs in another. 771 * 772 * To prevent a lock inversion deadlock we preemptively 773 * take the spa_namespace_lock. Normally the lock will not 774 * be contended and this is safe because spa_open_common() 775 * handles the case where the caller already holds the 776 * spa_namespace_lock. 777 * 778 * When the lock cannot be aquired after multiple retries 779 * this must be the vdev on zvol deadlock case and we have 780 * no choice but to return an error. For 5.12 and older 781 * kernels returning -ERESTARTSYS will result in the 782 * bdev->bd_mutex being dropped, then reacquired, and 783 * fops->open() being called again. This process can be 784 * repeated safely until both locks are acquired. For 5.13 785 * and newer the -ERESTARTSYS retry logic was removed from 786 * the kernel so the only option is to return the error for 787 * the caller to handle it. 788 */ 789 if (!mutex_owned(&spa_namespace_lock)) { 790 if (!mutex_tryenter(&spa_namespace_lock)) { 791 mutex_exit(&zv->zv_state_lock); 792 rw_exit(&zv->zv_suspend_lock); 793 drop_suspend = B_FALSE; 794 795 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 796 schedule(); 797 return (-SET_ERROR(ERESTARTSYS)); 798 #else 799 if ((gethrtime() - start) > timeout) 800 return (-SET_ERROR(ERESTARTSYS)); 801 802 schedule_timeout_interruptible( 803 MSEC_TO_TICK(10)); 804 goto retry; 805 #endif 806 } else { 807 drop_namespace = B_TRUE; 808 } 809 } 810 811 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 812 813 if (drop_namespace) 814 mutex_exit(&spa_namespace_lock); 815 } 816 817 if (error == 0) { 818 if ((blk_mode_is_open_write(flag)) && 819 (zv->zv_flags & ZVOL_RDONLY)) { 820 if (zv->zv_open_count == 0) 821 zvol_last_close(zv); 822 823 error = -SET_ERROR(EROFS); 824 } else { 825 zv->zv_open_count++; 826 } 827 } 828 829 mutex_exit(&zv->zv_state_lock); 830 if (drop_suspend) 831 rw_exit(&zv->zv_suspend_lock); 832 833 if (error == 0) 834 #ifdef HAVE_BLK_MODE_T 835 disk_check_media_change(disk); 836 #else 837 zfs_check_media_change(bdev); 838 #endif 839 840 return (error); 841 } 842 843 static void 844 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 845 zvol_release(struct gendisk *disk) 846 #else 847 zvol_release(struct gendisk *disk, fmode_t unused) 848 #endif 849 { 850 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 851 (void) unused; 852 #endif 853 zvol_state_t *zv; 854 boolean_t drop_suspend = B_TRUE; 855 856 rw_enter(&zvol_state_lock, RW_READER); 857 zv = disk->private_data; 858 859 mutex_enter(&zv->zv_state_lock); 860 ASSERT3U(zv->zv_open_count, >, 0); 861 /* 862 * make sure zvol is not suspended during last close 863 * (hold zv_suspend_lock) and respect proper lock acquisition 864 * ordering - zv_suspend_lock before zv_state_lock 865 */ 866 if (zv->zv_open_count == 1) { 867 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 868 mutex_exit(&zv->zv_state_lock); 869 rw_enter(&zv->zv_suspend_lock, RW_READER); 870 mutex_enter(&zv->zv_state_lock); 871 /* check to see if zv_suspend_lock is needed */ 872 if (zv->zv_open_count != 1) { 873 rw_exit(&zv->zv_suspend_lock); 874 drop_suspend = B_FALSE; 875 } 876 } 877 } else { 878 drop_suspend = B_FALSE; 879 } 880 rw_exit(&zvol_state_lock); 881 882 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 883 884 zv->zv_open_count--; 885 if (zv->zv_open_count == 0) { 886 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 887 zvol_last_close(zv); 888 } 889 890 mutex_exit(&zv->zv_state_lock); 891 892 if (drop_suspend) 893 rw_exit(&zv->zv_suspend_lock); 894 } 895 896 static int 897 zvol_ioctl(struct block_device *bdev, fmode_t mode, 898 unsigned int cmd, unsigned long arg) 899 { 900 zvol_state_t *zv = bdev->bd_disk->private_data; 901 int error = 0; 902 903 ASSERT3U(zv->zv_open_count, >, 0); 904 905 switch (cmd) { 906 case BLKFLSBUF: 907 #ifdef HAVE_FSYNC_BDEV 908 fsync_bdev(bdev); 909 #elif defined(HAVE_SYNC_BLOCKDEV) 910 sync_blockdev(bdev); 911 #else 912 #error "Neither fsync_bdev() nor sync_blockdev() found" 913 #endif 914 invalidate_bdev(bdev); 915 rw_enter(&zv->zv_suspend_lock, RW_READER); 916 917 if (!(zv->zv_flags & ZVOL_RDONLY)) 918 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 919 920 rw_exit(&zv->zv_suspend_lock); 921 break; 922 923 case BLKZNAME: 924 mutex_enter(&zv->zv_state_lock); 925 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 926 mutex_exit(&zv->zv_state_lock); 927 break; 928 929 default: 930 error = -ENOTTY; 931 break; 932 } 933 934 return (SET_ERROR(error)); 935 } 936 937 #ifdef CONFIG_COMPAT 938 static int 939 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 940 unsigned cmd, unsigned long arg) 941 { 942 return (zvol_ioctl(bdev, mode, cmd, arg)); 943 } 944 #else 945 #define zvol_compat_ioctl NULL 946 #endif 947 948 static unsigned int 949 zvol_check_events(struct gendisk *disk, unsigned int clearing) 950 { 951 unsigned int mask = 0; 952 953 rw_enter(&zvol_state_lock, RW_READER); 954 955 zvol_state_t *zv = disk->private_data; 956 if (zv != NULL) { 957 mutex_enter(&zv->zv_state_lock); 958 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 959 zv->zv_changed = 0; 960 mutex_exit(&zv->zv_state_lock); 961 } 962 963 rw_exit(&zvol_state_lock); 964 965 return (mask); 966 } 967 968 static int 969 zvol_revalidate_disk(struct gendisk *disk) 970 { 971 rw_enter(&zvol_state_lock, RW_READER); 972 973 zvol_state_t *zv = disk->private_data; 974 if (zv != NULL) { 975 mutex_enter(&zv->zv_state_lock); 976 set_capacity(zv->zv_zso->zvo_disk, 977 zv->zv_volsize >> SECTOR_BITS); 978 mutex_exit(&zv->zv_state_lock); 979 } 980 981 rw_exit(&zvol_state_lock); 982 983 return (0); 984 } 985 986 int 987 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 988 { 989 struct gendisk *disk = zv->zv_zso->zvo_disk; 990 991 #if defined(HAVE_REVALIDATE_DISK_SIZE) 992 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 993 #elif defined(HAVE_REVALIDATE_DISK) 994 revalidate_disk(disk); 995 #else 996 zvol_revalidate_disk(disk); 997 #endif 998 return (0); 999 } 1000 1001 void 1002 zvol_os_clear_private(zvol_state_t *zv) 1003 { 1004 /* 1005 * Cleared while holding zvol_state_lock as a writer 1006 * which will prevent zvol_open() from opening it. 1007 */ 1008 zv->zv_zso->zvo_disk->private_data = NULL; 1009 } 1010 1011 /* 1012 * Provide a simple virtual geometry for legacy compatibility. For devices 1013 * smaller than 1 MiB a small head and sector count is used to allow very 1014 * tiny devices. For devices over 1 Mib a standard head and sector count 1015 * is used to keep the cylinders count reasonable. 1016 */ 1017 static int 1018 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1019 { 1020 zvol_state_t *zv = bdev->bd_disk->private_data; 1021 sector_t sectors; 1022 1023 ASSERT3U(zv->zv_open_count, >, 0); 1024 1025 sectors = get_capacity(zv->zv_zso->zvo_disk); 1026 1027 if (sectors > 2048) { 1028 geo->heads = 16; 1029 geo->sectors = 63; 1030 } else { 1031 geo->heads = 2; 1032 geo->sectors = 4; 1033 } 1034 1035 geo->start = 0; 1036 geo->cylinders = sectors / (geo->heads * geo->sectors); 1037 1038 return (0); 1039 } 1040 1041 /* 1042 * Why have two separate block_device_operations structs? 1043 * 1044 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1045 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1046 * can't just change submit_bio dynamically at runtime. So just create two 1047 * separate structs to get around this. 1048 */ 1049 static const struct block_device_operations zvol_ops_blk_mq = { 1050 .open = zvol_open, 1051 .release = zvol_release, 1052 .ioctl = zvol_ioctl, 1053 .compat_ioctl = zvol_compat_ioctl, 1054 .check_events = zvol_check_events, 1055 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1056 .revalidate_disk = zvol_revalidate_disk, 1057 #endif 1058 .getgeo = zvol_getgeo, 1059 .owner = THIS_MODULE, 1060 }; 1061 1062 static const struct block_device_operations zvol_ops = { 1063 .open = zvol_open, 1064 .release = zvol_release, 1065 .ioctl = zvol_ioctl, 1066 .compat_ioctl = zvol_compat_ioctl, 1067 .check_events = zvol_check_events, 1068 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1069 .revalidate_disk = zvol_revalidate_disk, 1070 #endif 1071 .getgeo = zvol_getgeo, 1072 .owner = THIS_MODULE, 1073 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1074 .submit_bio = zvol_submit_bio, 1075 #endif 1076 }; 1077 1078 /* 1079 * Since 6.9, Linux has been removing queue limit setters in favour of an 1080 * initial queue_limits struct applied when the device is open. Since 6.11, 1081 * queue_limits is being extended to allow more things to be applied when the 1082 * device is open. Setters are also being removed for this. 1083 * 1084 * For OpenZFS, this means that depending on kernel version, some options may 1085 * be set up before the device is open, and some applied to an open device 1086 * (queue) after the fact. 1087 * 1088 * We manage this complexity by having our own limits struct, 1089 * zvol_queue_limits_t, in which we carry any queue config that we're 1090 * interested in setting. This structure is the same on all kernels. 1091 * 1092 * These limits are then applied to the queue at device open time by the most 1093 * appropriate method for the kernel. 1094 * 1095 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1096 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1097 * struct queue_limits, and passes it in. Any fields added in later kernels are 1098 * (obviously) not set up here. 1099 * 1100 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1101 * is created, and applies any remaining config. Before 6.9 that will be 1102 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1103 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1104 * will always be a no-op on the latest kernel we support). 1105 */ 1106 typedef struct zvol_queue_limits { 1107 unsigned int zql_max_hw_sectors; 1108 unsigned short zql_max_segments; 1109 unsigned int zql_max_segment_size; 1110 unsigned int zql_io_opt; 1111 unsigned int zql_physical_block_size; 1112 unsigned int zql_max_discard_sectors; 1113 unsigned int zql_discard_granularity; 1114 } zvol_queue_limits_t; 1115 1116 static void 1117 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1118 boolean_t use_blk_mq) 1119 { 1120 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1121 1122 if (use_blk_mq) { 1123 /* 1124 * IO requests can be really big (1MB). When an IO request 1125 * comes in, it is passed off to zvol_read() or zvol_write() 1126 * in a new thread, where it is chunked up into 'volblocksize' 1127 * sized pieces and processed. So for example, if the request 1128 * is a 1MB write and your volblocksize is 128k, one zvol_write 1129 * thread will take that request and sequentially do ten 128k 1130 * IOs. This is due to the fact that the thread needs to lock 1131 * each volblocksize sized block. So you might be wondering: 1132 * "instead of passing the whole 1MB request to one thread, 1133 * why not pass ten individual 128k chunks to ten threads and 1134 * process the whole write in parallel?" The short answer is 1135 * that there's a sweet spot number of chunks that balances 1136 * the greater parallelism with the added overhead of more 1137 * threads. The sweet spot can be different depending on if you 1138 * have a read or write heavy workload. Writes typically want 1139 * high chunk counts while reads typically want lower ones. On 1140 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1141 * configuration, with volblocksize=8k, the sweet spot for good 1142 * sequential reads and writes was at 8 chunks. 1143 */ 1144 1145 /* 1146 * Below we tell the kernel how big we want our requests 1147 * to be. You would think that blk_queue_io_opt() would be 1148 * used to do this since it is used to "set optimal request 1149 * size for the queue", but that doesn't seem to do 1150 * anything - the kernel still gives you huge requests 1151 * with tons of little PAGE_SIZE segments contained within it. 1152 * 1153 * Knowing that the kernel will just give you PAGE_SIZE segments 1154 * no matter what, you can say "ok, I want PAGE_SIZE byte 1155 * segments, and I want 'N' of them per request", where N is 1156 * the correct number of segments for the volblocksize and 1157 * number of chunks you want. 1158 */ 1159 if (zvol_blk_mq_blocks_per_thread != 0) { 1160 unsigned int chunks; 1161 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1162 1163 limits->zql_max_segment_size = PAGE_SIZE; 1164 limits->zql_max_segments = 1165 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1166 } else { 1167 /* 1168 * Special case: zvol_blk_mq_blocks_per_thread = 0 1169 * Max everything out. 1170 */ 1171 limits->zql_max_segments = UINT16_MAX; 1172 limits->zql_max_segment_size = UINT_MAX; 1173 } 1174 } else { 1175 limits->zql_max_segments = UINT16_MAX; 1176 limits->zql_max_segment_size = UINT_MAX; 1177 } 1178 1179 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1180 1181 limits->zql_physical_block_size = zv->zv_volblocksize; 1182 limits->zql_max_discard_sectors = 1183 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1184 limits->zql_discard_granularity = zv->zv_volblocksize; 1185 } 1186 1187 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1188 static void 1189 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1190 struct queue_limits *qlimits) 1191 { 1192 memset(qlimits, 0, sizeof (struct queue_limits)); 1193 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1194 qlimits->max_segments = limits->zql_max_segments; 1195 qlimits->max_segment_size = limits->zql_max_segment_size; 1196 qlimits->io_opt = limits->zql_io_opt; 1197 qlimits->physical_block_size = limits->zql_physical_block_size; 1198 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1199 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1200 qlimits->discard_granularity = limits->zql_discard_granularity; 1201 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1202 qlimits->features = 1203 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1204 #endif 1205 } 1206 #endif 1207 1208 static void 1209 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1210 struct request_queue *queue) 1211 { 1212 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1213 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1214 blk_queue_max_segments(queue, limits->zql_max_segments); 1215 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1216 blk_queue_io_opt(queue, limits->zql_io_opt); 1217 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1218 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1219 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1220 #endif 1221 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1222 blk_queue_set_write_cache(queue, B_TRUE); 1223 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1224 #endif 1225 } 1226 1227 static int 1228 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1229 { 1230 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1231 #if defined(HAVE_BLK_ALLOC_DISK) 1232 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1233 if (zso->zvo_disk == NULL) 1234 return (1); 1235 1236 zso->zvo_disk->minors = ZVOL_MINORS; 1237 zso->zvo_queue = zso->zvo_disk->queue; 1238 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1239 struct queue_limits qlimits; 1240 zvol_queue_limits_convert(limits, &qlimits); 1241 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1242 if (IS_ERR(disk)) { 1243 zso->zvo_disk = NULL; 1244 return (1); 1245 } 1246 1247 zso->zvo_disk = disk; 1248 zso->zvo_disk->minors = ZVOL_MINORS; 1249 zso->zvo_queue = zso->zvo_disk->queue; 1250 1251 #else 1252 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1253 if (zso->zvo_queue == NULL) 1254 return (1); 1255 1256 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1257 if (zso->zvo_disk == NULL) { 1258 blk_cleanup_queue(zso->zvo_queue); 1259 return (1); 1260 } 1261 1262 zso->zvo_disk->queue = zso->zvo_queue; 1263 #endif /* HAVE_BLK_ALLOC_DISK */ 1264 #else 1265 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1266 if (zso->zvo_queue == NULL) 1267 return (1); 1268 1269 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1270 if (zso->zvo_disk == NULL) { 1271 blk_cleanup_queue(zso->zvo_queue); 1272 return (1); 1273 } 1274 1275 zso->zvo_disk->queue = zso->zvo_queue; 1276 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1277 1278 zvol_queue_limits_apply(limits, zso->zvo_queue); 1279 1280 return (0); 1281 1282 } 1283 1284 static int 1285 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1286 { 1287 struct zvol_state_os *zso = zv->zv_zso; 1288 1289 /* Allocate our blk-mq tag_set */ 1290 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1291 return (1); 1292 1293 #if defined(HAVE_BLK_ALLOC_DISK) 1294 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1295 if (zso->zvo_disk == NULL) { 1296 blk_mq_free_tag_set(&zso->tag_set); 1297 return (1); 1298 } 1299 zso->zvo_queue = zso->zvo_disk->queue; 1300 zso->zvo_disk->minors = ZVOL_MINORS; 1301 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1302 struct queue_limits qlimits; 1303 zvol_queue_limits_convert(limits, &qlimits); 1304 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1305 if (IS_ERR(disk)) { 1306 zso->zvo_disk = NULL; 1307 blk_mq_free_tag_set(&zso->tag_set); 1308 return (1); 1309 } 1310 1311 zso->zvo_disk = disk; 1312 zso->zvo_queue = zso->zvo_disk->queue; 1313 zso->zvo_disk->minors = ZVOL_MINORS; 1314 #else 1315 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1316 if (zso->zvo_disk == NULL) { 1317 blk_cleanup_queue(zso->zvo_queue); 1318 blk_mq_free_tag_set(&zso->tag_set); 1319 return (1); 1320 } 1321 /* Allocate queue */ 1322 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1323 if (IS_ERR(zso->zvo_queue)) { 1324 blk_mq_free_tag_set(&zso->tag_set); 1325 return (1); 1326 } 1327 1328 /* Our queue is now created, assign it to our disk */ 1329 zso->zvo_disk->queue = zso->zvo_queue; 1330 #endif 1331 1332 zvol_queue_limits_apply(limits, zso->zvo_queue); 1333 1334 return (0); 1335 } 1336 1337 /* 1338 * Allocate memory for a new zvol_state_t and setup the required 1339 * request queue and generic disk structures for the block device. 1340 */ 1341 static zvol_state_t * 1342 zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) 1343 { 1344 zvol_state_t *zv; 1345 struct zvol_state_os *zso; 1346 uint64_t volmode; 1347 int ret; 1348 1349 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1350 return (NULL); 1351 1352 if (volmode == ZFS_VOLMODE_DEFAULT) 1353 volmode = zvol_volmode; 1354 1355 if (volmode == ZFS_VOLMODE_NONE) 1356 return (NULL); 1357 1358 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1359 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1360 zv->zv_zso = zso; 1361 zv->zv_volmode = volmode; 1362 zv->zv_volblocksize = volblocksize; 1363 1364 list_link_init(&zv->zv_next); 1365 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1366 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1367 1368 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1369 1370 zvol_queue_limits_t limits; 1371 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1372 1373 /* 1374 * The block layer has 3 interfaces for getting BIOs: 1375 * 1376 * 1. blk-mq request queues (new) 1377 * 2. submit_bio() (oldest) 1378 * 3. regular request queues (old). 1379 * 1380 * Each of those interfaces has two permutations: 1381 * 1382 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1383 * both the disk and its queue (5.14 kernel or newer) 1384 * 1385 * b) We don't have blk_*alloc_disk(), and have to allocate the 1386 * disk and the queue separately. (5.13 kernel or older) 1387 */ 1388 if (zv->zv_zso->use_blk_mq) { 1389 ret = zvol_alloc_blk_mq(zv, &limits); 1390 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1391 } else { 1392 ret = zvol_alloc_non_blk_mq(zso, &limits); 1393 zso->zvo_disk->fops = &zvol_ops; 1394 } 1395 if (ret != 0) 1396 goto out_kmem; 1397 1398 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1399 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1400 1401 if (!zv->zv_zso->use_blk_mq) { 1402 /* Disable write merging in favor of the ZIO pipeline. */ 1403 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1404 } 1405 1406 zso->zvo_queue->queuedata = zv; 1407 zso->zvo_dev = dev; 1408 zv->zv_open_count = 0; 1409 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1410 1411 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1412 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1413 1414 zso->zvo_disk->major = zvol_major; 1415 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1416 1417 /* 1418 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1419 * This is accomplished by limiting the number of minors for the 1420 * device to one and explicitly disabling partition scanning. 1421 */ 1422 if (volmode == ZFS_VOLMODE_DEV) { 1423 zso->zvo_disk->minors = 1; 1424 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1425 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1426 } 1427 1428 zso->zvo_disk->first_minor = (dev & MINORMASK); 1429 zso->zvo_disk->private_data = zv; 1430 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1431 ZVOL_DEV_NAME, (dev & MINORMASK)); 1432 1433 return (zv); 1434 1435 out_kmem: 1436 kmem_free(zso, sizeof (struct zvol_state_os)); 1437 kmem_free(zv, sizeof (zvol_state_t)); 1438 return (NULL); 1439 } 1440 1441 /* 1442 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1443 * At this time, the structure is not opened by anyone, is taken off 1444 * the zvol_state_list, and has its private data set to NULL. 1445 * The zvol_state_lock is dropped. 1446 * 1447 * This function may take many milliseconds to complete (e.g. we've seen 1448 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1449 * "del_gendisk". Thus, consumers need to be careful to account for this 1450 * latency when calling this function. 1451 */ 1452 void 1453 zvol_os_free(zvol_state_t *zv) 1454 { 1455 1456 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1457 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1458 ASSERT0(zv->zv_open_count); 1459 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1460 1461 rw_destroy(&zv->zv_suspend_lock); 1462 zfs_rangelock_fini(&zv->zv_rangelock); 1463 1464 del_gendisk(zv->zv_zso->zvo_disk); 1465 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1466 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1467 #if defined(HAVE_BLK_CLEANUP_DISK) 1468 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1469 #else 1470 put_disk(zv->zv_zso->zvo_disk); 1471 #endif 1472 #else 1473 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1474 put_disk(zv->zv_zso->zvo_disk); 1475 #endif 1476 1477 if (zv->zv_zso->use_blk_mq) 1478 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1479 1480 ida_simple_remove(&zvol_ida, 1481 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1482 1483 cv_destroy(&zv->zv_removing_cv); 1484 mutex_destroy(&zv->zv_state_lock); 1485 dataset_kstats_destroy(&zv->zv_kstat); 1486 1487 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1488 kmem_free(zv, sizeof (zvol_state_t)); 1489 } 1490 1491 void 1492 zvol_wait_close(zvol_state_t *zv) 1493 { 1494 } 1495 1496 struct add_disk_work { 1497 struct delayed_work work; 1498 struct gendisk *disk; 1499 int error; 1500 }; 1501 1502 static int 1503 __zvol_os_add_disk(struct gendisk *disk) 1504 { 1505 int error = 0; 1506 #ifdef HAVE_ADD_DISK_RET 1507 error = add_disk(disk); 1508 #else 1509 add_disk(disk); 1510 #endif 1511 return (error); 1512 } 1513 1514 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1515 static void 1516 zvol_os_add_disk_work(struct work_struct *work) 1517 { 1518 struct add_disk_work *add_disk_work; 1519 add_disk_work = container_of(work, struct add_disk_work, work.work); 1520 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1521 } 1522 #endif 1523 1524 /* 1525 * SPECIAL CASE: 1526 * 1527 * This function basically calls add_disk() from a workqueue. You may be 1528 * thinking: why not just call add_disk() directly? 1529 * 1530 * When you call add_disk(), the zvol appears to the world. When this happens, 1531 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1532 * differently on the 6.9+ kernels: 1533 * 1534 * - 6.8 and older kernels - 1535 * disk_scan_partitions() 1536 * handle = bdev_open_by_dev( 1537 * zvol_open() 1538 * bdev_release(handle); 1539 * zvol_release() 1540 * 1541 * 1542 * - 6.9+ kernels - 1543 * disk_scan_partitions() 1544 * file = bdev_file_open_by_dev() 1545 * zvol_open() 1546 * fput(file) 1547 * < wait for return to userspace > 1548 * zvol_release() 1549 * 1550 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1551 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1552 * async that has to wait until we return to userspace (since it adds the fput 1553 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1554 * behavior we want, since we want do things like create+destroy a zvol within 1555 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1556 * reference to the zvol while we're in the IOCTL, which can't wait until we 1557 * return to userspace. 1558 * 1559 * We can get around this since fput() has a special codepath for when it's 1560 * running in a kernel thread or interrupt. In those cases, it just puts the 1561 * fput into the system workqueue, which we can force to run with 1562 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1563 * run from a kernel thread and "tricks" the fput() codepaths. 1564 * 1565 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1566 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1567 * fput) to happen, which it eventually, naturally, will from the system_wq 1568 * without us explicitly calling __flush_workqueue(). 1569 */ 1570 static int 1571 zvol_os_add_disk(struct gendisk *disk) 1572 { 1573 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1574 struct add_disk_work add_disk_work; 1575 1576 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1577 add_disk_work.disk = disk; 1578 add_disk_work.error = 0; 1579 1580 /* Use *_delayed_work functions since they're not GPL'd */ 1581 schedule_delayed_work(&add_disk_work.work, 0); 1582 flush_delayed_work(&add_disk_work.work); 1583 1584 __flush_workqueue(system_wq); 1585 return (add_disk_work.error); 1586 #else /* <= 6.8 kernel */ 1587 return (__zvol_os_add_disk(disk)); 1588 #endif 1589 } 1590 1591 /* 1592 * Create a block device minor node and setup the linkage between it 1593 * and the specified volume. Once this function returns the block 1594 * device is live and ready for use. 1595 */ 1596 int 1597 zvol_os_create_minor(const char *name) 1598 { 1599 zvol_state_t *zv; 1600 objset_t *os; 1601 dmu_object_info_t *doi; 1602 uint64_t volsize; 1603 uint64_t len; 1604 unsigned minor = 0; 1605 int error = 0; 1606 int idx; 1607 uint64_t hash = zvol_name_hash(name); 1608 uint64_t volthreading; 1609 bool replayed_zil = B_FALSE; 1610 1611 if (zvol_inhibit_dev) 1612 return (0); 1613 1614 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1615 if (idx < 0) 1616 return (SET_ERROR(-idx)); 1617 minor = idx << ZVOL_MINOR_BITS; 1618 if (MINOR(minor) != minor) { 1619 /* too many partitions can cause an overflow */ 1620 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1621 name, minor, MINOR(minor)); 1622 ida_simple_remove(&zvol_ida, idx); 1623 return (SET_ERROR(EINVAL)); 1624 } 1625 1626 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1627 if (zv) { 1628 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1629 mutex_exit(&zv->zv_state_lock); 1630 ida_simple_remove(&zvol_ida, idx); 1631 return (SET_ERROR(EEXIST)); 1632 } 1633 1634 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1635 1636 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1637 if (error) 1638 goto out_doi; 1639 1640 error = dmu_object_info(os, ZVOL_OBJ, doi); 1641 if (error) 1642 goto out_dmu_objset_disown; 1643 1644 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1645 if (error) 1646 goto out_dmu_objset_disown; 1647 1648 zv = zvol_alloc(MKDEV(zvol_major, minor), name, 1649 doi->doi_data_block_size); 1650 if (zv == NULL) { 1651 error = SET_ERROR(EAGAIN); 1652 goto out_dmu_objset_disown; 1653 } 1654 zv->zv_hash = hash; 1655 1656 if (dmu_objset_is_snapshot(os)) 1657 zv->zv_flags |= ZVOL_RDONLY; 1658 1659 zv->zv_volsize = volsize; 1660 zv->zv_objset = os; 1661 1662 /* Default */ 1663 zv->zv_threading = B_TRUE; 1664 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1665 == 0) 1666 zv->zv_threading = volthreading; 1667 1668 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1669 1670 #ifdef QUEUE_FLAG_DISCARD 1671 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1672 #endif 1673 #ifdef QUEUE_FLAG_NONROT 1674 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1675 #endif 1676 #ifdef QUEUE_FLAG_ADD_RANDOM 1677 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1678 #endif 1679 /* This flag was introduced in kernel version 4.12. */ 1680 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1681 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1682 #endif 1683 1684 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1685 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1686 if (error) 1687 goto out_dmu_objset_disown; 1688 ASSERT3P(zv->zv_zilog, ==, NULL); 1689 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1690 if (spa_writeable(dmu_objset_spa(os))) { 1691 if (zil_replay_disable) 1692 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1693 else 1694 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1695 } 1696 if (replayed_zil) 1697 zil_close(zv->zv_zilog); 1698 zv->zv_zilog = NULL; 1699 1700 /* 1701 * When udev detects the addition of the device it will immediately 1702 * invoke blkid(8) to determine the type of content on the device. 1703 * Prefetching the blocks commonly scanned by blkid(8) will speed 1704 * up this process. 1705 */ 1706 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1707 if (len > 0) { 1708 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1709 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1710 ZIO_PRIORITY_SYNC_READ); 1711 } 1712 1713 zv->zv_objset = NULL; 1714 out_dmu_objset_disown: 1715 dmu_objset_disown(os, B_TRUE, FTAG); 1716 out_doi: 1717 kmem_free(doi, sizeof (dmu_object_info_t)); 1718 1719 /* 1720 * Keep in mind that once add_disk() is called, the zvol is 1721 * announced to the world, and zvol_open()/zvol_release() can 1722 * be called at any time. Incidentally, add_disk() itself calls 1723 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1724 * directly as well. 1725 */ 1726 if (error == 0) { 1727 rw_enter(&zvol_state_lock, RW_WRITER); 1728 zvol_insert(zv); 1729 rw_exit(&zvol_state_lock); 1730 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1731 } else { 1732 ida_simple_remove(&zvol_ida, idx); 1733 } 1734 1735 return (error); 1736 } 1737 1738 void 1739 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1740 { 1741 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1742 1743 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1744 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1745 1746 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1747 1748 /* move to new hashtable entry */ 1749 zv->zv_hash = zvol_name_hash(newname); 1750 hlist_del(&zv->zv_hlink); 1751 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1752 1753 /* 1754 * The block device's read-only state is briefly changed causing 1755 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1756 * the name change and fixes the symlinks. This does not change 1757 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1758 * changes. This would normally be done using kobject_uevent() but 1759 * that is a GPL-only symbol which is why we need this workaround. 1760 */ 1761 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1762 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1763 1764 dataset_kstats_rename(&zv->zv_kstat, newname); 1765 } 1766 1767 void 1768 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1769 { 1770 1771 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1772 } 1773 1774 void 1775 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1776 { 1777 1778 set_capacity(zv->zv_zso->zvo_disk, capacity); 1779 } 1780 1781 int 1782 zvol_init(void) 1783 { 1784 int error; 1785 1786 /* 1787 * zvol_threads is the module param the user passes in. 1788 * 1789 * zvol_actual_threads is what we use internally, since the user can 1790 * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 1791 */ 1792 static unsigned int zvol_actual_threads; 1793 1794 if (zvol_threads == 0) { 1795 /* 1796 * See dde9380a1 for why 32 was chosen here. This should 1797 * probably be refined to be some multiple of the number 1798 * of CPUs. 1799 */ 1800 zvol_actual_threads = MAX(num_online_cpus(), 32); 1801 } else { 1802 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 1803 } 1804 1805 /* 1806 * Use atleast 32 zvol_threads but for many core system, 1807 * prefer 6 threads per taskq, but no more taskqs 1808 * than threads in them on large systems. 1809 * 1810 * taskq total 1811 * cpus taskqs threads threads 1812 * ------- ------- ------- ------- 1813 * 1 1 32 32 1814 * 2 1 32 32 1815 * 4 1 32 32 1816 * 8 2 16 32 1817 * 16 3 11 33 1818 * 32 5 7 35 1819 * 64 8 8 64 1820 * 128 11 12 132 1821 * 256 16 16 256 1822 */ 1823 zv_taskq_t *ztqs = &zvol_taskqs; 1824 uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); 1825 if (num_tqs == 0) { 1826 num_tqs = 1 + num_online_cpus() / 6; 1827 while (num_tqs * num_tqs > zvol_actual_threads) 1828 num_tqs--; 1829 } 1830 uint_t per_tq_thread = zvol_actual_threads / num_tqs; 1831 if (per_tq_thread * num_tqs < zvol_actual_threads) 1832 per_tq_thread++; 1833 ztqs->tqs_cnt = num_tqs; 1834 ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); 1835 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1836 if (error) { 1837 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); 1838 ztqs->tqs_taskq = NULL; 1839 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1840 return (error); 1841 } 1842 1843 if (zvol_blk_mq_queue_depth == 0) { 1844 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1845 } else { 1846 zvol_actual_blk_mq_queue_depth = 1847 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1848 } 1849 1850 if (zvol_blk_mq_threads == 0) { 1851 zvol_blk_mq_actual_threads = num_online_cpus(); 1852 } else { 1853 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1854 1024); 1855 } 1856 1857 for (uint_t i = 0; i < num_tqs; i++) { 1858 char name[32]; 1859 (void) snprintf(name, sizeof (name), "%s_tq-%u", 1860 ZVOL_DRIVER, i); 1861 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, 1862 maxclsyspri, per_tq_thread, INT_MAX, 1863 TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1864 if (ztqs->tqs_taskq[i] == NULL) { 1865 for (int j = i - 1; j >= 0; j--) 1866 taskq_destroy(ztqs->tqs_taskq[j]); 1867 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1868 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1869 sizeof (taskq_t *)); 1870 ztqs->tqs_taskq = NULL; 1871 return (-ENOMEM); 1872 } 1873 } 1874 1875 zvol_init_impl(); 1876 ida_init(&zvol_ida); 1877 return (0); 1878 } 1879 1880 void 1881 zvol_fini(void) 1882 { 1883 zv_taskq_t *ztqs = &zvol_taskqs; 1884 zvol_fini_impl(); 1885 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1886 1887 if (ztqs->tqs_taskq == NULL) { 1888 ASSERT3U(ztqs->tqs_cnt, ==, 0); 1889 } else { 1890 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { 1891 ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); 1892 taskq_destroy(ztqs->tqs_taskq[i]); 1893 } 1894 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1895 sizeof (taskq_t *)); 1896 ztqs->tqs_taskq = NULL; 1897 } 1898 1899 ida_destroy(&zvol_ida); 1900 } 1901 1902 module_param(zvol_inhibit_dev, uint, 0644); 1903 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1904 1905 module_param(zvol_major, uint, 0444); 1906 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1907 1908 module_param(zvol_threads, uint, 0444); 1909 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" 1910 "to 0 to use all active CPUs"); 1911 1912 module_param(zvol_request_sync, uint, 0644); 1913 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1914 1915 module_param(zvol_max_discard_blocks, ulong, 0444); 1916 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1917 1918 module_param(zvol_num_taskqs, uint, 0444); 1919 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); 1920 1921 module_param(zvol_prefetch_bytes, uint, 0644); 1922 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1923 1924 module_param(zvol_volmode, uint, 0644); 1925 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1926 1927 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1928 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1929 1930 module_param(zvol_use_blk_mq, uint, 0644); 1931 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1932 1933 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1934 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1935 "Process volblocksize blocks per thread"); 1936 1937 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1938 module_param(zvol_open_timeout_ms, uint, 0644); 1939 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1940 #endif 1941