1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 24 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 25 * Copyright (c) 2024, Klara, Inc. 26 */ 27 28 #include <sys/dataset_kstats.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/zil_impl.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/spa_impl.h> 41 #include <sys/zvol.h> 42 #include <sys/zvol_impl.h> 43 #include <cityhash.h> 44 45 #include <linux/blkdev_compat.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/workqueue.h> 48 #include <linux/blk-mq.h> 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned int zvol_request_sync = 0; 55 static unsigned int zvol_prefetch_bytes = (128 * 1024); 56 static unsigned long zvol_max_discard_blocks = 16384; 57 58 /* 59 * Switch taskq at multiple of 512 MB offset. This can be set to a lower value 60 * to utilize more threads for small files but may affect prefetch hits. 61 */ 62 #define ZVOL_TASKQ_OFFSET_SHIFT 29 63 64 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 65 static unsigned int zvol_open_timeout_ms = 1000; 66 #endif 67 68 static unsigned int zvol_threads = 0; 69 static unsigned int zvol_blk_mq_threads = 0; 70 static unsigned int zvol_blk_mq_actual_threads; 71 static boolean_t zvol_use_blk_mq = B_FALSE; 72 73 /* 74 * The maximum number of volblocksize blocks to process per thread. Typically, 75 * write heavy workloads preform better with higher values here, and read 76 * heavy workloads preform better with lower values, but that's not a hard 77 * and fast rule. It's basically a knob to tune between "less overhead with 78 * less parallelism" and "more overhead, but more parallelism". 79 * 80 * '8' was chosen as a reasonable, balanced, default based off of sequential 81 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 82 */ 83 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 84 85 static unsigned int zvol_num_taskqs = 0; 86 87 #ifndef BLKDEV_DEFAULT_RQ 88 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 89 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 90 #endif 91 92 /* 93 * Finalize our BIO or request. 94 */ 95 static inline void 96 zvol_end_io(struct bio *bio, struct request *rq, int error) 97 { 98 if (bio) { 99 bio->bi_status = errno_to_bi_status(-error); 100 bio_endio(bio); 101 } else { 102 blk_mq_end_request(rq, errno_to_bi_status(error)); 103 } 104 } 105 106 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 107 static unsigned int zvol_actual_blk_mq_queue_depth; 108 109 struct zvol_state_os { 110 struct gendisk *zvo_disk; /* generic disk */ 111 struct request_queue *zvo_queue; /* request queue */ 112 dev_t zvo_dev; /* device id */ 113 114 struct blk_mq_tag_set tag_set; 115 116 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 117 boolean_t use_blk_mq; 118 }; 119 120 typedef struct zv_taskq { 121 uint_t tqs_cnt; 122 taskq_t **tqs_taskq; 123 } zv_taskq_t; 124 static zv_taskq_t zvol_taskqs; 125 static struct ida zvol_ida; 126 127 typedef struct zv_request_stack { 128 zvol_state_t *zv; 129 struct bio *bio; 130 struct request *rq; 131 } zv_request_t; 132 133 typedef struct zv_work { 134 struct request *rq; 135 struct work_struct work; 136 } zv_work_t; 137 138 typedef struct zv_request_task { 139 zv_request_t zvr; 140 taskq_ent_t ent; 141 } zv_request_task_t; 142 143 static zv_request_task_t * 144 zv_request_task_create(zv_request_t zvr) 145 { 146 zv_request_task_t *task; 147 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 148 taskq_init_ent(&task->ent); 149 task->zvr = zvr; 150 return (task); 151 } 152 153 static void 154 zv_request_task_free(zv_request_task_t *task) 155 { 156 kmem_free(task, sizeof (*task)); 157 } 158 159 /* 160 * This is called when a new block multiqueue request comes in. A request 161 * contains one or more BIOs. 162 */ 163 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 164 const struct blk_mq_queue_data *bd) 165 { 166 struct request *rq = bd->rq; 167 zvol_state_t *zv = rq->q->queuedata; 168 169 /* Tell the kernel that we are starting to process this request */ 170 blk_mq_start_request(rq); 171 172 if (blk_rq_is_passthrough(rq)) { 173 /* Skip non filesystem request */ 174 blk_mq_end_request(rq, BLK_STS_IOERR); 175 return (BLK_STS_IOERR); 176 } 177 178 zvol_request_impl(zv, NULL, rq, 0); 179 180 /* Acknowledge to the kernel that we got this request */ 181 return (BLK_STS_OK); 182 } 183 184 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 185 .queue_rq = zvol_mq_queue_rq, 186 }; 187 188 /* Initialize our blk-mq struct */ 189 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 190 { 191 struct zvol_state_os *zso = zv->zv_zso; 192 193 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 194 195 /* Initialize tag set. */ 196 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 197 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 198 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 199 zso->tag_set.numa_node = NUMA_NO_NODE; 200 zso->tag_set.cmd_size = 0; 201 202 /* 203 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 204 * zvol_request_impl() 205 */ 206 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 207 208 #ifdef BLK_MQ_F_SHOULD_MERGE 209 /* 210 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 211 * For older kernels, we set it. 212 */ 213 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 214 #endif 215 216 zso->tag_set.driver_data = zv; 217 218 return (blk_mq_alloc_tag_set(&zso->tag_set)); 219 } 220 221 /* 222 * Given a path, return TRUE if path is a ZVOL. 223 */ 224 boolean_t 225 zvol_os_is_zvol(const char *path) 226 { 227 dev_t dev = 0; 228 229 if (vdev_lookup_bdev(path, &dev) != 0) 230 return (B_FALSE); 231 232 if (MAJOR(dev) == zvol_major) 233 return (B_TRUE); 234 235 return (B_FALSE); 236 } 237 238 static void 239 zvol_write(zv_request_t *zvr) 240 { 241 struct bio *bio = zvr->bio; 242 struct request *rq = zvr->rq; 243 int error = 0; 244 zfs_uio_t uio; 245 zvol_state_t *zv = zvr->zv; 246 struct request_queue *q; 247 struct gendisk *disk; 248 unsigned long start_time = 0; 249 boolean_t acct = B_FALSE; 250 251 ASSERT3P(zv, !=, NULL); 252 ASSERT3U(zv->zv_open_count, >, 0); 253 ASSERT3P(zv->zv_zilog, !=, NULL); 254 255 q = zv->zv_zso->zvo_queue; 256 disk = zv->zv_zso->zvo_disk; 257 258 /* bio marked as FLUSH need to flush before write */ 259 if (io_is_flush(bio, rq)) 260 zil_commit(zv->zv_zilog, ZVOL_OBJ); 261 262 /* Some requests are just for flush and nothing else. */ 263 if (io_size(bio, rq) == 0) { 264 rw_exit(&zv->zv_suspend_lock); 265 zvol_end_io(bio, rq, 0); 266 return; 267 } 268 269 zfs_uio_bvec_init(&uio, bio, rq); 270 271 ssize_t start_resid = uio.uio_resid; 272 273 /* 274 * With use_blk_mq, accounting is done by blk_mq_start_request() 275 * and blk_mq_end_request(), so we can skip it here. 276 */ 277 if (bio) { 278 acct = blk_queue_io_stat(q); 279 if (acct) { 280 start_time = blk_generic_start_io_acct(q, disk, WRITE, 281 bio); 282 } 283 } 284 285 boolean_t sync = 286 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 287 288 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 289 uio.uio_loffset, uio.uio_resid, RL_WRITER); 290 291 uint64_t volsize = zv->zv_volsize; 292 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 293 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 294 uint64_t off = uio.uio_loffset; 295 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 296 297 if (bytes > volsize - off) /* don't write past the end */ 298 bytes = volsize - off; 299 300 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 301 302 /* This will only fail for ENOSPC */ 303 error = dmu_tx_assign(tx, DMU_TX_WAIT); 304 if (error) { 305 dmu_tx_abort(tx); 306 break; 307 } 308 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 309 if (error == 0) { 310 zvol_log_write(zv, tx, off, bytes, sync); 311 } 312 dmu_tx_commit(tx); 313 314 if (error) 315 break; 316 } 317 zfs_rangelock_exit(lr); 318 319 int64_t nwritten = start_resid - uio.uio_resid; 320 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 321 task_io_account_write(nwritten); 322 323 if (sync) 324 zil_commit(zv->zv_zilog, ZVOL_OBJ); 325 326 rw_exit(&zv->zv_suspend_lock); 327 328 if (bio && acct) { 329 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 330 } 331 332 zvol_end_io(bio, rq, -error); 333 } 334 335 static void 336 zvol_write_task(void *arg) 337 { 338 zv_request_task_t *task = arg; 339 zvol_write(&task->zvr); 340 zv_request_task_free(task); 341 } 342 343 static void 344 zvol_discard(zv_request_t *zvr) 345 { 346 struct bio *bio = zvr->bio; 347 struct request *rq = zvr->rq; 348 zvol_state_t *zv = zvr->zv; 349 uint64_t start = io_offset(bio, rq); 350 uint64_t size = io_size(bio, rq); 351 uint64_t end = start + size; 352 boolean_t sync; 353 int error = 0; 354 dmu_tx_t *tx; 355 struct request_queue *q = zv->zv_zso->zvo_queue; 356 struct gendisk *disk = zv->zv_zso->zvo_disk; 357 unsigned long start_time = 0; 358 boolean_t acct = B_FALSE; 359 360 ASSERT3P(zv, !=, NULL); 361 ASSERT3U(zv->zv_open_count, >, 0); 362 ASSERT3P(zv->zv_zilog, !=, NULL); 363 364 if (bio) { 365 acct = blk_queue_io_stat(q); 366 if (acct) { 367 start_time = blk_generic_start_io_acct(q, disk, WRITE, 368 bio); 369 } 370 } 371 372 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 373 374 if (end > zv->zv_volsize) { 375 error = SET_ERROR(EIO); 376 goto unlock; 377 } 378 379 /* 380 * Align the request to volume block boundaries when a secure erase is 381 * not required. This will prevent dnode_free_range() from zeroing out 382 * the unaligned parts which is slow (read-modify-write) and useless 383 * since we are not freeing any space by doing so. 384 */ 385 if (!io_is_secure_erase(bio, rq)) { 386 start = P2ROUNDUP(start, zv->zv_volblocksize); 387 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 388 size = end - start; 389 } 390 391 if (start >= end) 392 goto unlock; 393 394 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 395 start, size, RL_WRITER); 396 397 tx = dmu_tx_create(zv->zv_objset); 398 dmu_tx_mark_netfree(tx); 399 error = dmu_tx_assign(tx, DMU_TX_WAIT); 400 if (error != 0) { 401 dmu_tx_abort(tx); 402 } else { 403 zvol_log_truncate(zv, tx, start, size); 404 dmu_tx_commit(tx); 405 error = dmu_free_long_range(zv->zv_objset, 406 ZVOL_OBJ, start, size); 407 } 408 zfs_rangelock_exit(lr); 409 410 if (error == 0 && sync) 411 zil_commit(zv->zv_zilog, ZVOL_OBJ); 412 413 unlock: 414 rw_exit(&zv->zv_suspend_lock); 415 416 if (bio && acct) { 417 blk_generic_end_io_acct(q, disk, WRITE, bio, 418 start_time); 419 } 420 421 zvol_end_io(bio, rq, -error); 422 } 423 424 static void 425 zvol_discard_task(void *arg) 426 { 427 zv_request_task_t *task = arg; 428 zvol_discard(&task->zvr); 429 zv_request_task_free(task); 430 } 431 432 static void 433 zvol_read(zv_request_t *zvr) 434 { 435 struct bio *bio = zvr->bio; 436 struct request *rq = zvr->rq; 437 int error = 0; 438 zfs_uio_t uio; 439 boolean_t acct = B_FALSE; 440 zvol_state_t *zv = zvr->zv; 441 struct request_queue *q; 442 struct gendisk *disk; 443 unsigned long start_time = 0; 444 445 ASSERT3P(zv, !=, NULL); 446 ASSERT3U(zv->zv_open_count, >, 0); 447 448 zfs_uio_bvec_init(&uio, bio, rq); 449 450 q = zv->zv_zso->zvo_queue; 451 disk = zv->zv_zso->zvo_disk; 452 453 ssize_t start_resid = uio.uio_resid; 454 455 /* 456 * When blk-mq is being used, accounting is done by 457 * blk_mq_start_request() and blk_mq_end_request(). 458 */ 459 if (bio) { 460 acct = blk_queue_io_stat(q); 461 if (acct) 462 start_time = blk_generic_start_io_acct(q, disk, READ, 463 bio); 464 } 465 466 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 467 uio.uio_loffset, uio.uio_resid, RL_READER); 468 469 uint64_t volsize = zv->zv_volsize; 470 471 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 472 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 473 474 /* don't read past the end */ 475 if (bytes > volsize - uio.uio_loffset) 476 bytes = volsize - uio.uio_loffset; 477 478 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 479 if (error) { 480 /* convert checksum errors into IO errors */ 481 if (error == ECKSUM) 482 error = SET_ERROR(EIO); 483 break; 484 } 485 } 486 zfs_rangelock_exit(lr); 487 488 int64_t nread = start_resid - uio.uio_resid; 489 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 490 task_io_account_read(nread); 491 492 rw_exit(&zv->zv_suspend_lock); 493 494 if (bio && acct) { 495 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 496 } 497 498 zvol_end_io(bio, rq, -error); 499 } 500 501 static void 502 zvol_read_task(void *arg) 503 { 504 zv_request_task_t *task = arg; 505 zvol_read(&task->zvr); 506 zv_request_task_free(task); 507 } 508 509 510 /* 511 * Process a BIO or request 512 * 513 * Either 'bio' or 'rq' should be set depending on if we are processing a 514 * bio or a request (both should not be set). 515 * 516 * force_sync: Set to 0 to defer processing to a background taskq 517 * Set to 1 to process data synchronously 518 */ 519 static void 520 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 521 boolean_t force_sync) 522 { 523 fstrans_cookie_t cookie = spl_fstrans_mark(); 524 uint64_t offset = io_offset(bio, rq); 525 uint64_t size = io_size(bio, rq); 526 int rw = io_data_dir(bio, rq); 527 528 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 529 zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); 530 goto out; 531 } 532 533 if (zvol_request_sync || zv->zv_threading == B_FALSE) 534 force_sync = 1; 535 536 zv_request_t zvr = { 537 .zv = zv, 538 .bio = bio, 539 .rq = rq, 540 }; 541 542 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 543 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 544 zv->zv_zso->zvo_disk->disk_name, 545 (long long unsigned)offset, 546 (long unsigned)size); 547 548 zvol_end_io(bio, rq, -SET_ERROR(EIO)); 549 goto out; 550 } 551 552 zv_request_task_t *task; 553 zv_taskq_t *ztqs = &zvol_taskqs; 554 uint_t blk_mq_hw_queue = 0; 555 uint_t tq_idx; 556 uint_t taskq_hash; 557 if (rq) 558 #ifdef HAVE_BLK_MQ_RQ_HCTX 559 blk_mq_hw_queue = rq->mq_hctx->queue_num; 560 #else 561 blk_mq_hw_queue = 562 rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 563 #endif 564 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 565 blk_mq_hw_queue); 566 tq_idx = taskq_hash % ztqs->tqs_cnt; 567 568 if (rw == WRITE) { 569 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 570 zvol_end_io(bio, rq, -SET_ERROR(EROFS)); 571 goto out; 572 } 573 574 /* 575 * Prevents the zvol from being suspended, or the ZIL being 576 * concurrently opened. Will be released after the i/o 577 * completes. 578 */ 579 rw_enter(&zv->zv_suspend_lock, RW_READER); 580 581 /* 582 * Open a ZIL if this is the first time we have written to this 583 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 584 * than zv_state_lock so that we don't need to acquire an 585 * additional lock in this path. 586 */ 587 if (zv->zv_zilog == NULL) { 588 rw_exit(&zv->zv_suspend_lock); 589 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 590 if (zv->zv_zilog == NULL) { 591 zv->zv_zilog = zil_open(zv->zv_objset, 592 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 593 zv->zv_flags |= ZVOL_WRITTEN_TO; 594 /* replay / destroy done in zvol_create_minor */ 595 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 596 ZIL_REPLAY_NEEDED)); 597 } 598 rw_downgrade(&zv->zv_suspend_lock); 599 } 600 601 /* 602 * We don't want this thread to be blocked waiting for i/o to 603 * complete, so we instead wait from a taskq callback. The 604 * i/o may be a ZIL write (via zil_commit()), or a read of an 605 * indirect block, or a read of a data block (if this is a 606 * partial-block write). We will indicate that the i/o is 607 * complete by calling END_IO() from the taskq callback. 608 * 609 * This design allows the calling thread to continue and 610 * initiate more concurrent operations by calling 611 * zvol_request() again. There are typically only a small 612 * number of threads available to call zvol_request() (e.g. 613 * one per iSCSI target), so keeping the latency of 614 * zvol_request() low is important for performance. 615 * 616 * The zvol_request_sync module parameter allows this 617 * behavior to be altered, for performance evaluation 618 * purposes. If the callback blocks, setting 619 * zvol_request_sync=1 will result in much worse performance. 620 * 621 * We can have up to zvol_threads concurrent i/o's being 622 * processed for all zvols on the system. This is typically 623 * a vast improvement over the zvol_request_sync=1 behavior 624 * of one i/o at a time per zvol. However, an even better 625 * design would be for zvol_request() to initiate the zio 626 * directly, and then be notified by the zio_done callback, 627 * which would call END_IO(). Unfortunately, the DMU/ZIL 628 * interfaces lack this functionality (they block waiting for 629 * the i/o to complete). 630 */ 631 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 632 if (force_sync) { 633 zvol_discard(&zvr); 634 } else { 635 task = zv_request_task_create(zvr); 636 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 637 zvol_discard_task, task, 0, &task->ent); 638 } 639 } else { 640 if (force_sync) { 641 zvol_write(&zvr); 642 } else { 643 task = zv_request_task_create(zvr); 644 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 645 zvol_write_task, task, 0, &task->ent); 646 } 647 } 648 } else { 649 /* 650 * The SCST driver, and possibly others, may issue READ I/Os 651 * with a length of zero bytes. These empty I/Os contain no 652 * data and require no additional handling. 653 */ 654 if (size == 0) { 655 zvol_end_io(bio, rq, 0); 656 goto out; 657 } 658 659 rw_enter(&zv->zv_suspend_lock, RW_READER); 660 661 /* See comment in WRITE case above. */ 662 if (force_sync) { 663 zvol_read(&zvr); 664 } else { 665 task = zv_request_task_create(zvr); 666 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 667 zvol_read_task, task, 0, &task->ent); 668 } 669 } 670 671 out: 672 spl_fstrans_unmark(cookie); 673 } 674 675 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 676 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 677 static void 678 zvol_submit_bio(struct bio *bio) 679 #else 680 static blk_qc_t 681 zvol_submit_bio(struct bio *bio) 682 #endif 683 #else 684 static MAKE_REQUEST_FN_RET 685 zvol_request(struct request_queue *q, struct bio *bio) 686 #endif 687 { 688 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 689 #if defined(HAVE_BIO_BDEV_DISK) 690 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 691 #else 692 struct request_queue *q = bio->bi_disk->queue; 693 #endif 694 #endif 695 zvol_state_t *zv = q->queuedata; 696 697 zvol_request_impl(zv, bio, NULL, 0); 698 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 699 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 700 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 701 return (BLK_QC_T_NONE); 702 #endif 703 } 704 705 static int 706 #ifdef HAVE_BLK_MODE_T 707 zvol_open(struct gendisk *disk, blk_mode_t flag) 708 #else 709 zvol_open(struct block_device *bdev, fmode_t flag) 710 #endif 711 { 712 zvol_state_t *zv; 713 int error = 0; 714 boolean_t drop_suspend = B_FALSE; 715 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 716 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 717 hrtime_t start = gethrtime(); 718 719 retry: 720 #endif 721 rw_enter(&zvol_state_lock, RW_READER); 722 /* 723 * Obtain a copy of private_data under the zvol_state_lock to make 724 * sure that either the result of zvol free code path setting 725 * disk->private_data to NULL is observed, or zvol_os_free() 726 * is not called on this zv because of the positive zv_open_count. 727 */ 728 #ifdef HAVE_BLK_MODE_T 729 zv = disk->private_data; 730 #else 731 zv = bdev->bd_disk->private_data; 732 #endif 733 if (zv == NULL) { 734 rw_exit(&zvol_state_lock); 735 return (-SET_ERROR(ENXIO)); 736 } 737 738 mutex_enter(&zv->zv_state_lock); 739 740 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 741 mutex_exit(&zv->zv_state_lock); 742 rw_exit(&zvol_state_lock); 743 return (-SET_ERROR(ENXIO)); 744 } 745 746 /* 747 * Make sure zvol is not suspended during first open 748 * (hold zv_suspend_lock) and respect proper lock acquisition 749 * ordering - zv_suspend_lock before zv_state_lock 750 */ 751 if (zv->zv_open_count == 0) { 752 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 753 mutex_exit(&zv->zv_state_lock); 754 rw_enter(&zv->zv_suspend_lock, RW_READER); 755 mutex_enter(&zv->zv_state_lock); 756 /* check to see if zv_suspend_lock is needed */ 757 if (zv->zv_open_count != 0) { 758 rw_exit(&zv->zv_suspend_lock); 759 } else { 760 drop_suspend = B_TRUE; 761 } 762 } else { 763 drop_suspend = B_TRUE; 764 } 765 } 766 rw_exit(&zvol_state_lock); 767 768 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 769 770 if (zv->zv_open_count == 0) { 771 boolean_t drop_namespace = B_FALSE; 772 773 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 774 775 /* 776 * In all other call paths the spa_namespace_lock is taken 777 * before the bdev->bd_mutex lock. However, on open(2) 778 * the __blkdev_get() function calls fops->open() with the 779 * bdev->bd_mutex lock held. This can result in a deadlock 780 * when zvols from one pool are used as vdevs in another. 781 * 782 * To prevent a lock inversion deadlock we preemptively 783 * take the spa_namespace_lock. Normally the lock will not 784 * be contended and this is safe because spa_open_common() 785 * handles the case where the caller already holds the 786 * spa_namespace_lock. 787 * 788 * When the lock cannot be aquired after multiple retries 789 * this must be the vdev on zvol deadlock case and we have 790 * no choice but to return an error. For 5.12 and older 791 * kernels returning -ERESTARTSYS will result in the 792 * bdev->bd_mutex being dropped, then reacquired, and 793 * fops->open() being called again. This process can be 794 * repeated safely until both locks are acquired. For 5.13 795 * and newer the -ERESTARTSYS retry logic was removed from 796 * the kernel so the only option is to return the error for 797 * the caller to handle it. 798 */ 799 if (!mutex_owned(&spa_namespace_lock)) { 800 if (!mutex_tryenter(&spa_namespace_lock)) { 801 mutex_exit(&zv->zv_state_lock); 802 rw_exit(&zv->zv_suspend_lock); 803 drop_suspend = B_FALSE; 804 805 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 806 schedule(); 807 return (-SET_ERROR(ERESTARTSYS)); 808 #else 809 if ((gethrtime() - start) > timeout) 810 return (-SET_ERROR(ERESTARTSYS)); 811 812 schedule_timeout_interruptible( 813 MSEC_TO_TICK(10)); 814 goto retry; 815 #endif 816 } else { 817 drop_namespace = B_TRUE; 818 } 819 } 820 821 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 822 823 if (drop_namespace) 824 mutex_exit(&spa_namespace_lock); 825 } 826 827 if (error == 0) { 828 if ((blk_mode_is_open_write(flag)) && 829 (zv->zv_flags & ZVOL_RDONLY)) { 830 if (zv->zv_open_count == 0) 831 zvol_last_close(zv); 832 833 error = -SET_ERROR(EROFS); 834 } else { 835 zv->zv_open_count++; 836 } 837 } 838 839 mutex_exit(&zv->zv_state_lock); 840 if (drop_suspend) 841 rw_exit(&zv->zv_suspend_lock); 842 843 if (error == 0) 844 #ifdef HAVE_BLK_MODE_T 845 disk_check_media_change(disk); 846 #else 847 zfs_check_media_change(bdev); 848 #endif 849 850 return (error); 851 } 852 853 static void 854 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 855 zvol_release(struct gendisk *disk) 856 #else 857 zvol_release(struct gendisk *disk, fmode_t unused) 858 #endif 859 { 860 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 861 (void) unused; 862 #endif 863 zvol_state_t *zv; 864 boolean_t drop_suspend = B_TRUE; 865 866 rw_enter(&zvol_state_lock, RW_READER); 867 zv = disk->private_data; 868 869 mutex_enter(&zv->zv_state_lock); 870 ASSERT3U(zv->zv_open_count, >, 0); 871 /* 872 * make sure zvol is not suspended during last close 873 * (hold zv_suspend_lock) and respect proper lock acquisition 874 * ordering - zv_suspend_lock before zv_state_lock 875 */ 876 if (zv->zv_open_count == 1) { 877 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 878 mutex_exit(&zv->zv_state_lock); 879 rw_enter(&zv->zv_suspend_lock, RW_READER); 880 mutex_enter(&zv->zv_state_lock); 881 /* check to see if zv_suspend_lock is needed */ 882 if (zv->zv_open_count != 1) { 883 rw_exit(&zv->zv_suspend_lock); 884 drop_suspend = B_FALSE; 885 } 886 } 887 } else { 888 drop_suspend = B_FALSE; 889 } 890 rw_exit(&zvol_state_lock); 891 892 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 893 894 zv->zv_open_count--; 895 if (zv->zv_open_count == 0) { 896 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 897 zvol_last_close(zv); 898 } 899 900 mutex_exit(&zv->zv_state_lock); 901 902 if (drop_suspend) 903 rw_exit(&zv->zv_suspend_lock); 904 } 905 906 static int 907 zvol_ioctl(struct block_device *bdev, fmode_t mode, 908 unsigned int cmd, unsigned long arg) 909 { 910 zvol_state_t *zv = bdev->bd_disk->private_data; 911 int error = 0; 912 913 ASSERT3U(zv->zv_open_count, >, 0); 914 915 switch (cmd) { 916 case BLKFLSBUF: 917 #ifdef HAVE_FSYNC_BDEV 918 fsync_bdev(bdev); 919 #elif defined(HAVE_SYNC_BLOCKDEV) 920 sync_blockdev(bdev); 921 #else 922 #error "Neither fsync_bdev() nor sync_blockdev() found" 923 #endif 924 invalidate_bdev(bdev); 925 rw_enter(&zv->zv_suspend_lock, RW_READER); 926 927 if (!(zv->zv_flags & ZVOL_RDONLY)) 928 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 929 930 rw_exit(&zv->zv_suspend_lock); 931 break; 932 933 case BLKZNAME: 934 mutex_enter(&zv->zv_state_lock); 935 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 936 mutex_exit(&zv->zv_state_lock); 937 break; 938 939 default: 940 error = -ENOTTY; 941 break; 942 } 943 944 return (SET_ERROR(error)); 945 } 946 947 #ifdef CONFIG_COMPAT 948 static int 949 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 950 unsigned cmd, unsigned long arg) 951 { 952 return (zvol_ioctl(bdev, mode, cmd, arg)); 953 } 954 #else 955 #define zvol_compat_ioctl NULL 956 #endif 957 958 static unsigned int 959 zvol_check_events(struct gendisk *disk, unsigned int clearing) 960 { 961 unsigned int mask = 0; 962 963 rw_enter(&zvol_state_lock, RW_READER); 964 965 zvol_state_t *zv = disk->private_data; 966 if (zv != NULL) { 967 mutex_enter(&zv->zv_state_lock); 968 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 969 zv->zv_changed = 0; 970 mutex_exit(&zv->zv_state_lock); 971 } 972 973 rw_exit(&zvol_state_lock); 974 975 return (mask); 976 } 977 978 static int 979 zvol_revalidate_disk(struct gendisk *disk) 980 { 981 rw_enter(&zvol_state_lock, RW_READER); 982 983 zvol_state_t *zv = disk->private_data; 984 if (zv != NULL) { 985 mutex_enter(&zv->zv_state_lock); 986 set_capacity(zv->zv_zso->zvo_disk, 987 zv->zv_volsize >> SECTOR_BITS); 988 mutex_exit(&zv->zv_state_lock); 989 } 990 991 rw_exit(&zvol_state_lock); 992 993 return (0); 994 } 995 996 int 997 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 998 { 999 struct gendisk *disk = zv->zv_zso->zvo_disk; 1000 1001 #if defined(HAVE_REVALIDATE_DISK_SIZE) 1002 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 1003 #elif defined(HAVE_REVALIDATE_DISK) 1004 revalidate_disk(disk); 1005 #else 1006 zvol_revalidate_disk(disk); 1007 #endif 1008 return (0); 1009 } 1010 1011 void 1012 zvol_os_clear_private(zvol_state_t *zv) 1013 { 1014 /* 1015 * Cleared while holding zvol_state_lock as a writer 1016 * which will prevent zvol_open() from opening it. 1017 */ 1018 zv->zv_zso->zvo_disk->private_data = NULL; 1019 } 1020 1021 /* 1022 * Provide a simple virtual geometry for legacy compatibility. For devices 1023 * smaller than 1 MiB a small head and sector count is used to allow very 1024 * tiny devices. For devices over 1 Mib a standard head and sector count 1025 * is used to keep the cylinders count reasonable. 1026 */ 1027 static int 1028 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1029 { 1030 zvol_state_t *zv = bdev->bd_disk->private_data; 1031 sector_t sectors; 1032 1033 ASSERT3U(zv->zv_open_count, >, 0); 1034 1035 sectors = get_capacity(zv->zv_zso->zvo_disk); 1036 1037 if (sectors > 2048) { 1038 geo->heads = 16; 1039 geo->sectors = 63; 1040 } else { 1041 geo->heads = 2; 1042 geo->sectors = 4; 1043 } 1044 1045 geo->start = 0; 1046 geo->cylinders = sectors / (geo->heads * geo->sectors); 1047 1048 return (0); 1049 } 1050 1051 /* 1052 * Why have two separate block_device_operations structs? 1053 * 1054 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1055 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1056 * can't just change submit_bio dynamically at runtime. So just create two 1057 * separate structs to get around this. 1058 */ 1059 static const struct block_device_operations zvol_ops_blk_mq = { 1060 .open = zvol_open, 1061 .release = zvol_release, 1062 .ioctl = zvol_ioctl, 1063 .compat_ioctl = zvol_compat_ioctl, 1064 .check_events = zvol_check_events, 1065 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1066 .revalidate_disk = zvol_revalidate_disk, 1067 #endif 1068 .getgeo = zvol_getgeo, 1069 .owner = THIS_MODULE, 1070 }; 1071 1072 static const struct block_device_operations zvol_ops = { 1073 .open = zvol_open, 1074 .release = zvol_release, 1075 .ioctl = zvol_ioctl, 1076 .compat_ioctl = zvol_compat_ioctl, 1077 .check_events = zvol_check_events, 1078 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1079 .revalidate_disk = zvol_revalidate_disk, 1080 #endif 1081 .getgeo = zvol_getgeo, 1082 .owner = THIS_MODULE, 1083 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1084 .submit_bio = zvol_submit_bio, 1085 #endif 1086 }; 1087 1088 /* 1089 * Since 6.9, Linux has been removing queue limit setters in favour of an 1090 * initial queue_limits struct applied when the device is open. Since 6.11, 1091 * queue_limits is being extended to allow more things to be applied when the 1092 * device is open. Setters are also being removed for this. 1093 * 1094 * For OpenZFS, this means that depending on kernel version, some options may 1095 * be set up before the device is open, and some applied to an open device 1096 * (queue) after the fact. 1097 * 1098 * We manage this complexity by having our own limits struct, 1099 * zvol_queue_limits_t, in which we carry any queue config that we're 1100 * interested in setting. This structure is the same on all kernels. 1101 * 1102 * These limits are then applied to the queue at device open time by the most 1103 * appropriate method for the kernel. 1104 * 1105 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1106 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1107 * struct queue_limits, and passes it in. Any fields added in later kernels are 1108 * (obviously) not set up here. 1109 * 1110 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1111 * is created, and applies any remaining config. Before 6.9 that will be 1112 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1113 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1114 * will always be a no-op on the latest kernel we support). 1115 */ 1116 typedef struct zvol_queue_limits { 1117 unsigned int zql_max_hw_sectors; 1118 unsigned short zql_max_segments; 1119 unsigned int zql_max_segment_size; 1120 unsigned int zql_io_opt; 1121 unsigned int zql_physical_block_size; 1122 unsigned int zql_max_discard_sectors; 1123 unsigned int zql_discard_granularity; 1124 } zvol_queue_limits_t; 1125 1126 static void 1127 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1128 boolean_t use_blk_mq) 1129 { 1130 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1131 1132 if (use_blk_mq) { 1133 /* 1134 * IO requests can be really big (1MB). When an IO request 1135 * comes in, it is passed off to zvol_read() or zvol_write() 1136 * in a new thread, where it is chunked up into 'volblocksize' 1137 * sized pieces and processed. So for example, if the request 1138 * is a 1MB write and your volblocksize is 128k, one zvol_write 1139 * thread will take that request and sequentially do ten 128k 1140 * IOs. This is due to the fact that the thread needs to lock 1141 * each volblocksize sized block. So you might be wondering: 1142 * "instead of passing the whole 1MB request to one thread, 1143 * why not pass ten individual 128k chunks to ten threads and 1144 * process the whole write in parallel?" The short answer is 1145 * that there's a sweet spot number of chunks that balances 1146 * the greater parallelism with the added overhead of more 1147 * threads. The sweet spot can be different depending on if you 1148 * have a read or write heavy workload. Writes typically want 1149 * high chunk counts while reads typically want lower ones. On 1150 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1151 * configuration, with volblocksize=8k, the sweet spot for good 1152 * sequential reads and writes was at 8 chunks. 1153 */ 1154 1155 /* 1156 * Below we tell the kernel how big we want our requests 1157 * to be. You would think that blk_queue_io_opt() would be 1158 * used to do this since it is used to "set optimal request 1159 * size for the queue", but that doesn't seem to do 1160 * anything - the kernel still gives you huge requests 1161 * with tons of little PAGE_SIZE segments contained within it. 1162 * 1163 * Knowing that the kernel will just give you PAGE_SIZE segments 1164 * no matter what, you can say "ok, I want PAGE_SIZE byte 1165 * segments, and I want 'N' of them per request", where N is 1166 * the correct number of segments for the volblocksize and 1167 * number of chunks you want. 1168 */ 1169 if (zvol_blk_mq_blocks_per_thread != 0) { 1170 unsigned int chunks; 1171 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1172 1173 limits->zql_max_segment_size = PAGE_SIZE; 1174 limits->zql_max_segments = 1175 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1176 } else { 1177 /* 1178 * Special case: zvol_blk_mq_blocks_per_thread = 0 1179 * Max everything out. 1180 */ 1181 limits->zql_max_segments = UINT16_MAX; 1182 limits->zql_max_segment_size = UINT_MAX; 1183 } 1184 } else { 1185 limits->zql_max_segments = UINT16_MAX; 1186 limits->zql_max_segment_size = UINT_MAX; 1187 } 1188 1189 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1190 1191 limits->zql_physical_block_size = zv->zv_volblocksize; 1192 limits->zql_max_discard_sectors = 1193 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1194 limits->zql_discard_granularity = zv->zv_volblocksize; 1195 } 1196 1197 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1198 static void 1199 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1200 struct queue_limits *qlimits) 1201 { 1202 memset(qlimits, 0, sizeof (struct queue_limits)); 1203 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1204 qlimits->max_segments = limits->zql_max_segments; 1205 qlimits->max_segment_size = limits->zql_max_segment_size; 1206 qlimits->io_opt = limits->zql_io_opt; 1207 qlimits->physical_block_size = limits->zql_physical_block_size; 1208 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1209 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1210 qlimits->discard_granularity = limits->zql_discard_granularity; 1211 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1212 qlimits->features = 1213 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1214 #endif 1215 } 1216 #endif 1217 1218 static void 1219 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1220 struct request_queue *queue) 1221 { 1222 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1223 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1224 blk_queue_max_segments(queue, limits->zql_max_segments); 1225 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1226 blk_queue_io_opt(queue, limits->zql_io_opt); 1227 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1228 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1229 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1230 #endif 1231 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1232 blk_queue_set_write_cache(queue, B_TRUE); 1233 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1234 #endif 1235 } 1236 1237 static int 1238 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1239 { 1240 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1241 #if defined(HAVE_BLK_ALLOC_DISK) 1242 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1243 if (zso->zvo_disk == NULL) 1244 return (1); 1245 1246 zso->zvo_disk->minors = ZVOL_MINORS; 1247 zso->zvo_queue = zso->zvo_disk->queue; 1248 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1249 struct queue_limits qlimits; 1250 zvol_queue_limits_convert(limits, &qlimits); 1251 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1252 if (IS_ERR(disk)) { 1253 zso->zvo_disk = NULL; 1254 return (1); 1255 } 1256 1257 zso->zvo_disk = disk; 1258 zso->zvo_disk->minors = ZVOL_MINORS; 1259 zso->zvo_queue = zso->zvo_disk->queue; 1260 1261 #else 1262 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1263 if (zso->zvo_queue == NULL) 1264 return (1); 1265 1266 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1267 if (zso->zvo_disk == NULL) { 1268 blk_cleanup_queue(zso->zvo_queue); 1269 return (1); 1270 } 1271 1272 zso->zvo_disk->queue = zso->zvo_queue; 1273 #endif /* HAVE_BLK_ALLOC_DISK */ 1274 #else 1275 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1276 if (zso->zvo_queue == NULL) 1277 return (1); 1278 1279 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1280 if (zso->zvo_disk == NULL) { 1281 blk_cleanup_queue(zso->zvo_queue); 1282 return (1); 1283 } 1284 1285 zso->zvo_disk->queue = zso->zvo_queue; 1286 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1287 1288 zvol_queue_limits_apply(limits, zso->zvo_queue); 1289 1290 return (0); 1291 1292 } 1293 1294 static int 1295 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1296 { 1297 struct zvol_state_os *zso = zv->zv_zso; 1298 1299 /* Allocate our blk-mq tag_set */ 1300 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1301 return (1); 1302 1303 #if defined(HAVE_BLK_ALLOC_DISK) 1304 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1305 if (zso->zvo_disk == NULL) { 1306 blk_mq_free_tag_set(&zso->tag_set); 1307 return (1); 1308 } 1309 zso->zvo_queue = zso->zvo_disk->queue; 1310 zso->zvo_disk->minors = ZVOL_MINORS; 1311 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1312 struct queue_limits qlimits; 1313 zvol_queue_limits_convert(limits, &qlimits); 1314 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1315 if (IS_ERR(disk)) { 1316 zso->zvo_disk = NULL; 1317 blk_mq_free_tag_set(&zso->tag_set); 1318 return (1); 1319 } 1320 1321 zso->zvo_disk = disk; 1322 zso->zvo_queue = zso->zvo_disk->queue; 1323 zso->zvo_disk->minors = ZVOL_MINORS; 1324 #else 1325 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1326 if (zso->zvo_disk == NULL) { 1327 blk_cleanup_queue(zso->zvo_queue); 1328 blk_mq_free_tag_set(&zso->tag_set); 1329 return (1); 1330 } 1331 /* Allocate queue */ 1332 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1333 if (IS_ERR(zso->zvo_queue)) { 1334 blk_mq_free_tag_set(&zso->tag_set); 1335 return (1); 1336 } 1337 1338 /* Our queue is now created, assign it to our disk */ 1339 zso->zvo_disk->queue = zso->zvo_queue; 1340 #endif 1341 1342 zvol_queue_limits_apply(limits, zso->zvo_queue); 1343 1344 return (0); 1345 } 1346 1347 /* 1348 * Allocate memory for a new zvol_state_t and setup the required 1349 * request queue and generic disk structures for the block device. 1350 */ 1351 static zvol_state_t * 1352 zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) 1353 { 1354 zvol_state_t *zv; 1355 struct zvol_state_os *zso; 1356 uint64_t volmode; 1357 int ret; 1358 1359 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1360 return (NULL); 1361 1362 if (volmode == ZFS_VOLMODE_DEFAULT) 1363 volmode = zvol_volmode; 1364 1365 if (volmode == ZFS_VOLMODE_NONE) 1366 return (NULL); 1367 1368 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1369 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1370 zv->zv_zso = zso; 1371 zv->zv_volmode = volmode; 1372 zv->zv_volblocksize = volblocksize; 1373 1374 list_link_init(&zv->zv_next); 1375 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1376 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1377 1378 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1379 1380 zvol_queue_limits_t limits; 1381 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1382 1383 /* 1384 * The block layer has 3 interfaces for getting BIOs: 1385 * 1386 * 1. blk-mq request queues (new) 1387 * 2. submit_bio() (oldest) 1388 * 3. regular request queues (old). 1389 * 1390 * Each of those interfaces has two permutations: 1391 * 1392 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1393 * both the disk and its queue (5.14 kernel or newer) 1394 * 1395 * b) We don't have blk_*alloc_disk(), and have to allocate the 1396 * disk and the queue separately. (5.13 kernel or older) 1397 */ 1398 if (zv->zv_zso->use_blk_mq) { 1399 ret = zvol_alloc_blk_mq(zv, &limits); 1400 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1401 } else { 1402 ret = zvol_alloc_non_blk_mq(zso, &limits); 1403 zso->zvo_disk->fops = &zvol_ops; 1404 } 1405 if (ret != 0) 1406 goto out_kmem; 1407 1408 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1409 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1410 1411 if (!zv->zv_zso->use_blk_mq) { 1412 /* Disable write merging in favor of the ZIO pipeline. */ 1413 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1414 } 1415 1416 zso->zvo_queue->queuedata = zv; 1417 zso->zvo_dev = dev; 1418 zv->zv_open_count = 0; 1419 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1420 1421 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1422 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1423 1424 zso->zvo_disk->major = zvol_major; 1425 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1426 1427 /* 1428 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1429 * This is accomplished by limiting the number of minors for the 1430 * device to one and explicitly disabling partition scanning. 1431 */ 1432 if (volmode == ZFS_VOLMODE_DEV) { 1433 zso->zvo_disk->minors = 1; 1434 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1435 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1436 } 1437 1438 zso->zvo_disk->first_minor = (dev & MINORMASK); 1439 zso->zvo_disk->private_data = zv; 1440 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1441 ZVOL_DEV_NAME, (dev & MINORMASK)); 1442 1443 return (zv); 1444 1445 out_kmem: 1446 kmem_free(zso, sizeof (struct zvol_state_os)); 1447 kmem_free(zv, sizeof (zvol_state_t)); 1448 return (NULL); 1449 } 1450 1451 /* 1452 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1453 * At this time, the structure is not opened by anyone, is taken off 1454 * the zvol_state_list, and has its private data set to NULL. 1455 * The zvol_state_lock is dropped. 1456 * 1457 * This function may take many milliseconds to complete (e.g. we've seen 1458 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1459 * "del_gendisk". Thus, consumers need to be careful to account for this 1460 * latency when calling this function. 1461 */ 1462 void 1463 zvol_os_free(zvol_state_t *zv) 1464 { 1465 1466 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1467 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1468 ASSERT0(zv->zv_open_count); 1469 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1470 1471 rw_destroy(&zv->zv_suspend_lock); 1472 zfs_rangelock_fini(&zv->zv_rangelock); 1473 1474 del_gendisk(zv->zv_zso->zvo_disk); 1475 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1476 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1477 #if defined(HAVE_BLK_CLEANUP_DISK) 1478 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1479 #else 1480 put_disk(zv->zv_zso->zvo_disk); 1481 #endif 1482 #else 1483 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1484 put_disk(zv->zv_zso->zvo_disk); 1485 #endif 1486 1487 if (zv->zv_zso->use_blk_mq) 1488 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1489 1490 ida_simple_remove(&zvol_ida, 1491 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1492 1493 cv_destroy(&zv->zv_removing_cv); 1494 mutex_destroy(&zv->zv_state_lock); 1495 dataset_kstats_destroy(&zv->zv_kstat); 1496 1497 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1498 kmem_free(zv, sizeof (zvol_state_t)); 1499 } 1500 1501 void 1502 zvol_wait_close(zvol_state_t *zv) 1503 { 1504 } 1505 1506 struct add_disk_work { 1507 struct delayed_work work; 1508 struct gendisk *disk; 1509 int error; 1510 }; 1511 1512 static int 1513 __zvol_os_add_disk(struct gendisk *disk) 1514 { 1515 int error = 0; 1516 #ifdef HAVE_ADD_DISK_RET 1517 error = add_disk(disk); 1518 #else 1519 add_disk(disk); 1520 #endif 1521 return (error); 1522 } 1523 1524 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1525 static void 1526 zvol_os_add_disk_work(struct work_struct *work) 1527 { 1528 struct add_disk_work *add_disk_work; 1529 add_disk_work = container_of(work, struct add_disk_work, work.work); 1530 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1531 } 1532 #endif 1533 1534 /* 1535 * SPECIAL CASE: 1536 * 1537 * This function basically calls add_disk() from a workqueue. You may be 1538 * thinking: why not just call add_disk() directly? 1539 * 1540 * When you call add_disk(), the zvol appears to the world. When this happens, 1541 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1542 * differently on the 6.9+ kernels: 1543 * 1544 * - 6.8 and older kernels - 1545 * disk_scan_partitions() 1546 * handle = bdev_open_by_dev( 1547 * zvol_open() 1548 * bdev_release(handle); 1549 * zvol_release() 1550 * 1551 * 1552 * - 6.9+ kernels - 1553 * disk_scan_partitions() 1554 * file = bdev_file_open_by_dev() 1555 * zvol_open() 1556 * fput(file) 1557 * < wait for return to userspace > 1558 * zvol_release() 1559 * 1560 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1561 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1562 * async that has to wait until we return to userspace (since it adds the fput 1563 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1564 * behavior we want, since we want do things like create+destroy a zvol within 1565 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1566 * reference to the zvol while we're in the IOCTL, which can't wait until we 1567 * return to userspace. 1568 * 1569 * We can get around this since fput() has a special codepath for when it's 1570 * running in a kernel thread or interrupt. In those cases, it just puts the 1571 * fput into the system workqueue, which we can force to run with 1572 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1573 * run from a kernel thread and "tricks" the fput() codepaths. 1574 * 1575 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1576 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1577 * fput) to happen, which it eventually, naturally, will from the system_wq 1578 * without us explicitly calling __flush_workqueue(). 1579 */ 1580 static int 1581 zvol_os_add_disk(struct gendisk *disk) 1582 { 1583 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1584 struct add_disk_work add_disk_work; 1585 1586 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1587 add_disk_work.disk = disk; 1588 add_disk_work.error = 0; 1589 1590 /* Use *_delayed_work functions since they're not GPL'd */ 1591 schedule_delayed_work(&add_disk_work.work, 0); 1592 flush_delayed_work(&add_disk_work.work); 1593 1594 __flush_workqueue(system_wq); 1595 return (add_disk_work.error); 1596 #else /* <= 6.8 kernel */ 1597 return (__zvol_os_add_disk(disk)); 1598 #endif 1599 } 1600 1601 /* 1602 * Create a block device minor node and setup the linkage between it 1603 * and the specified volume. Once this function returns the block 1604 * device is live and ready for use. 1605 */ 1606 int 1607 zvol_os_create_minor(const char *name) 1608 { 1609 zvol_state_t *zv; 1610 objset_t *os; 1611 dmu_object_info_t *doi; 1612 uint64_t volsize; 1613 uint64_t len; 1614 unsigned minor = 0; 1615 int error = 0; 1616 int idx; 1617 uint64_t hash = zvol_name_hash(name); 1618 uint64_t volthreading; 1619 bool replayed_zil = B_FALSE; 1620 1621 if (zvol_inhibit_dev) 1622 return (0); 1623 1624 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1625 if (idx < 0) 1626 return (SET_ERROR(-idx)); 1627 minor = idx << ZVOL_MINOR_BITS; 1628 if (MINOR(minor) != minor) { 1629 /* too many partitions can cause an overflow */ 1630 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1631 name, minor, MINOR(minor)); 1632 ida_simple_remove(&zvol_ida, idx); 1633 return (SET_ERROR(EINVAL)); 1634 } 1635 1636 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1637 if (zv) { 1638 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1639 mutex_exit(&zv->zv_state_lock); 1640 ida_simple_remove(&zvol_ida, idx); 1641 return (SET_ERROR(EEXIST)); 1642 } 1643 1644 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1645 1646 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1647 if (error) 1648 goto out_doi; 1649 1650 error = dmu_object_info(os, ZVOL_OBJ, doi); 1651 if (error) 1652 goto out_dmu_objset_disown; 1653 1654 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1655 if (error) 1656 goto out_dmu_objset_disown; 1657 1658 zv = zvol_alloc(MKDEV(zvol_major, minor), name, 1659 doi->doi_data_block_size); 1660 if (zv == NULL) { 1661 error = SET_ERROR(EAGAIN); 1662 goto out_dmu_objset_disown; 1663 } 1664 zv->zv_hash = hash; 1665 1666 if (dmu_objset_is_snapshot(os)) 1667 zv->zv_flags |= ZVOL_RDONLY; 1668 1669 zv->zv_volsize = volsize; 1670 zv->zv_objset = os; 1671 1672 /* Default */ 1673 zv->zv_threading = B_TRUE; 1674 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1675 == 0) 1676 zv->zv_threading = volthreading; 1677 1678 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1679 1680 #ifdef QUEUE_FLAG_DISCARD 1681 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1682 #endif 1683 #ifdef QUEUE_FLAG_NONROT 1684 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1685 #endif 1686 #ifdef QUEUE_FLAG_ADD_RANDOM 1687 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1688 #endif 1689 /* This flag was introduced in kernel version 4.12. */ 1690 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1691 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1692 #endif 1693 1694 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1695 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1696 if (error) 1697 goto out_dmu_objset_disown; 1698 ASSERT3P(zv->zv_zilog, ==, NULL); 1699 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1700 if (spa_writeable(dmu_objset_spa(os))) { 1701 if (zil_replay_disable) 1702 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1703 else 1704 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1705 } 1706 if (replayed_zil) 1707 zil_close(zv->zv_zilog); 1708 zv->zv_zilog = NULL; 1709 1710 /* 1711 * When udev detects the addition of the device it will immediately 1712 * invoke blkid(8) to determine the type of content on the device. 1713 * Prefetching the blocks commonly scanned by blkid(8) will speed 1714 * up this process. 1715 */ 1716 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1717 if (len > 0) { 1718 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1719 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1720 ZIO_PRIORITY_SYNC_READ); 1721 } 1722 1723 zv->zv_objset = NULL; 1724 out_dmu_objset_disown: 1725 dmu_objset_disown(os, B_TRUE, FTAG); 1726 out_doi: 1727 kmem_free(doi, sizeof (dmu_object_info_t)); 1728 1729 /* 1730 * Keep in mind that once add_disk() is called, the zvol is 1731 * announced to the world, and zvol_open()/zvol_release() can 1732 * be called at any time. Incidentally, add_disk() itself calls 1733 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1734 * directly as well. 1735 */ 1736 if (error == 0) { 1737 rw_enter(&zvol_state_lock, RW_WRITER); 1738 zvol_insert(zv); 1739 rw_exit(&zvol_state_lock); 1740 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1741 } else { 1742 ida_simple_remove(&zvol_ida, idx); 1743 } 1744 1745 return (error); 1746 } 1747 1748 void 1749 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1750 { 1751 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1752 1753 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1754 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1755 1756 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1757 1758 /* move to new hashtable entry */ 1759 zv->zv_hash = zvol_name_hash(newname); 1760 hlist_del(&zv->zv_hlink); 1761 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1762 1763 /* 1764 * The block device's read-only state is briefly changed causing 1765 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1766 * the name change and fixes the symlinks. This does not change 1767 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1768 * changes. This would normally be done using kobject_uevent() but 1769 * that is a GPL-only symbol which is why we need this workaround. 1770 */ 1771 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1772 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1773 1774 dataset_kstats_rename(&zv->zv_kstat, newname); 1775 } 1776 1777 void 1778 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1779 { 1780 1781 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1782 } 1783 1784 void 1785 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1786 { 1787 1788 set_capacity(zv->zv_zso->zvo_disk, capacity); 1789 } 1790 1791 int 1792 zvol_init(void) 1793 { 1794 int error; 1795 1796 /* 1797 * zvol_threads is the module param the user passes in. 1798 * 1799 * zvol_actual_threads is what we use internally, since the user can 1800 * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 1801 */ 1802 static unsigned int zvol_actual_threads; 1803 1804 if (zvol_threads == 0) { 1805 /* 1806 * See dde9380a1 for why 32 was chosen here. This should 1807 * probably be refined to be some multiple of the number 1808 * of CPUs. 1809 */ 1810 zvol_actual_threads = MAX(num_online_cpus(), 32); 1811 } else { 1812 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 1813 } 1814 1815 /* 1816 * Use atleast 32 zvol_threads but for many core system, 1817 * prefer 6 threads per taskq, but no more taskqs 1818 * than threads in them on large systems. 1819 * 1820 * taskq total 1821 * cpus taskqs threads threads 1822 * ------- ------- ------- ------- 1823 * 1 1 32 32 1824 * 2 1 32 32 1825 * 4 1 32 32 1826 * 8 2 16 32 1827 * 16 3 11 33 1828 * 32 5 7 35 1829 * 64 8 8 64 1830 * 128 11 12 132 1831 * 256 16 16 256 1832 */ 1833 zv_taskq_t *ztqs = &zvol_taskqs; 1834 uint_t num_tqs = MIN(num_online_cpus(), zvol_num_taskqs); 1835 if (num_tqs == 0) { 1836 num_tqs = 1 + num_online_cpus() / 6; 1837 while (num_tqs * num_tqs > zvol_actual_threads) 1838 num_tqs--; 1839 } 1840 uint_t per_tq_thread = zvol_actual_threads / num_tqs; 1841 if (per_tq_thread * num_tqs < zvol_actual_threads) 1842 per_tq_thread++; 1843 ztqs->tqs_cnt = num_tqs; 1844 ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); 1845 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1846 if (error) { 1847 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); 1848 ztqs->tqs_taskq = NULL; 1849 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1850 return (error); 1851 } 1852 1853 if (zvol_blk_mq_queue_depth == 0) { 1854 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1855 } else { 1856 zvol_actual_blk_mq_queue_depth = 1857 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1858 } 1859 1860 if (zvol_blk_mq_threads == 0) { 1861 zvol_blk_mq_actual_threads = num_online_cpus(); 1862 } else { 1863 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1864 1024); 1865 } 1866 1867 for (uint_t i = 0; i < num_tqs; i++) { 1868 char name[32]; 1869 (void) snprintf(name, sizeof (name), "%s_tq-%u", 1870 ZVOL_DRIVER, i); 1871 ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, 1872 maxclsyspri, per_tq_thread, INT_MAX, 1873 TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1874 if (ztqs->tqs_taskq[i] == NULL) { 1875 for (int j = i - 1; j >= 0; j--) 1876 taskq_destroy(ztqs->tqs_taskq[j]); 1877 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1878 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1879 sizeof (taskq_t *)); 1880 ztqs->tqs_taskq = NULL; 1881 return (-ENOMEM); 1882 } 1883 } 1884 1885 zvol_init_impl(); 1886 ida_init(&zvol_ida); 1887 return (0); 1888 } 1889 1890 void 1891 zvol_fini(void) 1892 { 1893 zv_taskq_t *ztqs = &zvol_taskqs; 1894 zvol_fini_impl(); 1895 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1896 1897 if (ztqs->tqs_taskq == NULL) { 1898 ASSERT3U(ztqs->tqs_cnt, ==, 0); 1899 } else { 1900 for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { 1901 ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); 1902 taskq_destroy(ztqs->tqs_taskq[i]); 1903 } 1904 kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * 1905 sizeof (taskq_t *)); 1906 ztqs->tqs_taskq = NULL; 1907 } 1908 1909 ida_destroy(&zvol_ida); 1910 } 1911 1912 module_param(zvol_inhibit_dev, uint, 0644); 1913 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1914 1915 module_param(zvol_major, uint, 0444); 1916 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1917 1918 module_param(zvol_threads, uint, 0444); 1919 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" 1920 "to 0 to use all active CPUs"); 1921 1922 module_param(zvol_request_sync, uint, 0644); 1923 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1924 1925 module_param(zvol_max_discard_blocks, ulong, 0444); 1926 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1927 1928 module_param(zvol_num_taskqs, uint, 0444); 1929 MODULE_PARM_DESC(zvol_num_taskqs, "Number of zvol taskqs"); 1930 1931 module_param(zvol_prefetch_bytes, uint, 0644); 1932 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1933 1934 module_param(zvol_volmode, uint, 0644); 1935 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1936 1937 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1938 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1939 1940 module_param(zvol_use_blk_mq, uint, 0644); 1941 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1942 1943 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1944 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1945 "Process volblocksize blocks per thread"); 1946 1947 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1948 module_param(zvol_open_timeout_ms, uint, 0644); 1949 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1950 #endif 1951