1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 23 */ 24 25 #include <sys/dataset_kstats.h> 26 #include <sys/dbuf.h> 27 #include <sys/dmu_traverse.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/zap.h> 32 #include <sys/zfeature.h> 33 #include <sys/zil_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/zio.h> 36 #include <sys/zfs_rlock.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zvol.h> 39 #include <sys/zvol_impl.h> 40 41 #include <linux/blkdev_compat.h> 42 #include <linux/task_io_accounting_ops.h> 43 44 #ifdef HAVE_BLK_MQ 45 #include <linux/blk-mq.h> 46 #endif 47 48 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 49 struct request *rq, boolean_t force_sync); 50 51 static unsigned int zvol_major = ZVOL_MAJOR; 52 static unsigned int zvol_request_sync = 0; 53 static unsigned int zvol_prefetch_bytes = (128 * 1024); 54 static unsigned long zvol_max_discard_blocks = 16384; 55 56 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 57 static unsigned int zvol_open_timeout_ms = 1000; 58 #endif 59 60 static unsigned int zvol_threads = 0; 61 #ifdef HAVE_BLK_MQ 62 static unsigned int zvol_blk_mq_threads = 0; 63 static unsigned int zvol_blk_mq_actual_threads; 64 static boolean_t zvol_use_blk_mq = B_FALSE; 65 66 /* 67 * The maximum number of volblocksize blocks to process per thread. Typically, 68 * write heavy workloads preform better with higher values here, and read 69 * heavy workloads preform better with lower values, but that's not a hard 70 * and fast rule. It's basically a knob to tune between "less overhead with 71 * less parallelism" and "more overhead, but more parallelism". 72 * 73 * '8' was chosen as a reasonable, balanced, default based off of sequential 74 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 75 */ 76 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 77 #endif 78 79 #ifndef BLKDEV_DEFAULT_RQ 80 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 81 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 82 #endif 83 84 /* 85 * Finalize our BIO or request. 86 */ 87 #ifdef HAVE_BLK_MQ 88 #define END_IO(zv, bio, rq, error) do { \ 89 if (bio) { \ 90 BIO_END_IO(bio, error); \ 91 } else { \ 92 blk_mq_end_request(rq, errno_to_bi_status(error)); \ 93 } \ 94 } while (0) 95 #else 96 #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) 97 #endif 98 99 #ifdef HAVE_BLK_MQ 100 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 101 static unsigned int zvol_actual_blk_mq_queue_depth; 102 #endif 103 104 struct zvol_state_os { 105 struct gendisk *zvo_disk; /* generic disk */ 106 struct request_queue *zvo_queue; /* request queue */ 107 dev_t zvo_dev; /* device id */ 108 109 #ifdef HAVE_BLK_MQ 110 struct blk_mq_tag_set tag_set; 111 #endif 112 113 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 114 boolean_t use_blk_mq; 115 }; 116 117 static taskq_t *zvol_taskq; 118 static struct ida zvol_ida; 119 120 typedef struct zv_request_stack { 121 zvol_state_t *zv; 122 struct bio *bio; 123 struct request *rq; 124 } zv_request_t; 125 126 typedef struct zv_work { 127 struct request *rq; 128 struct work_struct work; 129 } zv_work_t; 130 131 typedef struct zv_request_task { 132 zv_request_t zvr; 133 taskq_ent_t ent; 134 } zv_request_task_t; 135 136 static zv_request_task_t * 137 zv_request_task_create(zv_request_t zvr) 138 { 139 zv_request_task_t *task; 140 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 141 taskq_init_ent(&task->ent); 142 task->zvr = zvr; 143 return (task); 144 } 145 146 static void 147 zv_request_task_free(zv_request_task_t *task) 148 { 149 kmem_free(task, sizeof (*task)); 150 } 151 152 #ifdef HAVE_BLK_MQ 153 154 /* 155 * This is called when a new block multiqueue request comes in. A request 156 * contains one or more BIOs. 157 */ 158 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 159 const struct blk_mq_queue_data *bd) 160 { 161 struct request *rq = bd->rq; 162 zvol_state_t *zv = rq->q->queuedata; 163 164 /* Tell the kernel that we are starting to process this request */ 165 blk_mq_start_request(rq); 166 167 if (blk_rq_is_passthrough(rq)) { 168 /* Skip non filesystem request */ 169 blk_mq_end_request(rq, BLK_STS_IOERR); 170 return (BLK_STS_IOERR); 171 } 172 173 zvol_request_impl(zv, NULL, rq, 0); 174 175 /* Acknowledge to the kernel that we got this request */ 176 return (BLK_STS_OK); 177 } 178 179 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 180 .queue_rq = zvol_mq_queue_rq, 181 }; 182 183 /* Initialize our blk-mq struct */ 184 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 185 { 186 struct zvol_state_os *zso = zv->zv_zso; 187 188 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 189 190 /* Initialize tag set. */ 191 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 192 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 193 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 194 zso->tag_set.numa_node = NUMA_NO_NODE; 195 zso->tag_set.cmd_size = 0; 196 197 /* 198 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 199 * zvol_request_impl() 200 */ 201 zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; 202 zso->tag_set.driver_data = zv; 203 204 return (blk_mq_alloc_tag_set(&zso->tag_set)); 205 } 206 #endif /* HAVE_BLK_MQ */ 207 208 /* 209 * Given a path, return TRUE if path is a ZVOL. 210 */ 211 boolean_t 212 zvol_os_is_zvol(const char *path) 213 { 214 dev_t dev = 0; 215 216 if (vdev_lookup_bdev(path, &dev) != 0) 217 return (B_FALSE); 218 219 if (MAJOR(dev) == zvol_major) 220 return (B_TRUE); 221 222 return (B_FALSE); 223 } 224 225 static void 226 zvol_write(zv_request_t *zvr) 227 { 228 struct bio *bio = zvr->bio; 229 struct request *rq = zvr->rq; 230 int error = 0; 231 zfs_uio_t uio; 232 zvol_state_t *zv = zvr->zv; 233 struct request_queue *q; 234 struct gendisk *disk; 235 unsigned long start_time = 0; 236 boolean_t acct = B_FALSE; 237 238 ASSERT3P(zv, !=, NULL); 239 ASSERT3U(zv->zv_open_count, >, 0); 240 ASSERT3P(zv->zv_zilog, !=, NULL); 241 242 q = zv->zv_zso->zvo_queue; 243 disk = zv->zv_zso->zvo_disk; 244 245 /* bio marked as FLUSH need to flush before write */ 246 if (io_is_flush(bio, rq)) 247 zil_commit(zv->zv_zilog, ZVOL_OBJ); 248 249 /* Some requests are just for flush and nothing else. */ 250 if (io_size(bio, rq) == 0) { 251 rw_exit(&zv->zv_suspend_lock); 252 END_IO(zv, bio, rq, 0); 253 return; 254 } 255 256 zfs_uio_bvec_init(&uio, bio, rq); 257 258 ssize_t start_resid = uio.uio_resid; 259 260 /* 261 * With use_blk_mq, accounting is done by blk_mq_start_request() 262 * and blk_mq_end_request(), so we can skip it here. 263 */ 264 if (bio) { 265 acct = blk_queue_io_stat(q); 266 if (acct) { 267 start_time = blk_generic_start_io_acct(q, disk, WRITE, 268 bio); 269 } 270 } 271 272 boolean_t sync = 273 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 274 275 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 276 uio.uio_loffset, uio.uio_resid, RL_WRITER); 277 278 uint64_t volsize = zv->zv_volsize; 279 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 280 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 281 uint64_t off = uio.uio_loffset; 282 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 283 284 if (bytes > volsize - off) /* don't write past the end */ 285 bytes = volsize - off; 286 287 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 288 289 /* This will only fail for ENOSPC */ 290 error = dmu_tx_assign(tx, TXG_WAIT); 291 if (error) { 292 dmu_tx_abort(tx); 293 break; 294 } 295 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 296 if (error == 0) { 297 zvol_log_write(zv, tx, off, bytes, sync); 298 } 299 dmu_tx_commit(tx); 300 301 if (error) 302 break; 303 } 304 zfs_rangelock_exit(lr); 305 306 int64_t nwritten = start_resid - uio.uio_resid; 307 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 308 task_io_account_write(nwritten); 309 310 if (sync) 311 zil_commit(zv->zv_zilog, ZVOL_OBJ); 312 313 rw_exit(&zv->zv_suspend_lock); 314 315 if (bio && acct) { 316 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 317 } 318 319 END_IO(zv, bio, rq, -error); 320 } 321 322 static void 323 zvol_write_task(void *arg) 324 { 325 zv_request_task_t *task = arg; 326 zvol_write(&task->zvr); 327 zv_request_task_free(task); 328 } 329 330 static void 331 zvol_discard(zv_request_t *zvr) 332 { 333 struct bio *bio = zvr->bio; 334 struct request *rq = zvr->rq; 335 zvol_state_t *zv = zvr->zv; 336 uint64_t start = io_offset(bio, rq); 337 uint64_t size = io_size(bio, rq); 338 uint64_t end = start + size; 339 boolean_t sync; 340 int error = 0; 341 dmu_tx_t *tx; 342 struct request_queue *q = zv->zv_zso->zvo_queue; 343 struct gendisk *disk = zv->zv_zso->zvo_disk; 344 unsigned long start_time = 0; 345 boolean_t acct = B_FALSE; 346 347 ASSERT3P(zv, !=, NULL); 348 ASSERT3U(zv->zv_open_count, >, 0); 349 ASSERT3P(zv->zv_zilog, !=, NULL); 350 351 if (bio) { 352 acct = blk_queue_io_stat(q); 353 if (acct) { 354 start_time = blk_generic_start_io_acct(q, disk, WRITE, 355 bio); 356 } 357 } 358 359 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 360 361 if (end > zv->zv_volsize) { 362 error = SET_ERROR(EIO); 363 goto unlock; 364 } 365 366 /* 367 * Align the request to volume block boundaries when a secure erase is 368 * not required. This will prevent dnode_free_range() from zeroing out 369 * the unaligned parts which is slow (read-modify-write) and useless 370 * since we are not freeing any space by doing so. 371 */ 372 if (!io_is_secure_erase(bio, rq)) { 373 start = P2ROUNDUP(start, zv->zv_volblocksize); 374 end = P2ALIGN(end, zv->zv_volblocksize); 375 size = end - start; 376 } 377 378 if (start >= end) 379 goto unlock; 380 381 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 382 start, size, RL_WRITER); 383 384 tx = dmu_tx_create(zv->zv_objset); 385 dmu_tx_mark_netfree(tx); 386 error = dmu_tx_assign(tx, TXG_WAIT); 387 if (error != 0) { 388 dmu_tx_abort(tx); 389 } else { 390 zvol_log_truncate(zv, tx, start, size, B_TRUE); 391 dmu_tx_commit(tx); 392 error = dmu_free_long_range(zv->zv_objset, 393 ZVOL_OBJ, start, size); 394 } 395 zfs_rangelock_exit(lr); 396 397 if (error == 0 && sync) 398 zil_commit(zv->zv_zilog, ZVOL_OBJ); 399 400 unlock: 401 rw_exit(&zv->zv_suspend_lock); 402 403 if (bio && acct) { 404 blk_generic_end_io_acct(q, disk, WRITE, bio, 405 start_time); 406 } 407 408 END_IO(zv, bio, rq, -error); 409 } 410 411 static void 412 zvol_discard_task(void *arg) 413 { 414 zv_request_task_t *task = arg; 415 zvol_discard(&task->zvr); 416 zv_request_task_free(task); 417 } 418 419 static void 420 zvol_read(zv_request_t *zvr) 421 { 422 struct bio *bio = zvr->bio; 423 struct request *rq = zvr->rq; 424 int error = 0; 425 zfs_uio_t uio; 426 boolean_t acct = B_FALSE; 427 zvol_state_t *zv = zvr->zv; 428 struct request_queue *q; 429 struct gendisk *disk; 430 unsigned long start_time = 0; 431 432 ASSERT3P(zv, !=, NULL); 433 ASSERT3U(zv->zv_open_count, >, 0); 434 435 zfs_uio_bvec_init(&uio, bio, rq); 436 437 q = zv->zv_zso->zvo_queue; 438 disk = zv->zv_zso->zvo_disk; 439 440 ssize_t start_resid = uio.uio_resid; 441 442 /* 443 * When blk-mq is being used, accounting is done by 444 * blk_mq_start_request() and blk_mq_end_request(). 445 */ 446 if (bio) { 447 acct = blk_queue_io_stat(q); 448 if (acct) 449 start_time = blk_generic_start_io_acct(q, disk, READ, 450 bio); 451 } 452 453 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 454 uio.uio_loffset, uio.uio_resid, RL_READER); 455 456 uint64_t volsize = zv->zv_volsize; 457 458 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 459 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 460 461 /* don't read past the end */ 462 if (bytes > volsize - uio.uio_loffset) 463 bytes = volsize - uio.uio_loffset; 464 465 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 466 if (error) { 467 /* convert checksum errors into IO errors */ 468 if (error == ECKSUM) 469 error = SET_ERROR(EIO); 470 break; 471 } 472 } 473 zfs_rangelock_exit(lr); 474 475 int64_t nread = start_resid - uio.uio_resid; 476 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 477 task_io_account_read(nread); 478 479 rw_exit(&zv->zv_suspend_lock); 480 481 if (bio && acct) { 482 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 483 } 484 485 END_IO(zv, bio, rq, -error); 486 } 487 488 static void 489 zvol_read_task(void *arg) 490 { 491 zv_request_task_t *task = arg; 492 zvol_read(&task->zvr); 493 zv_request_task_free(task); 494 } 495 496 497 /* 498 * Process a BIO or request 499 * 500 * Either 'bio' or 'rq' should be set depending on if we are processing a 501 * bio or a request (both should not be set). 502 * 503 * force_sync: Set to 0 to defer processing to a background taskq 504 * Set to 1 to process data synchronously 505 */ 506 static void 507 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 508 boolean_t force_sync) 509 { 510 fstrans_cookie_t cookie = spl_fstrans_mark(); 511 uint64_t offset = io_offset(bio, rq); 512 uint64_t size = io_size(bio, rq); 513 int rw = io_data_dir(bio, rq); 514 515 if (zvol_request_sync) 516 force_sync = 1; 517 518 zv_request_t zvr = { 519 .zv = zv, 520 .bio = bio, 521 .rq = rq, 522 }; 523 524 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 525 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 526 zv->zv_zso->zvo_disk->disk_name, 527 (long long unsigned)offset, 528 (long unsigned)size); 529 530 END_IO(zv, bio, rq, -SET_ERROR(EIO)); 531 goto out; 532 } 533 534 zv_request_task_t *task; 535 536 if (rw == WRITE) { 537 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 538 END_IO(zv, bio, rq, -SET_ERROR(EROFS)); 539 goto out; 540 } 541 542 /* 543 * Prevents the zvol from being suspended, or the ZIL being 544 * concurrently opened. Will be released after the i/o 545 * completes. 546 */ 547 rw_enter(&zv->zv_suspend_lock, RW_READER); 548 549 /* 550 * Open a ZIL if this is the first time we have written to this 551 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 552 * than zv_state_lock so that we don't need to acquire an 553 * additional lock in this path. 554 */ 555 if (zv->zv_zilog == NULL) { 556 rw_exit(&zv->zv_suspend_lock); 557 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 558 if (zv->zv_zilog == NULL) { 559 zv->zv_zilog = zil_open(zv->zv_objset, 560 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 561 zv->zv_flags |= ZVOL_WRITTEN_TO; 562 /* replay / destroy done in zvol_create_minor */ 563 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 564 ZIL_REPLAY_NEEDED)); 565 } 566 rw_downgrade(&zv->zv_suspend_lock); 567 } 568 569 /* 570 * We don't want this thread to be blocked waiting for i/o to 571 * complete, so we instead wait from a taskq callback. The 572 * i/o may be a ZIL write (via zil_commit()), or a read of an 573 * indirect block, or a read of a data block (if this is a 574 * partial-block write). We will indicate that the i/o is 575 * complete by calling END_IO() from the taskq callback. 576 * 577 * This design allows the calling thread to continue and 578 * initiate more concurrent operations by calling 579 * zvol_request() again. There are typically only a small 580 * number of threads available to call zvol_request() (e.g. 581 * one per iSCSI target), so keeping the latency of 582 * zvol_request() low is important for performance. 583 * 584 * The zvol_request_sync module parameter allows this 585 * behavior to be altered, for performance evaluation 586 * purposes. If the callback blocks, setting 587 * zvol_request_sync=1 will result in much worse performance. 588 * 589 * We can have up to zvol_threads concurrent i/o's being 590 * processed for all zvols on the system. This is typically 591 * a vast improvement over the zvol_request_sync=1 behavior 592 * of one i/o at a time per zvol. However, an even better 593 * design would be for zvol_request() to initiate the zio 594 * directly, and then be notified by the zio_done callback, 595 * which would call END_IO(). Unfortunately, the DMU/ZIL 596 * interfaces lack this functionality (they block waiting for 597 * the i/o to complete). 598 */ 599 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 600 if (force_sync) { 601 zvol_discard(&zvr); 602 } else { 603 task = zv_request_task_create(zvr); 604 taskq_dispatch_ent(zvol_taskq, 605 zvol_discard_task, task, 0, &task->ent); 606 } 607 } else { 608 if (force_sync) { 609 zvol_write(&zvr); 610 } else { 611 task = zv_request_task_create(zvr); 612 taskq_dispatch_ent(zvol_taskq, 613 zvol_write_task, task, 0, &task->ent); 614 } 615 } 616 } else { 617 /* 618 * The SCST driver, and possibly others, may issue READ I/Os 619 * with a length of zero bytes. These empty I/Os contain no 620 * data and require no additional handling. 621 */ 622 if (size == 0) { 623 END_IO(zv, bio, rq, 0); 624 goto out; 625 } 626 627 rw_enter(&zv->zv_suspend_lock, RW_READER); 628 629 /* See comment in WRITE case above. */ 630 if (force_sync) { 631 zvol_read(&zvr); 632 } else { 633 task = zv_request_task_create(zvr); 634 taskq_dispatch_ent(zvol_taskq, 635 zvol_read_task, task, 0, &task->ent); 636 } 637 } 638 639 out: 640 spl_fstrans_unmark(cookie); 641 } 642 643 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 644 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 645 static void 646 zvol_submit_bio(struct bio *bio) 647 #else 648 static blk_qc_t 649 zvol_submit_bio(struct bio *bio) 650 #endif 651 #else 652 static MAKE_REQUEST_FN_RET 653 zvol_request(struct request_queue *q, struct bio *bio) 654 #endif 655 { 656 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 657 #if defined(HAVE_BIO_BDEV_DISK) 658 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 659 #else 660 struct request_queue *q = bio->bi_disk->queue; 661 #endif 662 #endif 663 zvol_state_t *zv = q->queuedata; 664 665 zvol_request_impl(zv, bio, NULL, 0); 666 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 667 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 668 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 669 return (BLK_QC_T_NONE); 670 #endif 671 } 672 673 static int 674 #ifdef HAVE_BLK_MODE_T 675 zvol_open(struct gendisk *disk, blk_mode_t flag) 676 #else 677 zvol_open(struct block_device *bdev, fmode_t flag) 678 #endif 679 { 680 zvol_state_t *zv; 681 int error = 0; 682 boolean_t drop_suspend = B_FALSE; 683 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 684 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 685 hrtime_t start = gethrtime(); 686 687 retry: 688 #endif 689 rw_enter(&zvol_state_lock, RW_READER); 690 /* 691 * Obtain a copy of private_data under the zvol_state_lock to make 692 * sure that either the result of zvol free code path setting 693 * disk->private_data to NULL is observed, or zvol_os_free() 694 * is not called on this zv because of the positive zv_open_count. 695 */ 696 #ifdef HAVE_BLK_MODE_T 697 zv = disk->private_data; 698 #else 699 zv = bdev->bd_disk->private_data; 700 #endif 701 if (zv == NULL) { 702 rw_exit(&zvol_state_lock); 703 return (SET_ERROR(-ENXIO)); 704 } 705 706 mutex_enter(&zv->zv_state_lock); 707 /* 708 * Make sure zvol is not suspended during first open 709 * (hold zv_suspend_lock) and respect proper lock acquisition 710 * ordering - zv_suspend_lock before zv_state_lock 711 */ 712 if (zv->zv_open_count == 0) { 713 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 714 mutex_exit(&zv->zv_state_lock); 715 rw_enter(&zv->zv_suspend_lock, RW_READER); 716 mutex_enter(&zv->zv_state_lock); 717 /* check to see if zv_suspend_lock is needed */ 718 if (zv->zv_open_count != 0) { 719 rw_exit(&zv->zv_suspend_lock); 720 } else { 721 drop_suspend = B_TRUE; 722 } 723 } else { 724 drop_suspend = B_TRUE; 725 } 726 } 727 rw_exit(&zvol_state_lock); 728 729 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 730 731 if (zv->zv_open_count == 0) { 732 boolean_t drop_namespace = B_FALSE; 733 734 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 735 736 /* 737 * In all other call paths the spa_namespace_lock is taken 738 * before the bdev->bd_mutex lock. However, on open(2) 739 * the __blkdev_get() function calls fops->open() with the 740 * bdev->bd_mutex lock held. This can result in a deadlock 741 * when zvols from one pool are used as vdevs in another. 742 * 743 * To prevent a lock inversion deadlock we preemptively 744 * take the spa_namespace_lock. Normally the lock will not 745 * be contended and this is safe because spa_open_common() 746 * handles the case where the caller already holds the 747 * spa_namespace_lock. 748 * 749 * When the lock cannot be aquired after multiple retries 750 * this must be the vdev on zvol deadlock case and we have 751 * no choice but to return an error. For 5.12 and older 752 * kernels returning -ERESTARTSYS will result in the 753 * bdev->bd_mutex being dropped, then reacquired, and 754 * fops->open() being called again. This process can be 755 * repeated safely until both locks are acquired. For 5.13 756 * and newer the -ERESTARTSYS retry logic was removed from 757 * the kernel so the only option is to return the error for 758 * the caller to handle it. 759 */ 760 if (!mutex_owned(&spa_namespace_lock)) { 761 if (!mutex_tryenter(&spa_namespace_lock)) { 762 mutex_exit(&zv->zv_state_lock); 763 rw_exit(&zv->zv_suspend_lock); 764 765 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 766 schedule(); 767 return (SET_ERROR(-ERESTARTSYS)); 768 #else 769 if ((gethrtime() - start) > timeout) 770 return (SET_ERROR(-ERESTARTSYS)); 771 772 schedule_timeout(MSEC_TO_TICK(10)); 773 goto retry; 774 #endif 775 } else { 776 drop_namespace = B_TRUE; 777 } 778 } 779 780 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 781 782 if (drop_namespace) 783 mutex_exit(&spa_namespace_lock); 784 } 785 786 if (error == 0) { 787 if ((blk_mode_is_open_write(flag)) && 788 (zv->zv_flags & ZVOL_RDONLY)) { 789 if (zv->zv_open_count == 0) 790 zvol_last_close(zv); 791 792 error = SET_ERROR(-EROFS); 793 } else { 794 zv->zv_open_count++; 795 } 796 } 797 798 mutex_exit(&zv->zv_state_lock); 799 if (drop_suspend) 800 rw_exit(&zv->zv_suspend_lock); 801 802 if (error == 0) 803 #ifdef HAVE_BLK_MODE_T 804 disk_check_media_change(disk); 805 #else 806 zfs_check_media_change(bdev); 807 #endif 808 809 return (error); 810 } 811 812 static void 813 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 814 zvol_release(struct gendisk *disk) 815 #else 816 zvol_release(struct gendisk *disk, fmode_t unused) 817 #endif 818 { 819 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 820 (void) unused; 821 #endif 822 zvol_state_t *zv; 823 boolean_t drop_suspend = B_TRUE; 824 825 rw_enter(&zvol_state_lock, RW_READER); 826 zv = disk->private_data; 827 828 mutex_enter(&zv->zv_state_lock); 829 ASSERT3U(zv->zv_open_count, >, 0); 830 /* 831 * make sure zvol is not suspended during last close 832 * (hold zv_suspend_lock) and respect proper lock acquisition 833 * ordering - zv_suspend_lock before zv_state_lock 834 */ 835 if (zv->zv_open_count == 1) { 836 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 837 mutex_exit(&zv->zv_state_lock); 838 rw_enter(&zv->zv_suspend_lock, RW_READER); 839 mutex_enter(&zv->zv_state_lock); 840 /* check to see if zv_suspend_lock is needed */ 841 if (zv->zv_open_count != 1) { 842 rw_exit(&zv->zv_suspend_lock); 843 drop_suspend = B_FALSE; 844 } 845 } 846 } else { 847 drop_suspend = B_FALSE; 848 } 849 rw_exit(&zvol_state_lock); 850 851 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 852 853 zv->zv_open_count--; 854 if (zv->zv_open_count == 0) { 855 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 856 zvol_last_close(zv); 857 } 858 859 mutex_exit(&zv->zv_state_lock); 860 861 if (drop_suspend) 862 rw_exit(&zv->zv_suspend_lock); 863 } 864 865 static int 866 zvol_ioctl(struct block_device *bdev, fmode_t mode, 867 unsigned int cmd, unsigned long arg) 868 { 869 zvol_state_t *zv = bdev->bd_disk->private_data; 870 int error = 0; 871 872 ASSERT3U(zv->zv_open_count, >, 0); 873 874 switch (cmd) { 875 case BLKFLSBUF: 876 fsync_bdev(bdev); 877 invalidate_bdev(bdev); 878 rw_enter(&zv->zv_suspend_lock, RW_READER); 879 880 if (!(zv->zv_flags & ZVOL_RDONLY)) 881 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 882 883 rw_exit(&zv->zv_suspend_lock); 884 break; 885 886 case BLKZNAME: 887 mutex_enter(&zv->zv_state_lock); 888 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 889 mutex_exit(&zv->zv_state_lock); 890 break; 891 892 default: 893 error = -ENOTTY; 894 break; 895 } 896 897 return (SET_ERROR(error)); 898 } 899 900 #ifdef CONFIG_COMPAT 901 static int 902 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 903 unsigned cmd, unsigned long arg) 904 { 905 return (zvol_ioctl(bdev, mode, cmd, arg)); 906 } 907 #else 908 #define zvol_compat_ioctl NULL 909 #endif 910 911 static unsigned int 912 zvol_check_events(struct gendisk *disk, unsigned int clearing) 913 { 914 unsigned int mask = 0; 915 916 rw_enter(&zvol_state_lock, RW_READER); 917 918 zvol_state_t *zv = disk->private_data; 919 if (zv != NULL) { 920 mutex_enter(&zv->zv_state_lock); 921 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 922 zv->zv_changed = 0; 923 mutex_exit(&zv->zv_state_lock); 924 } 925 926 rw_exit(&zvol_state_lock); 927 928 return (mask); 929 } 930 931 static int 932 zvol_revalidate_disk(struct gendisk *disk) 933 { 934 rw_enter(&zvol_state_lock, RW_READER); 935 936 zvol_state_t *zv = disk->private_data; 937 if (zv != NULL) { 938 mutex_enter(&zv->zv_state_lock); 939 set_capacity(zv->zv_zso->zvo_disk, 940 zv->zv_volsize >> SECTOR_BITS); 941 mutex_exit(&zv->zv_state_lock); 942 } 943 944 rw_exit(&zvol_state_lock); 945 946 return (0); 947 } 948 949 int 950 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 951 { 952 struct gendisk *disk = zv->zv_zso->zvo_disk; 953 954 #if defined(HAVE_REVALIDATE_DISK_SIZE) 955 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 956 #elif defined(HAVE_REVALIDATE_DISK) 957 revalidate_disk(disk); 958 #else 959 zvol_revalidate_disk(disk); 960 #endif 961 return (0); 962 } 963 964 void 965 zvol_os_clear_private(zvol_state_t *zv) 966 { 967 /* 968 * Cleared while holding zvol_state_lock as a writer 969 * which will prevent zvol_open() from opening it. 970 */ 971 zv->zv_zso->zvo_disk->private_data = NULL; 972 } 973 974 /* 975 * Provide a simple virtual geometry for legacy compatibility. For devices 976 * smaller than 1 MiB a small head and sector count is used to allow very 977 * tiny devices. For devices over 1 Mib a standard head and sector count 978 * is used to keep the cylinders count reasonable. 979 */ 980 static int 981 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 982 { 983 zvol_state_t *zv = bdev->bd_disk->private_data; 984 sector_t sectors; 985 986 ASSERT3U(zv->zv_open_count, >, 0); 987 988 sectors = get_capacity(zv->zv_zso->zvo_disk); 989 990 if (sectors > 2048) { 991 geo->heads = 16; 992 geo->sectors = 63; 993 } else { 994 geo->heads = 2; 995 geo->sectors = 4; 996 } 997 998 geo->start = 0; 999 geo->cylinders = sectors / (geo->heads * geo->sectors); 1000 1001 return (0); 1002 } 1003 1004 /* 1005 * Why have two separate block_device_operations structs? 1006 * 1007 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1008 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1009 * can't just change submit_bio dynamically at runtime. So just create two 1010 * separate structs to get around this. 1011 */ 1012 static const struct block_device_operations zvol_ops_blk_mq = { 1013 .open = zvol_open, 1014 .release = zvol_release, 1015 .ioctl = zvol_ioctl, 1016 .compat_ioctl = zvol_compat_ioctl, 1017 .check_events = zvol_check_events, 1018 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1019 .revalidate_disk = zvol_revalidate_disk, 1020 #endif 1021 .getgeo = zvol_getgeo, 1022 .owner = THIS_MODULE, 1023 }; 1024 1025 static const struct block_device_operations zvol_ops = { 1026 .open = zvol_open, 1027 .release = zvol_release, 1028 .ioctl = zvol_ioctl, 1029 .compat_ioctl = zvol_compat_ioctl, 1030 .check_events = zvol_check_events, 1031 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1032 .revalidate_disk = zvol_revalidate_disk, 1033 #endif 1034 .getgeo = zvol_getgeo, 1035 .owner = THIS_MODULE, 1036 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1037 .submit_bio = zvol_submit_bio, 1038 #endif 1039 }; 1040 1041 static int 1042 zvol_alloc_non_blk_mq(struct zvol_state_os *zso) 1043 { 1044 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1045 #if defined(HAVE_BLK_ALLOC_DISK) 1046 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1047 if (zso->zvo_disk == NULL) 1048 return (1); 1049 1050 zso->zvo_disk->minors = ZVOL_MINORS; 1051 zso->zvo_queue = zso->zvo_disk->queue; 1052 #else 1053 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1054 if (zso->zvo_queue == NULL) 1055 return (1); 1056 1057 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1058 if (zso->zvo_disk == NULL) { 1059 blk_cleanup_queue(zso->zvo_queue); 1060 return (1); 1061 } 1062 1063 zso->zvo_disk->queue = zso->zvo_queue; 1064 #endif /* HAVE_BLK_ALLOC_DISK */ 1065 #else 1066 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1067 if (zso->zvo_queue == NULL) 1068 return (1); 1069 1070 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1071 if (zso->zvo_disk == NULL) { 1072 blk_cleanup_queue(zso->zvo_queue); 1073 return (1); 1074 } 1075 1076 zso->zvo_disk->queue = zso->zvo_queue; 1077 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1078 return (0); 1079 1080 } 1081 1082 static int 1083 zvol_alloc_blk_mq(zvol_state_t *zv) 1084 { 1085 #ifdef HAVE_BLK_MQ 1086 struct zvol_state_os *zso = zv->zv_zso; 1087 1088 /* Allocate our blk-mq tag_set */ 1089 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1090 return (1); 1091 1092 #if defined(HAVE_BLK_ALLOC_DISK) 1093 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1094 if (zso->zvo_disk == NULL) { 1095 blk_mq_free_tag_set(&zso->tag_set); 1096 return (1); 1097 } 1098 zso->zvo_queue = zso->zvo_disk->queue; 1099 zso->zvo_disk->minors = ZVOL_MINORS; 1100 #else 1101 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1102 if (zso->zvo_disk == NULL) { 1103 blk_cleanup_queue(zso->zvo_queue); 1104 blk_mq_free_tag_set(&zso->tag_set); 1105 return (1); 1106 } 1107 /* Allocate queue */ 1108 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1109 if (IS_ERR(zso->zvo_queue)) { 1110 blk_mq_free_tag_set(&zso->tag_set); 1111 return (1); 1112 } 1113 1114 /* Our queue is now created, assign it to our disk */ 1115 zso->zvo_disk->queue = zso->zvo_queue; 1116 1117 #endif 1118 #endif 1119 return (0); 1120 } 1121 1122 /* 1123 * Allocate memory for a new zvol_state_t and setup the required 1124 * request queue and generic disk structures for the block device. 1125 */ 1126 static zvol_state_t * 1127 zvol_alloc(dev_t dev, const char *name) 1128 { 1129 zvol_state_t *zv; 1130 struct zvol_state_os *zso; 1131 uint64_t volmode; 1132 int ret; 1133 1134 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1135 return (NULL); 1136 1137 if (volmode == ZFS_VOLMODE_DEFAULT) 1138 volmode = zvol_volmode; 1139 1140 if (volmode == ZFS_VOLMODE_NONE) 1141 return (NULL); 1142 1143 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1144 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1145 zv->zv_zso = zso; 1146 zv->zv_volmode = volmode; 1147 1148 list_link_init(&zv->zv_next); 1149 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1150 1151 #ifdef HAVE_BLK_MQ 1152 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1153 #endif 1154 1155 /* 1156 * The block layer has 3 interfaces for getting BIOs: 1157 * 1158 * 1. blk-mq request queues (new) 1159 * 2. submit_bio() (oldest) 1160 * 3. regular request queues (old). 1161 * 1162 * Each of those interfaces has two permutations: 1163 * 1164 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1165 * both the disk and its queue (5.14 kernel or newer) 1166 * 1167 * b) We don't have blk_*alloc_disk(), and have to allocate the 1168 * disk and the queue separately. (5.13 kernel or older) 1169 */ 1170 if (zv->zv_zso->use_blk_mq) { 1171 ret = zvol_alloc_blk_mq(zv); 1172 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1173 } else { 1174 ret = zvol_alloc_non_blk_mq(zso); 1175 zso->zvo_disk->fops = &zvol_ops; 1176 } 1177 if (ret != 0) 1178 goto out_kmem; 1179 1180 blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); 1181 1182 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1183 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1184 1185 if (!zv->zv_zso->use_blk_mq) { 1186 /* Disable write merging in favor of the ZIO pipeline. */ 1187 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1188 } 1189 1190 /* Enable /proc/diskstats */ 1191 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); 1192 1193 zso->zvo_queue->queuedata = zv; 1194 zso->zvo_dev = dev; 1195 zv->zv_open_count = 0; 1196 strlcpy(zv->zv_name, name, MAXNAMELEN); 1197 1198 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1199 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1200 1201 zso->zvo_disk->major = zvol_major; 1202 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1203 1204 /* 1205 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1206 * This is accomplished by limiting the number of minors for the 1207 * device to one and explicitly disabling partition scanning. 1208 */ 1209 if (volmode == ZFS_VOLMODE_DEV) { 1210 zso->zvo_disk->minors = 1; 1211 zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; 1212 zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; 1213 } 1214 1215 zso->zvo_disk->first_minor = (dev & MINORMASK); 1216 zso->zvo_disk->private_data = zv; 1217 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1218 ZVOL_DEV_NAME, (dev & MINORMASK)); 1219 1220 return (zv); 1221 1222 out_kmem: 1223 kmem_free(zso, sizeof (struct zvol_state_os)); 1224 kmem_free(zv, sizeof (zvol_state_t)); 1225 return (NULL); 1226 } 1227 1228 /* 1229 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1230 * At this time, the structure is not opened by anyone, is taken off 1231 * the zvol_state_list, and has its private data set to NULL. 1232 * The zvol_state_lock is dropped. 1233 * 1234 * This function may take many milliseconds to complete (e.g. we've seen 1235 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1236 * "del_gendisk". Thus, consumers need to be careful to account for this 1237 * latency when calling this function. 1238 */ 1239 void 1240 zvol_os_free(zvol_state_t *zv) 1241 { 1242 1243 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1244 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1245 ASSERT0(zv->zv_open_count); 1246 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1247 1248 rw_destroy(&zv->zv_suspend_lock); 1249 zfs_rangelock_fini(&zv->zv_rangelock); 1250 1251 del_gendisk(zv->zv_zso->zvo_disk); 1252 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1253 defined(HAVE_BLK_ALLOC_DISK) 1254 #if defined(HAVE_BLK_CLEANUP_DISK) 1255 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1256 #else 1257 put_disk(zv->zv_zso->zvo_disk); 1258 #endif 1259 #else 1260 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1261 put_disk(zv->zv_zso->zvo_disk); 1262 #endif 1263 1264 #ifdef HAVE_BLK_MQ 1265 if (zv->zv_zso->use_blk_mq) 1266 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1267 #endif 1268 1269 ida_simple_remove(&zvol_ida, 1270 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1271 1272 mutex_destroy(&zv->zv_state_lock); 1273 dataset_kstats_destroy(&zv->zv_kstat); 1274 1275 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1276 kmem_free(zv, sizeof (zvol_state_t)); 1277 } 1278 1279 void 1280 zvol_wait_close(zvol_state_t *zv) 1281 { 1282 } 1283 1284 /* 1285 * Create a block device minor node and setup the linkage between it 1286 * and the specified volume. Once this function returns the block 1287 * device is live and ready for use. 1288 */ 1289 int 1290 zvol_os_create_minor(const char *name) 1291 { 1292 zvol_state_t *zv; 1293 objset_t *os; 1294 dmu_object_info_t *doi; 1295 uint64_t volsize; 1296 uint64_t len; 1297 unsigned minor = 0; 1298 int error = 0; 1299 int idx; 1300 uint64_t hash = zvol_name_hash(name); 1301 bool replayed_zil = B_FALSE; 1302 1303 if (zvol_inhibit_dev) 1304 return (0); 1305 1306 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1307 if (idx < 0) 1308 return (SET_ERROR(-idx)); 1309 minor = idx << ZVOL_MINOR_BITS; 1310 1311 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1312 if (zv) { 1313 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1314 mutex_exit(&zv->zv_state_lock); 1315 ida_simple_remove(&zvol_ida, idx); 1316 return (SET_ERROR(EEXIST)); 1317 } 1318 1319 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1320 1321 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1322 if (error) 1323 goto out_doi; 1324 1325 error = dmu_object_info(os, ZVOL_OBJ, doi); 1326 if (error) 1327 goto out_dmu_objset_disown; 1328 1329 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1330 if (error) 1331 goto out_dmu_objset_disown; 1332 1333 zv = zvol_alloc(MKDEV(zvol_major, minor), name); 1334 if (zv == NULL) { 1335 error = SET_ERROR(EAGAIN); 1336 goto out_dmu_objset_disown; 1337 } 1338 zv->zv_hash = hash; 1339 1340 if (dmu_objset_is_snapshot(os)) 1341 zv->zv_flags |= ZVOL_RDONLY; 1342 1343 zv->zv_volblocksize = doi->doi_data_block_size; 1344 zv->zv_volsize = volsize; 1345 zv->zv_objset = os; 1346 1347 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1348 1349 blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, 1350 (DMU_MAX_ACCESS / 4) >> 9); 1351 1352 if (zv->zv_zso->use_blk_mq) { 1353 /* 1354 * IO requests can be really big (1MB). When an IO request 1355 * comes in, it is passed off to zvol_read() or zvol_write() 1356 * in a new thread, where it is chunked up into 'volblocksize' 1357 * sized pieces and processed. So for example, if the request 1358 * is a 1MB write and your volblocksize is 128k, one zvol_write 1359 * thread will take that request and sequentially do ten 128k 1360 * IOs. This is due to the fact that the thread needs to lock 1361 * each volblocksize sized block. So you might be wondering: 1362 * "instead of passing the whole 1MB request to one thread, 1363 * why not pass ten individual 128k chunks to ten threads and 1364 * process the whole write in parallel?" The short answer is 1365 * that there's a sweet spot number of chunks that balances 1366 * the greater parallelism with the added overhead of more 1367 * threads. The sweet spot can be different depending on if you 1368 * have a read or write heavy workload. Writes typically want 1369 * high chunk counts while reads typically want lower ones. On 1370 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1371 * configuration, with volblocksize=8k, the sweet spot for good 1372 * sequential reads and writes was at 8 chunks. 1373 */ 1374 1375 /* 1376 * Below we tell the kernel how big we want our requests 1377 * to be. You would think that blk_queue_io_opt() would be 1378 * used to do this since it is used to "set optimal request 1379 * size for the queue", but that doesn't seem to do 1380 * anything - the kernel still gives you huge requests 1381 * with tons of little PAGE_SIZE segments contained within it. 1382 * 1383 * Knowing that the kernel will just give you PAGE_SIZE segments 1384 * no matter what, you can say "ok, I want PAGE_SIZE byte 1385 * segments, and I want 'N' of them per request", where N is 1386 * the correct number of segments for the volblocksize and 1387 * number of chunks you want. 1388 */ 1389 #ifdef HAVE_BLK_MQ 1390 if (zvol_blk_mq_blocks_per_thread != 0) { 1391 unsigned int chunks; 1392 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1393 1394 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, 1395 PAGE_SIZE); 1396 blk_queue_max_segments(zv->zv_zso->zvo_queue, 1397 (zv->zv_volblocksize * chunks) / PAGE_SIZE); 1398 } else { 1399 /* 1400 * Special case: zvol_blk_mq_blocks_per_thread = 0 1401 * Max everything out. 1402 */ 1403 blk_queue_max_segments(zv->zv_zso->zvo_queue, 1404 UINT16_MAX); 1405 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, 1406 UINT_MAX); 1407 } 1408 #endif 1409 } else { 1410 blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); 1411 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); 1412 } 1413 1414 blk_queue_physical_block_size(zv->zv_zso->zvo_queue, 1415 zv->zv_volblocksize); 1416 blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); 1417 blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, 1418 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); 1419 blk_queue_discard_granularity(zv->zv_zso->zvo_queue, 1420 zv->zv_volblocksize); 1421 #ifdef QUEUE_FLAG_DISCARD 1422 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1423 #endif 1424 #ifdef QUEUE_FLAG_NONROT 1425 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1426 #endif 1427 #ifdef QUEUE_FLAG_ADD_RANDOM 1428 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1429 #endif 1430 /* This flag was introduced in kernel version 4.12. */ 1431 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1432 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1433 #endif 1434 1435 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1436 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1437 if (error) 1438 goto out_dmu_objset_disown; 1439 ASSERT3P(zv->zv_zilog, ==, NULL); 1440 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1441 if (spa_writeable(dmu_objset_spa(os))) { 1442 if (zil_replay_disable) 1443 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1444 else 1445 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1446 } 1447 if (replayed_zil) 1448 zil_close(zv->zv_zilog); 1449 zv->zv_zilog = NULL; 1450 1451 /* 1452 * When udev detects the addition of the device it will immediately 1453 * invoke blkid(8) to determine the type of content on the device. 1454 * Prefetching the blocks commonly scanned by blkid(8) will speed 1455 * up this process. 1456 */ 1457 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1458 if (len > 0) { 1459 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1460 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1461 ZIO_PRIORITY_SYNC_READ); 1462 } 1463 1464 zv->zv_objset = NULL; 1465 out_dmu_objset_disown: 1466 dmu_objset_disown(os, B_TRUE, FTAG); 1467 out_doi: 1468 kmem_free(doi, sizeof (dmu_object_info_t)); 1469 1470 /* 1471 * Keep in mind that once add_disk() is called, the zvol is 1472 * announced to the world, and zvol_open()/zvol_release() can 1473 * be called at any time. Incidentally, add_disk() itself calls 1474 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1475 * directly as well. 1476 */ 1477 if (error == 0) { 1478 rw_enter(&zvol_state_lock, RW_WRITER); 1479 zvol_insert(zv); 1480 rw_exit(&zvol_state_lock); 1481 #ifdef HAVE_ADD_DISK_RET 1482 error = add_disk(zv->zv_zso->zvo_disk); 1483 #else 1484 add_disk(zv->zv_zso->zvo_disk); 1485 #endif 1486 } else { 1487 ida_simple_remove(&zvol_ida, idx); 1488 } 1489 1490 return (error); 1491 } 1492 1493 void 1494 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1495 { 1496 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1497 1498 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1499 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1500 1501 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1502 1503 /* move to new hashtable entry */ 1504 zv->zv_hash = zvol_name_hash(zv->zv_name); 1505 hlist_del(&zv->zv_hlink); 1506 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1507 1508 /* 1509 * The block device's read-only state is briefly changed causing 1510 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1511 * the name change and fixes the symlinks. This does not change 1512 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1513 * changes. This would normally be done using kobject_uevent() but 1514 * that is a GPL-only symbol which is why we need this workaround. 1515 */ 1516 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1517 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1518 } 1519 1520 void 1521 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1522 { 1523 1524 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1525 } 1526 1527 void 1528 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1529 { 1530 1531 set_capacity(zv->zv_zso->zvo_disk, capacity); 1532 } 1533 1534 int 1535 zvol_init(void) 1536 { 1537 int error; 1538 1539 /* 1540 * zvol_threads is the module param the user passes in. 1541 * 1542 * zvol_actual_threads is what we use internally, since the user can 1543 * pass zvol_thread = 0 to mean "use all the CPUs" (the default). 1544 */ 1545 static unsigned int zvol_actual_threads; 1546 1547 if (zvol_threads == 0) { 1548 /* 1549 * See dde9380a1 for why 32 was chosen here. This should 1550 * probably be refined to be some multiple of the number 1551 * of CPUs. 1552 */ 1553 zvol_actual_threads = MAX(num_online_cpus(), 32); 1554 } else { 1555 zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); 1556 } 1557 1558 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1559 if (error) { 1560 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1561 return (error); 1562 } 1563 1564 #ifdef HAVE_BLK_MQ 1565 if (zvol_blk_mq_queue_depth == 0) { 1566 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1567 } else { 1568 zvol_actual_blk_mq_queue_depth = 1569 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1570 } 1571 1572 if (zvol_blk_mq_threads == 0) { 1573 zvol_blk_mq_actual_threads = num_online_cpus(); 1574 } else { 1575 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1576 1024); 1577 } 1578 #endif 1579 zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, 1580 zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1581 if (zvol_taskq == NULL) { 1582 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1583 return (-ENOMEM); 1584 } 1585 1586 zvol_init_impl(); 1587 ida_init(&zvol_ida); 1588 return (0); 1589 } 1590 1591 void 1592 zvol_fini(void) 1593 { 1594 zvol_fini_impl(); 1595 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1596 taskq_destroy(zvol_taskq); 1597 ida_destroy(&zvol_ida); 1598 } 1599 1600 /* BEGIN CSTYLED */ 1601 module_param(zvol_inhibit_dev, uint, 0644); 1602 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1603 1604 module_param(zvol_major, uint, 0444); 1605 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1606 1607 module_param(zvol_threads, uint, 0444); 1608 MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" 1609 "to 0 to use all active CPUs"); 1610 1611 module_param(zvol_request_sync, uint, 0644); 1612 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1613 1614 module_param(zvol_max_discard_blocks, ulong, 0444); 1615 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1616 1617 module_param(zvol_prefetch_bytes, uint, 0644); 1618 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1619 1620 module_param(zvol_volmode, uint, 0644); 1621 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1622 1623 #ifdef HAVE_BLK_MQ 1624 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1625 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1626 1627 module_param(zvol_use_blk_mq, uint, 0644); 1628 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1629 1630 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1631 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1632 "Process volblocksize blocks per thread"); 1633 #endif 1634 1635 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1636 module_param(zvol_open_timeout_ms, uint, 0644); 1637 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1638 #endif 1639 1640 /* END CSTYLED */ 1641