1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 24 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 25 * Copyright (c) 2024, 2025, Klara, Inc. 26 */ 27 28 #include <sys/dataset_kstats.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/zil_impl.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/spa_impl.h> 41 #include <sys/zvol.h> 42 #include <sys/zvol_impl.h> 43 #include <cityhash.h> 44 45 #include <linux/blkdev_compat.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/workqueue.h> 48 #include <linux/blk-mq.h> 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned long zvol_max_discard_blocks = 16384; 55 56 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 57 static unsigned int zvol_open_timeout_ms = 1000; 58 #endif 59 60 static unsigned int zvol_blk_mq_threads = 0; 61 static unsigned int zvol_blk_mq_actual_threads; 62 static boolean_t zvol_use_blk_mq = B_FALSE; 63 64 /* 65 * The maximum number of volblocksize blocks to process per thread. Typically, 66 * write heavy workloads preform better with higher values here, and read 67 * heavy workloads preform better with lower values, but that's not a hard 68 * and fast rule. It's basically a knob to tune between "less overhead with 69 * less parallelism" and "more overhead, but more parallelism". 70 * 71 * '8' was chosen as a reasonable, balanced, default based off of sequential 72 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 73 */ 74 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 75 76 #ifndef BLKDEV_DEFAULT_RQ 77 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 78 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 79 #endif 80 81 /* 82 * Finalize our BIO or request. 83 */ 84 static inline void 85 zvol_end_io(struct bio *bio, struct request *rq, int error) 86 { 87 ASSERT3U(error, >=, 0); 88 if (bio) { 89 bio->bi_status = errno_to_bi_status(error); 90 bio_endio(bio); 91 } else { 92 blk_mq_end_request(rq, errno_to_bi_status(error)); 93 } 94 } 95 96 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 97 static unsigned int zvol_actual_blk_mq_queue_depth; 98 99 struct zvol_state_os { 100 struct gendisk *zvo_disk; /* generic disk */ 101 struct request_queue *zvo_queue; /* request queue */ 102 dev_t zvo_dev; /* device id */ 103 104 struct blk_mq_tag_set tag_set; 105 106 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 107 boolean_t use_blk_mq; 108 }; 109 110 static struct ida zvol_ida; 111 112 /* 113 * This is called when a new block multiqueue request comes in. A request 114 * contains one or more BIOs. 115 */ 116 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 117 const struct blk_mq_queue_data *bd) 118 { 119 struct request *rq = bd->rq; 120 zvol_state_t *zv = rq->q->queuedata; 121 122 /* Tell the kernel that we are starting to process this request */ 123 blk_mq_start_request(rq); 124 125 if (blk_rq_is_passthrough(rq)) { 126 /* Skip non filesystem request */ 127 blk_mq_end_request(rq, BLK_STS_IOERR); 128 return (BLK_STS_IOERR); 129 } 130 131 zvol_request_impl(zv, NULL, rq, 0); 132 133 /* Acknowledge to the kernel that we got this request */ 134 return (BLK_STS_OK); 135 } 136 137 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 138 .queue_rq = zvol_mq_queue_rq, 139 }; 140 141 /* Initialize our blk-mq struct */ 142 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 143 { 144 struct zvol_state_os *zso = zv->zv_zso; 145 146 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 147 148 /* Initialize tag set. */ 149 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 150 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 151 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 152 zso->tag_set.numa_node = NUMA_NO_NODE; 153 zso->tag_set.cmd_size = 0; 154 155 /* 156 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 157 * zvol_request_impl() 158 */ 159 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 160 161 #ifdef BLK_MQ_F_SHOULD_MERGE 162 /* 163 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 164 * For older kernels, we set it. 165 */ 166 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 167 #endif 168 169 zso->tag_set.driver_data = zv; 170 171 return (blk_mq_alloc_tag_set(&zso->tag_set)); 172 } 173 174 /* 175 * Given a path, return TRUE if path is a ZVOL. 176 */ 177 boolean_t 178 zvol_os_is_zvol(const char *path) 179 { 180 dev_t dev = 0; 181 182 if (vdev_lookup_bdev(path, &dev) != 0) 183 return (B_FALSE); 184 185 if (MAJOR(dev) == zvol_major) 186 return (B_TRUE); 187 188 return (B_FALSE); 189 } 190 191 static void 192 zvol_write(zv_request_t *zvr) 193 { 194 struct bio *bio = zvr->bio; 195 struct request *rq = zvr->rq; 196 int error = 0; 197 zfs_uio_t uio; 198 zvol_state_t *zv = zvr->zv; 199 struct request_queue *q; 200 struct gendisk *disk; 201 unsigned long start_time = 0; 202 boolean_t acct = B_FALSE; 203 204 ASSERT3P(zv, !=, NULL); 205 ASSERT3U(zv->zv_open_count, >, 0); 206 ASSERT3P(zv->zv_zilog, !=, NULL); 207 208 q = zv->zv_zso->zvo_queue; 209 disk = zv->zv_zso->zvo_disk; 210 211 /* bio marked as FLUSH need to flush before write */ 212 if (io_is_flush(bio, rq)) { 213 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 214 if (error != 0) { 215 rw_exit(&zv->zv_suspend_lock); 216 zvol_end_io(bio, rq, -error); 217 return; 218 } 219 } 220 221 /* Some requests are just for flush and nothing else. */ 222 if (io_size(bio, rq) == 0) { 223 rw_exit(&zv->zv_suspend_lock); 224 zvol_end_io(bio, rq, 0); 225 return; 226 } 227 228 zfs_uio_bvec_init(&uio, bio, rq); 229 230 ssize_t start_resid = uio.uio_resid; 231 232 /* 233 * With use_blk_mq, accounting is done by blk_mq_start_request() 234 * and blk_mq_end_request(), so we can skip it here. 235 */ 236 if (bio) { 237 acct = blk_queue_io_stat(q); 238 if (acct) { 239 start_time = blk_generic_start_io_acct(q, disk, WRITE, 240 bio); 241 } 242 } 243 244 boolean_t sync = 245 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 246 247 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 248 uio.uio_loffset, uio.uio_resid, RL_WRITER); 249 250 uint64_t volsize = zv->zv_volsize; 251 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 252 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 253 uint64_t off = uio.uio_loffset; 254 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 255 256 if (bytes > volsize - off) /* don't write past the end */ 257 bytes = volsize - off; 258 259 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 260 261 /* This will only fail for ENOSPC */ 262 error = dmu_tx_assign(tx, DMU_TX_WAIT); 263 if (error) { 264 dmu_tx_abort(tx); 265 break; 266 } 267 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, 268 DMU_READ_PREFETCH); 269 if (error == 0) { 270 zvol_log_write(zv, tx, off, bytes, sync); 271 } 272 dmu_tx_commit(tx); 273 274 if (error) 275 break; 276 } 277 zfs_rangelock_exit(lr); 278 279 int64_t nwritten = start_resid - uio.uio_resid; 280 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 281 task_io_account_write(nwritten); 282 283 if (error == 0 && sync) 284 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 285 286 rw_exit(&zv->zv_suspend_lock); 287 288 if (bio && acct) { 289 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 290 } 291 292 zvol_end_io(bio, rq, error); 293 } 294 295 static void 296 zvol_write_task(void *arg) 297 { 298 zv_request_task_t *task = arg; 299 zvol_write(&task->zvr); 300 zv_request_task_free(task); 301 } 302 303 static void 304 zvol_discard(zv_request_t *zvr) 305 { 306 struct bio *bio = zvr->bio; 307 struct request *rq = zvr->rq; 308 zvol_state_t *zv = zvr->zv; 309 uint64_t start = io_offset(bio, rq); 310 uint64_t size = io_size(bio, rq); 311 uint64_t end = start + size; 312 boolean_t sync; 313 int error = 0; 314 dmu_tx_t *tx; 315 struct request_queue *q = zv->zv_zso->zvo_queue; 316 struct gendisk *disk = zv->zv_zso->zvo_disk; 317 unsigned long start_time = 0; 318 boolean_t acct = B_FALSE; 319 320 ASSERT3P(zv, !=, NULL); 321 ASSERT3U(zv->zv_open_count, >, 0); 322 ASSERT3P(zv->zv_zilog, !=, NULL); 323 324 if (bio) { 325 acct = blk_queue_io_stat(q); 326 if (acct) { 327 start_time = blk_generic_start_io_acct(q, disk, WRITE, 328 bio); 329 } 330 } 331 332 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 333 334 if (end > zv->zv_volsize) { 335 error = SET_ERROR(EIO); 336 goto unlock; 337 } 338 339 /* 340 * Align the request to volume block boundaries when a secure erase is 341 * not required. This will prevent dnode_free_range() from zeroing out 342 * the unaligned parts which is slow (read-modify-write) and useless 343 * since we are not freeing any space by doing so. 344 */ 345 if (!io_is_secure_erase(bio, rq)) { 346 start = P2ROUNDUP(start, zv->zv_volblocksize); 347 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 348 size = end - start; 349 } 350 351 if (start >= end) 352 goto unlock; 353 354 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 355 start, size, RL_WRITER); 356 357 tx = dmu_tx_create(zv->zv_objset); 358 dmu_tx_mark_netfree(tx); 359 error = dmu_tx_assign(tx, DMU_TX_WAIT); 360 if (error != 0) { 361 dmu_tx_abort(tx); 362 } else { 363 zvol_log_truncate(zv, tx, start, size); 364 dmu_tx_commit(tx); 365 error = dmu_free_long_range(zv->zv_objset, 366 ZVOL_OBJ, start, size); 367 } 368 zfs_rangelock_exit(lr); 369 370 if (error == 0 && sync) 371 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 372 373 unlock: 374 rw_exit(&zv->zv_suspend_lock); 375 376 if (bio && acct) { 377 blk_generic_end_io_acct(q, disk, WRITE, bio, 378 start_time); 379 } 380 381 zvol_end_io(bio, rq, error); 382 } 383 384 static void 385 zvol_discard_task(void *arg) 386 { 387 zv_request_task_t *task = arg; 388 zvol_discard(&task->zvr); 389 zv_request_task_free(task); 390 } 391 392 static void 393 zvol_read(zv_request_t *zvr) 394 { 395 struct bio *bio = zvr->bio; 396 struct request *rq = zvr->rq; 397 int error = 0; 398 zfs_uio_t uio; 399 boolean_t acct = B_FALSE; 400 zvol_state_t *zv = zvr->zv; 401 struct request_queue *q; 402 struct gendisk *disk; 403 unsigned long start_time = 0; 404 405 ASSERT3P(zv, !=, NULL); 406 ASSERT3U(zv->zv_open_count, >, 0); 407 408 zfs_uio_bvec_init(&uio, bio, rq); 409 410 q = zv->zv_zso->zvo_queue; 411 disk = zv->zv_zso->zvo_disk; 412 413 ssize_t start_resid = uio.uio_resid; 414 415 /* 416 * When blk-mq is being used, accounting is done by 417 * blk_mq_start_request() and blk_mq_end_request(). 418 */ 419 if (bio) { 420 acct = blk_queue_io_stat(q); 421 if (acct) 422 start_time = blk_generic_start_io_acct(q, disk, READ, 423 bio); 424 } 425 426 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 427 uio.uio_loffset, uio.uio_resid, RL_READER); 428 429 uint64_t volsize = zv->zv_volsize; 430 431 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 432 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 433 434 /* don't read past the end */ 435 if (bytes > volsize - uio.uio_loffset) 436 bytes = volsize - uio.uio_loffset; 437 438 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, 439 DMU_READ_PREFETCH); 440 if (error) { 441 /* convert checksum errors into IO errors */ 442 if (error == ECKSUM) 443 error = SET_ERROR(EIO); 444 break; 445 } 446 } 447 zfs_rangelock_exit(lr); 448 449 int64_t nread = start_resid - uio.uio_resid; 450 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 451 task_io_account_read(nread); 452 453 rw_exit(&zv->zv_suspend_lock); 454 455 if (bio && acct) { 456 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 457 } 458 459 zvol_end_io(bio, rq, error); 460 } 461 462 static void 463 zvol_read_task(void *arg) 464 { 465 zv_request_task_t *task = arg; 466 zvol_read(&task->zvr); 467 zv_request_task_free(task); 468 } 469 470 471 /* 472 * Process a BIO or request 473 * 474 * Either 'bio' or 'rq' should be set depending on if we are processing a 475 * bio or a request (both should not be set). 476 * 477 * force_sync: Set to 0 to defer processing to a background taskq 478 * Set to 1 to process data synchronously 479 */ 480 static void 481 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 482 boolean_t force_sync) 483 { 484 fstrans_cookie_t cookie = spl_fstrans_mark(); 485 uint64_t offset = io_offset(bio, rq); 486 uint64_t size = io_size(bio, rq); 487 int rw = io_data_dir(bio, rq); 488 489 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 490 zvol_end_io(bio, rq, SET_ERROR(ENXIO)); 491 goto out; 492 } 493 494 if (zvol_request_sync || zv->zv_threading == B_FALSE) 495 force_sync = 1; 496 497 zv_request_t zvr = { 498 .zv = zv, 499 .bio = bio, 500 .rq = rq, 501 }; 502 503 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 504 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 505 zv->zv_zso->zvo_disk->disk_name, 506 (long long unsigned)offset, 507 (long unsigned)size); 508 509 zvol_end_io(bio, rq, SET_ERROR(EIO)); 510 goto out; 511 } 512 513 zv_request_task_t *task; 514 zv_taskq_t *ztqs = &zvol_taskqs; 515 uint_t blk_mq_hw_queue = 0; 516 uint_t tq_idx; 517 uint_t taskq_hash; 518 if (rq) 519 #ifdef HAVE_BLK_MQ_RQ_HCTX 520 blk_mq_hw_queue = rq->mq_hctx->queue_num; 521 #else 522 blk_mq_hw_queue = rq->q->queue_hw_ctx[ 523 rq->q->mq_map[raw_smp_processor_id()]]->queue_num; 524 #endif 525 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 526 blk_mq_hw_queue); 527 tq_idx = taskq_hash % ztqs->tqs_cnt; 528 529 if (rw == WRITE) { 530 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 531 zvol_end_io(bio, rq, SET_ERROR(EROFS)); 532 goto out; 533 } 534 535 /* 536 * Prevents the zvol from being suspended, or the ZIL being 537 * concurrently opened. Will be released after the i/o 538 * completes. 539 */ 540 rw_enter(&zv->zv_suspend_lock, RW_READER); 541 542 /* 543 * Open a ZIL if this is the first time we have written to this 544 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 545 * than zv_state_lock so that we don't need to acquire an 546 * additional lock in this path. 547 */ 548 if (zv->zv_zilog == NULL) { 549 rw_exit(&zv->zv_suspend_lock); 550 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 551 if (zv->zv_zilog == NULL) { 552 zv->zv_zilog = zil_open(zv->zv_objset, 553 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 554 zv->zv_flags |= ZVOL_WRITTEN_TO; 555 /* replay / destroy done in zvol_create_minor */ 556 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 557 ZIL_REPLAY_NEEDED)); 558 } 559 rw_downgrade(&zv->zv_suspend_lock); 560 } 561 562 /* 563 * We don't want this thread to be blocked waiting for i/o to 564 * complete, so we instead wait from a taskq callback. The 565 * i/o may be a ZIL write (via zil_commit()), or a read of an 566 * indirect block, or a read of a data block (if this is a 567 * partial-block write). We will indicate that the i/o is 568 * complete by calling END_IO() from the taskq callback. 569 * 570 * This design allows the calling thread to continue and 571 * initiate more concurrent operations by calling 572 * zvol_request() again. There are typically only a small 573 * number of threads available to call zvol_request() (e.g. 574 * one per iSCSI target), so keeping the latency of 575 * zvol_request() low is important for performance. 576 * 577 * The zvol_request_sync module parameter allows this 578 * behavior to be altered, for performance evaluation 579 * purposes. If the callback blocks, setting 580 * zvol_request_sync=1 will result in much worse performance. 581 * 582 * We can have up to zvol_threads concurrent i/o's being 583 * processed for all zvols on the system. This is typically 584 * a vast improvement over the zvol_request_sync=1 behavior 585 * of one i/o at a time per zvol. However, an even better 586 * design would be for zvol_request() to initiate the zio 587 * directly, and then be notified by the zio_done callback, 588 * which would call END_IO(). Unfortunately, the DMU/ZIL 589 * interfaces lack this functionality (they block waiting for 590 * the i/o to complete). 591 */ 592 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 593 if (force_sync) { 594 zvol_discard(&zvr); 595 } else { 596 task = zv_request_task_create(zvr); 597 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 598 zvol_discard_task, task, 0, &task->ent); 599 } 600 } else { 601 if (force_sync) { 602 zvol_write(&zvr); 603 } else { 604 task = zv_request_task_create(zvr); 605 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 606 zvol_write_task, task, 0, &task->ent); 607 } 608 } 609 } else { 610 /* 611 * The SCST driver, and possibly others, may issue READ I/Os 612 * with a length of zero bytes. These empty I/Os contain no 613 * data and require no additional handling. 614 */ 615 if (size == 0) { 616 zvol_end_io(bio, rq, 0); 617 goto out; 618 } 619 620 rw_enter(&zv->zv_suspend_lock, RW_READER); 621 622 /* See comment in WRITE case above. */ 623 if (force_sync) { 624 zvol_read(&zvr); 625 } else { 626 task = zv_request_task_create(zvr); 627 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 628 zvol_read_task, task, 0, &task->ent); 629 } 630 } 631 632 out: 633 spl_fstrans_unmark(cookie); 634 } 635 636 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 637 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 638 static void 639 zvol_submit_bio(struct bio *bio) 640 #else 641 static blk_qc_t 642 zvol_submit_bio(struct bio *bio) 643 #endif 644 #else 645 static MAKE_REQUEST_FN_RET 646 zvol_request(struct request_queue *q, struct bio *bio) 647 #endif 648 { 649 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 650 #if defined(HAVE_BIO_BDEV_DISK) 651 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 652 #else 653 struct request_queue *q = bio->bi_disk->queue; 654 #endif 655 #endif 656 zvol_state_t *zv = q->queuedata; 657 658 zvol_request_impl(zv, bio, NULL, 0); 659 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 660 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 661 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 662 return (BLK_QC_T_NONE); 663 #endif 664 } 665 666 static int 667 #ifdef HAVE_BLK_MODE_T 668 zvol_open(struct gendisk *disk, blk_mode_t flag) 669 #else 670 zvol_open(struct block_device *bdev, fmode_t flag) 671 #endif 672 { 673 zvol_state_t *zv; 674 int error = 0; 675 boolean_t drop_suspend = B_FALSE; 676 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 677 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 678 hrtime_t start = gethrtime(); 679 680 retry: 681 #endif 682 683 #ifdef HAVE_BLK_MODE_T 684 zv = atomic_load_ptr(&disk->private_data); 685 #else 686 zv = atomic_load_ptr(&bdev->bd_disk->private_data); 687 #endif 688 if (zv == NULL) { 689 return (-SET_ERROR(ENXIO)); 690 } 691 692 mutex_enter(&zv->zv_state_lock); 693 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 694 mutex_exit(&zv->zv_state_lock); 695 return (-SET_ERROR(ENXIO)); 696 } 697 698 /* 699 * Make sure zvol is not suspended during first open 700 * (hold zv_suspend_lock) and respect proper lock acquisition 701 * ordering - zv_suspend_lock before zv_state_lock 702 */ 703 if (zv->zv_open_count == 0) { 704 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 705 mutex_exit(&zv->zv_state_lock); 706 707 /* 708 * Removal may happen while the locks are down, so 709 * we can't trust zv any longer; we have to start over. 710 */ 711 #ifdef HAVE_BLK_MODE_T 712 zv = atomic_load_ptr(&disk->private_data); 713 #else 714 zv = atomic_load_ptr(&bdev->bd_disk->private_data); 715 #endif 716 if (zv == NULL) 717 return (-SET_ERROR(ENXIO)); 718 719 rw_enter(&zv->zv_suspend_lock, RW_READER); 720 mutex_enter(&zv->zv_state_lock); 721 722 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 723 mutex_exit(&zv->zv_state_lock); 724 rw_exit(&zv->zv_suspend_lock); 725 return (-SET_ERROR(ENXIO)); 726 } 727 728 /* check to see if zv_suspend_lock is needed */ 729 if (zv->zv_open_count != 0) { 730 rw_exit(&zv->zv_suspend_lock); 731 } else { 732 drop_suspend = B_TRUE; 733 } 734 } else { 735 drop_suspend = B_TRUE; 736 } 737 } 738 739 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 740 741 if (zv->zv_open_count == 0) { 742 boolean_t drop_namespace = B_FALSE; 743 744 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 745 746 /* 747 * In all other call paths the spa_namespace_lock is taken 748 * before the bdev->bd_mutex lock. However, on open(2) 749 * the __blkdev_get() function calls fops->open() with the 750 * bdev->bd_mutex lock held. This can result in a deadlock 751 * when zvols from one pool are used as vdevs in another. 752 * 753 * To prevent a lock inversion deadlock we preemptively 754 * take the spa_namespace_lock. Normally the lock will not 755 * be contended and this is safe because spa_open_common() 756 * handles the case where the caller already holds the 757 * spa_namespace_lock. 758 * 759 * When the lock cannot be aquired after multiple retries 760 * this must be the vdev on zvol deadlock case and we have 761 * no choice but to return an error. For 5.12 and older 762 * kernels returning -ERESTARTSYS will result in the 763 * bdev->bd_mutex being dropped, then reacquired, and 764 * fops->open() being called again. This process can be 765 * repeated safely until both locks are acquired. For 5.13 766 * and newer the -ERESTARTSYS retry logic was removed from 767 * the kernel so the only option is to return the error for 768 * the caller to handle it. 769 */ 770 if (!mutex_owned(&spa_namespace_lock)) { 771 if (!mutex_tryenter(&spa_namespace_lock)) { 772 mutex_exit(&zv->zv_state_lock); 773 rw_exit(&zv->zv_suspend_lock); 774 drop_suspend = B_FALSE; 775 776 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 777 schedule(); 778 return (-SET_ERROR(ERESTARTSYS)); 779 #else 780 if ((gethrtime() - start) > timeout) 781 return (-SET_ERROR(ERESTARTSYS)); 782 783 schedule_timeout_interruptible( 784 MSEC_TO_TICK(10)); 785 goto retry; 786 #endif 787 } else { 788 drop_namespace = B_TRUE; 789 } 790 } 791 792 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 793 794 if (drop_namespace) 795 mutex_exit(&spa_namespace_lock); 796 } 797 798 if (error == 0) { 799 if ((blk_mode_is_open_write(flag)) && 800 (zv->zv_flags & ZVOL_RDONLY)) { 801 if (zv->zv_open_count == 0) 802 zvol_last_close(zv); 803 804 error = -SET_ERROR(EROFS); 805 } else { 806 zv->zv_open_count++; 807 } 808 } 809 810 mutex_exit(&zv->zv_state_lock); 811 if (drop_suspend) 812 rw_exit(&zv->zv_suspend_lock); 813 814 if (error == 0) 815 #ifdef HAVE_BLK_MODE_T 816 disk_check_media_change(disk); 817 #else 818 zfs_check_media_change(bdev); 819 #endif 820 821 return (error); 822 } 823 824 static void 825 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 826 zvol_release(struct gendisk *disk) 827 #else 828 zvol_release(struct gendisk *disk, fmode_t unused) 829 #endif 830 { 831 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 832 (void) unused; 833 #endif 834 boolean_t drop_suspend = B_TRUE; 835 836 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 837 if (zv == NULL) 838 return; 839 840 mutex_enter(&zv->zv_state_lock); 841 ASSERT3U(zv->zv_open_count, >, 0); 842 /* 843 * make sure zvol is not suspended during last close 844 * (hold zv_suspend_lock) and respect proper lock acquisition 845 * ordering - zv_suspend_lock before zv_state_lock 846 */ 847 if (zv->zv_open_count == 1) { 848 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 849 mutex_exit(&zv->zv_state_lock); 850 rw_enter(&zv->zv_suspend_lock, RW_READER); 851 mutex_enter(&zv->zv_state_lock); 852 853 /* 854 * Unlike in zvol_open(), we don't check if removal 855 * started here, because we might be one of the openers 856 * that needs to be thrown out! If we're the last, we 857 * need to call zvol_last_close() below to finish 858 * cleanup. So, no special treatment for us. 859 */ 860 861 /* check to see if zv_suspend_lock is needed */ 862 if (zv->zv_open_count != 1) { 863 rw_exit(&zv->zv_suspend_lock); 864 drop_suspend = B_FALSE; 865 } 866 } 867 } else { 868 drop_suspend = B_FALSE; 869 } 870 871 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 872 873 zv->zv_open_count--; 874 if (zv->zv_open_count == 0) { 875 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 876 zvol_last_close(zv); 877 } 878 879 mutex_exit(&zv->zv_state_lock); 880 881 if (drop_suspend) 882 rw_exit(&zv->zv_suspend_lock); 883 } 884 885 static int 886 zvol_ioctl(struct block_device *bdev, fmode_t mode, 887 unsigned int cmd, unsigned long arg) 888 { 889 int error = 0; 890 891 zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); 892 ASSERT3P(zv, !=, NULL); 893 ASSERT3U(zv->zv_open_count, >, 0); 894 895 switch (cmd) { 896 case BLKFLSBUF: 897 #ifdef HAVE_FSYNC_BDEV 898 fsync_bdev(bdev); 899 #elif defined(HAVE_SYNC_BLOCKDEV) 900 sync_blockdev(bdev); 901 #else 902 #error "Neither fsync_bdev() nor sync_blockdev() found" 903 #endif 904 invalidate_bdev(bdev); 905 rw_enter(&zv->zv_suspend_lock, RW_READER); 906 907 if (!(zv->zv_flags & ZVOL_RDONLY)) 908 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 909 910 rw_exit(&zv->zv_suspend_lock); 911 break; 912 913 case BLKZNAME: 914 mutex_enter(&zv->zv_state_lock); 915 error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 916 mutex_exit(&zv->zv_state_lock); 917 if (error) 918 error = SET_ERROR(error); 919 break; 920 921 default: 922 error = SET_ERROR(ENOTTY); 923 break; 924 } 925 926 return (-error); 927 } 928 929 #ifdef CONFIG_COMPAT 930 static int 931 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 932 unsigned cmd, unsigned long arg) 933 { 934 return (zvol_ioctl(bdev, mode, cmd, arg)); 935 } 936 #else 937 #define zvol_compat_ioctl NULL 938 #endif 939 940 static unsigned int 941 zvol_check_events(struct gendisk *disk, unsigned int clearing) 942 { 943 unsigned int mask = 0; 944 945 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 946 947 if (zv != NULL) { 948 mutex_enter(&zv->zv_state_lock); 949 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 950 zv->zv_changed = 0; 951 mutex_exit(&zv->zv_state_lock); 952 } 953 954 return (mask); 955 } 956 957 static int 958 zvol_revalidate_disk(struct gendisk *disk) 959 { 960 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 961 962 if (zv != NULL) { 963 mutex_enter(&zv->zv_state_lock); 964 set_capacity(zv->zv_zso->zvo_disk, 965 zv->zv_volsize >> SECTOR_BITS); 966 mutex_exit(&zv->zv_state_lock); 967 } 968 969 return (0); 970 } 971 972 int 973 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 974 { 975 struct gendisk *disk = zv->zv_zso->zvo_disk; 976 977 #if defined(HAVE_REVALIDATE_DISK_SIZE) 978 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 979 #elif defined(HAVE_REVALIDATE_DISK) 980 revalidate_disk(disk); 981 #else 982 zvol_revalidate_disk(disk); 983 #endif 984 return (0); 985 } 986 987 /* 988 * Provide a simple virtual geometry for legacy compatibility. For devices 989 * smaller than 1 MiB a small head and sector count is used to allow very 990 * tiny devices. For devices over 1 Mib a standard head and sector count 991 * is used to keep the cylinders count reasonable. 992 */ 993 static int 994 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 995 { 996 sector_t sectors; 997 998 zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); 999 ASSERT3P(zv, !=, NULL); 1000 ASSERT3U(zv->zv_open_count, >, 0); 1001 1002 sectors = get_capacity(zv->zv_zso->zvo_disk); 1003 1004 if (sectors > 2048) { 1005 geo->heads = 16; 1006 geo->sectors = 63; 1007 } else { 1008 geo->heads = 2; 1009 geo->sectors = 4; 1010 } 1011 1012 geo->start = 0; 1013 geo->cylinders = sectors / (geo->heads * geo->sectors); 1014 1015 return (0); 1016 } 1017 1018 /* 1019 * Why have two separate block_device_operations structs? 1020 * 1021 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1022 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1023 * can't just change submit_bio dynamically at runtime. So just create two 1024 * separate structs to get around this. 1025 */ 1026 static const struct block_device_operations zvol_ops_blk_mq = { 1027 .open = zvol_open, 1028 .release = zvol_release, 1029 .ioctl = zvol_ioctl, 1030 .compat_ioctl = zvol_compat_ioctl, 1031 .check_events = zvol_check_events, 1032 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1033 .revalidate_disk = zvol_revalidate_disk, 1034 #endif 1035 .getgeo = zvol_getgeo, 1036 .owner = THIS_MODULE, 1037 }; 1038 1039 static const struct block_device_operations zvol_ops = { 1040 .open = zvol_open, 1041 .release = zvol_release, 1042 .ioctl = zvol_ioctl, 1043 .compat_ioctl = zvol_compat_ioctl, 1044 .check_events = zvol_check_events, 1045 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1046 .revalidate_disk = zvol_revalidate_disk, 1047 #endif 1048 .getgeo = zvol_getgeo, 1049 .owner = THIS_MODULE, 1050 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1051 .submit_bio = zvol_submit_bio, 1052 #endif 1053 }; 1054 1055 /* 1056 * Since 6.9, Linux has been removing queue limit setters in favour of an 1057 * initial queue_limits struct applied when the device is open. Since 6.11, 1058 * queue_limits is being extended to allow more things to be applied when the 1059 * device is open. Setters are also being removed for this. 1060 * 1061 * For OpenZFS, this means that depending on kernel version, some options may 1062 * be set up before the device is open, and some applied to an open device 1063 * (queue) after the fact. 1064 * 1065 * We manage this complexity by having our own limits struct, 1066 * zvol_queue_limits_t, in which we carry any queue config that we're 1067 * interested in setting. This structure is the same on all kernels. 1068 * 1069 * These limits are then applied to the queue at device open time by the most 1070 * appropriate method for the kernel. 1071 * 1072 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1073 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1074 * struct queue_limits, and passes it in. Any fields added in later kernels are 1075 * (obviously) not set up here. 1076 * 1077 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1078 * is created, and applies any remaining config. Before 6.9 that will be 1079 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1080 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1081 * will always be a no-op on the latest kernel we support). 1082 */ 1083 typedef struct zvol_queue_limits { 1084 unsigned int zql_max_hw_sectors; 1085 unsigned short zql_max_segments; 1086 unsigned int zql_max_segment_size; 1087 unsigned int zql_io_opt; 1088 unsigned int zql_physical_block_size; 1089 unsigned int zql_max_discard_sectors; 1090 unsigned int zql_discard_granularity; 1091 } zvol_queue_limits_t; 1092 1093 static void 1094 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1095 boolean_t use_blk_mq) 1096 { 1097 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1098 1099 if (use_blk_mq) { 1100 /* 1101 * IO requests can be really big (1MB). When an IO request 1102 * comes in, it is passed off to zvol_read() or zvol_write() 1103 * in a new thread, where it is chunked up into 'volblocksize' 1104 * sized pieces and processed. So for example, if the request 1105 * is a 1MB write and your volblocksize is 128k, one zvol_write 1106 * thread will take that request and sequentially do ten 128k 1107 * IOs. This is due to the fact that the thread needs to lock 1108 * each volblocksize sized block. So you might be wondering: 1109 * "instead of passing the whole 1MB request to one thread, 1110 * why not pass ten individual 128k chunks to ten threads and 1111 * process the whole write in parallel?" The short answer is 1112 * that there's a sweet spot number of chunks that balances 1113 * the greater parallelism with the added overhead of more 1114 * threads. The sweet spot can be different depending on if you 1115 * have a read or write heavy workload. Writes typically want 1116 * high chunk counts while reads typically want lower ones. On 1117 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1118 * configuration, with volblocksize=8k, the sweet spot for good 1119 * sequential reads and writes was at 8 chunks. 1120 */ 1121 1122 /* 1123 * Below we tell the kernel how big we want our requests 1124 * to be. You would think that blk_queue_io_opt() would be 1125 * used to do this since it is used to "set optimal request 1126 * size for the queue", but that doesn't seem to do 1127 * anything - the kernel still gives you huge requests 1128 * with tons of little PAGE_SIZE segments contained within it. 1129 * 1130 * Knowing that the kernel will just give you PAGE_SIZE segments 1131 * no matter what, you can say "ok, I want PAGE_SIZE byte 1132 * segments, and I want 'N' of them per request", where N is 1133 * the correct number of segments for the volblocksize and 1134 * number of chunks you want. 1135 */ 1136 if (zvol_blk_mq_blocks_per_thread != 0) { 1137 unsigned int chunks; 1138 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1139 1140 limits->zql_max_segment_size = PAGE_SIZE; 1141 limits->zql_max_segments = 1142 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1143 } else { 1144 /* 1145 * Special case: zvol_blk_mq_blocks_per_thread = 0 1146 * Max everything out. 1147 */ 1148 limits->zql_max_segments = UINT16_MAX; 1149 limits->zql_max_segment_size = UINT_MAX; 1150 } 1151 } else { 1152 limits->zql_max_segments = UINT16_MAX; 1153 limits->zql_max_segment_size = UINT_MAX; 1154 } 1155 1156 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1157 1158 limits->zql_physical_block_size = zv->zv_volblocksize; 1159 limits->zql_max_discard_sectors = 1160 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1161 limits->zql_discard_granularity = zv->zv_volblocksize; 1162 } 1163 1164 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1165 static void 1166 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1167 struct queue_limits *qlimits) 1168 { 1169 memset(qlimits, 0, sizeof (struct queue_limits)); 1170 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1171 qlimits->max_segments = limits->zql_max_segments; 1172 qlimits->max_segment_size = limits->zql_max_segment_size; 1173 qlimits->io_opt = limits->zql_io_opt; 1174 qlimits->physical_block_size = limits->zql_physical_block_size; 1175 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1176 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1177 qlimits->discard_granularity = limits->zql_discard_granularity; 1178 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1179 qlimits->features = 1180 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1181 #endif 1182 } 1183 #endif 1184 1185 static void 1186 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1187 struct request_queue *queue) 1188 { 1189 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1190 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1191 blk_queue_max_segments(queue, limits->zql_max_segments); 1192 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1193 blk_queue_io_opt(queue, limits->zql_io_opt); 1194 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1195 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1196 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1197 #endif 1198 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1199 blk_queue_set_write_cache(queue, B_TRUE); 1200 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1201 #endif 1202 } 1203 1204 static int 1205 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1206 { 1207 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1208 #if defined(HAVE_BLK_ALLOC_DISK) 1209 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1210 if (zso->zvo_disk == NULL) 1211 return (1); 1212 1213 zso->zvo_disk->minors = ZVOL_MINORS; 1214 zso->zvo_queue = zso->zvo_disk->queue; 1215 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1216 struct queue_limits qlimits; 1217 zvol_queue_limits_convert(limits, &qlimits); 1218 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1219 if (IS_ERR(disk)) { 1220 zso->zvo_disk = NULL; 1221 return (1); 1222 } 1223 1224 zso->zvo_disk = disk; 1225 zso->zvo_disk->minors = ZVOL_MINORS; 1226 zso->zvo_queue = zso->zvo_disk->queue; 1227 1228 #else 1229 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1230 if (zso->zvo_queue == NULL) 1231 return (1); 1232 1233 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1234 if (zso->zvo_disk == NULL) { 1235 blk_cleanup_queue(zso->zvo_queue); 1236 return (1); 1237 } 1238 1239 zso->zvo_disk->queue = zso->zvo_queue; 1240 #endif /* HAVE_BLK_ALLOC_DISK */ 1241 #else 1242 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1243 if (zso->zvo_queue == NULL) 1244 return (1); 1245 1246 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1247 if (zso->zvo_disk == NULL) { 1248 blk_cleanup_queue(zso->zvo_queue); 1249 return (1); 1250 } 1251 1252 zso->zvo_disk->queue = zso->zvo_queue; 1253 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1254 1255 zvol_queue_limits_apply(limits, zso->zvo_queue); 1256 1257 return (0); 1258 1259 } 1260 1261 static int 1262 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1263 { 1264 struct zvol_state_os *zso = zv->zv_zso; 1265 1266 /* Allocate our blk-mq tag_set */ 1267 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1268 return (1); 1269 1270 #if defined(HAVE_BLK_ALLOC_DISK) 1271 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1272 if (zso->zvo_disk == NULL) { 1273 blk_mq_free_tag_set(&zso->tag_set); 1274 return (1); 1275 } 1276 zso->zvo_queue = zso->zvo_disk->queue; 1277 zso->zvo_disk->minors = ZVOL_MINORS; 1278 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1279 struct queue_limits qlimits; 1280 zvol_queue_limits_convert(limits, &qlimits); 1281 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1282 if (IS_ERR(disk)) { 1283 zso->zvo_disk = NULL; 1284 blk_mq_free_tag_set(&zso->tag_set); 1285 return (1); 1286 } 1287 1288 zso->zvo_disk = disk; 1289 zso->zvo_queue = zso->zvo_disk->queue; 1290 zso->zvo_disk->minors = ZVOL_MINORS; 1291 #else 1292 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1293 if (zso->zvo_disk == NULL) { 1294 blk_cleanup_queue(zso->zvo_queue); 1295 blk_mq_free_tag_set(&zso->tag_set); 1296 return (1); 1297 } 1298 /* Allocate queue */ 1299 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1300 if (IS_ERR(zso->zvo_queue)) { 1301 blk_mq_free_tag_set(&zso->tag_set); 1302 return (1); 1303 } 1304 1305 /* Our queue is now created, assign it to our disk */ 1306 zso->zvo_disk->queue = zso->zvo_queue; 1307 #endif 1308 1309 zvol_queue_limits_apply(limits, zso->zvo_queue); 1310 1311 return (0); 1312 } 1313 1314 /* 1315 * Allocate memory for a new zvol_state_t and setup the required 1316 * request queue and generic disk structures for the block device. 1317 */ 1318 static int 1319 zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, 1320 zvol_state_t **zvp) 1321 { 1322 zvol_state_t *zv; 1323 struct zvol_state_os *zso; 1324 uint64_t volmode; 1325 int ret; 1326 1327 ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); 1328 if (ret) 1329 return (ret); 1330 1331 if (volmode == ZFS_VOLMODE_DEFAULT) 1332 volmode = zvol_volmode; 1333 1334 if (volmode == ZFS_VOLMODE_NONE) 1335 return (0); 1336 1337 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1338 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1339 zv->zv_zso = zso; 1340 zv->zv_volmode = volmode; 1341 zv->zv_volsize = volsize; 1342 zv->zv_volblocksize = volblocksize; 1343 1344 list_link_init(&zv->zv_next); 1345 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1346 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1347 1348 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1349 1350 zvol_queue_limits_t limits; 1351 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1352 1353 /* 1354 * The block layer has 3 interfaces for getting BIOs: 1355 * 1356 * 1. blk-mq request queues (new) 1357 * 2. submit_bio() (oldest) 1358 * 3. regular request queues (old). 1359 * 1360 * Each of those interfaces has two permutations: 1361 * 1362 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1363 * both the disk and its queue (5.14 kernel or newer) 1364 * 1365 * b) We don't have blk_*alloc_disk(), and have to allocate the 1366 * disk and the queue separately. (5.13 kernel or older) 1367 */ 1368 if (zv->zv_zso->use_blk_mq) { 1369 ret = zvol_alloc_blk_mq(zv, &limits); 1370 if (ret != 0) 1371 goto out_kmem; 1372 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1373 } else { 1374 ret = zvol_alloc_non_blk_mq(zso, &limits); 1375 if (ret != 0) 1376 goto out_kmem; 1377 zso->zvo_disk->fops = &zvol_ops; 1378 } 1379 1380 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1381 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1382 1383 if (!zv->zv_zso->use_blk_mq) { 1384 /* Disable write merging in favor of the ZIO pipeline. */ 1385 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1386 } 1387 1388 zso->zvo_queue->queuedata = zv; 1389 zso->zvo_dev = dev; 1390 zv->zv_open_count = 0; 1391 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1392 1393 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1394 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1395 1396 zso->zvo_disk->major = zvol_major; 1397 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1398 1399 /* 1400 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1401 * This is accomplished by limiting the number of minors for the 1402 * device to one and explicitly disabling partition scanning. 1403 */ 1404 if (volmode == ZFS_VOLMODE_DEV) { 1405 zso->zvo_disk->minors = 1; 1406 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1407 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1408 } 1409 1410 zso->zvo_disk->first_minor = (dev & MINORMASK); 1411 zso->zvo_disk->private_data = zv; 1412 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1413 ZVOL_DEV_NAME, (dev & MINORMASK)); 1414 1415 *zvp = zv; 1416 return (ret); 1417 1418 out_kmem: 1419 kmem_free(zso, sizeof (struct zvol_state_os)); 1420 kmem_free(zv, sizeof (zvol_state_t)); 1421 return (ret); 1422 } 1423 1424 void 1425 zvol_os_remove_minor(zvol_state_t *zv) 1426 { 1427 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1428 ASSERT0(zv->zv_open_count); 1429 ASSERT0(atomic_read(&zv->zv_suspend_ref)); 1430 ASSERT(zv->zv_flags & ZVOL_REMOVING); 1431 1432 struct zvol_state_os *zso = zv->zv_zso; 1433 zv->zv_zso = NULL; 1434 1435 /* Clearing private_data will make new callers return immediately. */ 1436 atomic_store_ptr(&zso->zvo_disk->private_data, NULL); 1437 1438 /* 1439 * Drop the state lock before calling del_gendisk(). There may be 1440 * callers waiting to acquire it, but del_gendisk() will block until 1441 * they exit, which would deadlock. 1442 */ 1443 mutex_exit(&zv->zv_state_lock); 1444 1445 del_gendisk(zso->zvo_disk); 1446 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1447 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1448 #if defined(HAVE_BLK_CLEANUP_DISK) 1449 blk_cleanup_disk(zso->zvo_disk); 1450 #else 1451 put_disk(zso->zvo_disk); 1452 #endif 1453 #else 1454 blk_cleanup_queue(zso->zvo_queue); 1455 put_disk(zso->zvo_disk); 1456 #endif 1457 1458 if (zso->use_blk_mq) 1459 blk_mq_free_tag_set(&zso->tag_set); 1460 1461 ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); 1462 1463 kmem_free(zso, sizeof (struct zvol_state_os)); 1464 1465 mutex_enter(&zv->zv_state_lock); 1466 } 1467 1468 void 1469 zvol_os_free(zvol_state_t *zv) 1470 { 1471 1472 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1473 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1474 ASSERT0(zv->zv_open_count); 1475 ASSERT0P(zv->zv_zso); 1476 1477 ASSERT0P(zv->zv_objset); 1478 ASSERT0P(zv->zv_zilog); 1479 ASSERT0P(zv->zv_dn); 1480 1481 rw_destroy(&zv->zv_suspend_lock); 1482 zfs_rangelock_fini(&zv->zv_rangelock); 1483 1484 cv_destroy(&zv->zv_removing_cv); 1485 mutex_destroy(&zv->zv_state_lock); 1486 dataset_kstats_destroy(&zv->zv_kstat); 1487 1488 kmem_free(zv, sizeof (zvol_state_t)); 1489 } 1490 1491 void 1492 zvol_wait_close(zvol_state_t *zv) 1493 { 1494 } 1495 1496 struct add_disk_work { 1497 struct delayed_work work; 1498 struct gendisk *disk; 1499 int error; 1500 }; 1501 1502 static int 1503 __zvol_os_add_disk(struct gendisk *disk) 1504 { 1505 int error = 0; 1506 #ifdef HAVE_ADD_DISK_RET 1507 error = -add_disk(disk); 1508 if (error) 1509 error = SET_ERROR(error); 1510 #else 1511 add_disk(disk); 1512 #endif 1513 return (error); 1514 } 1515 1516 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1517 static void 1518 zvol_os_add_disk_work(struct work_struct *work) 1519 { 1520 struct add_disk_work *add_disk_work; 1521 add_disk_work = container_of(work, struct add_disk_work, work.work); 1522 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1523 } 1524 #endif 1525 1526 /* 1527 * SPECIAL CASE: 1528 * 1529 * This function basically calls add_disk() from a workqueue. You may be 1530 * thinking: why not just call add_disk() directly? 1531 * 1532 * When you call add_disk(), the zvol appears to the world. When this happens, 1533 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1534 * differently on the 6.9+ kernels: 1535 * 1536 * - 6.8 and older kernels - 1537 * disk_scan_partitions() 1538 * handle = bdev_open_by_dev( 1539 * zvol_open() 1540 * bdev_release(handle); 1541 * zvol_release() 1542 * 1543 * 1544 * - 6.9+ kernels - 1545 * disk_scan_partitions() 1546 * file = bdev_file_open_by_dev() 1547 * zvol_open() 1548 * fput(file) 1549 * < wait for return to userspace > 1550 * zvol_release() 1551 * 1552 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1553 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1554 * async that has to wait until we return to userspace (since it adds the fput 1555 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1556 * behavior we want, since we want do things like create+destroy a zvol within 1557 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1558 * reference to the zvol while we're in the IOCTL, which can't wait until we 1559 * return to userspace. 1560 * 1561 * We can get around this since fput() has a special codepath for when it's 1562 * running in a kernel thread or interrupt. In those cases, it just puts the 1563 * fput into the system workqueue, which we can force to run with 1564 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1565 * run from a kernel thread and "tricks" the fput() codepaths. 1566 * 1567 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1568 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1569 * fput) to happen, which it eventually, naturally, will from the system_wq 1570 * without us explicitly calling __flush_workqueue(). 1571 */ 1572 static int 1573 zvol_os_add_disk(struct gendisk *disk) 1574 { 1575 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1576 struct add_disk_work add_disk_work; 1577 1578 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1579 add_disk_work.disk = disk; 1580 add_disk_work.error = 0; 1581 1582 /* Use *_delayed_work functions since they're not GPL'd */ 1583 schedule_delayed_work(&add_disk_work.work, 0); 1584 flush_delayed_work(&add_disk_work.work); 1585 1586 __flush_workqueue(system_wq); 1587 return (add_disk_work.error); 1588 #else /* <= 6.8 kernel */ 1589 return (__zvol_os_add_disk(disk)); 1590 #endif 1591 } 1592 1593 /* 1594 * Create a block device minor node and setup the linkage between it 1595 * and the specified volume. Once this function returns the block 1596 * device is live and ready for use. 1597 */ 1598 int 1599 zvol_os_create_minor(const char *name) 1600 { 1601 zvol_state_t *zv = NULL; 1602 objset_t *os; 1603 dmu_object_info_t *doi; 1604 uint64_t volsize; 1605 uint64_t len; 1606 unsigned minor = 0; 1607 int error = 0; 1608 int idx; 1609 uint64_t hash = zvol_name_hash(name); 1610 uint64_t volthreading; 1611 bool replayed_zil = B_FALSE; 1612 1613 if (zvol_inhibit_dev) 1614 return (0); 1615 1616 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1617 if (idx < 0) 1618 return (SET_ERROR(-idx)); 1619 minor = idx << ZVOL_MINOR_BITS; 1620 if (MINOR(minor) != minor) { 1621 /* too many partitions can cause an overflow */ 1622 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1623 name, minor, MINOR(minor)); 1624 ida_simple_remove(&zvol_ida, idx); 1625 return (SET_ERROR(EINVAL)); 1626 } 1627 1628 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1629 if (zv) { 1630 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1631 mutex_exit(&zv->zv_state_lock); 1632 ida_simple_remove(&zvol_ida, idx); 1633 return (SET_ERROR(EEXIST)); 1634 } 1635 1636 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1637 1638 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1639 if (error) 1640 goto out_doi; 1641 1642 error = dmu_object_info(os, ZVOL_OBJ, doi); 1643 if (error) 1644 goto out_dmu_objset_disown; 1645 1646 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1647 if (error) 1648 goto out_dmu_objset_disown; 1649 1650 error = zvol_alloc(MKDEV(zvol_major, minor), name, 1651 volsize, doi->doi_data_block_size, &zv); 1652 if (error || zv == NULL) 1653 goto out_dmu_objset_disown; 1654 1655 zv->zv_hash = hash; 1656 1657 if (dmu_objset_is_snapshot(os)) 1658 zv->zv_flags |= ZVOL_RDONLY; 1659 1660 zv->zv_objset = os; 1661 1662 /* Default */ 1663 zv->zv_threading = B_TRUE; 1664 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1665 == 0) 1666 zv->zv_threading = volthreading; 1667 1668 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1669 1670 #ifdef QUEUE_FLAG_DISCARD 1671 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1672 #endif 1673 #ifdef QUEUE_FLAG_NONROT 1674 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1675 #endif 1676 #ifdef QUEUE_FLAG_ADD_RANDOM 1677 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1678 #endif 1679 /* This flag was introduced in kernel version 4.12. */ 1680 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1681 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1682 #endif 1683 1684 ASSERT0P(zv->zv_kstat.dk_kstats); 1685 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1686 if (error) 1687 goto out_dmu_objset_disown; 1688 ASSERT0P(zv->zv_zilog); 1689 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1690 if (spa_writeable(dmu_objset_spa(os))) { 1691 if (zil_replay_disable) 1692 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1693 else 1694 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1695 } 1696 if (replayed_zil) 1697 zil_close(zv->zv_zilog); 1698 zv->zv_zilog = NULL; 1699 1700 /* 1701 * When udev detects the addition of the device it will immediately 1702 * invoke blkid(8) to determine the type of content on the device. 1703 * Prefetching the blocks commonly scanned by blkid(8) will speed 1704 * up this process. 1705 */ 1706 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1707 if (len > 0) { 1708 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1709 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1710 ZIO_PRIORITY_SYNC_READ); 1711 } 1712 1713 zv->zv_objset = NULL; 1714 out_dmu_objset_disown: 1715 dmu_objset_disown(os, B_TRUE, FTAG); 1716 out_doi: 1717 kmem_free(doi, sizeof (dmu_object_info_t)); 1718 1719 /* 1720 * Keep in mind that once add_disk() is called, the zvol is 1721 * announced to the world, and zvol_open()/zvol_release() can 1722 * be called at any time. Incidentally, add_disk() itself calls 1723 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1724 * directly as well. 1725 */ 1726 if (error == 0 && zv) { 1727 rw_enter(&zvol_state_lock, RW_WRITER); 1728 zvol_insert(zv); 1729 rw_exit(&zvol_state_lock); 1730 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1731 } else { 1732 ida_simple_remove(&zvol_ida, idx); 1733 } 1734 1735 return (error); 1736 } 1737 1738 int 1739 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1740 { 1741 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1742 1743 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1744 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1745 1746 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1747 1748 /* move to new hashtable entry */ 1749 zv->zv_hash = zvol_name_hash(newname); 1750 hlist_del(&zv->zv_hlink); 1751 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1752 1753 /* 1754 * The block device's read-only state is briefly changed causing 1755 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1756 * the name change and fixes the symlinks. This does not change 1757 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1758 * changes. This would normally be done using kobject_uevent() but 1759 * that is a GPL-only symbol which is why we need this workaround. 1760 */ 1761 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1762 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1763 1764 dataset_kstats_rename(&zv->zv_kstat, newname); 1765 1766 return (0); 1767 } 1768 1769 void 1770 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1771 { 1772 1773 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1774 } 1775 1776 void 1777 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1778 { 1779 1780 set_capacity(zv->zv_zso->zvo_disk, capacity); 1781 } 1782 1783 int 1784 zvol_init(void) 1785 { 1786 int error; 1787 1788 error = zvol_init_impl(); 1789 if (error) { 1790 printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); 1791 return (error); 1792 } 1793 1794 error = -register_blkdev(zvol_major, ZVOL_DRIVER); 1795 if (error) { 1796 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1797 return (SET_ERROR(error)); 1798 } 1799 1800 if (zvol_blk_mq_queue_depth == 0) { 1801 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1802 } else { 1803 zvol_actual_blk_mq_queue_depth = 1804 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1805 } 1806 1807 if (zvol_blk_mq_threads == 0) { 1808 zvol_blk_mq_actual_threads = num_online_cpus(); 1809 } else { 1810 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1811 1024); 1812 } 1813 1814 ida_init(&zvol_ida); 1815 return (0); 1816 } 1817 1818 void 1819 zvol_fini(void) 1820 { 1821 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1822 1823 zvol_fini_impl(); 1824 1825 ida_destroy(&zvol_ida); 1826 } 1827 1828 module_param(zvol_major, uint, 0444); 1829 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1830 1831 module_param(zvol_max_discard_blocks, ulong, 0444); 1832 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1833 1834 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1835 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1836 1837 module_param(zvol_use_blk_mq, uint, 0644); 1838 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1839 1840 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1841 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1842 "Process volblocksize blocks per thread"); 1843 1844 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1845 module_param(zvol_open_timeout_ms, uint, 0644); 1846 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1847 #endif 1848