1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 24 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 25 * Copyright (c) 2024, Klara, Inc. 26 */ 27 28 #include <sys/dataset_kstats.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/zil_impl.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/spa_impl.h> 41 #include <sys/zvol.h> 42 #include <sys/zvol_impl.h> 43 #include <cityhash.h> 44 45 #include <linux/blkdev_compat.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/workqueue.h> 48 #include <linux/blk-mq.h> 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned long zvol_max_discard_blocks = 16384; 55 56 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 57 static unsigned int zvol_open_timeout_ms = 1000; 58 #endif 59 60 static unsigned int zvol_blk_mq_threads = 0; 61 static unsigned int zvol_blk_mq_actual_threads; 62 static boolean_t zvol_use_blk_mq = B_FALSE; 63 64 /* 65 * The maximum number of volblocksize blocks to process per thread. Typically, 66 * write heavy workloads preform better with higher values here, and read 67 * heavy workloads preform better with lower values, but that's not a hard 68 * and fast rule. It's basically a knob to tune between "less overhead with 69 * less parallelism" and "more overhead, but more parallelism". 70 * 71 * '8' was chosen as a reasonable, balanced, default based off of sequential 72 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 73 */ 74 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 75 76 #ifndef BLKDEV_DEFAULT_RQ 77 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 78 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 79 #endif 80 81 /* 82 * Finalize our BIO or request. 83 */ 84 static inline void 85 zvol_end_io(struct bio *bio, struct request *rq, int error) 86 { 87 ASSERT3U(error, >=, 0); 88 if (bio) { 89 bio->bi_status = errno_to_bi_status(error); 90 bio_endio(bio); 91 } else { 92 blk_mq_end_request(rq, errno_to_bi_status(error)); 93 } 94 } 95 96 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 97 static unsigned int zvol_actual_blk_mq_queue_depth; 98 99 struct zvol_state_os { 100 struct gendisk *zvo_disk; /* generic disk */ 101 struct request_queue *zvo_queue; /* request queue */ 102 dev_t zvo_dev; /* device id */ 103 104 struct blk_mq_tag_set tag_set; 105 106 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 107 boolean_t use_blk_mq; 108 }; 109 110 static struct ida zvol_ida; 111 112 /* 113 * This is called when a new block multiqueue request comes in. A request 114 * contains one or more BIOs. 115 */ 116 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 117 const struct blk_mq_queue_data *bd) 118 { 119 struct request *rq = bd->rq; 120 zvol_state_t *zv = rq->q->queuedata; 121 122 /* Tell the kernel that we are starting to process this request */ 123 blk_mq_start_request(rq); 124 125 if (blk_rq_is_passthrough(rq)) { 126 /* Skip non filesystem request */ 127 blk_mq_end_request(rq, BLK_STS_IOERR); 128 return (BLK_STS_IOERR); 129 } 130 131 zvol_request_impl(zv, NULL, rq, 0); 132 133 /* Acknowledge to the kernel that we got this request */ 134 return (BLK_STS_OK); 135 } 136 137 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 138 .queue_rq = zvol_mq_queue_rq, 139 }; 140 141 /* Initialize our blk-mq struct */ 142 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 143 { 144 struct zvol_state_os *zso = zv->zv_zso; 145 146 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 147 148 /* Initialize tag set. */ 149 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 150 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 151 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 152 zso->tag_set.numa_node = NUMA_NO_NODE; 153 zso->tag_set.cmd_size = 0; 154 155 /* 156 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 157 * zvol_request_impl() 158 */ 159 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 160 161 #ifdef BLK_MQ_F_SHOULD_MERGE 162 /* 163 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 164 * For older kernels, we set it. 165 */ 166 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 167 #endif 168 169 zso->tag_set.driver_data = zv; 170 171 return (blk_mq_alloc_tag_set(&zso->tag_set)); 172 } 173 174 /* 175 * Given a path, return TRUE if path is a ZVOL. 176 */ 177 boolean_t 178 zvol_os_is_zvol(const char *path) 179 { 180 dev_t dev = 0; 181 182 if (vdev_lookup_bdev(path, &dev) != 0) 183 return (B_FALSE); 184 185 if (MAJOR(dev) == zvol_major) 186 return (B_TRUE); 187 188 return (B_FALSE); 189 } 190 191 static void 192 zvol_write(zv_request_t *zvr) 193 { 194 struct bio *bio = zvr->bio; 195 struct request *rq = zvr->rq; 196 int error = 0; 197 zfs_uio_t uio; 198 zvol_state_t *zv = zvr->zv; 199 struct request_queue *q; 200 struct gendisk *disk; 201 unsigned long start_time = 0; 202 boolean_t acct = B_FALSE; 203 204 ASSERT3P(zv, !=, NULL); 205 ASSERT3U(zv->zv_open_count, >, 0); 206 ASSERT3P(zv->zv_zilog, !=, NULL); 207 208 q = zv->zv_zso->zvo_queue; 209 disk = zv->zv_zso->zvo_disk; 210 211 /* bio marked as FLUSH need to flush before write */ 212 if (io_is_flush(bio, rq)) { 213 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 214 if (error != 0) { 215 rw_exit(&zv->zv_suspend_lock); 216 zvol_end_io(bio, rq, -error); 217 return; 218 } 219 } 220 221 /* Some requests are just for flush and nothing else. */ 222 if (io_size(bio, rq) == 0) { 223 rw_exit(&zv->zv_suspend_lock); 224 zvol_end_io(bio, rq, 0); 225 return; 226 } 227 228 zfs_uio_bvec_init(&uio, bio, rq); 229 230 ssize_t start_resid = uio.uio_resid; 231 232 /* 233 * With use_blk_mq, accounting is done by blk_mq_start_request() 234 * and blk_mq_end_request(), so we can skip it here. 235 */ 236 if (bio) { 237 acct = blk_queue_io_stat(q); 238 if (acct) { 239 start_time = blk_generic_start_io_acct(q, disk, WRITE, 240 bio); 241 } 242 } 243 244 boolean_t sync = 245 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 246 247 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 248 uio.uio_loffset, uio.uio_resid, RL_WRITER); 249 250 uint64_t volsize = zv->zv_volsize; 251 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 252 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 253 uint64_t off = uio.uio_loffset; 254 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 255 256 if (bytes > volsize - off) /* don't write past the end */ 257 bytes = volsize - off; 258 259 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 260 261 /* This will only fail for ENOSPC */ 262 error = dmu_tx_assign(tx, DMU_TX_WAIT); 263 if (error) { 264 dmu_tx_abort(tx); 265 break; 266 } 267 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, 268 DMU_READ_PREFETCH); 269 if (error == 0) { 270 zvol_log_write(zv, tx, off, bytes, sync); 271 } 272 dmu_tx_commit(tx); 273 274 if (error) 275 break; 276 } 277 zfs_rangelock_exit(lr); 278 279 int64_t nwritten = start_resid - uio.uio_resid; 280 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 281 task_io_account_write(nwritten); 282 283 if (error == 0 && sync) 284 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 285 286 rw_exit(&zv->zv_suspend_lock); 287 288 if (bio && acct) { 289 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 290 } 291 292 zvol_end_io(bio, rq, error); 293 } 294 295 static void 296 zvol_write_task(void *arg) 297 { 298 zv_request_task_t *task = arg; 299 zvol_write(&task->zvr); 300 zv_request_task_free(task); 301 } 302 303 static void 304 zvol_discard(zv_request_t *zvr) 305 { 306 struct bio *bio = zvr->bio; 307 struct request *rq = zvr->rq; 308 zvol_state_t *zv = zvr->zv; 309 uint64_t start = io_offset(bio, rq); 310 uint64_t size = io_size(bio, rq); 311 uint64_t end = start + size; 312 boolean_t sync; 313 int error = 0; 314 dmu_tx_t *tx; 315 struct request_queue *q = zv->zv_zso->zvo_queue; 316 struct gendisk *disk = zv->zv_zso->zvo_disk; 317 unsigned long start_time = 0; 318 boolean_t acct = B_FALSE; 319 320 ASSERT3P(zv, !=, NULL); 321 ASSERT3U(zv->zv_open_count, >, 0); 322 ASSERT3P(zv->zv_zilog, !=, NULL); 323 324 if (bio) { 325 acct = blk_queue_io_stat(q); 326 if (acct) { 327 start_time = blk_generic_start_io_acct(q, disk, WRITE, 328 bio); 329 } 330 } 331 332 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 333 334 if (end > zv->zv_volsize) { 335 error = SET_ERROR(EIO); 336 goto unlock; 337 } 338 339 /* 340 * Align the request to volume block boundaries when a secure erase is 341 * not required. This will prevent dnode_free_range() from zeroing out 342 * the unaligned parts which is slow (read-modify-write) and useless 343 * since we are not freeing any space by doing so. 344 */ 345 if (!io_is_secure_erase(bio, rq)) { 346 start = P2ROUNDUP(start, zv->zv_volblocksize); 347 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 348 size = end - start; 349 } 350 351 if (start >= end) 352 goto unlock; 353 354 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 355 start, size, RL_WRITER); 356 357 tx = dmu_tx_create(zv->zv_objset); 358 dmu_tx_mark_netfree(tx); 359 error = dmu_tx_assign(tx, DMU_TX_WAIT); 360 if (error != 0) { 361 dmu_tx_abort(tx); 362 } else { 363 zvol_log_truncate(zv, tx, start, size); 364 dmu_tx_commit(tx); 365 error = dmu_free_long_range(zv->zv_objset, 366 ZVOL_OBJ, start, size); 367 } 368 zfs_rangelock_exit(lr); 369 370 if (error == 0 && sync) 371 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 372 373 unlock: 374 rw_exit(&zv->zv_suspend_lock); 375 376 if (bio && acct) { 377 blk_generic_end_io_acct(q, disk, WRITE, bio, 378 start_time); 379 } 380 381 zvol_end_io(bio, rq, error); 382 } 383 384 static void 385 zvol_discard_task(void *arg) 386 { 387 zv_request_task_t *task = arg; 388 zvol_discard(&task->zvr); 389 zv_request_task_free(task); 390 } 391 392 static void 393 zvol_read(zv_request_t *zvr) 394 { 395 struct bio *bio = zvr->bio; 396 struct request *rq = zvr->rq; 397 int error = 0; 398 zfs_uio_t uio; 399 boolean_t acct = B_FALSE; 400 zvol_state_t *zv = zvr->zv; 401 struct request_queue *q; 402 struct gendisk *disk; 403 unsigned long start_time = 0; 404 405 ASSERT3P(zv, !=, NULL); 406 ASSERT3U(zv->zv_open_count, >, 0); 407 408 zfs_uio_bvec_init(&uio, bio, rq); 409 410 q = zv->zv_zso->zvo_queue; 411 disk = zv->zv_zso->zvo_disk; 412 413 ssize_t start_resid = uio.uio_resid; 414 415 /* 416 * When blk-mq is being used, accounting is done by 417 * blk_mq_start_request() and blk_mq_end_request(). 418 */ 419 if (bio) { 420 acct = blk_queue_io_stat(q); 421 if (acct) 422 start_time = blk_generic_start_io_acct(q, disk, READ, 423 bio); 424 } 425 426 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 427 uio.uio_loffset, uio.uio_resid, RL_READER); 428 429 uint64_t volsize = zv->zv_volsize; 430 431 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 432 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 433 434 /* don't read past the end */ 435 if (bytes > volsize - uio.uio_loffset) 436 bytes = volsize - uio.uio_loffset; 437 438 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, 439 DMU_READ_PREFETCH); 440 if (error) { 441 /* convert checksum errors into IO errors */ 442 if (error == ECKSUM) 443 error = SET_ERROR(EIO); 444 break; 445 } 446 } 447 zfs_rangelock_exit(lr); 448 449 int64_t nread = start_resid - uio.uio_resid; 450 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 451 task_io_account_read(nread); 452 453 rw_exit(&zv->zv_suspend_lock); 454 455 if (bio && acct) { 456 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 457 } 458 459 zvol_end_io(bio, rq, error); 460 } 461 462 static void 463 zvol_read_task(void *arg) 464 { 465 zv_request_task_t *task = arg; 466 zvol_read(&task->zvr); 467 zv_request_task_free(task); 468 } 469 470 471 /* 472 * Process a BIO or request 473 * 474 * Either 'bio' or 'rq' should be set depending on if we are processing a 475 * bio or a request (both should not be set). 476 * 477 * force_sync: Set to 0 to defer processing to a background taskq 478 * Set to 1 to process data synchronously 479 */ 480 static void 481 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 482 boolean_t force_sync) 483 { 484 fstrans_cookie_t cookie = spl_fstrans_mark(); 485 uint64_t offset = io_offset(bio, rq); 486 uint64_t size = io_size(bio, rq); 487 int rw = io_data_dir(bio, rq); 488 489 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 490 zvol_end_io(bio, rq, SET_ERROR(ENXIO)); 491 goto out; 492 } 493 494 if (zvol_request_sync || zv->zv_threading == B_FALSE) 495 force_sync = 1; 496 497 zv_request_t zvr = { 498 .zv = zv, 499 .bio = bio, 500 .rq = rq, 501 }; 502 503 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 504 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 505 zv->zv_zso->zvo_disk->disk_name, 506 (long long unsigned)offset, 507 (long unsigned)size); 508 509 zvol_end_io(bio, rq, SET_ERROR(EIO)); 510 goto out; 511 } 512 513 zv_request_task_t *task; 514 zv_taskq_t *ztqs = &zvol_taskqs; 515 uint_t blk_mq_hw_queue = 0; 516 uint_t tq_idx; 517 uint_t taskq_hash; 518 if (rq) 519 #ifdef HAVE_BLK_MQ_RQ_HCTX 520 blk_mq_hw_queue = rq->mq_hctx->queue_num; 521 #else 522 blk_mq_hw_queue = rq->q->queue_hw_ctx[ 523 rq->q->mq_map[raw_smp_processor_id()]]->queue_num; 524 #endif 525 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 526 blk_mq_hw_queue); 527 tq_idx = taskq_hash % ztqs->tqs_cnt; 528 529 if (rw == WRITE) { 530 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 531 zvol_end_io(bio, rq, SET_ERROR(EROFS)); 532 goto out; 533 } 534 535 /* 536 * Prevents the zvol from being suspended, or the ZIL being 537 * concurrently opened. Will be released after the i/o 538 * completes. 539 */ 540 rw_enter(&zv->zv_suspend_lock, RW_READER); 541 542 /* 543 * Open a ZIL if this is the first time we have written to this 544 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 545 * than zv_state_lock so that we don't need to acquire an 546 * additional lock in this path. 547 */ 548 if (zv->zv_zilog == NULL) { 549 rw_exit(&zv->zv_suspend_lock); 550 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 551 if (zv->zv_zilog == NULL) { 552 zv->zv_zilog = zil_open(zv->zv_objset, 553 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 554 zv->zv_flags |= ZVOL_WRITTEN_TO; 555 /* replay / destroy done in zvol_create_minor */ 556 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 557 ZIL_REPLAY_NEEDED)); 558 } 559 rw_downgrade(&zv->zv_suspend_lock); 560 } 561 562 /* 563 * We don't want this thread to be blocked waiting for i/o to 564 * complete, so we instead wait from a taskq callback. The 565 * i/o may be a ZIL write (via zil_commit()), or a read of an 566 * indirect block, or a read of a data block (if this is a 567 * partial-block write). We will indicate that the i/o is 568 * complete by calling END_IO() from the taskq callback. 569 * 570 * This design allows the calling thread to continue and 571 * initiate more concurrent operations by calling 572 * zvol_request() again. There are typically only a small 573 * number of threads available to call zvol_request() (e.g. 574 * one per iSCSI target), so keeping the latency of 575 * zvol_request() low is important for performance. 576 * 577 * The zvol_request_sync module parameter allows this 578 * behavior to be altered, for performance evaluation 579 * purposes. If the callback blocks, setting 580 * zvol_request_sync=1 will result in much worse performance. 581 * 582 * We can have up to zvol_threads concurrent i/o's being 583 * processed for all zvols on the system. This is typically 584 * a vast improvement over the zvol_request_sync=1 behavior 585 * of one i/o at a time per zvol. However, an even better 586 * design would be for zvol_request() to initiate the zio 587 * directly, and then be notified by the zio_done callback, 588 * which would call END_IO(). Unfortunately, the DMU/ZIL 589 * interfaces lack this functionality (they block waiting for 590 * the i/o to complete). 591 */ 592 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 593 if (force_sync) { 594 zvol_discard(&zvr); 595 } else { 596 task = zv_request_task_create(zvr); 597 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 598 zvol_discard_task, task, 0, &task->ent); 599 } 600 } else { 601 if (force_sync) { 602 zvol_write(&zvr); 603 } else { 604 task = zv_request_task_create(zvr); 605 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 606 zvol_write_task, task, 0, &task->ent); 607 } 608 } 609 } else { 610 /* 611 * The SCST driver, and possibly others, may issue READ I/Os 612 * with a length of zero bytes. These empty I/Os contain no 613 * data and require no additional handling. 614 */ 615 if (size == 0) { 616 zvol_end_io(bio, rq, 0); 617 goto out; 618 } 619 620 rw_enter(&zv->zv_suspend_lock, RW_READER); 621 622 /* See comment in WRITE case above. */ 623 if (force_sync) { 624 zvol_read(&zvr); 625 } else { 626 task = zv_request_task_create(zvr); 627 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 628 zvol_read_task, task, 0, &task->ent); 629 } 630 } 631 632 out: 633 spl_fstrans_unmark(cookie); 634 } 635 636 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 637 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 638 static void 639 zvol_submit_bio(struct bio *bio) 640 #else 641 static blk_qc_t 642 zvol_submit_bio(struct bio *bio) 643 #endif 644 #else 645 static MAKE_REQUEST_FN_RET 646 zvol_request(struct request_queue *q, struct bio *bio) 647 #endif 648 { 649 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 650 #if defined(HAVE_BIO_BDEV_DISK) 651 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 652 #else 653 struct request_queue *q = bio->bi_disk->queue; 654 #endif 655 #endif 656 zvol_state_t *zv = q->queuedata; 657 658 zvol_request_impl(zv, bio, NULL, 0); 659 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 660 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 661 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 662 return (BLK_QC_T_NONE); 663 #endif 664 } 665 666 static int 667 #ifdef HAVE_BLK_MODE_T 668 zvol_open(struct gendisk *disk, blk_mode_t flag) 669 #else 670 zvol_open(struct block_device *bdev, fmode_t flag) 671 #endif 672 { 673 zvol_state_t *zv; 674 int error = 0; 675 boolean_t drop_suspend = B_FALSE; 676 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 677 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 678 hrtime_t start = gethrtime(); 679 680 retry: 681 #endif 682 rw_enter(&zvol_state_lock, RW_READER); 683 /* 684 * Obtain a copy of private_data under the zvol_state_lock to make 685 * sure that either the result of zvol free code path setting 686 * disk->private_data to NULL is observed, or zvol_os_free() 687 * is not called on this zv because of the positive zv_open_count. 688 */ 689 #ifdef HAVE_BLK_MODE_T 690 zv = disk->private_data; 691 #else 692 zv = bdev->bd_disk->private_data; 693 #endif 694 if (zv == NULL) { 695 rw_exit(&zvol_state_lock); 696 return (-SET_ERROR(ENXIO)); 697 } 698 699 mutex_enter(&zv->zv_state_lock); 700 701 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 702 mutex_exit(&zv->zv_state_lock); 703 rw_exit(&zvol_state_lock); 704 return (-SET_ERROR(ENXIO)); 705 } 706 707 /* 708 * Make sure zvol is not suspended during first open 709 * (hold zv_suspend_lock) and respect proper lock acquisition 710 * ordering - zv_suspend_lock before zv_state_lock 711 */ 712 if (zv->zv_open_count == 0) { 713 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 714 mutex_exit(&zv->zv_state_lock); 715 rw_enter(&zv->zv_suspend_lock, RW_READER); 716 mutex_enter(&zv->zv_state_lock); 717 /* check to see if zv_suspend_lock is needed */ 718 if (zv->zv_open_count != 0) { 719 rw_exit(&zv->zv_suspend_lock); 720 } else { 721 drop_suspend = B_TRUE; 722 } 723 } else { 724 drop_suspend = B_TRUE; 725 } 726 } 727 rw_exit(&zvol_state_lock); 728 729 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 730 731 if (zv->zv_open_count == 0) { 732 boolean_t drop_namespace = B_FALSE; 733 734 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 735 736 /* 737 * In all other call paths the spa_namespace_lock is taken 738 * before the bdev->bd_mutex lock. However, on open(2) 739 * the __blkdev_get() function calls fops->open() with the 740 * bdev->bd_mutex lock held. This can result in a deadlock 741 * when zvols from one pool are used as vdevs in another. 742 * 743 * To prevent a lock inversion deadlock we preemptively 744 * take the spa_namespace_lock. Normally the lock will not 745 * be contended and this is safe because spa_open_common() 746 * handles the case where the caller already holds the 747 * spa_namespace_lock. 748 * 749 * When the lock cannot be aquired after multiple retries 750 * this must be the vdev on zvol deadlock case and we have 751 * no choice but to return an error. For 5.12 and older 752 * kernels returning -ERESTARTSYS will result in the 753 * bdev->bd_mutex being dropped, then reacquired, and 754 * fops->open() being called again. This process can be 755 * repeated safely until both locks are acquired. For 5.13 756 * and newer the -ERESTARTSYS retry logic was removed from 757 * the kernel so the only option is to return the error for 758 * the caller to handle it. 759 */ 760 if (!mutex_owned(&spa_namespace_lock)) { 761 if (!mutex_tryenter(&spa_namespace_lock)) { 762 mutex_exit(&zv->zv_state_lock); 763 rw_exit(&zv->zv_suspend_lock); 764 drop_suspend = B_FALSE; 765 766 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 767 schedule(); 768 return (-SET_ERROR(ERESTARTSYS)); 769 #else 770 if ((gethrtime() - start) > timeout) 771 return (-SET_ERROR(ERESTARTSYS)); 772 773 schedule_timeout_interruptible( 774 MSEC_TO_TICK(10)); 775 goto retry; 776 #endif 777 } else { 778 drop_namespace = B_TRUE; 779 } 780 } 781 782 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 783 784 if (drop_namespace) 785 mutex_exit(&spa_namespace_lock); 786 } 787 788 if (error == 0) { 789 if ((blk_mode_is_open_write(flag)) && 790 (zv->zv_flags & ZVOL_RDONLY)) { 791 if (zv->zv_open_count == 0) 792 zvol_last_close(zv); 793 794 error = -SET_ERROR(EROFS); 795 } else { 796 zv->zv_open_count++; 797 } 798 } 799 800 mutex_exit(&zv->zv_state_lock); 801 if (drop_suspend) 802 rw_exit(&zv->zv_suspend_lock); 803 804 if (error == 0) 805 #ifdef HAVE_BLK_MODE_T 806 disk_check_media_change(disk); 807 #else 808 zfs_check_media_change(bdev); 809 #endif 810 811 return (error); 812 } 813 814 static void 815 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 816 zvol_release(struct gendisk *disk) 817 #else 818 zvol_release(struct gendisk *disk, fmode_t unused) 819 #endif 820 { 821 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 822 (void) unused; 823 #endif 824 zvol_state_t *zv; 825 boolean_t drop_suspend = B_TRUE; 826 827 rw_enter(&zvol_state_lock, RW_READER); 828 zv = disk->private_data; 829 830 mutex_enter(&zv->zv_state_lock); 831 ASSERT3U(zv->zv_open_count, >, 0); 832 /* 833 * make sure zvol is not suspended during last close 834 * (hold zv_suspend_lock) and respect proper lock acquisition 835 * ordering - zv_suspend_lock before zv_state_lock 836 */ 837 if (zv->zv_open_count == 1) { 838 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 839 mutex_exit(&zv->zv_state_lock); 840 rw_enter(&zv->zv_suspend_lock, RW_READER); 841 mutex_enter(&zv->zv_state_lock); 842 /* check to see if zv_suspend_lock is needed */ 843 if (zv->zv_open_count != 1) { 844 rw_exit(&zv->zv_suspend_lock); 845 drop_suspend = B_FALSE; 846 } 847 } 848 } else { 849 drop_suspend = B_FALSE; 850 } 851 rw_exit(&zvol_state_lock); 852 853 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 854 855 zv->zv_open_count--; 856 if (zv->zv_open_count == 0) { 857 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 858 zvol_last_close(zv); 859 } 860 861 mutex_exit(&zv->zv_state_lock); 862 863 if (drop_suspend) 864 rw_exit(&zv->zv_suspend_lock); 865 } 866 867 static int 868 zvol_ioctl(struct block_device *bdev, fmode_t mode, 869 unsigned int cmd, unsigned long arg) 870 { 871 zvol_state_t *zv = bdev->bd_disk->private_data; 872 int error = 0; 873 874 ASSERT3U(zv->zv_open_count, >, 0); 875 876 switch (cmd) { 877 case BLKFLSBUF: 878 #ifdef HAVE_FSYNC_BDEV 879 fsync_bdev(bdev); 880 #elif defined(HAVE_SYNC_BLOCKDEV) 881 sync_blockdev(bdev); 882 #else 883 #error "Neither fsync_bdev() nor sync_blockdev() found" 884 #endif 885 invalidate_bdev(bdev); 886 rw_enter(&zv->zv_suspend_lock, RW_READER); 887 888 if (!(zv->zv_flags & ZVOL_RDONLY)) 889 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 890 891 rw_exit(&zv->zv_suspend_lock); 892 break; 893 894 case BLKZNAME: 895 mutex_enter(&zv->zv_state_lock); 896 error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 897 mutex_exit(&zv->zv_state_lock); 898 if (error) 899 error = SET_ERROR(error); 900 break; 901 902 default: 903 error = SET_ERROR(ENOTTY); 904 break; 905 } 906 907 return (-error); 908 } 909 910 #ifdef CONFIG_COMPAT 911 static int 912 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 913 unsigned cmd, unsigned long arg) 914 { 915 return (zvol_ioctl(bdev, mode, cmd, arg)); 916 } 917 #else 918 #define zvol_compat_ioctl NULL 919 #endif 920 921 static unsigned int 922 zvol_check_events(struct gendisk *disk, unsigned int clearing) 923 { 924 unsigned int mask = 0; 925 926 rw_enter(&zvol_state_lock, RW_READER); 927 928 zvol_state_t *zv = disk->private_data; 929 if (zv != NULL) { 930 mutex_enter(&zv->zv_state_lock); 931 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 932 zv->zv_changed = 0; 933 mutex_exit(&zv->zv_state_lock); 934 } 935 936 rw_exit(&zvol_state_lock); 937 938 return (mask); 939 } 940 941 static int 942 zvol_revalidate_disk(struct gendisk *disk) 943 { 944 rw_enter(&zvol_state_lock, RW_READER); 945 946 zvol_state_t *zv = disk->private_data; 947 if (zv != NULL) { 948 mutex_enter(&zv->zv_state_lock); 949 set_capacity(zv->zv_zso->zvo_disk, 950 zv->zv_volsize >> SECTOR_BITS); 951 mutex_exit(&zv->zv_state_lock); 952 } 953 954 rw_exit(&zvol_state_lock); 955 956 return (0); 957 } 958 959 int 960 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 961 { 962 struct gendisk *disk = zv->zv_zso->zvo_disk; 963 964 #if defined(HAVE_REVALIDATE_DISK_SIZE) 965 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 966 #elif defined(HAVE_REVALIDATE_DISK) 967 revalidate_disk(disk); 968 #else 969 zvol_revalidate_disk(disk); 970 #endif 971 return (0); 972 } 973 974 void 975 zvol_os_clear_private(zvol_state_t *zv) 976 { 977 /* 978 * Cleared while holding zvol_state_lock as a writer 979 * which will prevent zvol_open() from opening it. 980 */ 981 zv->zv_zso->zvo_disk->private_data = NULL; 982 } 983 984 /* 985 * Provide a simple virtual geometry for legacy compatibility. For devices 986 * smaller than 1 MiB a small head and sector count is used to allow very 987 * tiny devices. For devices over 1 Mib a standard head and sector count 988 * is used to keep the cylinders count reasonable. 989 */ 990 static int 991 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 992 { 993 zvol_state_t *zv = bdev->bd_disk->private_data; 994 sector_t sectors; 995 996 ASSERT3U(zv->zv_open_count, >, 0); 997 998 sectors = get_capacity(zv->zv_zso->zvo_disk); 999 1000 if (sectors > 2048) { 1001 geo->heads = 16; 1002 geo->sectors = 63; 1003 } else { 1004 geo->heads = 2; 1005 geo->sectors = 4; 1006 } 1007 1008 geo->start = 0; 1009 geo->cylinders = sectors / (geo->heads * geo->sectors); 1010 1011 return (0); 1012 } 1013 1014 /* 1015 * Why have two separate block_device_operations structs? 1016 * 1017 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1018 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1019 * can't just change submit_bio dynamically at runtime. So just create two 1020 * separate structs to get around this. 1021 */ 1022 static const struct block_device_operations zvol_ops_blk_mq = { 1023 .open = zvol_open, 1024 .release = zvol_release, 1025 .ioctl = zvol_ioctl, 1026 .compat_ioctl = zvol_compat_ioctl, 1027 .check_events = zvol_check_events, 1028 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1029 .revalidate_disk = zvol_revalidate_disk, 1030 #endif 1031 .getgeo = zvol_getgeo, 1032 .owner = THIS_MODULE, 1033 }; 1034 1035 static const struct block_device_operations zvol_ops = { 1036 .open = zvol_open, 1037 .release = zvol_release, 1038 .ioctl = zvol_ioctl, 1039 .compat_ioctl = zvol_compat_ioctl, 1040 .check_events = zvol_check_events, 1041 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1042 .revalidate_disk = zvol_revalidate_disk, 1043 #endif 1044 .getgeo = zvol_getgeo, 1045 .owner = THIS_MODULE, 1046 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1047 .submit_bio = zvol_submit_bio, 1048 #endif 1049 }; 1050 1051 /* 1052 * Since 6.9, Linux has been removing queue limit setters in favour of an 1053 * initial queue_limits struct applied when the device is open. Since 6.11, 1054 * queue_limits is being extended to allow more things to be applied when the 1055 * device is open. Setters are also being removed for this. 1056 * 1057 * For OpenZFS, this means that depending on kernel version, some options may 1058 * be set up before the device is open, and some applied to an open device 1059 * (queue) after the fact. 1060 * 1061 * We manage this complexity by having our own limits struct, 1062 * zvol_queue_limits_t, in which we carry any queue config that we're 1063 * interested in setting. This structure is the same on all kernels. 1064 * 1065 * These limits are then applied to the queue at device open time by the most 1066 * appropriate method for the kernel. 1067 * 1068 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1069 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1070 * struct queue_limits, and passes it in. Any fields added in later kernels are 1071 * (obviously) not set up here. 1072 * 1073 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1074 * is created, and applies any remaining config. Before 6.9 that will be 1075 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1076 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1077 * will always be a no-op on the latest kernel we support). 1078 */ 1079 typedef struct zvol_queue_limits { 1080 unsigned int zql_max_hw_sectors; 1081 unsigned short zql_max_segments; 1082 unsigned int zql_max_segment_size; 1083 unsigned int zql_io_opt; 1084 unsigned int zql_physical_block_size; 1085 unsigned int zql_max_discard_sectors; 1086 unsigned int zql_discard_granularity; 1087 } zvol_queue_limits_t; 1088 1089 static void 1090 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1091 boolean_t use_blk_mq) 1092 { 1093 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1094 1095 if (use_blk_mq) { 1096 /* 1097 * IO requests can be really big (1MB). When an IO request 1098 * comes in, it is passed off to zvol_read() or zvol_write() 1099 * in a new thread, where it is chunked up into 'volblocksize' 1100 * sized pieces and processed. So for example, if the request 1101 * is a 1MB write and your volblocksize is 128k, one zvol_write 1102 * thread will take that request and sequentially do ten 128k 1103 * IOs. This is due to the fact that the thread needs to lock 1104 * each volblocksize sized block. So you might be wondering: 1105 * "instead of passing the whole 1MB request to one thread, 1106 * why not pass ten individual 128k chunks to ten threads and 1107 * process the whole write in parallel?" The short answer is 1108 * that there's a sweet spot number of chunks that balances 1109 * the greater parallelism with the added overhead of more 1110 * threads. The sweet spot can be different depending on if you 1111 * have a read or write heavy workload. Writes typically want 1112 * high chunk counts while reads typically want lower ones. On 1113 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1114 * configuration, with volblocksize=8k, the sweet spot for good 1115 * sequential reads and writes was at 8 chunks. 1116 */ 1117 1118 /* 1119 * Below we tell the kernel how big we want our requests 1120 * to be. You would think that blk_queue_io_opt() would be 1121 * used to do this since it is used to "set optimal request 1122 * size for the queue", but that doesn't seem to do 1123 * anything - the kernel still gives you huge requests 1124 * with tons of little PAGE_SIZE segments contained within it. 1125 * 1126 * Knowing that the kernel will just give you PAGE_SIZE segments 1127 * no matter what, you can say "ok, I want PAGE_SIZE byte 1128 * segments, and I want 'N' of them per request", where N is 1129 * the correct number of segments for the volblocksize and 1130 * number of chunks you want. 1131 */ 1132 if (zvol_blk_mq_blocks_per_thread != 0) { 1133 unsigned int chunks; 1134 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1135 1136 limits->zql_max_segment_size = PAGE_SIZE; 1137 limits->zql_max_segments = 1138 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1139 } else { 1140 /* 1141 * Special case: zvol_blk_mq_blocks_per_thread = 0 1142 * Max everything out. 1143 */ 1144 limits->zql_max_segments = UINT16_MAX; 1145 limits->zql_max_segment_size = UINT_MAX; 1146 } 1147 } else { 1148 limits->zql_max_segments = UINT16_MAX; 1149 limits->zql_max_segment_size = UINT_MAX; 1150 } 1151 1152 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1153 1154 limits->zql_physical_block_size = zv->zv_volblocksize; 1155 limits->zql_max_discard_sectors = 1156 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1157 limits->zql_discard_granularity = zv->zv_volblocksize; 1158 } 1159 1160 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1161 static void 1162 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1163 struct queue_limits *qlimits) 1164 { 1165 memset(qlimits, 0, sizeof (struct queue_limits)); 1166 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1167 qlimits->max_segments = limits->zql_max_segments; 1168 qlimits->max_segment_size = limits->zql_max_segment_size; 1169 qlimits->io_opt = limits->zql_io_opt; 1170 qlimits->physical_block_size = limits->zql_physical_block_size; 1171 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1172 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1173 qlimits->discard_granularity = limits->zql_discard_granularity; 1174 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1175 qlimits->features = 1176 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1177 #endif 1178 } 1179 #endif 1180 1181 static void 1182 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1183 struct request_queue *queue) 1184 { 1185 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1186 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1187 blk_queue_max_segments(queue, limits->zql_max_segments); 1188 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1189 blk_queue_io_opt(queue, limits->zql_io_opt); 1190 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1191 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1192 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1193 #endif 1194 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1195 blk_queue_set_write_cache(queue, B_TRUE); 1196 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1197 #endif 1198 } 1199 1200 static int 1201 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1202 { 1203 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1204 #if defined(HAVE_BLK_ALLOC_DISK) 1205 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1206 if (zso->zvo_disk == NULL) 1207 return (1); 1208 1209 zso->zvo_disk->minors = ZVOL_MINORS; 1210 zso->zvo_queue = zso->zvo_disk->queue; 1211 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1212 struct queue_limits qlimits; 1213 zvol_queue_limits_convert(limits, &qlimits); 1214 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1215 if (IS_ERR(disk)) { 1216 zso->zvo_disk = NULL; 1217 return (1); 1218 } 1219 1220 zso->zvo_disk = disk; 1221 zso->zvo_disk->minors = ZVOL_MINORS; 1222 zso->zvo_queue = zso->zvo_disk->queue; 1223 1224 #else 1225 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1226 if (zso->zvo_queue == NULL) 1227 return (1); 1228 1229 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1230 if (zso->zvo_disk == NULL) { 1231 blk_cleanup_queue(zso->zvo_queue); 1232 return (1); 1233 } 1234 1235 zso->zvo_disk->queue = zso->zvo_queue; 1236 #endif /* HAVE_BLK_ALLOC_DISK */ 1237 #else 1238 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1239 if (zso->zvo_queue == NULL) 1240 return (1); 1241 1242 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1243 if (zso->zvo_disk == NULL) { 1244 blk_cleanup_queue(zso->zvo_queue); 1245 return (1); 1246 } 1247 1248 zso->zvo_disk->queue = zso->zvo_queue; 1249 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1250 1251 zvol_queue_limits_apply(limits, zso->zvo_queue); 1252 1253 return (0); 1254 1255 } 1256 1257 static int 1258 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1259 { 1260 struct zvol_state_os *zso = zv->zv_zso; 1261 1262 /* Allocate our blk-mq tag_set */ 1263 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1264 return (1); 1265 1266 #if defined(HAVE_BLK_ALLOC_DISK) 1267 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1268 if (zso->zvo_disk == NULL) { 1269 blk_mq_free_tag_set(&zso->tag_set); 1270 return (1); 1271 } 1272 zso->zvo_queue = zso->zvo_disk->queue; 1273 zso->zvo_disk->minors = ZVOL_MINORS; 1274 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1275 struct queue_limits qlimits; 1276 zvol_queue_limits_convert(limits, &qlimits); 1277 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1278 if (IS_ERR(disk)) { 1279 zso->zvo_disk = NULL; 1280 blk_mq_free_tag_set(&zso->tag_set); 1281 return (1); 1282 } 1283 1284 zso->zvo_disk = disk; 1285 zso->zvo_queue = zso->zvo_disk->queue; 1286 zso->zvo_disk->minors = ZVOL_MINORS; 1287 #else 1288 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1289 if (zso->zvo_disk == NULL) { 1290 blk_cleanup_queue(zso->zvo_queue); 1291 blk_mq_free_tag_set(&zso->tag_set); 1292 return (1); 1293 } 1294 /* Allocate queue */ 1295 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1296 if (IS_ERR(zso->zvo_queue)) { 1297 blk_mq_free_tag_set(&zso->tag_set); 1298 return (1); 1299 } 1300 1301 /* Our queue is now created, assign it to our disk */ 1302 zso->zvo_disk->queue = zso->zvo_queue; 1303 #endif 1304 1305 zvol_queue_limits_apply(limits, zso->zvo_queue); 1306 1307 return (0); 1308 } 1309 1310 /* 1311 * Allocate memory for a new zvol_state_t and setup the required 1312 * request queue and generic disk structures for the block device. 1313 */ 1314 static int 1315 zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, 1316 zvol_state_t **zvp) 1317 { 1318 zvol_state_t *zv; 1319 struct zvol_state_os *zso; 1320 uint64_t volmode; 1321 int ret; 1322 1323 ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); 1324 if (ret) 1325 return (ret); 1326 1327 if (volmode == ZFS_VOLMODE_DEFAULT) 1328 volmode = zvol_volmode; 1329 1330 if (volmode == ZFS_VOLMODE_NONE) 1331 return (0); 1332 1333 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1334 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1335 zv->zv_zso = zso; 1336 zv->zv_volmode = volmode; 1337 zv->zv_volsize = volsize; 1338 zv->zv_volblocksize = volblocksize; 1339 1340 list_link_init(&zv->zv_next); 1341 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1342 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1343 1344 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1345 1346 zvol_queue_limits_t limits; 1347 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1348 1349 /* 1350 * The block layer has 3 interfaces for getting BIOs: 1351 * 1352 * 1. blk-mq request queues (new) 1353 * 2. submit_bio() (oldest) 1354 * 3. regular request queues (old). 1355 * 1356 * Each of those interfaces has two permutations: 1357 * 1358 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1359 * both the disk and its queue (5.14 kernel or newer) 1360 * 1361 * b) We don't have blk_*alloc_disk(), and have to allocate the 1362 * disk and the queue separately. (5.13 kernel or older) 1363 */ 1364 if (zv->zv_zso->use_blk_mq) { 1365 ret = zvol_alloc_blk_mq(zv, &limits); 1366 if (ret != 0) 1367 goto out_kmem; 1368 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1369 } else { 1370 ret = zvol_alloc_non_blk_mq(zso, &limits); 1371 if (ret != 0) 1372 goto out_kmem; 1373 zso->zvo_disk->fops = &zvol_ops; 1374 } 1375 1376 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1377 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1378 1379 if (!zv->zv_zso->use_blk_mq) { 1380 /* Disable write merging in favor of the ZIO pipeline. */ 1381 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1382 } 1383 1384 zso->zvo_queue->queuedata = zv; 1385 zso->zvo_dev = dev; 1386 zv->zv_open_count = 0; 1387 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1388 1389 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1390 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1391 1392 zso->zvo_disk->major = zvol_major; 1393 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1394 1395 /* 1396 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1397 * This is accomplished by limiting the number of minors for the 1398 * device to one and explicitly disabling partition scanning. 1399 */ 1400 if (volmode == ZFS_VOLMODE_DEV) { 1401 zso->zvo_disk->minors = 1; 1402 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1403 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1404 } 1405 1406 zso->zvo_disk->first_minor = (dev & MINORMASK); 1407 zso->zvo_disk->private_data = zv; 1408 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1409 ZVOL_DEV_NAME, (dev & MINORMASK)); 1410 1411 *zvp = zv; 1412 return (ret); 1413 1414 out_kmem: 1415 kmem_free(zso, sizeof (struct zvol_state_os)); 1416 kmem_free(zv, sizeof (zvol_state_t)); 1417 return (ret); 1418 } 1419 1420 /* 1421 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1422 * At this time, the structure is not opened by anyone, is taken off 1423 * the zvol_state_list, and has its private data set to NULL. 1424 * The zvol_state_lock is dropped. 1425 * 1426 * This function may take many milliseconds to complete (e.g. we've seen 1427 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1428 * "del_gendisk". Thus, consumers need to be careful to account for this 1429 * latency when calling this function. 1430 */ 1431 void 1432 zvol_os_free(zvol_state_t *zv) 1433 { 1434 1435 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1436 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1437 ASSERT0(zv->zv_open_count); 1438 ASSERT0P(zv->zv_zso->zvo_disk->private_data); 1439 1440 rw_destroy(&zv->zv_suspend_lock); 1441 zfs_rangelock_fini(&zv->zv_rangelock); 1442 1443 del_gendisk(zv->zv_zso->zvo_disk); 1444 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1445 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1446 #if defined(HAVE_BLK_CLEANUP_DISK) 1447 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1448 #else 1449 put_disk(zv->zv_zso->zvo_disk); 1450 #endif 1451 #else 1452 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1453 put_disk(zv->zv_zso->zvo_disk); 1454 #endif 1455 1456 if (zv->zv_zso->use_blk_mq) 1457 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1458 1459 ida_simple_remove(&zvol_ida, 1460 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1461 1462 cv_destroy(&zv->zv_removing_cv); 1463 mutex_destroy(&zv->zv_state_lock); 1464 dataset_kstats_destroy(&zv->zv_kstat); 1465 1466 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1467 kmem_free(zv, sizeof (zvol_state_t)); 1468 } 1469 1470 void 1471 zvol_wait_close(zvol_state_t *zv) 1472 { 1473 } 1474 1475 struct add_disk_work { 1476 struct delayed_work work; 1477 struct gendisk *disk; 1478 int error; 1479 }; 1480 1481 static int 1482 __zvol_os_add_disk(struct gendisk *disk) 1483 { 1484 int error = 0; 1485 #ifdef HAVE_ADD_DISK_RET 1486 error = -add_disk(disk); 1487 if (error) 1488 error = SET_ERROR(error); 1489 #else 1490 add_disk(disk); 1491 #endif 1492 return (error); 1493 } 1494 1495 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1496 static void 1497 zvol_os_add_disk_work(struct work_struct *work) 1498 { 1499 struct add_disk_work *add_disk_work; 1500 add_disk_work = container_of(work, struct add_disk_work, work.work); 1501 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1502 } 1503 #endif 1504 1505 /* 1506 * SPECIAL CASE: 1507 * 1508 * This function basically calls add_disk() from a workqueue. You may be 1509 * thinking: why not just call add_disk() directly? 1510 * 1511 * When you call add_disk(), the zvol appears to the world. When this happens, 1512 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1513 * differently on the 6.9+ kernels: 1514 * 1515 * - 6.8 and older kernels - 1516 * disk_scan_partitions() 1517 * handle = bdev_open_by_dev( 1518 * zvol_open() 1519 * bdev_release(handle); 1520 * zvol_release() 1521 * 1522 * 1523 * - 6.9+ kernels - 1524 * disk_scan_partitions() 1525 * file = bdev_file_open_by_dev() 1526 * zvol_open() 1527 * fput(file) 1528 * < wait for return to userspace > 1529 * zvol_release() 1530 * 1531 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1532 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1533 * async that has to wait until we return to userspace (since it adds the fput 1534 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1535 * behavior we want, since we want do things like create+destroy a zvol within 1536 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1537 * reference to the zvol while we're in the IOCTL, which can't wait until we 1538 * return to userspace. 1539 * 1540 * We can get around this since fput() has a special codepath for when it's 1541 * running in a kernel thread or interrupt. In those cases, it just puts the 1542 * fput into the system workqueue, which we can force to run with 1543 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1544 * run from a kernel thread and "tricks" the fput() codepaths. 1545 * 1546 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1547 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1548 * fput) to happen, which it eventually, naturally, will from the system_wq 1549 * without us explicitly calling __flush_workqueue(). 1550 */ 1551 static int 1552 zvol_os_add_disk(struct gendisk *disk) 1553 { 1554 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1555 struct add_disk_work add_disk_work; 1556 1557 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1558 add_disk_work.disk = disk; 1559 add_disk_work.error = 0; 1560 1561 /* Use *_delayed_work functions since they're not GPL'd */ 1562 schedule_delayed_work(&add_disk_work.work, 0); 1563 flush_delayed_work(&add_disk_work.work); 1564 1565 __flush_workqueue(system_wq); 1566 return (add_disk_work.error); 1567 #else /* <= 6.8 kernel */ 1568 return (__zvol_os_add_disk(disk)); 1569 #endif 1570 } 1571 1572 /* 1573 * Create a block device minor node and setup the linkage between it 1574 * and the specified volume. Once this function returns the block 1575 * device is live and ready for use. 1576 */ 1577 int 1578 zvol_os_create_minor(const char *name) 1579 { 1580 zvol_state_t *zv = NULL; 1581 objset_t *os; 1582 dmu_object_info_t *doi; 1583 uint64_t volsize; 1584 uint64_t len; 1585 unsigned minor = 0; 1586 int error = 0; 1587 int idx; 1588 uint64_t hash = zvol_name_hash(name); 1589 uint64_t volthreading; 1590 bool replayed_zil = B_FALSE; 1591 1592 if (zvol_inhibit_dev) 1593 return (0); 1594 1595 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1596 if (idx < 0) 1597 return (SET_ERROR(-idx)); 1598 minor = idx << ZVOL_MINOR_BITS; 1599 if (MINOR(minor) != minor) { 1600 /* too many partitions can cause an overflow */ 1601 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1602 name, minor, MINOR(minor)); 1603 ida_simple_remove(&zvol_ida, idx); 1604 return (SET_ERROR(EINVAL)); 1605 } 1606 1607 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1608 if (zv) { 1609 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1610 mutex_exit(&zv->zv_state_lock); 1611 ida_simple_remove(&zvol_ida, idx); 1612 return (SET_ERROR(EEXIST)); 1613 } 1614 1615 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1616 1617 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1618 if (error) 1619 goto out_doi; 1620 1621 error = dmu_object_info(os, ZVOL_OBJ, doi); 1622 if (error) 1623 goto out_dmu_objset_disown; 1624 1625 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1626 if (error) 1627 goto out_dmu_objset_disown; 1628 1629 error = zvol_alloc(MKDEV(zvol_major, minor), name, 1630 volsize, doi->doi_data_block_size, &zv); 1631 if (error || zv == NULL) 1632 goto out_dmu_objset_disown; 1633 1634 zv->zv_hash = hash; 1635 1636 if (dmu_objset_is_snapshot(os)) 1637 zv->zv_flags |= ZVOL_RDONLY; 1638 1639 zv->zv_objset = os; 1640 1641 /* Default */ 1642 zv->zv_threading = B_TRUE; 1643 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1644 == 0) 1645 zv->zv_threading = volthreading; 1646 1647 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1648 1649 #ifdef QUEUE_FLAG_DISCARD 1650 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1651 #endif 1652 #ifdef QUEUE_FLAG_NONROT 1653 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1654 #endif 1655 #ifdef QUEUE_FLAG_ADD_RANDOM 1656 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1657 #endif 1658 /* This flag was introduced in kernel version 4.12. */ 1659 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1660 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1661 #endif 1662 1663 ASSERT0P(zv->zv_kstat.dk_kstats); 1664 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1665 if (error) 1666 goto out_dmu_objset_disown; 1667 ASSERT0P(zv->zv_zilog); 1668 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1669 if (spa_writeable(dmu_objset_spa(os))) { 1670 if (zil_replay_disable) 1671 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1672 else 1673 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1674 } 1675 if (replayed_zil) 1676 zil_close(zv->zv_zilog); 1677 zv->zv_zilog = NULL; 1678 1679 /* 1680 * When udev detects the addition of the device it will immediately 1681 * invoke blkid(8) to determine the type of content on the device. 1682 * Prefetching the blocks commonly scanned by blkid(8) will speed 1683 * up this process. 1684 */ 1685 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1686 if (len > 0) { 1687 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1688 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1689 ZIO_PRIORITY_SYNC_READ); 1690 } 1691 1692 zv->zv_objset = NULL; 1693 out_dmu_objset_disown: 1694 dmu_objset_disown(os, B_TRUE, FTAG); 1695 out_doi: 1696 kmem_free(doi, sizeof (dmu_object_info_t)); 1697 1698 /* 1699 * Keep in mind that once add_disk() is called, the zvol is 1700 * announced to the world, and zvol_open()/zvol_release() can 1701 * be called at any time. Incidentally, add_disk() itself calls 1702 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1703 * directly as well. 1704 */ 1705 if (error == 0 && zv) { 1706 rw_enter(&zvol_state_lock, RW_WRITER); 1707 zvol_insert(zv); 1708 rw_exit(&zvol_state_lock); 1709 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1710 } else { 1711 ida_simple_remove(&zvol_ida, idx); 1712 } 1713 1714 return (error); 1715 } 1716 1717 int 1718 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1719 { 1720 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1721 1722 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1723 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1724 1725 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1726 1727 /* move to new hashtable entry */ 1728 zv->zv_hash = zvol_name_hash(newname); 1729 hlist_del(&zv->zv_hlink); 1730 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1731 1732 /* 1733 * The block device's read-only state is briefly changed causing 1734 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1735 * the name change and fixes the symlinks. This does not change 1736 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1737 * changes. This would normally be done using kobject_uevent() but 1738 * that is a GPL-only symbol which is why we need this workaround. 1739 */ 1740 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1741 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1742 1743 dataset_kstats_rename(&zv->zv_kstat, newname); 1744 1745 return (0); 1746 } 1747 1748 void 1749 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1750 { 1751 1752 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1753 } 1754 1755 void 1756 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1757 { 1758 1759 set_capacity(zv->zv_zso->zvo_disk, capacity); 1760 } 1761 1762 int 1763 zvol_init(void) 1764 { 1765 int error; 1766 1767 error = zvol_init_impl(); 1768 if (error) { 1769 printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); 1770 return (error); 1771 } 1772 1773 error = -register_blkdev(zvol_major, ZVOL_DRIVER); 1774 if (error) { 1775 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1776 return (SET_ERROR(error)); 1777 } 1778 1779 if (zvol_blk_mq_queue_depth == 0) { 1780 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1781 } else { 1782 zvol_actual_blk_mq_queue_depth = 1783 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1784 } 1785 1786 if (zvol_blk_mq_threads == 0) { 1787 zvol_blk_mq_actual_threads = num_online_cpus(); 1788 } else { 1789 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1790 1024); 1791 } 1792 1793 ida_init(&zvol_ida); 1794 return (0); 1795 } 1796 1797 void 1798 zvol_fini(void) 1799 { 1800 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1801 1802 zvol_fini_impl(); 1803 1804 ida_destroy(&zvol_ida); 1805 } 1806 1807 module_param(zvol_major, uint, 0444); 1808 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1809 1810 module_param(zvol_max_discard_blocks, ulong, 0444); 1811 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1812 1813 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1814 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1815 1816 module_param(zvol_use_blk_mq, uint, 0644); 1817 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1818 1819 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1820 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1821 "Process volblocksize blocks per thread"); 1822 1823 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1824 module_param(zvol_open_timeout_ms, uint, 0644); 1825 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1826 #endif 1827