1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 24 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 25 * Copyright (c) 2024, Klara, Inc. 26 */ 27 28 #include <sys/dataset_kstats.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/zil_impl.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/spa_impl.h> 41 #include <sys/zvol.h> 42 #include <sys/zvol_impl.h> 43 #include <cityhash.h> 44 45 #include <linux/blkdev_compat.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/workqueue.h> 48 #include <linux/blk-mq.h> 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned int zvol_prefetch_bytes = (128 * 1024); 55 static unsigned long zvol_max_discard_blocks = 16384; 56 57 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 58 static unsigned int zvol_open_timeout_ms = 1000; 59 #endif 60 61 static unsigned int zvol_blk_mq_threads = 0; 62 static unsigned int zvol_blk_mq_actual_threads; 63 static boolean_t zvol_use_blk_mq = B_FALSE; 64 65 /* 66 * The maximum number of volblocksize blocks to process per thread. Typically, 67 * write heavy workloads preform better with higher values here, and read 68 * heavy workloads preform better with lower values, but that's not a hard 69 * and fast rule. It's basically a knob to tune between "less overhead with 70 * less parallelism" and "more overhead, but more parallelism". 71 * 72 * '8' was chosen as a reasonable, balanced, default based off of sequential 73 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 74 */ 75 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 76 77 #ifndef BLKDEV_DEFAULT_RQ 78 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 79 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 80 #endif 81 82 /* 83 * Finalize our BIO or request. 84 */ 85 static inline void 86 zvol_end_io(struct bio *bio, struct request *rq, int error) 87 { 88 if (bio) { 89 bio->bi_status = errno_to_bi_status(-error); 90 bio_endio(bio); 91 } else { 92 blk_mq_end_request(rq, errno_to_bi_status(error)); 93 } 94 } 95 96 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 97 static unsigned int zvol_actual_blk_mq_queue_depth; 98 99 struct zvol_state_os { 100 struct gendisk *zvo_disk; /* generic disk */ 101 struct request_queue *zvo_queue; /* request queue */ 102 dev_t zvo_dev; /* device id */ 103 104 struct blk_mq_tag_set tag_set; 105 106 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 107 boolean_t use_blk_mq; 108 }; 109 110 static struct ida zvol_ida; 111 112 /* 113 * This is called when a new block multiqueue request comes in. A request 114 * contains one or more BIOs. 115 */ 116 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 117 const struct blk_mq_queue_data *bd) 118 { 119 struct request *rq = bd->rq; 120 zvol_state_t *zv = rq->q->queuedata; 121 122 /* Tell the kernel that we are starting to process this request */ 123 blk_mq_start_request(rq); 124 125 if (blk_rq_is_passthrough(rq)) { 126 /* Skip non filesystem request */ 127 blk_mq_end_request(rq, BLK_STS_IOERR); 128 return (BLK_STS_IOERR); 129 } 130 131 zvol_request_impl(zv, NULL, rq, 0); 132 133 /* Acknowledge to the kernel that we got this request */ 134 return (BLK_STS_OK); 135 } 136 137 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 138 .queue_rq = zvol_mq_queue_rq, 139 }; 140 141 /* Initialize our blk-mq struct */ 142 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 143 { 144 struct zvol_state_os *zso = zv->zv_zso; 145 146 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 147 148 /* Initialize tag set. */ 149 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 150 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 151 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 152 zso->tag_set.numa_node = NUMA_NO_NODE; 153 zso->tag_set.cmd_size = 0; 154 155 /* 156 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 157 * zvol_request_impl() 158 */ 159 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 160 161 #ifdef BLK_MQ_F_SHOULD_MERGE 162 /* 163 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 164 * For older kernels, we set it. 165 */ 166 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 167 #endif 168 169 zso->tag_set.driver_data = zv; 170 171 return (blk_mq_alloc_tag_set(&zso->tag_set)); 172 } 173 174 /* 175 * Given a path, return TRUE if path is a ZVOL. 176 */ 177 boolean_t 178 zvol_os_is_zvol(const char *path) 179 { 180 dev_t dev = 0; 181 182 if (vdev_lookup_bdev(path, &dev) != 0) 183 return (B_FALSE); 184 185 if (MAJOR(dev) == zvol_major) 186 return (B_TRUE); 187 188 return (B_FALSE); 189 } 190 191 static void 192 zvol_write(zv_request_t *zvr) 193 { 194 struct bio *bio = zvr->bio; 195 struct request *rq = zvr->rq; 196 int error = 0; 197 zfs_uio_t uio; 198 zvol_state_t *zv = zvr->zv; 199 struct request_queue *q; 200 struct gendisk *disk; 201 unsigned long start_time = 0; 202 boolean_t acct = B_FALSE; 203 204 ASSERT3P(zv, !=, NULL); 205 ASSERT3U(zv->zv_open_count, >, 0); 206 ASSERT3P(zv->zv_zilog, !=, NULL); 207 208 q = zv->zv_zso->zvo_queue; 209 disk = zv->zv_zso->zvo_disk; 210 211 /* bio marked as FLUSH need to flush before write */ 212 if (io_is_flush(bio, rq)) 213 zil_commit(zv->zv_zilog, ZVOL_OBJ); 214 215 /* Some requests are just for flush and nothing else. */ 216 if (io_size(bio, rq) == 0) { 217 rw_exit(&zv->zv_suspend_lock); 218 zvol_end_io(bio, rq, 0); 219 return; 220 } 221 222 zfs_uio_bvec_init(&uio, bio, rq); 223 224 ssize_t start_resid = uio.uio_resid; 225 226 /* 227 * With use_blk_mq, accounting is done by blk_mq_start_request() 228 * and blk_mq_end_request(), so we can skip it here. 229 */ 230 if (bio) { 231 acct = blk_queue_io_stat(q); 232 if (acct) { 233 start_time = blk_generic_start_io_acct(q, disk, WRITE, 234 bio); 235 } 236 } 237 238 boolean_t sync = 239 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 240 241 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 242 uio.uio_loffset, uio.uio_resid, RL_WRITER); 243 244 uint64_t volsize = zv->zv_volsize; 245 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 246 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 247 uint64_t off = uio.uio_loffset; 248 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 249 250 if (bytes > volsize - off) /* don't write past the end */ 251 bytes = volsize - off; 252 253 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 254 255 /* This will only fail for ENOSPC */ 256 error = dmu_tx_assign(tx, DMU_TX_WAIT); 257 if (error) { 258 dmu_tx_abort(tx); 259 break; 260 } 261 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, 262 DMU_READ_PREFETCH); 263 if (error == 0) { 264 zvol_log_write(zv, tx, off, bytes, sync); 265 } 266 dmu_tx_commit(tx); 267 268 if (error) 269 break; 270 } 271 zfs_rangelock_exit(lr); 272 273 int64_t nwritten = start_resid - uio.uio_resid; 274 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 275 task_io_account_write(nwritten); 276 277 if (sync) 278 zil_commit(zv->zv_zilog, ZVOL_OBJ); 279 280 rw_exit(&zv->zv_suspend_lock); 281 282 if (bio && acct) { 283 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 284 } 285 286 zvol_end_io(bio, rq, -error); 287 } 288 289 static void 290 zvol_write_task(void *arg) 291 { 292 zv_request_task_t *task = arg; 293 zvol_write(&task->zvr); 294 zv_request_task_free(task); 295 } 296 297 static void 298 zvol_discard(zv_request_t *zvr) 299 { 300 struct bio *bio = zvr->bio; 301 struct request *rq = zvr->rq; 302 zvol_state_t *zv = zvr->zv; 303 uint64_t start = io_offset(bio, rq); 304 uint64_t size = io_size(bio, rq); 305 uint64_t end = start + size; 306 boolean_t sync; 307 int error = 0; 308 dmu_tx_t *tx; 309 struct request_queue *q = zv->zv_zso->zvo_queue; 310 struct gendisk *disk = zv->zv_zso->zvo_disk; 311 unsigned long start_time = 0; 312 boolean_t acct = B_FALSE; 313 314 ASSERT3P(zv, !=, NULL); 315 ASSERT3U(zv->zv_open_count, >, 0); 316 ASSERT3P(zv->zv_zilog, !=, NULL); 317 318 if (bio) { 319 acct = blk_queue_io_stat(q); 320 if (acct) { 321 start_time = blk_generic_start_io_acct(q, disk, WRITE, 322 bio); 323 } 324 } 325 326 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 327 328 if (end > zv->zv_volsize) { 329 error = SET_ERROR(EIO); 330 goto unlock; 331 } 332 333 /* 334 * Align the request to volume block boundaries when a secure erase is 335 * not required. This will prevent dnode_free_range() from zeroing out 336 * the unaligned parts which is slow (read-modify-write) and useless 337 * since we are not freeing any space by doing so. 338 */ 339 if (!io_is_secure_erase(bio, rq)) { 340 start = P2ROUNDUP(start, zv->zv_volblocksize); 341 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 342 size = end - start; 343 } 344 345 if (start >= end) 346 goto unlock; 347 348 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 349 start, size, RL_WRITER); 350 351 tx = dmu_tx_create(zv->zv_objset); 352 dmu_tx_mark_netfree(tx); 353 error = dmu_tx_assign(tx, DMU_TX_WAIT); 354 if (error != 0) { 355 dmu_tx_abort(tx); 356 } else { 357 zvol_log_truncate(zv, tx, start, size); 358 dmu_tx_commit(tx); 359 error = dmu_free_long_range(zv->zv_objset, 360 ZVOL_OBJ, start, size); 361 } 362 zfs_rangelock_exit(lr); 363 364 if (error == 0 && sync) 365 zil_commit(zv->zv_zilog, ZVOL_OBJ); 366 367 unlock: 368 rw_exit(&zv->zv_suspend_lock); 369 370 if (bio && acct) { 371 blk_generic_end_io_acct(q, disk, WRITE, bio, 372 start_time); 373 } 374 375 zvol_end_io(bio, rq, -error); 376 } 377 378 static void 379 zvol_discard_task(void *arg) 380 { 381 zv_request_task_t *task = arg; 382 zvol_discard(&task->zvr); 383 zv_request_task_free(task); 384 } 385 386 static void 387 zvol_read(zv_request_t *zvr) 388 { 389 struct bio *bio = zvr->bio; 390 struct request *rq = zvr->rq; 391 int error = 0; 392 zfs_uio_t uio; 393 boolean_t acct = B_FALSE; 394 zvol_state_t *zv = zvr->zv; 395 struct request_queue *q; 396 struct gendisk *disk; 397 unsigned long start_time = 0; 398 399 ASSERT3P(zv, !=, NULL); 400 ASSERT3U(zv->zv_open_count, >, 0); 401 402 zfs_uio_bvec_init(&uio, bio, rq); 403 404 q = zv->zv_zso->zvo_queue; 405 disk = zv->zv_zso->zvo_disk; 406 407 ssize_t start_resid = uio.uio_resid; 408 409 /* 410 * When blk-mq is being used, accounting is done by 411 * blk_mq_start_request() and blk_mq_end_request(). 412 */ 413 if (bio) { 414 acct = blk_queue_io_stat(q); 415 if (acct) 416 start_time = blk_generic_start_io_acct(q, disk, READ, 417 bio); 418 } 419 420 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 421 uio.uio_loffset, uio.uio_resid, RL_READER); 422 423 uint64_t volsize = zv->zv_volsize; 424 425 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 426 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 427 428 /* don't read past the end */ 429 if (bytes > volsize - uio.uio_loffset) 430 bytes = volsize - uio.uio_loffset; 431 432 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, 433 DMU_READ_PREFETCH); 434 if (error) { 435 /* convert checksum errors into IO errors */ 436 if (error == ECKSUM) 437 error = SET_ERROR(EIO); 438 break; 439 } 440 } 441 zfs_rangelock_exit(lr); 442 443 int64_t nread = start_resid - uio.uio_resid; 444 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 445 task_io_account_read(nread); 446 447 rw_exit(&zv->zv_suspend_lock); 448 449 if (bio && acct) { 450 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 451 } 452 453 zvol_end_io(bio, rq, -error); 454 } 455 456 static void 457 zvol_read_task(void *arg) 458 { 459 zv_request_task_t *task = arg; 460 zvol_read(&task->zvr); 461 zv_request_task_free(task); 462 } 463 464 465 /* 466 * Process a BIO or request 467 * 468 * Either 'bio' or 'rq' should be set depending on if we are processing a 469 * bio or a request (both should not be set). 470 * 471 * force_sync: Set to 0 to defer processing to a background taskq 472 * Set to 1 to process data synchronously 473 */ 474 static void 475 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 476 boolean_t force_sync) 477 { 478 fstrans_cookie_t cookie = spl_fstrans_mark(); 479 uint64_t offset = io_offset(bio, rq); 480 uint64_t size = io_size(bio, rq); 481 int rw = io_data_dir(bio, rq); 482 483 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 484 zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); 485 goto out; 486 } 487 488 if (zvol_request_sync || zv->zv_threading == B_FALSE) 489 force_sync = 1; 490 491 zv_request_t zvr = { 492 .zv = zv, 493 .bio = bio, 494 .rq = rq, 495 }; 496 497 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 498 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 499 zv->zv_zso->zvo_disk->disk_name, 500 (long long unsigned)offset, 501 (long unsigned)size); 502 503 zvol_end_io(bio, rq, -SET_ERROR(EIO)); 504 goto out; 505 } 506 507 zv_request_task_t *task; 508 zv_taskq_t *ztqs = &zvol_taskqs; 509 uint_t blk_mq_hw_queue = 0; 510 uint_t tq_idx; 511 uint_t taskq_hash; 512 if (rq) 513 #ifdef HAVE_BLK_MQ_RQ_HCTX 514 blk_mq_hw_queue = rq->mq_hctx->queue_num; 515 #else 516 blk_mq_hw_queue = 517 rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 518 #endif 519 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 520 blk_mq_hw_queue); 521 tq_idx = taskq_hash % ztqs->tqs_cnt; 522 523 if (rw == WRITE) { 524 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 525 zvol_end_io(bio, rq, -SET_ERROR(EROFS)); 526 goto out; 527 } 528 529 /* 530 * Prevents the zvol from being suspended, or the ZIL being 531 * concurrently opened. Will be released after the i/o 532 * completes. 533 */ 534 rw_enter(&zv->zv_suspend_lock, RW_READER); 535 536 /* 537 * Open a ZIL if this is the first time we have written to this 538 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 539 * than zv_state_lock so that we don't need to acquire an 540 * additional lock in this path. 541 */ 542 if (zv->zv_zilog == NULL) { 543 rw_exit(&zv->zv_suspend_lock); 544 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 545 if (zv->zv_zilog == NULL) { 546 zv->zv_zilog = zil_open(zv->zv_objset, 547 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 548 zv->zv_flags |= ZVOL_WRITTEN_TO; 549 /* replay / destroy done in zvol_create_minor */ 550 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 551 ZIL_REPLAY_NEEDED)); 552 } 553 rw_downgrade(&zv->zv_suspend_lock); 554 } 555 556 /* 557 * We don't want this thread to be blocked waiting for i/o to 558 * complete, so we instead wait from a taskq callback. The 559 * i/o may be a ZIL write (via zil_commit()), or a read of an 560 * indirect block, or a read of a data block (if this is a 561 * partial-block write). We will indicate that the i/o is 562 * complete by calling END_IO() from the taskq callback. 563 * 564 * This design allows the calling thread to continue and 565 * initiate more concurrent operations by calling 566 * zvol_request() again. There are typically only a small 567 * number of threads available to call zvol_request() (e.g. 568 * one per iSCSI target), so keeping the latency of 569 * zvol_request() low is important for performance. 570 * 571 * The zvol_request_sync module parameter allows this 572 * behavior to be altered, for performance evaluation 573 * purposes. If the callback blocks, setting 574 * zvol_request_sync=1 will result in much worse performance. 575 * 576 * We can have up to zvol_threads concurrent i/o's being 577 * processed for all zvols on the system. This is typically 578 * a vast improvement over the zvol_request_sync=1 behavior 579 * of one i/o at a time per zvol. However, an even better 580 * design would be for zvol_request() to initiate the zio 581 * directly, and then be notified by the zio_done callback, 582 * which would call END_IO(). Unfortunately, the DMU/ZIL 583 * interfaces lack this functionality (they block waiting for 584 * the i/o to complete). 585 */ 586 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 587 if (force_sync) { 588 zvol_discard(&zvr); 589 } else { 590 task = zv_request_task_create(zvr); 591 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 592 zvol_discard_task, task, 0, &task->ent); 593 } 594 } else { 595 if (force_sync) { 596 zvol_write(&zvr); 597 } else { 598 task = zv_request_task_create(zvr); 599 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 600 zvol_write_task, task, 0, &task->ent); 601 } 602 } 603 } else { 604 /* 605 * The SCST driver, and possibly others, may issue READ I/Os 606 * with a length of zero bytes. These empty I/Os contain no 607 * data and require no additional handling. 608 */ 609 if (size == 0) { 610 zvol_end_io(bio, rq, 0); 611 goto out; 612 } 613 614 rw_enter(&zv->zv_suspend_lock, RW_READER); 615 616 /* See comment in WRITE case above. */ 617 if (force_sync) { 618 zvol_read(&zvr); 619 } else { 620 task = zv_request_task_create(zvr); 621 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 622 zvol_read_task, task, 0, &task->ent); 623 } 624 } 625 626 out: 627 spl_fstrans_unmark(cookie); 628 } 629 630 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 631 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 632 static void 633 zvol_submit_bio(struct bio *bio) 634 #else 635 static blk_qc_t 636 zvol_submit_bio(struct bio *bio) 637 #endif 638 #else 639 static MAKE_REQUEST_FN_RET 640 zvol_request(struct request_queue *q, struct bio *bio) 641 #endif 642 { 643 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 644 #if defined(HAVE_BIO_BDEV_DISK) 645 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 646 #else 647 struct request_queue *q = bio->bi_disk->queue; 648 #endif 649 #endif 650 zvol_state_t *zv = q->queuedata; 651 652 zvol_request_impl(zv, bio, NULL, 0); 653 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 654 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 655 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 656 return (BLK_QC_T_NONE); 657 #endif 658 } 659 660 static int 661 #ifdef HAVE_BLK_MODE_T 662 zvol_open(struct gendisk *disk, blk_mode_t flag) 663 #else 664 zvol_open(struct block_device *bdev, fmode_t flag) 665 #endif 666 { 667 zvol_state_t *zv; 668 int error = 0; 669 boolean_t drop_suspend = B_FALSE; 670 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 671 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 672 hrtime_t start = gethrtime(); 673 674 retry: 675 #endif 676 rw_enter(&zvol_state_lock, RW_READER); 677 /* 678 * Obtain a copy of private_data under the zvol_state_lock to make 679 * sure that either the result of zvol free code path setting 680 * disk->private_data to NULL is observed, or zvol_os_free() 681 * is not called on this zv because of the positive zv_open_count. 682 */ 683 #ifdef HAVE_BLK_MODE_T 684 zv = disk->private_data; 685 #else 686 zv = bdev->bd_disk->private_data; 687 #endif 688 if (zv == NULL) { 689 rw_exit(&zvol_state_lock); 690 return (-SET_ERROR(ENXIO)); 691 } 692 693 mutex_enter(&zv->zv_state_lock); 694 695 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 696 mutex_exit(&zv->zv_state_lock); 697 rw_exit(&zvol_state_lock); 698 return (-SET_ERROR(ENXIO)); 699 } 700 701 /* 702 * Make sure zvol is not suspended during first open 703 * (hold zv_suspend_lock) and respect proper lock acquisition 704 * ordering - zv_suspend_lock before zv_state_lock 705 */ 706 if (zv->zv_open_count == 0) { 707 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 708 mutex_exit(&zv->zv_state_lock); 709 rw_enter(&zv->zv_suspend_lock, RW_READER); 710 mutex_enter(&zv->zv_state_lock); 711 /* check to see if zv_suspend_lock is needed */ 712 if (zv->zv_open_count != 0) { 713 rw_exit(&zv->zv_suspend_lock); 714 } else { 715 drop_suspend = B_TRUE; 716 } 717 } else { 718 drop_suspend = B_TRUE; 719 } 720 } 721 rw_exit(&zvol_state_lock); 722 723 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 724 725 if (zv->zv_open_count == 0) { 726 boolean_t drop_namespace = B_FALSE; 727 728 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 729 730 /* 731 * In all other call paths the spa_namespace_lock is taken 732 * before the bdev->bd_mutex lock. However, on open(2) 733 * the __blkdev_get() function calls fops->open() with the 734 * bdev->bd_mutex lock held. This can result in a deadlock 735 * when zvols from one pool are used as vdevs in another. 736 * 737 * To prevent a lock inversion deadlock we preemptively 738 * take the spa_namespace_lock. Normally the lock will not 739 * be contended and this is safe because spa_open_common() 740 * handles the case where the caller already holds the 741 * spa_namespace_lock. 742 * 743 * When the lock cannot be aquired after multiple retries 744 * this must be the vdev on zvol deadlock case and we have 745 * no choice but to return an error. For 5.12 and older 746 * kernels returning -ERESTARTSYS will result in the 747 * bdev->bd_mutex being dropped, then reacquired, and 748 * fops->open() being called again. This process can be 749 * repeated safely until both locks are acquired. For 5.13 750 * and newer the -ERESTARTSYS retry logic was removed from 751 * the kernel so the only option is to return the error for 752 * the caller to handle it. 753 */ 754 if (!mutex_owned(&spa_namespace_lock)) { 755 if (!mutex_tryenter(&spa_namespace_lock)) { 756 mutex_exit(&zv->zv_state_lock); 757 rw_exit(&zv->zv_suspend_lock); 758 drop_suspend = B_FALSE; 759 760 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 761 schedule(); 762 return (-SET_ERROR(ERESTARTSYS)); 763 #else 764 if ((gethrtime() - start) > timeout) 765 return (-SET_ERROR(ERESTARTSYS)); 766 767 schedule_timeout_interruptible( 768 MSEC_TO_TICK(10)); 769 goto retry; 770 #endif 771 } else { 772 drop_namespace = B_TRUE; 773 } 774 } 775 776 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 777 778 if (drop_namespace) 779 mutex_exit(&spa_namespace_lock); 780 } 781 782 if (error == 0) { 783 if ((blk_mode_is_open_write(flag)) && 784 (zv->zv_flags & ZVOL_RDONLY)) { 785 if (zv->zv_open_count == 0) 786 zvol_last_close(zv); 787 788 error = -SET_ERROR(EROFS); 789 } else { 790 zv->zv_open_count++; 791 } 792 } 793 794 mutex_exit(&zv->zv_state_lock); 795 if (drop_suspend) 796 rw_exit(&zv->zv_suspend_lock); 797 798 if (error == 0) 799 #ifdef HAVE_BLK_MODE_T 800 disk_check_media_change(disk); 801 #else 802 zfs_check_media_change(bdev); 803 #endif 804 805 return (error); 806 } 807 808 static void 809 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 810 zvol_release(struct gendisk *disk) 811 #else 812 zvol_release(struct gendisk *disk, fmode_t unused) 813 #endif 814 { 815 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 816 (void) unused; 817 #endif 818 zvol_state_t *zv; 819 boolean_t drop_suspend = B_TRUE; 820 821 rw_enter(&zvol_state_lock, RW_READER); 822 zv = disk->private_data; 823 824 mutex_enter(&zv->zv_state_lock); 825 ASSERT3U(zv->zv_open_count, >, 0); 826 /* 827 * make sure zvol is not suspended during last close 828 * (hold zv_suspend_lock) and respect proper lock acquisition 829 * ordering - zv_suspend_lock before zv_state_lock 830 */ 831 if (zv->zv_open_count == 1) { 832 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 833 mutex_exit(&zv->zv_state_lock); 834 rw_enter(&zv->zv_suspend_lock, RW_READER); 835 mutex_enter(&zv->zv_state_lock); 836 /* check to see if zv_suspend_lock is needed */ 837 if (zv->zv_open_count != 1) { 838 rw_exit(&zv->zv_suspend_lock); 839 drop_suspend = B_FALSE; 840 } 841 } 842 } else { 843 drop_suspend = B_FALSE; 844 } 845 rw_exit(&zvol_state_lock); 846 847 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 848 849 zv->zv_open_count--; 850 if (zv->zv_open_count == 0) { 851 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 852 zvol_last_close(zv); 853 } 854 855 mutex_exit(&zv->zv_state_lock); 856 857 if (drop_suspend) 858 rw_exit(&zv->zv_suspend_lock); 859 } 860 861 static int 862 zvol_ioctl(struct block_device *bdev, fmode_t mode, 863 unsigned int cmd, unsigned long arg) 864 { 865 zvol_state_t *zv = bdev->bd_disk->private_data; 866 int error = 0; 867 868 ASSERT3U(zv->zv_open_count, >, 0); 869 870 switch (cmd) { 871 case BLKFLSBUF: 872 #ifdef HAVE_FSYNC_BDEV 873 fsync_bdev(bdev); 874 #elif defined(HAVE_SYNC_BLOCKDEV) 875 sync_blockdev(bdev); 876 #else 877 #error "Neither fsync_bdev() nor sync_blockdev() found" 878 #endif 879 invalidate_bdev(bdev); 880 rw_enter(&zv->zv_suspend_lock, RW_READER); 881 882 if (!(zv->zv_flags & ZVOL_RDONLY)) 883 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 884 885 rw_exit(&zv->zv_suspend_lock); 886 break; 887 888 case BLKZNAME: 889 mutex_enter(&zv->zv_state_lock); 890 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 891 mutex_exit(&zv->zv_state_lock); 892 break; 893 894 default: 895 error = -ENOTTY; 896 break; 897 } 898 899 return (SET_ERROR(error)); 900 } 901 902 #ifdef CONFIG_COMPAT 903 static int 904 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 905 unsigned cmd, unsigned long arg) 906 { 907 return (zvol_ioctl(bdev, mode, cmd, arg)); 908 } 909 #else 910 #define zvol_compat_ioctl NULL 911 #endif 912 913 static unsigned int 914 zvol_check_events(struct gendisk *disk, unsigned int clearing) 915 { 916 unsigned int mask = 0; 917 918 rw_enter(&zvol_state_lock, RW_READER); 919 920 zvol_state_t *zv = disk->private_data; 921 if (zv != NULL) { 922 mutex_enter(&zv->zv_state_lock); 923 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 924 zv->zv_changed = 0; 925 mutex_exit(&zv->zv_state_lock); 926 } 927 928 rw_exit(&zvol_state_lock); 929 930 return (mask); 931 } 932 933 static int 934 zvol_revalidate_disk(struct gendisk *disk) 935 { 936 rw_enter(&zvol_state_lock, RW_READER); 937 938 zvol_state_t *zv = disk->private_data; 939 if (zv != NULL) { 940 mutex_enter(&zv->zv_state_lock); 941 set_capacity(zv->zv_zso->zvo_disk, 942 zv->zv_volsize >> SECTOR_BITS); 943 mutex_exit(&zv->zv_state_lock); 944 } 945 946 rw_exit(&zvol_state_lock); 947 948 return (0); 949 } 950 951 int 952 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 953 { 954 struct gendisk *disk = zv->zv_zso->zvo_disk; 955 956 #if defined(HAVE_REVALIDATE_DISK_SIZE) 957 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 958 #elif defined(HAVE_REVALIDATE_DISK) 959 revalidate_disk(disk); 960 #else 961 zvol_revalidate_disk(disk); 962 #endif 963 return (0); 964 } 965 966 void 967 zvol_os_clear_private(zvol_state_t *zv) 968 { 969 /* 970 * Cleared while holding zvol_state_lock as a writer 971 * which will prevent zvol_open() from opening it. 972 */ 973 zv->zv_zso->zvo_disk->private_data = NULL; 974 } 975 976 /* 977 * Provide a simple virtual geometry for legacy compatibility. For devices 978 * smaller than 1 MiB a small head and sector count is used to allow very 979 * tiny devices. For devices over 1 Mib a standard head and sector count 980 * is used to keep the cylinders count reasonable. 981 */ 982 static int 983 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 984 { 985 zvol_state_t *zv = bdev->bd_disk->private_data; 986 sector_t sectors; 987 988 ASSERT3U(zv->zv_open_count, >, 0); 989 990 sectors = get_capacity(zv->zv_zso->zvo_disk); 991 992 if (sectors > 2048) { 993 geo->heads = 16; 994 geo->sectors = 63; 995 } else { 996 geo->heads = 2; 997 geo->sectors = 4; 998 } 999 1000 geo->start = 0; 1001 geo->cylinders = sectors / (geo->heads * geo->sectors); 1002 1003 return (0); 1004 } 1005 1006 /* 1007 * Why have two separate block_device_operations structs? 1008 * 1009 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1010 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1011 * can't just change submit_bio dynamically at runtime. So just create two 1012 * separate structs to get around this. 1013 */ 1014 static const struct block_device_operations zvol_ops_blk_mq = { 1015 .open = zvol_open, 1016 .release = zvol_release, 1017 .ioctl = zvol_ioctl, 1018 .compat_ioctl = zvol_compat_ioctl, 1019 .check_events = zvol_check_events, 1020 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1021 .revalidate_disk = zvol_revalidate_disk, 1022 #endif 1023 .getgeo = zvol_getgeo, 1024 .owner = THIS_MODULE, 1025 }; 1026 1027 static const struct block_device_operations zvol_ops = { 1028 .open = zvol_open, 1029 .release = zvol_release, 1030 .ioctl = zvol_ioctl, 1031 .compat_ioctl = zvol_compat_ioctl, 1032 .check_events = zvol_check_events, 1033 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1034 .revalidate_disk = zvol_revalidate_disk, 1035 #endif 1036 .getgeo = zvol_getgeo, 1037 .owner = THIS_MODULE, 1038 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1039 .submit_bio = zvol_submit_bio, 1040 #endif 1041 }; 1042 1043 /* 1044 * Since 6.9, Linux has been removing queue limit setters in favour of an 1045 * initial queue_limits struct applied when the device is open. Since 6.11, 1046 * queue_limits is being extended to allow more things to be applied when the 1047 * device is open. Setters are also being removed for this. 1048 * 1049 * For OpenZFS, this means that depending on kernel version, some options may 1050 * be set up before the device is open, and some applied to an open device 1051 * (queue) after the fact. 1052 * 1053 * We manage this complexity by having our own limits struct, 1054 * zvol_queue_limits_t, in which we carry any queue config that we're 1055 * interested in setting. This structure is the same on all kernels. 1056 * 1057 * These limits are then applied to the queue at device open time by the most 1058 * appropriate method for the kernel. 1059 * 1060 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1061 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1062 * struct queue_limits, and passes it in. Any fields added in later kernels are 1063 * (obviously) not set up here. 1064 * 1065 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1066 * is created, and applies any remaining config. Before 6.9 that will be 1067 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1068 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1069 * will always be a no-op on the latest kernel we support). 1070 */ 1071 typedef struct zvol_queue_limits { 1072 unsigned int zql_max_hw_sectors; 1073 unsigned short zql_max_segments; 1074 unsigned int zql_max_segment_size; 1075 unsigned int zql_io_opt; 1076 unsigned int zql_physical_block_size; 1077 unsigned int zql_max_discard_sectors; 1078 unsigned int zql_discard_granularity; 1079 } zvol_queue_limits_t; 1080 1081 static void 1082 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1083 boolean_t use_blk_mq) 1084 { 1085 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1086 1087 if (use_blk_mq) { 1088 /* 1089 * IO requests can be really big (1MB). When an IO request 1090 * comes in, it is passed off to zvol_read() or zvol_write() 1091 * in a new thread, where it is chunked up into 'volblocksize' 1092 * sized pieces and processed. So for example, if the request 1093 * is a 1MB write and your volblocksize is 128k, one zvol_write 1094 * thread will take that request and sequentially do ten 128k 1095 * IOs. This is due to the fact that the thread needs to lock 1096 * each volblocksize sized block. So you might be wondering: 1097 * "instead of passing the whole 1MB request to one thread, 1098 * why not pass ten individual 128k chunks to ten threads and 1099 * process the whole write in parallel?" The short answer is 1100 * that there's a sweet spot number of chunks that balances 1101 * the greater parallelism with the added overhead of more 1102 * threads. The sweet spot can be different depending on if you 1103 * have a read or write heavy workload. Writes typically want 1104 * high chunk counts while reads typically want lower ones. On 1105 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1106 * configuration, with volblocksize=8k, the sweet spot for good 1107 * sequential reads and writes was at 8 chunks. 1108 */ 1109 1110 /* 1111 * Below we tell the kernel how big we want our requests 1112 * to be. You would think that blk_queue_io_opt() would be 1113 * used to do this since it is used to "set optimal request 1114 * size for the queue", but that doesn't seem to do 1115 * anything - the kernel still gives you huge requests 1116 * with tons of little PAGE_SIZE segments contained within it. 1117 * 1118 * Knowing that the kernel will just give you PAGE_SIZE segments 1119 * no matter what, you can say "ok, I want PAGE_SIZE byte 1120 * segments, and I want 'N' of them per request", where N is 1121 * the correct number of segments for the volblocksize and 1122 * number of chunks you want. 1123 */ 1124 if (zvol_blk_mq_blocks_per_thread != 0) { 1125 unsigned int chunks; 1126 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1127 1128 limits->zql_max_segment_size = PAGE_SIZE; 1129 limits->zql_max_segments = 1130 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1131 } else { 1132 /* 1133 * Special case: zvol_blk_mq_blocks_per_thread = 0 1134 * Max everything out. 1135 */ 1136 limits->zql_max_segments = UINT16_MAX; 1137 limits->zql_max_segment_size = UINT_MAX; 1138 } 1139 } else { 1140 limits->zql_max_segments = UINT16_MAX; 1141 limits->zql_max_segment_size = UINT_MAX; 1142 } 1143 1144 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1145 1146 limits->zql_physical_block_size = zv->zv_volblocksize; 1147 limits->zql_max_discard_sectors = 1148 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1149 limits->zql_discard_granularity = zv->zv_volblocksize; 1150 } 1151 1152 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1153 static void 1154 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1155 struct queue_limits *qlimits) 1156 { 1157 memset(qlimits, 0, sizeof (struct queue_limits)); 1158 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1159 qlimits->max_segments = limits->zql_max_segments; 1160 qlimits->max_segment_size = limits->zql_max_segment_size; 1161 qlimits->io_opt = limits->zql_io_opt; 1162 qlimits->physical_block_size = limits->zql_physical_block_size; 1163 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1164 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1165 qlimits->discard_granularity = limits->zql_discard_granularity; 1166 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1167 qlimits->features = 1168 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1169 #endif 1170 } 1171 #endif 1172 1173 static void 1174 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1175 struct request_queue *queue) 1176 { 1177 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1178 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1179 blk_queue_max_segments(queue, limits->zql_max_segments); 1180 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1181 blk_queue_io_opt(queue, limits->zql_io_opt); 1182 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1183 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1184 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1185 #endif 1186 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1187 blk_queue_set_write_cache(queue, B_TRUE); 1188 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1189 #endif 1190 } 1191 1192 static int 1193 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1194 { 1195 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1196 #if defined(HAVE_BLK_ALLOC_DISK) 1197 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1198 if (zso->zvo_disk == NULL) 1199 return (1); 1200 1201 zso->zvo_disk->minors = ZVOL_MINORS; 1202 zso->zvo_queue = zso->zvo_disk->queue; 1203 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1204 struct queue_limits qlimits; 1205 zvol_queue_limits_convert(limits, &qlimits); 1206 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1207 if (IS_ERR(disk)) { 1208 zso->zvo_disk = NULL; 1209 return (1); 1210 } 1211 1212 zso->zvo_disk = disk; 1213 zso->zvo_disk->minors = ZVOL_MINORS; 1214 zso->zvo_queue = zso->zvo_disk->queue; 1215 1216 #else 1217 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1218 if (zso->zvo_queue == NULL) 1219 return (1); 1220 1221 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1222 if (zso->zvo_disk == NULL) { 1223 blk_cleanup_queue(zso->zvo_queue); 1224 return (1); 1225 } 1226 1227 zso->zvo_disk->queue = zso->zvo_queue; 1228 #endif /* HAVE_BLK_ALLOC_DISK */ 1229 #else 1230 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1231 if (zso->zvo_queue == NULL) 1232 return (1); 1233 1234 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1235 if (zso->zvo_disk == NULL) { 1236 blk_cleanup_queue(zso->zvo_queue); 1237 return (1); 1238 } 1239 1240 zso->zvo_disk->queue = zso->zvo_queue; 1241 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1242 1243 zvol_queue_limits_apply(limits, zso->zvo_queue); 1244 1245 return (0); 1246 1247 } 1248 1249 static int 1250 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1251 { 1252 struct zvol_state_os *zso = zv->zv_zso; 1253 1254 /* Allocate our blk-mq tag_set */ 1255 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1256 return (1); 1257 1258 #if defined(HAVE_BLK_ALLOC_DISK) 1259 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1260 if (zso->zvo_disk == NULL) { 1261 blk_mq_free_tag_set(&zso->tag_set); 1262 return (1); 1263 } 1264 zso->zvo_queue = zso->zvo_disk->queue; 1265 zso->zvo_disk->minors = ZVOL_MINORS; 1266 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1267 struct queue_limits qlimits; 1268 zvol_queue_limits_convert(limits, &qlimits); 1269 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1270 if (IS_ERR(disk)) { 1271 zso->zvo_disk = NULL; 1272 blk_mq_free_tag_set(&zso->tag_set); 1273 return (1); 1274 } 1275 1276 zso->zvo_disk = disk; 1277 zso->zvo_queue = zso->zvo_disk->queue; 1278 zso->zvo_disk->minors = ZVOL_MINORS; 1279 #else 1280 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1281 if (zso->zvo_disk == NULL) { 1282 blk_cleanup_queue(zso->zvo_queue); 1283 blk_mq_free_tag_set(&zso->tag_set); 1284 return (1); 1285 } 1286 /* Allocate queue */ 1287 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1288 if (IS_ERR(zso->zvo_queue)) { 1289 blk_mq_free_tag_set(&zso->tag_set); 1290 return (1); 1291 } 1292 1293 /* Our queue is now created, assign it to our disk */ 1294 zso->zvo_disk->queue = zso->zvo_queue; 1295 #endif 1296 1297 zvol_queue_limits_apply(limits, zso->zvo_queue); 1298 1299 return (0); 1300 } 1301 1302 /* 1303 * Allocate memory for a new zvol_state_t and setup the required 1304 * request queue and generic disk structures for the block device. 1305 */ 1306 static zvol_state_t * 1307 zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) 1308 { 1309 zvol_state_t *zv; 1310 struct zvol_state_os *zso; 1311 uint64_t volmode; 1312 int ret; 1313 1314 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1315 return (NULL); 1316 1317 if (volmode == ZFS_VOLMODE_DEFAULT) 1318 volmode = zvol_volmode; 1319 1320 if (volmode == ZFS_VOLMODE_NONE) 1321 return (NULL); 1322 1323 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1324 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1325 zv->zv_zso = zso; 1326 zv->zv_volmode = volmode; 1327 zv->zv_volblocksize = volblocksize; 1328 1329 list_link_init(&zv->zv_next); 1330 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1331 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1332 1333 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1334 1335 zvol_queue_limits_t limits; 1336 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1337 1338 /* 1339 * The block layer has 3 interfaces for getting BIOs: 1340 * 1341 * 1. blk-mq request queues (new) 1342 * 2. submit_bio() (oldest) 1343 * 3. regular request queues (old). 1344 * 1345 * Each of those interfaces has two permutations: 1346 * 1347 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1348 * both the disk and its queue (5.14 kernel or newer) 1349 * 1350 * b) We don't have blk_*alloc_disk(), and have to allocate the 1351 * disk and the queue separately. (5.13 kernel or older) 1352 */ 1353 if (zv->zv_zso->use_blk_mq) { 1354 ret = zvol_alloc_blk_mq(zv, &limits); 1355 if (ret != 0) 1356 goto out_kmem; 1357 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1358 } else { 1359 ret = zvol_alloc_non_blk_mq(zso, &limits); 1360 if (ret != 0) 1361 goto out_kmem; 1362 zso->zvo_disk->fops = &zvol_ops; 1363 } 1364 1365 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1366 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1367 1368 if (!zv->zv_zso->use_blk_mq) { 1369 /* Disable write merging in favor of the ZIO pipeline. */ 1370 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1371 } 1372 1373 zso->zvo_queue->queuedata = zv; 1374 zso->zvo_dev = dev; 1375 zv->zv_open_count = 0; 1376 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1377 1378 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1379 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1380 1381 zso->zvo_disk->major = zvol_major; 1382 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1383 1384 /* 1385 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1386 * This is accomplished by limiting the number of minors for the 1387 * device to one and explicitly disabling partition scanning. 1388 */ 1389 if (volmode == ZFS_VOLMODE_DEV) { 1390 zso->zvo_disk->minors = 1; 1391 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1392 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1393 } 1394 1395 zso->zvo_disk->first_minor = (dev & MINORMASK); 1396 zso->zvo_disk->private_data = zv; 1397 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1398 ZVOL_DEV_NAME, (dev & MINORMASK)); 1399 1400 return (zv); 1401 1402 out_kmem: 1403 kmem_free(zso, sizeof (struct zvol_state_os)); 1404 kmem_free(zv, sizeof (zvol_state_t)); 1405 return (NULL); 1406 } 1407 1408 /* 1409 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1410 * At this time, the structure is not opened by anyone, is taken off 1411 * the zvol_state_list, and has its private data set to NULL. 1412 * The zvol_state_lock is dropped. 1413 * 1414 * This function may take many milliseconds to complete (e.g. we've seen 1415 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1416 * "del_gendisk". Thus, consumers need to be careful to account for this 1417 * latency when calling this function. 1418 */ 1419 void 1420 zvol_os_free(zvol_state_t *zv) 1421 { 1422 1423 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1424 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1425 ASSERT0(zv->zv_open_count); 1426 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1427 1428 rw_destroy(&zv->zv_suspend_lock); 1429 zfs_rangelock_fini(&zv->zv_rangelock); 1430 1431 del_gendisk(zv->zv_zso->zvo_disk); 1432 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1433 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1434 #if defined(HAVE_BLK_CLEANUP_DISK) 1435 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1436 #else 1437 put_disk(zv->zv_zso->zvo_disk); 1438 #endif 1439 #else 1440 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1441 put_disk(zv->zv_zso->zvo_disk); 1442 #endif 1443 1444 if (zv->zv_zso->use_blk_mq) 1445 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1446 1447 ida_simple_remove(&zvol_ida, 1448 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1449 1450 cv_destroy(&zv->zv_removing_cv); 1451 mutex_destroy(&zv->zv_state_lock); 1452 dataset_kstats_destroy(&zv->zv_kstat); 1453 1454 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1455 kmem_free(zv, sizeof (zvol_state_t)); 1456 } 1457 1458 void 1459 zvol_wait_close(zvol_state_t *zv) 1460 { 1461 } 1462 1463 struct add_disk_work { 1464 struct delayed_work work; 1465 struct gendisk *disk; 1466 int error; 1467 }; 1468 1469 static int 1470 __zvol_os_add_disk(struct gendisk *disk) 1471 { 1472 int error = 0; 1473 #ifdef HAVE_ADD_DISK_RET 1474 error = add_disk(disk); 1475 #else 1476 add_disk(disk); 1477 #endif 1478 return (error); 1479 } 1480 1481 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1482 static void 1483 zvol_os_add_disk_work(struct work_struct *work) 1484 { 1485 struct add_disk_work *add_disk_work; 1486 add_disk_work = container_of(work, struct add_disk_work, work.work); 1487 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1488 } 1489 #endif 1490 1491 /* 1492 * SPECIAL CASE: 1493 * 1494 * This function basically calls add_disk() from a workqueue. You may be 1495 * thinking: why not just call add_disk() directly? 1496 * 1497 * When you call add_disk(), the zvol appears to the world. When this happens, 1498 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1499 * differently on the 6.9+ kernels: 1500 * 1501 * - 6.8 and older kernels - 1502 * disk_scan_partitions() 1503 * handle = bdev_open_by_dev( 1504 * zvol_open() 1505 * bdev_release(handle); 1506 * zvol_release() 1507 * 1508 * 1509 * - 6.9+ kernels - 1510 * disk_scan_partitions() 1511 * file = bdev_file_open_by_dev() 1512 * zvol_open() 1513 * fput(file) 1514 * < wait for return to userspace > 1515 * zvol_release() 1516 * 1517 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1518 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1519 * async that has to wait until we return to userspace (since it adds the fput 1520 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1521 * behavior we want, since we want do things like create+destroy a zvol within 1522 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1523 * reference to the zvol while we're in the IOCTL, which can't wait until we 1524 * return to userspace. 1525 * 1526 * We can get around this since fput() has a special codepath for when it's 1527 * running in a kernel thread or interrupt. In those cases, it just puts the 1528 * fput into the system workqueue, which we can force to run with 1529 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1530 * run from a kernel thread and "tricks" the fput() codepaths. 1531 * 1532 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1533 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1534 * fput) to happen, which it eventually, naturally, will from the system_wq 1535 * without us explicitly calling __flush_workqueue(). 1536 */ 1537 static int 1538 zvol_os_add_disk(struct gendisk *disk) 1539 { 1540 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1541 struct add_disk_work add_disk_work; 1542 1543 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1544 add_disk_work.disk = disk; 1545 add_disk_work.error = 0; 1546 1547 /* Use *_delayed_work functions since they're not GPL'd */ 1548 schedule_delayed_work(&add_disk_work.work, 0); 1549 flush_delayed_work(&add_disk_work.work); 1550 1551 __flush_workqueue(system_wq); 1552 return (add_disk_work.error); 1553 #else /* <= 6.8 kernel */ 1554 return (__zvol_os_add_disk(disk)); 1555 #endif 1556 } 1557 1558 /* 1559 * Create a block device minor node and setup the linkage between it 1560 * and the specified volume. Once this function returns the block 1561 * device is live and ready for use. 1562 */ 1563 int 1564 zvol_os_create_minor(const char *name) 1565 { 1566 zvol_state_t *zv; 1567 objset_t *os; 1568 dmu_object_info_t *doi; 1569 uint64_t volsize; 1570 uint64_t len; 1571 unsigned minor = 0; 1572 int error = 0; 1573 int idx; 1574 uint64_t hash = zvol_name_hash(name); 1575 uint64_t volthreading; 1576 bool replayed_zil = B_FALSE; 1577 1578 if (zvol_inhibit_dev) 1579 return (0); 1580 1581 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1582 if (idx < 0) 1583 return (SET_ERROR(-idx)); 1584 minor = idx << ZVOL_MINOR_BITS; 1585 if (MINOR(minor) != minor) { 1586 /* too many partitions can cause an overflow */ 1587 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1588 name, minor, MINOR(minor)); 1589 ida_simple_remove(&zvol_ida, idx); 1590 return (SET_ERROR(EINVAL)); 1591 } 1592 1593 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1594 if (zv) { 1595 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1596 mutex_exit(&zv->zv_state_lock); 1597 ida_simple_remove(&zvol_ida, idx); 1598 return (SET_ERROR(EEXIST)); 1599 } 1600 1601 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1602 1603 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1604 if (error) 1605 goto out_doi; 1606 1607 error = dmu_object_info(os, ZVOL_OBJ, doi); 1608 if (error) 1609 goto out_dmu_objset_disown; 1610 1611 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1612 if (error) 1613 goto out_dmu_objset_disown; 1614 1615 zv = zvol_alloc(MKDEV(zvol_major, minor), name, 1616 doi->doi_data_block_size); 1617 if (zv == NULL) { 1618 error = SET_ERROR(EAGAIN); 1619 goto out_dmu_objset_disown; 1620 } 1621 zv->zv_hash = hash; 1622 1623 if (dmu_objset_is_snapshot(os)) 1624 zv->zv_flags |= ZVOL_RDONLY; 1625 1626 zv->zv_volsize = volsize; 1627 zv->zv_objset = os; 1628 1629 /* Default */ 1630 zv->zv_threading = B_TRUE; 1631 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1632 == 0) 1633 zv->zv_threading = volthreading; 1634 1635 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1636 1637 #ifdef QUEUE_FLAG_DISCARD 1638 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1639 #endif 1640 #ifdef QUEUE_FLAG_NONROT 1641 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1642 #endif 1643 #ifdef QUEUE_FLAG_ADD_RANDOM 1644 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1645 #endif 1646 /* This flag was introduced in kernel version 4.12. */ 1647 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1648 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1649 #endif 1650 1651 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1652 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1653 if (error) 1654 goto out_dmu_objset_disown; 1655 ASSERT3P(zv->zv_zilog, ==, NULL); 1656 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1657 if (spa_writeable(dmu_objset_spa(os))) { 1658 if (zil_replay_disable) 1659 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1660 else 1661 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1662 } 1663 if (replayed_zil) 1664 zil_close(zv->zv_zilog); 1665 zv->zv_zilog = NULL; 1666 1667 /* 1668 * When udev detects the addition of the device it will immediately 1669 * invoke blkid(8) to determine the type of content on the device. 1670 * Prefetching the blocks commonly scanned by blkid(8) will speed 1671 * up this process. 1672 */ 1673 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1674 if (len > 0) { 1675 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1676 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1677 ZIO_PRIORITY_SYNC_READ); 1678 } 1679 1680 zv->zv_objset = NULL; 1681 out_dmu_objset_disown: 1682 dmu_objset_disown(os, B_TRUE, FTAG); 1683 out_doi: 1684 kmem_free(doi, sizeof (dmu_object_info_t)); 1685 1686 /* 1687 * Keep in mind that once add_disk() is called, the zvol is 1688 * announced to the world, and zvol_open()/zvol_release() can 1689 * be called at any time. Incidentally, add_disk() itself calls 1690 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1691 * directly as well. 1692 */ 1693 if (error == 0) { 1694 rw_enter(&zvol_state_lock, RW_WRITER); 1695 zvol_insert(zv); 1696 rw_exit(&zvol_state_lock); 1697 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1698 } else { 1699 ida_simple_remove(&zvol_ida, idx); 1700 } 1701 1702 return (error); 1703 } 1704 1705 void 1706 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1707 { 1708 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1709 1710 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1711 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1712 1713 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1714 1715 /* move to new hashtable entry */ 1716 zv->zv_hash = zvol_name_hash(newname); 1717 hlist_del(&zv->zv_hlink); 1718 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1719 1720 /* 1721 * The block device's read-only state is briefly changed causing 1722 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1723 * the name change and fixes the symlinks. This does not change 1724 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1725 * changes. This would normally be done using kobject_uevent() but 1726 * that is a GPL-only symbol which is why we need this workaround. 1727 */ 1728 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1729 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1730 1731 dataset_kstats_rename(&zv->zv_kstat, newname); 1732 } 1733 1734 void 1735 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1736 { 1737 1738 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1739 } 1740 1741 void 1742 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1743 { 1744 1745 set_capacity(zv->zv_zso->zvo_disk, capacity); 1746 } 1747 1748 int 1749 zvol_init(void) 1750 { 1751 int error; 1752 1753 error = zvol_init_impl(); 1754 if (error) { 1755 printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); 1756 return (error); 1757 } 1758 1759 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1760 if (error) { 1761 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1762 return (error); 1763 } 1764 1765 if (zvol_blk_mq_queue_depth == 0) { 1766 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1767 } else { 1768 zvol_actual_blk_mq_queue_depth = 1769 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1770 } 1771 1772 if (zvol_blk_mq_threads == 0) { 1773 zvol_blk_mq_actual_threads = num_online_cpus(); 1774 } else { 1775 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1776 1024); 1777 } 1778 1779 ida_init(&zvol_ida); 1780 return (0); 1781 } 1782 1783 void 1784 zvol_fini(void) 1785 { 1786 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1787 1788 zvol_fini_impl(); 1789 1790 ida_destroy(&zvol_ida); 1791 } 1792 1793 module_param(zvol_major, uint, 0444); 1794 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1795 1796 module_param(zvol_max_discard_blocks, ulong, 0444); 1797 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1798 1799 module_param(zvol_prefetch_bytes, uint, 0644); 1800 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1801 1802 module_param(zvol_volmode, uint, 0644); 1803 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1804 1805 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1806 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1807 1808 module_param(zvol_use_blk_mq, uint, 0644); 1809 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1810 1811 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1812 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1813 "Process volblocksize blocks per thread"); 1814 1815 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1816 module_param(zvol_open_timeout_ms, uint, 0644); 1817 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1818 #endif 1819