1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 24 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 25 * Copyright (c) 2024, Klara, Inc. 26 */ 27 28 #include <sys/dataset_kstats.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/zil_impl.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/spa_impl.h> 41 #include <sys/zvol.h> 42 #include <sys/zvol_impl.h> 43 #include <cityhash.h> 44 45 #include <linux/blkdev_compat.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/workqueue.h> 48 #include <linux/blk-mq.h> 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned long zvol_max_discard_blocks = 16384; 55 56 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 57 static unsigned int zvol_open_timeout_ms = 1000; 58 #endif 59 60 static unsigned int zvol_blk_mq_threads = 0; 61 static unsigned int zvol_blk_mq_actual_threads; 62 static boolean_t zvol_use_blk_mq = B_FALSE; 63 64 /* 65 * The maximum number of volblocksize blocks to process per thread. Typically, 66 * write heavy workloads preform better with higher values here, and read 67 * heavy workloads preform better with lower values, but that's not a hard 68 * and fast rule. It's basically a knob to tune between "less overhead with 69 * less parallelism" and "more overhead, but more parallelism". 70 * 71 * '8' was chosen as a reasonable, balanced, default based off of sequential 72 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 73 */ 74 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 75 76 #ifndef BLKDEV_DEFAULT_RQ 77 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 78 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 79 #endif 80 81 /* 82 * Finalize our BIO or request. 83 */ 84 static inline void 85 zvol_end_io(struct bio *bio, struct request *rq, int error) 86 { 87 if (bio) { 88 bio->bi_status = errno_to_bi_status(-error); 89 bio_endio(bio); 90 } else { 91 blk_mq_end_request(rq, errno_to_bi_status(error)); 92 } 93 } 94 95 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 96 static unsigned int zvol_actual_blk_mq_queue_depth; 97 98 struct zvol_state_os { 99 struct gendisk *zvo_disk; /* generic disk */ 100 struct request_queue *zvo_queue; /* request queue */ 101 dev_t zvo_dev; /* device id */ 102 103 struct blk_mq_tag_set tag_set; 104 105 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 106 boolean_t use_blk_mq; 107 }; 108 109 static struct ida zvol_ida; 110 111 /* 112 * This is called when a new block multiqueue request comes in. A request 113 * contains one or more BIOs. 114 */ 115 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 116 const struct blk_mq_queue_data *bd) 117 { 118 struct request *rq = bd->rq; 119 zvol_state_t *zv = rq->q->queuedata; 120 121 /* Tell the kernel that we are starting to process this request */ 122 blk_mq_start_request(rq); 123 124 if (blk_rq_is_passthrough(rq)) { 125 /* Skip non filesystem request */ 126 blk_mq_end_request(rq, BLK_STS_IOERR); 127 return (BLK_STS_IOERR); 128 } 129 130 zvol_request_impl(zv, NULL, rq, 0); 131 132 /* Acknowledge to the kernel that we got this request */ 133 return (BLK_STS_OK); 134 } 135 136 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 137 .queue_rq = zvol_mq_queue_rq, 138 }; 139 140 /* Initialize our blk-mq struct */ 141 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 142 { 143 struct zvol_state_os *zso = zv->zv_zso; 144 145 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 146 147 /* Initialize tag set. */ 148 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 149 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 150 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 151 zso->tag_set.numa_node = NUMA_NO_NODE; 152 zso->tag_set.cmd_size = 0; 153 154 /* 155 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 156 * zvol_request_impl() 157 */ 158 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 159 160 #ifdef BLK_MQ_F_SHOULD_MERGE 161 /* 162 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 163 * For older kernels, we set it. 164 */ 165 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 166 #endif 167 168 zso->tag_set.driver_data = zv; 169 170 return (blk_mq_alloc_tag_set(&zso->tag_set)); 171 } 172 173 /* 174 * Given a path, return TRUE if path is a ZVOL. 175 */ 176 boolean_t 177 zvol_os_is_zvol(const char *path) 178 { 179 dev_t dev = 0; 180 181 if (vdev_lookup_bdev(path, &dev) != 0) 182 return (B_FALSE); 183 184 if (MAJOR(dev) == zvol_major) 185 return (B_TRUE); 186 187 return (B_FALSE); 188 } 189 190 static void 191 zvol_write(zv_request_t *zvr) 192 { 193 struct bio *bio = zvr->bio; 194 struct request *rq = zvr->rq; 195 int error = 0; 196 zfs_uio_t uio; 197 zvol_state_t *zv = zvr->zv; 198 struct request_queue *q; 199 struct gendisk *disk; 200 unsigned long start_time = 0; 201 boolean_t acct = B_FALSE; 202 203 ASSERT3P(zv, !=, NULL); 204 ASSERT3U(zv->zv_open_count, >, 0); 205 ASSERT3P(zv->zv_zilog, !=, NULL); 206 207 q = zv->zv_zso->zvo_queue; 208 disk = zv->zv_zso->zvo_disk; 209 210 /* bio marked as FLUSH need to flush before write */ 211 if (io_is_flush(bio, rq)) 212 zil_commit(zv->zv_zilog, ZVOL_OBJ); 213 214 /* Some requests are just for flush and nothing else. */ 215 if (io_size(bio, rq) == 0) { 216 rw_exit(&zv->zv_suspend_lock); 217 zvol_end_io(bio, rq, 0); 218 return; 219 } 220 221 zfs_uio_bvec_init(&uio, bio, rq); 222 223 ssize_t start_resid = uio.uio_resid; 224 225 /* 226 * With use_blk_mq, accounting is done by blk_mq_start_request() 227 * and blk_mq_end_request(), so we can skip it here. 228 */ 229 if (bio) { 230 acct = blk_queue_io_stat(q); 231 if (acct) { 232 start_time = blk_generic_start_io_acct(q, disk, WRITE, 233 bio); 234 } 235 } 236 237 boolean_t sync = 238 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 239 240 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 241 uio.uio_loffset, uio.uio_resid, RL_WRITER); 242 243 uint64_t volsize = zv->zv_volsize; 244 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 245 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 246 uint64_t off = uio.uio_loffset; 247 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 248 249 if (bytes > volsize - off) /* don't write past the end */ 250 bytes = volsize - off; 251 252 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 253 254 /* This will only fail for ENOSPC */ 255 error = dmu_tx_assign(tx, DMU_TX_WAIT); 256 if (error) { 257 dmu_tx_abort(tx); 258 break; 259 } 260 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, 261 DMU_READ_PREFETCH); 262 if (error == 0) { 263 zvol_log_write(zv, tx, off, bytes, sync); 264 } 265 dmu_tx_commit(tx); 266 267 if (error) 268 break; 269 } 270 zfs_rangelock_exit(lr); 271 272 int64_t nwritten = start_resid - uio.uio_resid; 273 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 274 task_io_account_write(nwritten); 275 276 if (sync) 277 zil_commit(zv->zv_zilog, ZVOL_OBJ); 278 279 rw_exit(&zv->zv_suspend_lock); 280 281 if (bio && acct) { 282 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 283 } 284 285 zvol_end_io(bio, rq, -error); 286 } 287 288 static void 289 zvol_write_task(void *arg) 290 { 291 zv_request_task_t *task = arg; 292 zvol_write(&task->zvr); 293 zv_request_task_free(task); 294 } 295 296 static void 297 zvol_discard(zv_request_t *zvr) 298 { 299 struct bio *bio = zvr->bio; 300 struct request *rq = zvr->rq; 301 zvol_state_t *zv = zvr->zv; 302 uint64_t start = io_offset(bio, rq); 303 uint64_t size = io_size(bio, rq); 304 uint64_t end = start + size; 305 boolean_t sync; 306 int error = 0; 307 dmu_tx_t *tx; 308 struct request_queue *q = zv->zv_zso->zvo_queue; 309 struct gendisk *disk = zv->zv_zso->zvo_disk; 310 unsigned long start_time = 0; 311 boolean_t acct = B_FALSE; 312 313 ASSERT3P(zv, !=, NULL); 314 ASSERT3U(zv->zv_open_count, >, 0); 315 ASSERT3P(zv->zv_zilog, !=, NULL); 316 317 if (bio) { 318 acct = blk_queue_io_stat(q); 319 if (acct) { 320 start_time = blk_generic_start_io_acct(q, disk, WRITE, 321 bio); 322 } 323 } 324 325 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 326 327 if (end > zv->zv_volsize) { 328 error = SET_ERROR(EIO); 329 goto unlock; 330 } 331 332 /* 333 * Align the request to volume block boundaries when a secure erase is 334 * not required. This will prevent dnode_free_range() from zeroing out 335 * the unaligned parts which is slow (read-modify-write) and useless 336 * since we are not freeing any space by doing so. 337 */ 338 if (!io_is_secure_erase(bio, rq)) { 339 start = P2ROUNDUP(start, zv->zv_volblocksize); 340 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 341 size = end - start; 342 } 343 344 if (start >= end) 345 goto unlock; 346 347 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 348 start, size, RL_WRITER); 349 350 tx = dmu_tx_create(zv->zv_objset); 351 dmu_tx_mark_netfree(tx); 352 error = dmu_tx_assign(tx, DMU_TX_WAIT); 353 if (error != 0) { 354 dmu_tx_abort(tx); 355 } else { 356 zvol_log_truncate(zv, tx, start, size); 357 dmu_tx_commit(tx); 358 error = dmu_free_long_range(zv->zv_objset, 359 ZVOL_OBJ, start, size); 360 } 361 zfs_rangelock_exit(lr); 362 363 if (error == 0 && sync) 364 zil_commit(zv->zv_zilog, ZVOL_OBJ); 365 366 unlock: 367 rw_exit(&zv->zv_suspend_lock); 368 369 if (bio && acct) { 370 blk_generic_end_io_acct(q, disk, WRITE, bio, 371 start_time); 372 } 373 374 zvol_end_io(bio, rq, -error); 375 } 376 377 static void 378 zvol_discard_task(void *arg) 379 { 380 zv_request_task_t *task = arg; 381 zvol_discard(&task->zvr); 382 zv_request_task_free(task); 383 } 384 385 static void 386 zvol_read(zv_request_t *zvr) 387 { 388 struct bio *bio = zvr->bio; 389 struct request *rq = zvr->rq; 390 int error = 0; 391 zfs_uio_t uio; 392 boolean_t acct = B_FALSE; 393 zvol_state_t *zv = zvr->zv; 394 struct request_queue *q; 395 struct gendisk *disk; 396 unsigned long start_time = 0; 397 398 ASSERT3P(zv, !=, NULL); 399 ASSERT3U(zv->zv_open_count, >, 0); 400 401 zfs_uio_bvec_init(&uio, bio, rq); 402 403 q = zv->zv_zso->zvo_queue; 404 disk = zv->zv_zso->zvo_disk; 405 406 ssize_t start_resid = uio.uio_resid; 407 408 /* 409 * When blk-mq is being used, accounting is done by 410 * blk_mq_start_request() and blk_mq_end_request(). 411 */ 412 if (bio) { 413 acct = blk_queue_io_stat(q); 414 if (acct) 415 start_time = blk_generic_start_io_acct(q, disk, READ, 416 bio); 417 } 418 419 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 420 uio.uio_loffset, uio.uio_resid, RL_READER); 421 422 uint64_t volsize = zv->zv_volsize; 423 424 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 425 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 426 427 /* don't read past the end */ 428 if (bytes > volsize - uio.uio_loffset) 429 bytes = volsize - uio.uio_loffset; 430 431 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, 432 DMU_READ_PREFETCH); 433 if (error) { 434 /* convert checksum errors into IO errors */ 435 if (error == ECKSUM) 436 error = SET_ERROR(EIO); 437 break; 438 } 439 } 440 zfs_rangelock_exit(lr); 441 442 int64_t nread = start_resid - uio.uio_resid; 443 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 444 task_io_account_read(nread); 445 446 rw_exit(&zv->zv_suspend_lock); 447 448 if (bio && acct) { 449 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 450 } 451 452 zvol_end_io(bio, rq, -error); 453 } 454 455 static void 456 zvol_read_task(void *arg) 457 { 458 zv_request_task_t *task = arg; 459 zvol_read(&task->zvr); 460 zv_request_task_free(task); 461 } 462 463 464 /* 465 * Process a BIO or request 466 * 467 * Either 'bio' or 'rq' should be set depending on if we are processing a 468 * bio or a request (both should not be set). 469 * 470 * force_sync: Set to 0 to defer processing to a background taskq 471 * Set to 1 to process data synchronously 472 */ 473 static void 474 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 475 boolean_t force_sync) 476 { 477 fstrans_cookie_t cookie = spl_fstrans_mark(); 478 uint64_t offset = io_offset(bio, rq); 479 uint64_t size = io_size(bio, rq); 480 int rw = io_data_dir(bio, rq); 481 482 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 483 zvol_end_io(bio, rq, -SET_ERROR(ENXIO)); 484 goto out; 485 } 486 487 if (zvol_request_sync || zv->zv_threading == B_FALSE) 488 force_sync = 1; 489 490 zv_request_t zvr = { 491 .zv = zv, 492 .bio = bio, 493 .rq = rq, 494 }; 495 496 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 497 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 498 zv->zv_zso->zvo_disk->disk_name, 499 (long long unsigned)offset, 500 (long unsigned)size); 501 502 zvol_end_io(bio, rq, -SET_ERROR(EIO)); 503 goto out; 504 } 505 506 zv_request_task_t *task; 507 zv_taskq_t *ztqs = &zvol_taskqs; 508 uint_t blk_mq_hw_queue = 0; 509 uint_t tq_idx; 510 uint_t taskq_hash; 511 if (rq) 512 #ifdef HAVE_BLK_MQ_RQ_HCTX 513 blk_mq_hw_queue = rq->mq_hctx->queue_num; 514 #else 515 blk_mq_hw_queue = 516 rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; 517 #endif 518 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 519 blk_mq_hw_queue); 520 tq_idx = taskq_hash % ztqs->tqs_cnt; 521 522 if (rw == WRITE) { 523 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 524 zvol_end_io(bio, rq, -SET_ERROR(EROFS)); 525 goto out; 526 } 527 528 /* 529 * Prevents the zvol from being suspended, or the ZIL being 530 * concurrently opened. Will be released after the i/o 531 * completes. 532 */ 533 rw_enter(&zv->zv_suspend_lock, RW_READER); 534 535 /* 536 * Open a ZIL if this is the first time we have written to this 537 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 538 * than zv_state_lock so that we don't need to acquire an 539 * additional lock in this path. 540 */ 541 if (zv->zv_zilog == NULL) { 542 rw_exit(&zv->zv_suspend_lock); 543 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 544 if (zv->zv_zilog == NULL) { 545 zv->zv_zilog = zil_open(zv->zv_objset, 546 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 547 zv->zv_flags |= ZVOL_WRITTEN_TO; 548 /* replay / destroy done in zvol_create_minor */ 549 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 550 ZIL_REPLAY_NEEDED)); 551 } 552 rw_downgrade(&zv->zv_suspend_lock); 553 } 554 555 /* 556 * We don't want this thread to be blocked waiting for i/o to 557 * complete, so we instead wait from a taskq callback. The 558 * i/o may be a ZIL write (via zil_commit()), or a read of an 559 * indirect block, or a read of a data block (if this is a 560 * partial-block write). We will indicate that the i/o is 561 * complete by calling END_IO() from the taskq callback. 562 * 563 * This design allows the calling thread to continue and 564 * initiate more concurrent operations by calling 565 * zvol_request() again. There are typically only a small 566 * number of threads available to call zvol_request() (e.g. 567 * one per iSCSI target), so keeping the latency of 568 * zvol_request() low is important for performance. 569 * 570 * The zvol_request_sync module parameter allows this 571 * behavior to be altered, for performance evaluation 572 * purposes. If the callback blocks, setting 573 * zvol_request_sync=1 will result in much worse performance. 574 * 575 * We can have up to zvol_threads concurrent i/o's being 576 * processed for all zvols on the system. This is typically 577 * a vast improvement over the zvol_request_sync=1 behavior 578 * of one i/o at a time per zvol. However, an even better 579 * design would be for zvol_request() to initiate the zio 580 * directly, and then be notified by the zio_done callback, 581 * which would call END_IO(). Unfortunately, the DMU/ZIL 582 * interfaces lack this functionality (they block waiting for 583 * the i/o to complete). 584 */ 585 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 586 if (force_sync) { 587 zvol_discard(&zvr); 588 } else { 589 task = zv_request_task_create(zvr); 590 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 591 zvol_discard_task, task, 0, &task->ent); 592 } 593 } else { 594 if (force_sync) { 595 zvol_write(&zvr); 596 } else { 597 task = zv_request_task_create(zvr); 598 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 599 zvol_write_task, task, 0, &task->ent); 600 } 601 } 602 } else { 603 /* 604 * The SCST driver, and possibly others, may issue READ I/Os 605 * with a length of zero bytes. These empty I/Os contain no 606 * data and require no additional handling. 607 */ 608 if (size == 0) { 609 zvol_end_io(bio, rq, 0); 610 goto out; 611 } 612 613 rw_enter(&zv->zv_suspend_lock, RW_READER); 614 615 /* See comment in WRITE case above. */ 616 if (force_sync) { 617 zvol_read(&zvr); 618 } else { 619 task = zv_request_task_create(zvr); 620 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 621 zvol_read_task, task, 0, &task->ent); 622 } 623 } 624 625 out: 626 spl_fstrans_unmark(cookie); 627 } 628 629 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 630 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 631 static void 632 zvol_submit_bio(struct bio *bio) 633 #else 634 static blk_qc_t 635 zvol_submit_bio(struct bio *bio) 636 #endif 637 #else 638 static MAKE_REQUEST_FN_RET 639 zvol_request(struct request_queue *q, struct bio *bio) 640 #endif 641 { 642 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 643 #if defined(HAVE_BIO_BDEV_DISK) 644 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 645 #else 646 struct request_queue *q = bio->bi_disk->queue; 647 #endif 648 #endif 649 zvol_state_t *zv = q->queuedata; 650 651 zvol_request_impl(zv, bio, NULL, 0); 652 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 653 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 654 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 655 return (BLK_QC_T_NONE); 656 #endif 657 } 658 659 static int 660 #ifdef HAVE_BLK_MODE_T 661 zvol_open(struct gendisk *disk, blk_mode_t flag) 662 #else 663 zvol_open(struct block_device *bdev, fmode_t flag) 664 #endif 665 { 666 zvol_state_t *zv; 667 int error = 0; 668 boolean_t drop_suspend = B_FALSE; 669 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 670 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 671 hrtime_t start = gethrtime(); 672 673 retry: 674 #endif 675 rw_enter(&zvol_state_lock, RW_READER); 676 /* 677 * Obtain a copy of private_data under the zvol_state_lock to make 678 * sure that either the result of zvol free code path setting 679 * disk->private_data to NULL is observed, or zvol_os_free() 680 * is not called on this zv because of the positive zv_open_count. 681 */ 682 #ifdef HAVE_BLK_MODE_T 683 zv = disk->private_data; 684 #else 685 zv = bdev->bd_disk->private_data; 686 #endif 687 if (zv == NULL) { 688 rw_exit(&zvol_state_lock); 689 return (-SET_ERROR(ENXIO)); 690 } 691 692 mutex_enter(&zv->zv_state_lock); 693 694 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 695 mutex_exit(&zv->zv_state_lock); 696 rw_exit(&zvol_state_lock); 697 return (-SET_ERROR(ENXIO)); 698 } 699 700 /* 701 * Make sure zvol is not suspended during first open 702 * (hold zv_suspend_lock) and respect proper lock acquisition 703 * ordering - zv_suspend_lock before zv_state_lock 704 */ 705 if (zv->zv_open_count == 0) { 706 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 707 mutex_exit(&zv->zv_state_lock); 708 rw_enter(&zv->zv_suspend_lock, RW_READER); 709 mutex_enter(&zv->zv_state_lock); 710 /* check to see if zv_suspend_lock is needed */ 711 if (zv->zv_open_count != 0) { 712 rw_exit(&zv->zv_suspend_lock); 713 } else { 714 drop_suspend = B_TRUE; 715 } 716 } else { 717 drop_suspend = B_TRUE; 718 } 719 } 720 rw_exit(&zvol_state_lock); 721 722 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 723 724 if (zv->zv_open_count == 0) { 725 boolean_t drop_namespace = B_FALSE; 726 727 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 728 729 /* 730 * In all other call paths the spa_namespace_lock is taken 731 * before the bdev->bd_mutex lock. However, on open(2) 732 * the __blkdev_get() function calls fops->open() with the 733 * bdev->bd_mutex lock held. This can result in a deadlock 734 * when zvols from one pool are used as vdevs in another. 735 * 736 * To prevent a lock inversion deadlock we preemptively 737 * take the spa_namespace_lock. Normally the lock will not 738 * be contended and this is safe because spa_open_common() 739 * handles the case where the caller already holds the 740 * spa_namespace_lock. 741 * 742 * When the lock cannot be aquired after multiple retries 743 * this must be the vdev on zvol deadlock case and we have 744 * no choice but to return an error. For 5.12 and older 745 * kernels returning -ERESTARTSYS will result in the 746 * bdev->bd_mutex being dropped, then reacquired, and 747 * fops->open() being called again. This process can be 748 * repeated safely until both locks are acquired. For 5.13 749 * and newer the -ERESTARTSYS retry logic was removed from 750 * the kernel so the only option is to return the error for 751 * the caller to handle it. 752 */ 753 if (!mutex_owned(&spa_namespace_lock)) { 754 if (!mutex_tryenter(&spa_namespace_lock)) { 755 mutex_exit(&zv->zv_state_lock); 756 rw_exit(&zv->zv_suspend_lock); 757 drop_suspend = B_FALSE; 758 759 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 760 schedule(); 761 return (-SET_ERROR(ERESTARTSYS)); 762 #else 763 if ((gethrtime() - start) > timeout) 764 return (-SET_ERROR(ERESTARTSYS)); 765 766 schedule_timeout_interruptible( 767 MSEC_TO_TICK(10)); 768 goto retry; 769 #endif 770 } else { 771 drop_namespace = B_TRUE; 772 } 773 } 774 775 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 776 777 if (drop_namespace) 778 mutex_exit(&spa_namespace_lock); 779 } 780 781 if (error == 0) { 782 if ((blk_mode_is_open_write(flag)) && 783 (zv->zv_flags & ZVOL_RDONLY)) { 784 if (zv->zv_open_count == 0) 785 zvol_last_close(zv); 786 787 error = -SET_ERROR(EROFS); 788 } else { 789 zv->zv_open_count++; 790 } 791 } 792 793 mutex_exit(&zv->zv_state_lock); 794 if (drop_suspend) 795 rw_exit(&zv->zv_suspend_lock); 796 797 if (error == 0) 798 #ifdef HAVE_BLK_MODE_T 799 disk_check_media_change(disk); 800 #else 801 zfs_check_media_change(bdev); 802 #endif 803 804 return (error); 805 } 806 807 static void 808 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 809 zvol_release(struct gendisk *disk) 810 #else 811 zvol_release(struct gendisk *disk, fmode_t unused) 812 #endif 813 { 814 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 815 (void) unused; 816 #endif 817 zvol_state_t *zv; 818 boolean_t drop_suspend = B_TRUE; 819 820 rw_enter(&zvol_state_lock, RW_READER); 821 zv = disk->private_data; 822 823 mutex_enter(&zv->zv_state_lock); 824 ASSERT3U(zv->zv_open_count, >, 0); 825 /* 826 * make sure zvol is not suspended during last close 827 * (hold zv_suspend_lock) and respect proper lock acquisition 828 * ordering - zv_suspend_lock before zv_state_lock 829 */ 830 if (zv->zv_open_count == 1) { 831 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 832 mutex_exit(&zv->zv_state_lock); 833 rw_enter(&zv->zv_suspend_lock, RW_READER); 834 mutex_enter(&zv->zv_state_lock); 835 /* check to see if zv_suspend_lock is needed */ 836 if (zv->zv_open_count != 1) { 837 rw_exit(&zv->zv_suspend_lock); 838 drop_suspend = B_FALSE; 839 } 840 } 841 } else { 842 drop_suspend = B_FALSE; 843 } 844 rw_exit(&zvol_state_lock); 845 846 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 847 848 zv->zv_open_count--; 849 if (zv->zv_open_count == 0) { 850 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 851 zvol_last_close(zv); 852 } 853 854 mutex_exit(&zv->zv_state_lock); 855 856 if (drop_suspend) 857 rw_exit(&zv->zv_suspend_lock); 858 } 859 860 static int 861 zvol_ioctl(struct block_device *bdev, fmode_t mode, 862 unsigned int cmd, unsigned long arg) 863 { 864 zvol_state_t *zv = bdev->bd_disk->private_data; 865 int error = 0; 866 867 ASSERT3U(zv->zv_open_count, >, 0); 868 869 switch (cmd) { 870 case BLKFLSBUF: 871 #ifdef HAVE_FSYNC_BDEV 872 fsync_bdev(bdev); 873 #elif defined(HAVE_SYNC_BLOCKDEV) 874 sync_blockdev(bdev); 875 #else 876 #error "Neither fsync_bdev() nor sync_blockdev() found" 877 #endif 878 invalidate_bdev(bdev); 879 rw_enter(&zv->zv_suspend_lock, RW_READER); 880 881 if (!(zv->zv_flags & ZVOL_RDONLY)) 882 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 883 884 rw_exit(&zv->zv_suspend_lock); 885 break; 886 887 case BLKZNAME: 888 mutex_enter(&zv->zv_state_lock); 889 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 890 mutex_exit(&zv->zv_state_lock); 891 break; 892 893 default: 894 error = -ENOTTY; 895 break; 896 } 897 898 return (SET_ERROR(error)); 899 } 900 901 #ifdef CONFIG_COMPAT 902 static int 903 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 904 unsigned cmd, unsigned long arg) 905 { 906 return (zvol_ioctl(bdev, mode, cmd, arg)); 907 } 908 #else 909 #define zvol_compat_ioctl NULL 910 #endif 911 912 static unsigned int 913 zvol_check_events(struct gendisk *disk, unsigned int clearing) 914 { 915 unsigned int mask = 0; 916 917 rw_enter(&zvol_state_lock, RW_READER); 918 919 zvol_state_t *zv = disk->private_data; 920 if (zv != NULL) { 921 mutex_enter(&zv->zv_state_lock); 922 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 923 zv->zv_changed = 0; 924 mutex_exit(&zv->zv_state_lock); 925 } 926 927 rw_exit(&zvol_state_lock); 928 929 return (mask); 930 } 931 932 static int 933 zvol_revalidate_disk(struct gendisk *disk) 934 { 935 rw_enter(&zvol_state_lock, RW_READER); 936 937 zvol_state_t *zv = disk->private_data; 938 if (zv != NULL) { 939 mutex_enter(&zv->zv_state_lock); 940 set_capacity(zv->zv_zso->zvo_disk, 941 zv->zv_volsize >> SECTOR_BITS); 942 mutex_exit(&zv->zv_state_lock); 943 } 944 945 rw_exit(&zvol_state_lock); 946 947 return (0); 948 } 949 950 int 951 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 952 { 953 struct gendisk *disk = zv->zv_zso->zvo_disk; 954 955 #if defined(HAVE_REVALIDATE_DISK_SIZE) 956 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 957 #elif defined(HAVE_REVALIDATE_DISK) 958 revalidate_disk(disk); 959 #else 960 zvol_revalidate_disk(disk); 961 #endif 962 return (0); 963 } 964 965 void 966 zvol_os_clear_private(zvol_state_t *zv) 967 { 968 /* 969 * Cleared while holding zvol_state_lock as a writer 970 * which will prevent zvol_open() from opening it. 971 */ 972 zv->zv_zso->zvo_disk->private_data = NULL; 973 } 974 975 /* 976 * Provide a simple virtual geometry for legacy compatibility. For devices 977 * smaller than 1 MiB a small head and sector count is used to allow very 978 * tiny devices. For devices over 1 Mib a standard head and sector count 979 * is used to keep the cylinders count reasonable. 980 */ 981 static int 982 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 983 { 984 zvol_state_t *zv = bdev->bd_disk->private_data; 985 sector_t sectors; 986 987 ASSERT3U(zv->zv_open_count, >, 0); 988 989 sectors = get_capacity(zv->zv_zso->zvo_disk); 990 991 if (sectors > 2048) { 992 geo->heads = 16; 993 geo->sectors = 63; 994 } else { 995 geo->heads = 2; 996 geo->sectors = 4; 997 } 998 999 geo->start = 0; 1000 geo->cylinders = sectors / (geo->heads * geo->sectors); 1001 1002 return (0); 1003 } 1004 1005 /* 1006 * Why have two separate block_device_operations structs? 1007 * 1008 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1009 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1010 * can't just change submit_bio dynamically at runtime. So just create two 1011 * separate structs to get around this. 1012 */ 1013 static const struct block_device_operations zvol_ops_blk_mq = { 1014 .open = zvol_open, 1015 .release = zvol_release, 1016 .ioctl = zvol_ioctl, 1017 .compat_ioctl = zvol_compat_ioctl, 1018 .check_events = zvol_check_events, 1019 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1020 .revalidate_disk = zvol_revalidate_disk, 1021 #endif 1022 .getgeo = zvol_getgeo, 1023 .owner = THIS_MODULE, 1024 }; 1025 1026 static const struct block_device_operations zvol_ops = { 1027 .open = zvol_open, 1028 .release = zvol_release, 1029 .ioctl = zvol_ioctl, 1030 .compat_ioctl = zvol_compat_ioctl, 1031 .check_events = zvol_check_events, 1032 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1033 .revalidate_disk = zvol_revalidate_disk, 1034 #endif 1035 .getgeo = zvol_getgeo, 1036 .owner = THIS_MODULE, 1037 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1038 .submit_bio = zvol_submit_bio, 1039 #endif 1040 }; 1041 1042 /* 1043 * Since 6.9, Linux has been removing queue limit setters in favour of an 1044 * initial queue_limits struct applied when the device is open. Since 6.11, 1045 * queue_limits is being extended to allow more things to be applied when the 1046 * device is open. Setters are also being removed for this. 1047 * 1048 * For OpenZFS, this means that depending on kernel version, some options may 1049 * be set up before the device is open, and some applied to an open device 1050 * (queue) after the fact. 1051 * 1052 * We manage this complexity by having our own limits struct, 1053 * zvol_queue_limits_t, in which we carry any queue config that we're 1054 * interested in setting. This structure is the same on all kernels. 1055 * 1056 * These limits are then applied to the queue at device open time by the most 1057 * appropriate method for the kernel. 1058 * 1059 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1060 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1061 * struct queue_limits, and passes it in. Any fields added in later kernels are 1062 * (obviously) not set up here. 1063 * 1064 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1065 * is created, and applies any remaining config. Before 6.9 that will be 1066 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1067 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1068 * will always be a no-op on the latest kernel we support). 1069 */ 1070 typedef struct zvol_queue_limits { 1071 unsigned int zql_max_hw_sectors; 1072 unsigned short zql_max_segments; 1073 unsigned int zql_max_segment_size; 1074 unsigned int zql_io_opt; 1075 unsigned int zql_physical_block_size; 1076 unsigned int zql_max_discard_sectors; 1077 unsigned int zql_discard_granularity; 1078 } zvol_queue_limits_t; 1079 1080 static void 1081 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1082 boolean_t use_blk_mq) 1083 { 1084 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1085 1086 if (use_blk_mq) { 1087 /* 1088 * IO requests can be really big (1MB). When an IO request 1089 * comes in, it is passed off to zvol_read() or zvol_write() 1090 * in a new thread, where it is chunked up into 'volblocksize' 1091 * sized pieces and processed. So for example, if the request 1092 * is a 1MB write and your volblocksize is 128k, one zvol_write 1093 * thread will take that request and sequentially do ten 128k 1094 * IOs. This is due to the fact that the thread needs to lock 1095 * each volblocksize sized block. So you might be wondering: 1096 * "instead of passing the whole 1MB request to one thread, 1097 * why not pass ten individual 128k chunks to ten threads and 1098 * process the whole write in parallel?" The short answer is 1099 * that there's a sweet spot number of chunks that balances 1100 * the greater parallelism with the added overhead of more 1101 * threads. The sweet spot can be different depending on if you 1102 * have a read or write heavy workload. Writes typically want 1103 * high chunk counts while reads typically want lower ones. On 1104 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1105 * configuration, with volblocksize=8k, the sweet spot for good 1106 * sequential reads and writes was at 8 chunks. 1107 */ 1108 1109 /* 1110 * Below we tell the kernel how big we want our requests 1111 * to be. You would think that blk_queue_io_opt() would be 1112 * used to do this since it is used to "set optimal request 1113 * size for the queue", but that doesn't seem to do 1114 * anything - the kernel still gives you huge requests 1115 * with tons of little PAGE_SIZE segments contained within it. 1116 * 1117 * Knowing that the kernel will just give you PAGE_SIZE segments 1118 * no matter what, you can say "ok, I want PAGE_SIZE byte 1119 * segments, and I want 'N' of them per request", where N is 1120 * the correct number of segments for the volblocksize and 1121 * number of chunks you want. 1122 */ 1123 if (zvol_blk_mq_blocks_per_thread != 0) { 1124 unsigned int chunks; 1125 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1126 1127 limits->zql_max_segment_size = PAGE_SIZE; 1128 limits->zql_max_segments = 1129 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1130 } else { 1131 /* 1132 * Special case: zvol_blk_mq_blocks_per_thread = 0 1133 * Max everything out. 1134 */ 1135 limits->zql_max_segments = UINT16_MAX; 1136 limits->zql_max_segment_size = UINT_MAX; 1137 } 1138 } else { 1139 limits->zql_max_segments = UINT16_MAX; 1140 limits->zql_max_segment_size = UINT_MAX; 1141 } 1142 1143 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1144 1145 limits->zql_physical_block_size = zv->zv_volblocksize; 1146 limits->zql_max_discard_sectors = 1147 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1148 limits->zql_discard_granularity = zv->zv_volblocksize; 1149 } 1150 1151 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1152 static void 1153 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1154 struct queue_limits *qlimits) 1155 { 1156 memset(qlimits, 0, sizeof (struct queue_limits)); 1157 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1158 qlimits->max_segments = limits->zql_max_segments; 1159 qlimits->max_segment_size = limits->zql_max_segment_size; 1160 qlimits->io_opt = limits->zql_io_opt; 1161 qlimits->physical_block_size = limits->zql_physical_block_size; 1162 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1163 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1164 qlimits->discard_granularity = limits->zql_discard_granularity; 1165 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1166 qlimits->features = 1167 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1168 #endif 1169 } 1170 #endif 1171 1172 static void 1173 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1174 struct request_queue *queue) 1175 { 1176 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1177 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1178 blk_queue_max_segments(queue, limits->zql_max_segments); 1179 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1180 blk_queue_io_opt(queue, limits->zql_io_opt); 1181 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1182 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1183 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1184 #endif 1185 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1186 blk_queue_set_write_cache(queue, B_TRUE); 1187 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1188 #endif 1189 } 1190 1191 static int 1192 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1193 { 1194 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1195 #if defined(HAVE_BLK_ALLOC_DISK) 1196 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1197 if (zso->zvo_disk == NULL) 1198 return (1); 1199 1200 zso->zvo_disk->minors = ZVOL_MINORS; 1201 zso->zvo_queue = zso->zvo_disk->queue; 1202 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1203 struct queue_limits qlimits; 1204 zvol_queue_limits_convert(limits, &qlimits); 1205 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1206 if (IS_ERR(disk)) { 1207 zso->zvo_disk = NULL; 1208 return (1); 1209 } 1210 1211 zso->zvo_disk = disk; 1212 zso->zvo_disk->minors = ZVOL_MINORS; 1213 zso->zvo_queue = zso->zvo_disk->queue; 1214 1215 #else 1216 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1217 if (zso->zvo_queue == NULL) 1218 return (1); 1219 1220 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1221 if (zso->zvo_disk == NULL) { 1222 blk_cleanup_queue(zso->zvo_queue); 1223 return (1); 1224 } 1225 1226 zso->zvo_disk->queue = zso->zvo_queue; 1227 #endif /* HAVE_BLK_ALLOC_DISK */ 1228 #else 1229 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1230 if (zso->zvo_queue == NULL) 1231 return (1); 1232 1233 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1234 if (zso->zvo_disk == NULL) { 1235 blk_cleanup_queue(zso->zvo_queue); 1236 return (1); 1237 } 1238 1239 zso->zvo_disk->queue = zso->zvo_queue; 1240 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1241 1242 zvol_queue_limits_apply(limits, zso->zvo_queue); 1243 1244 return (0); 1245 1246 } 1247 1248 static int 1249 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1250 { 1251 struct zvol_state_os *zso = zv->zv_zso; 1252 1253 /* Allocate our blk-mq tag_set */ 1254 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1255 return (1); 1256 1257 #if defined(HAVE_BLK_ALLOC_DISK) 1258 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1259 if (zso->zvo_disk == NULL) { 1260 blk_mq_free_tag_set(&zso->tag_set); 1261 return (1); 1262 } 1263 zso->zvo_queue = zso->zvo_disk->queue; 1264 zso->zvo_disk->minors = ZVOL_MINORS; 1265 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1266 struct queue_limits qlimits; 1267 zvol_queue_limits_convert(limits, &qlimits); 1268 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1269 if (IS_ERR(disk)) { 1270 zso->zvo_disk = NULL; 1271 blk_mq_free_tag_set(&zso->tag_set); 1272 return (1); 1273 } 1274 1275 zso->zvo_disk = disk; 1276 zso->zvo_queue = zso->zvo_disk->queue; 1277 zso->zvo_disk->minors = ZVOL_MINORS; 1278 #else 1279 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1280 if (zso->zvo_disk == NULL) { 1281 blk_cleanup_queue(zso->zvo_queue); 1282 blk_mq_free_tag_set(&zso->tag_set); 1283 return (1); 1284 } 1285 /* Allocate queue */ 1286 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1287 if (IS_ERR(zso->zvo_queue)) { 1288 blk_mq_free_tag_set(&zso->tag_set); 1289 return (1); 1290 } 1291 1292 /* Our queue is now created, assign it to our disk */ 1293 zso->zvo_disk->queue = zso->zvo_queue; 1294 #endif 1295 1296 zvol_queue_limits_apply(limits, zso->zvo_queue); 1297 1298 return (0); 1299 } 1300 1301 /* 1302 * Allocate memory for a new zvol_state_t and setup the required 1303 * request queue and generic disk structures for the block device. 1304 */ 1305 static zvol_state_t * 1306 zvol_alloc(dev_t dev, const char *name, uint64_t volblocksize) 1307 { 1308 zvol_state_t *zv; 1309 struct zvol_state_os *zso; 1310 uint64_t volmode; 1311 int ret; 1312 1313 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 1314 return (NULL); 1315 1316 if (volmode == ZFS_VOLMODE_DEFAULT) 1317 volmode = zvol_volmode; 1318 1319 if (volmode == ZFS_VOLMODE_NONE) 1320 return (NULL); 1321 1322 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1323 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1324 zv->zv_zso = zso; 1325 zv->zv_volmode = volmode; 1326 zv->zv_volblocksize = volblocksize; 1327 1328 list_link_init(&zv->zv_next); 1329 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1330 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1331 1332 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1333 1334 zvol_queue_limits_t limits; 1335 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1336 1337 /* 1338 * The block layer has 3 interfaces for getting BIOs: 1339 * 1340 * 1. blk-mq request queues (new) 1341 * 2. submit_bio() (oldest) 1342 * 3. regular request queues (old). 1343 * 1344 * Each of those interfaces has two permutations: 1345 * 1346 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1347 * both the disk and its queue (5.14 kernel or newer) 1348 * 1349 * b) We don't have blk_*alloc_disk(), and have to allocate the 1350 * disk and the queue separately. (5.13 kernel or older) 1351 */ 1352 if (zv->zv_zso->use_blk_mq) { 1353 ret = zvol_alloc_blk_mq(zv, &limits); 1354 if (ret != 0) 1355 goto out_kmem; 1356 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1357 } else { 1358 ret = zvol_alloc_non_blk_mq(zso, &limits); 1359 if (ret != 0) 1360 goto out_kmem; 1361 zso->zvo_disk->fops = &zvol_ops; 1362 } 1363 1364 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1365 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1366 1367 if (!zv->zv_zso->use_blk_mq) { 1368 /* Disable write merging in favor of the ZIO pipeline. */ 1369 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1370 } 1371 1372 zso->zvo_queue->queuedata = zv; 1373 zso->zvo_dev = dev; 1374 zv->zv_open_count = 0; 1375 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1376 1377 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1378 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1379 1380 zso->zvo_disk->major = zvol_major; 1381 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1382 1383 /* 1384 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1385 * This is accomplished by limiting the number of minors for the 1386 * device to one and explicitly disabling partition scanning. 1387 */ 1388 if (volmode == ZFS_VOLMODE_DEV) { 1389 zso->zvo_disk->minors = 1; 1390 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1391 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1392 } 1393 1394 zso->zvo_disk->first_minor = (dev & MINORMASK); 1395 zso->zvo_disk->private_data = zv; 1396 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1397 ZVOL_DEV_NAME, (dev & MINORMASK)); 1398 1399 return (zv); 1400 1401 out_kmem: 1402 kmem_free(zso, sizeof (struct zvol_state_os)); 1403 kmem_free(zv, sizeof (zvol_state_t)); 1404 return (NULL); 1405 } 1406 1407 /* 1408 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 1409 * At this time, the structure is not opened by anyone, is taken off 1410 * the zvol_state_list, and has its private data set to NULL. 1411 * The zvol_state_lock is dropped. 1412 * 1413 * This function may take many milliseconds to complete (e.g. we've seen 1414 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 1415 * "del_gendisk". Thus, consumers need to be careful to account for this 1416 * latency when calling this function. 1417 */ 1418 void 1419 zvol_os_free(zvol_state_t *zv) 1420 { 1421 1422 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1423 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1424 ASSERT0(zv->zv_open_count); 1425 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 1426 1427 rw_destroy(&zv->zv_suspend_lock); 1428 zfs_rangelock_fini(&zv->zv_rangelock); 1429 1430 del_gendisk(zv->zv_zso->zvo_disk); 1431 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1432 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1433 #if defined(HAVE_BLK_CLEANUP_DISK) 1434 blk_cleanup_disk(zv->zv_zso->zvo_disk); 1435 #else 1436 put_disk(zv->zv_zso->zvo_disk); 1437 #endif 1438 #else 1439 blk_cleanup_queue(zv->zv_zso->zvo_queue); 1440 put_disk(zv->zv_zso->zvo_disk); 1441 #endif 1442 1443 if (zv->zv_zso->use_blk_mq) 1444 blk_mq_free_tag_set(&zv->zv_zso->tag_set); 1445 1446 ida_simple_remove(&zvol_ida, 1447 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 1448 1449 cv_destroy(&zv->zv_removing_cv); 1450 mutex_destroy(&zv->zv_state_lock); 1451 dataset_kstats_destroy(&zv->zv_kstat); 1452 1453 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 1454 kmem_free(zv, sizeof (zvol_state_t)); 1455 } 1456 1457 void 1458 zvol_wait_close(zvol_state_t *zv) 1459 { 1460 } 1461 1462 struct add_disk_work { 1463 struct delayed_work work; 1464 struct gendisk *disk; 1465 int error; 1466 }; 1467 1468 static int 1469 __zvol_os_add_disk(struct gendisk *disk) 1470 { 1471 int error = 0; 1472 #ifdef HAVE_ADD_DISK_RET 1473 error = add_disk(disk); 1474 #else 1475 add_disk(disk); 1476 #endif 1477 return (error); 1478 } 1479 1480 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1481 static void 1482 zvol_os_add_disk_work(struct work_struct *work) 1483 { 1484 struct add_disk_work *add_disk_work; 1485 add_disk_work = container_of(work, struct add_disk_work, work.work); 1486 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1487 } 1488 #endif 1489 1490 /* 1491 * SPECIAL CASE: 1492 * 1493 * This function basically calls add_disk() from a workqueue. You may be 1494 * thinking: why not just call add_disk() directly? 1495 * 1496 * When you call add_disk(), the zvol appears to the world. When this happens, 1497 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1498 * differently on the 6.9+ kernels: 1499 * 1500 * - 6.8 and older kernels - 1501 * disk_scan_partitions() 1502 * handle = bdev_open_by_dev( 1503 * zvol_open() 1504 * bdev_release(handle); 1505 * zvol_release() 1506 * 1507 * 1508 * - 6.9+ kernels - 1509 * disk_scan_partitions() 1510 * file = bdev_file_open_by_dev() 1511 * zvol_open() 1512 * fput(file) 1513 * < wait for return to userspace > 1514 * zvol_release() 1515 * 1516 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1517 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1518 * async that has to wait until we return to userspace (since it adds the fput 1519 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1520 * behavior we want, since we want do things like create+destroy a zvol within 1521 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1522 * reference to the zvol while we're in the IOCTL, which can't wait until we 1523 * return to userspace. 1524 * 1525 * We can get around this since fput() has a special codepath for when it's 1526 * running in a kernel thread or interrupt. In those cases, it just puts the 1527 * fput into the system workqueue, which we can force to run with 1528 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1529 * run from a kernel thread and "tricks" the fput() codepaths. 1530 * 1531 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1532 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1533 * fput) to happen, which it eventually, naturally, will from the system_wq 1534 * without us explicitly calling __flush_workqueue(). 1535 */ 1536 static int 1537 zvol_os_add_disk(struct gendisk *disk) 1538 { 1539 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1540 struct add_disk_work add_disk_work; 1541 1542 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1543 add_disk_work.disk = disk; 1544 add_disk_work.error = 0; 1545 1546 /* Use *_delayed_work functions since they're not GPL'd */ 1547 schedule_delayed_work(&add_disk_work.work, 0); 1548 flush_delayed_work(&add_disk_work.work); 1549 1550 __flush_workqueue(system_wq); 1551 return (add_disk_work.error); 1552 #else /* <= 6.8 kernel */ 1553 return (__zvol_os_add_disk(disk)); 1554 #endif 1555 } 1556 1557 /* 1558 * Create a block device minor node and setup the linkage between it 1559 * and the specified volume. Once this function returns the block 1560 * device is live and ready for use. 1561 */ 1562 int 1563 zvol_os_create_minor(const char *name) 1564 { 1565 zvol_state_t *zv; 1566 objset_t *os; 1567 dmu_object_info_t *doi; 1568 uint64_t volsize; 1569 uint64_t len; 1570 unsigned minor = 0; 1571 int error = 0; 1572 int idx; 1573 uint64_t hash = zvol_name_hash(name); 1574 uint64_t volthreading; 1575 bool replayed_zil = B_FALSE; 1576 1577 if (zvol_inhibit_dev) 1578 return (0); 1579 1580 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1581 if (idx < 0) 1582 return (SET_ERROR(-idx)); 1583 minor = idx << ZVOL_MINOR_BITS; 1584 if (MINOR(minor) != minor) { 1585 /* too many partitions can cause an overflow */ 1586 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1587 name, minor, MINOR(minor)); 1588 ida_simple_remove(&zvol_ida, idx); 1589 return (SET_ERROR(EINVAL)); 1590 } 1591 1592 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1593 if (zv) { 1594 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1595 mutex_exit(&zv->zv_state_lock); 1596 ida_simple_remove(&zvol_ida, idx); 1597 return (SET_ERROR(EEXIST)); 1598 } 1599 1600 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1601 1602 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1603 if (error) 1604 goto out_doi; 1605 1606 error = dmu_object_info(os, ZVOL_OBJ, doi); 1607 if (error) 1608 goto out_dmu_objset_disown; 1609 1610 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1611 if (error) 1612 goto out_dmu_objset_disown; 1613 1614 zv = zvol_alloc(MKDEV(zvol_major, minor), name, 1615 doi->doi_data_block_size); 1616 if (zv == NULL) { 1617 error = SET_ERROR(EAGAIN); 1618 goto out_dmu_objset_disown; 1619 } 1620 zv->zv_hash = hash; 1621 1622 if (dmu_objset_is_snapshot(os)) 1623 zv->zv_flags |= ZVOL_RDONLY; 1624 1625 zv->zv_volsize = volsize; 1626 zv->zv_objset = os; 1627 1628 /* Default */ 1629 zv->zv_threading = B_TRUE; 1630 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1631 == 0) 1632 zv->zv_threading = volthreading; 1633 1634 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1635 1636 #ifdef QUEUE_FLAG_DISCARD 1637 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1638 #endif 1639 #ifdef QUEUE_FLAG_NONROT 1640 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1641 #endif 1642 #ifdef QUEUE_FLAG_ADD_RANDOM 1643 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1644 #endif 1645 /* This flag was introduced in kernel version 4.12. */ 1646 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1647 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1648 #endif 1649 1650 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1651 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1652 if (error) 1653 goto out_dmu_objset_disown; 1654 ASSERT3P(zv->zv_zilog, ==, NULL); 1655 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1656 if (spa_writeable(dmu_objset_spa(os))) { 1657 if (zil_replay_disable) 1658 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1659 else 1660 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1661 } 1662 if (replayed_zil) 1663 zil_close(zv->zv_zilog); 1664 zv->zv_zilog = NULL; 1665 1666 /* 1667 * When udev detects the addition of the device it will immediately 1668 * invoke blkid(8) to determine the type of content on the device. 1669 * Prefetching the blocks commonly scanned by blkid(8) will speed 1670 * up this process. 1671 */ 1672 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1673 if (len > 0) { 1674 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1675 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1676 ZIO_PRIORITY_SYNC_READ); 1677 } 1678 1679 zv->zv_objset = NULL; 1680 out_dmu_objset_disown: 1681 dmu_objset_disown(os, B_TRUE, FTAG); 1682 out_doi: 1683 kmem_free(doi, sizeof (dmu_object_info_t)); 1684 1685 /* 1686 * Keep in mind that once add_disk() is called, the zvol is 1687 * announced to the world, and zvol_open()/zvol_release() can 1688 * be called at any time. Incidentally, add_disk() itself calls 1689 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1690 * directly as well. 1691 */ 1692 if (error == 0) { 1693 rw_enter(&zvol_state_lock, RW_WRITER); 1694 zvol_insert(zv); 1695 rw_exit(&zvol_state_lock); 1696 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1697 } else { 1698 ida_simple_remove(&zvol_ida, idx); 1699 } 1700 1701 return (error); 1702 } 1703 1704 void 1705 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1706 { 1707 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1708 1709 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1710 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1711 1712 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1713 1714 /* move to new hashtable entry */ 1715 zv->zv_hash = zvol_name_hash(newname); 1716 hlist_del(&zv->zv_hlink); 1717 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1718 1719 /* 1720 * The block device's read-only state is briefly changed causing 1721 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1722 * the name change and fixes the symlinks. This does not change 1723 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1724 * changes. This would normally be done using kobject_uevent() but 1725 * that is a GPL-only symbol which is why we need this workaround. 1726 */ 1727 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1728 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1729 1730 dataset_kstats_rename(&zv->zv_kstat, newname); 1731 } 1732 1733 void 1734 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1735 { 1736 1737 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1738 } 1739 1740 void 1741 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1742 { 1743 1744 set_capacity(zv->zv_zso->zvo_disk, capacity); 1745 } 1746 1747 int 1748 zvol_init(void) 1749 { 1750 int error; 1751 1752 error = zvol_init_impl(); 1753 if (error) { 1754 printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); 1755 return (error); 1756 } 1757 1758 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1759 if (error) { 1760 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1761 return (error); 1762 } 1763 1764 if (zvol_blk_mq_queue_depth == 0) { 1765 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1766 } else { 1767 zvol_actual_blk_mq_queue_depth = 1768 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1769 } 1770 1771 if (zvol_blk_mq_threads == 0) { 1772 zvol_blk_mq_actual_threads = num_online_cpus(); 1773 } else { 1774 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1775 1024); 1776 } 1777 1778 ida_init(&zvol_ida); 1779 return (0); 1780 } 1781 1782 void 1783 zvol_fini(void) 1784 { 1785 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1786 1787 zvol_fini_impl(); 1788 1789 ida_destroy(&zvol_ida); 1790 } 1791 1792 module_param(zvol_major, uint, 0444); 1793 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1794 1795 module_param(zvol_max_discard_blocks, ulong, 0444); 1796 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1797 1798 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1799 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1800 1801 module_param(zvol_use_blk_mq, uint, 0644); 1802 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1803 1804 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1805 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1806 "Process volblocksize blocks per thread"); 1807 1808 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1809 module_param(zvol_open_timeout_ms, uint, 0644); 1810 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1811 #endif 1812