1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 24 * Copyright (c) 2024, Rob Norris <robn@despairlabs.com> 25 * Copyright (c) 2024, 2025, Klara, Inc. 26 */ 27 28 #include <sys/dataset_kstats.h> 29 #include <sys/dbuf.h> 30 #include <sys/dmu_traverse.h> 31 #include <sys/dsl_dataset.h> 32 #include <sys/dsl_prop.h> 33 #include <sys/dsl_dir.h> 34 #include <sys/zap.h> 35 #include <sys/zfeature.h> 36 #include <sys/zil_impl.h> 37 #include <sys/dmu_tx.h> 38 #include <sys/zio.h> 39 #include <sys/zfs_rlock.h> 40 #include <sys/spa_impl.h> 41 #include <sys/zvol.h> 42 #include <sys/zvol_impl.h> 43 #include <cityhash.h> 44 45 #include <linux/blkdev_compat.h> 46 #include <linux/task_io_accounting_ops.h> 47 #include <linux/workqueue.h> 48 #include <linux/blk-mq.h> 49 50 static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, 51 struct request *rq, boolean_t force_sync); 52 53 static unsigned int zvol_major = ZVOL_MAJOR; 54 static unsigned long zvol_max_discard_blocks = 16384; 55 56 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 57 static unsigned int zvol_open_timeout_ms = 1000; 58 #endif 59 60 static unsigned int zvol_blk_mq_threads = 0; 61 static unsigned int zvol_blk_mq_actual_threads; 62 static boolean_t zvol_use_blk_mq = B_FALSE; 63 64 /* 65 * The maximum number of volblocksize blocks to process per thread. Typically, 66 * write heavy workloads preform better with higher values here, and read 67 * heavy workloads preform better with lower values, but that's not a hard 68 * and fast rule. It's basically a knob to tune between "less overhead with 69 * less parallelism" and "more overhead, but more parallelism". 70 * 71 * '8' was chosen as a reasonable, balanced, default based off of sequential 72 * read and write tests to a zvol in an NVMe pool (with 16 CPUs). 73 */ 74 static unsigned int zvol_blk_mq_blocks_per_thread = 8; 75 76 #ifndef BLKDEV_DEFAULT_RQ 77 /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ 78 #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ 79 #endif 80 81 /* 82 * Finalize our BIO or request. 83 */ 84 static inline void 85 zvol_end_io(struct bio *bio, struct request *rq, int error) 86 { 87 ASSERT3U(error, >=, 0); 88 if (bio) { 89 bio->bi_status = errno_to_bi_status(error); 90 bio_endio(bio); 91 } else { 92 blk_mq_end_request(rq, errno_to_bi_status(error)); 93 } 94 } 95 96 static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 97 static unsigned int zvol_actual_blk_mq_queue_depth; 98 99 struct zvol_state_os { 100 struct gendisk *zvo_disk; /* generic disk */ 101 struct request_queue *zvo_queue; /* request queue */ 102 dev_t zvo_dev; /* device id */ 103 104 struct blk_mq_tag_set tag_set; 105 106 /* Set from the global 'zvol_use_blk_mq' at zvol load */ 107 boolean_t use_blk_mq; 108 }; 109 110 static struct ida zvol_ida; 111 112 /* 113 * This is called when a new block multiqueue request comes in. A request 114 * contains one or more BIOs. 115 */ 116 static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, 117 const struct blk_mq_queue_data *bd) 118 { 119 struct request *rq = bd->rq; 120 zvol_state_t *zv = rq->q->queuedata; 121 122 /* Tell the kernel that we are starting to process this request */ 123 blk_mq_start_request(rq); 124 125 if (blk_rq_is_passthrough(rq)) { 126 /* Skip non filesystem request */ 127 blk_mq_end_request(rq, BLK_STS_IOERR); 128 return (BLK_STS_IOERR); 129 } 130 131 zvol_request_impl(zv, NULL, rq, 0); 132 133 /* Acknowledge to the kernel that we got this request */ 134 return (BLK_STS_OK); 135 } 136 137 static struct blk_mq_ops zvol_blk_mq_queue_ops = { 138 .queue_rq = zvol_mq_queue_rq, 139 }; 140 141 /* Initialize our blk-mq struct */ 142 static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) 143 { 144 struct zvol_state_os *zso = zv->zv_zso; 145 146 memset(&zso->tag_set, 0, sizeof (zso->tag_set)); 147 148 /* Initialize tag set. */ 149 zso->tag_set.ops = &zvol_blk_mq_queue_ops; 150 zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; 151 zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; 152 zso->tag_set.numa_node = NUMA_NO_NODE; 153 zso->tag_set.cmd_size = 0; 154 155 /* 156 * We need BLK_MQ_F_BLOCKING here since we do blocking calls in 157 * zvol_request_impl() 158 */ 159 zso->tag_set.flags = BLK_MQ_F_BLOCKING; 160 161 #ifdef BLK_MQ_F_SHOULD_MERGE 162 /* 163 * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. 164 * For older kernels, we set it. 165 */ 166 zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; 167 #endif 168 169 zso->tag_set.driver_data = zv; 170 171 return (blk_mq_alloc_tag_set(&zso->tag_set)); 172 } 173 174 /* 175 * Given a path, return TRUE if path is a ZVOL. 176 */ 177 boolean_t 178 zvol_os_is_zvol(const char *path) 179 { 180 dev_t dev = 0; 181 182 if (vdev_lookup_bdev(path, &dev) != 0) 183 return (B_FALSE); 184 185 if (MAJOR(dev) == zvol_major) 186 return (B_TRUE); 187 188 return (B_FALSE); 189 } 190 191 static void 192 zvol_write(zv_request_t *zvr) 193 { 194 struct bio *bio = zvr->bio; 195 struct request *rq = zvr->rq; 196 int error = 0; 197 zfs_uio_t uio; 198 zvol_state_t *zv = zvr->zv; 199 struct request_queue *q; 200 struct gendisk *disk; 201 unsigned long start_time = 0; 202 boolean_t acct = B_FALSE; 203 204 ASSERT3P(zv, !=, NULL); 205 ASSERT3U(zv->zv_open_count, >, 0); 206 ASSERT3P(zv->zv_zilog, !=, NULL); 207 208 q = zv->zv_zso->zvo_queue; 209 disk = zv->zv_zso->zvo_disk; 210 211 /* bio marked as FLUSH need to flush before write */ 212 if (io_is_flush(bio, rq)) { 213 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 214 if (error != 0) { 215 rw_exit(&zv->zv_suspend_lock); 216 zvol_end_io(bio, rq, -error); 217 return; 218 } 219 } 220 221 /* Some requests are just for flush and nothing else. */ 222 if (io_size(bio, rq) == 0) { 223 rw_exit(&zv->zv_suspend_lock); 224 zvol_end_io(bio, rq, 0); 225 return; 226 } 227 228 zfs_uio_bvec_init(&uio, bio, rq); 229 230 ssize_t start_resid = uio.uio_resid; 231 232 /* 233 * With use_blk_mq, accounting is done by blk_mq_start_request() 234 * and blk_mq_end_request(), so we can skip it here. 235 */ 236 if (bio) { 237 acct = blk_queue_io_stat(q); 238 if (acct) { 239 start_time = blk_generic_start_io_acct(q, disk, WRITE, 240 bio); 241 } 242 } 243 244 boolean_t sync = 245 io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 246 247 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 248 uio.uio_loffset, uio.uio_resid, RL_WRITER); 249 250 uint64_t volsize = zv->zv_volsize; 251 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 252 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 253 uint64_t off = uio.uio_loffset; 254 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 255 256 if (bytes > volsize - off) /* don't write past the end */ 257 bytes = volsize - off; 258 259 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 260 261 /* This will only fail for ENOSPC */ 262 error = dmu_tx_assign(tx, DMU_TX_WAIT); 263 if (error) { 264 dmu_tx_abort(tx); 265 break; 266 } 267 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, 268 DMU_READ_PREFETCH); 269 if (error == 0) { 270 zvol_log_write(zv, tx, off, bytes, sync); 271 } 272 dmu_tx_commit(tx); 273 274 if (error) 275 break; 276 } 277 zfs_rangelock_exit(lr); 278 279 int64_t nwritten = start_resid - uio.uio_resid; 280 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 281 task_io_account_write(nwritten); 282 283 if (error == 0 && sync) 284 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 285 286 rw_exit(&zv->zv_suspend_lock); 287 288 if (bio && acct) { 289 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 290 } 291 292 zvol_end_io(bio, rq, error); 293 } 294 295 static void 296 zvol_write_task(void *arg) 297 { 298 zv_request_task_t *task = arg; 299 zvol_write(&task->zvr); 300 zv_request_task_free(task); 301 } 302 303 static void 304 zvol_discard(zv_request_t *zvr) 305 { 306 struct bio *bio = zvr->bio; 307 struct request *rq = zvr->rq; 308 zvol_state_t *zv = zvr->zv; 309 uint64_t start = io_offset(bio, rq); 310 uint64_t size = io_size(bio, rq); 311 uint64_t end = start + size; 312 boolean_t sync; 313 int error = 0; 314 dmu_tx_t *tx; 315 struct request_queue *q = zv->zv_zso->zvo_queue; 316 struct gendisk *disk = zv->zv_zso->zvo_disk; 317 unsigned long start_time = 0; 318 boolean_t acct = B_FALSE; 319 320 ASSERT3P(zv, !=, NULL); 321 ASSERT3U(zv->zv_open_count, >, 0); 322 ASSERT3P(zv->zv_zilog, !=, NULL); 323 324 if (bio) { 325 acct = blk_queue_io_stat(q); 326 if (acct) { 327 start_time = blk_generic_start_io_acct(q, disk, WRITE, 328 bio); 329 } 330 } 331 332 sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 333 334 if (end > zv->zv_volsize) { 335 error = SET_ERROR(EIO); 336 goto unlock; 337 } 338 339 /* 340 * Align the request to volume block boundaries when a secure erase is 341 * not required. This will prevent dnode_free_range() from zeroing out 342 * the unaligned parts which is slow (read-modify-write) and useless 343 * since we are not freeing any space by doing so. 344 */ 345 if (!io_is_secure_erase(bio, rq)) { 346 start = P2ROUNDUP(start, zv->zv_volblocksize); 347 end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); 348 size = end - start; 349 } 350 351 if (start >= end) 352 goto unlock; 353 354 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 355 start, size, RL_WRITER); 356 357 tx = dmu_tx_create(zv->zv_objset); 358 dmu_tx_mark_netfree(tx); 359 error = dmu_tx_assign(tx, DMU_TX_WAIT); 360 if (error != 0) { 361 dmu_tx_abort(tx); 362 } else { 363 zvol_log_truncate(zv, tx, start, size); 364 dmu_tx_commit(tx); 365 error = dmu_free_long_range(zv->zv_objset, 366 ZVOL_OBJ, start, size); 367 } 368 zfs_rangelock_exit(lr); 369 370 if (error == 0 && sync) 371 error = zil_commit(zv->zv_zilog, ZVOL_OBJ); 372 373 unlock: 374 rw_exit(&zv->zv_suspend_lock); 375 376 if (bio && acct) { 377 blk_generic_end_io_acct(q, disk, WRITE, bio, 378 start_time); 379 } 380 381 zvol_end_io(bio, rq, error); 382 } 383 384 static void 385 zvol_discard_task(void *arg) 386 { 387 zv_request_task_t *task = arg; 388 zvol_discard(&task->zvr); 389 zv_request_task_free(task); 390 } 391 392 static void 393 zvol_read(zv_request_t *zvr) 394 { 395 struct bio *bio = zvr->bio; 396 struct request *rq = zvr->rq; 397 int error = 0; 398 zfs_uio_t uio; 399 boolean_t acct = B_FALSE; 400 zvol_state_t *zv = zvr->zv; 401 struct request_queue *q; 402 struct gendisk *disk; 403 unsigned long start_time = 0; 404 405 ASSERT3P(zv, !=, NULL); 406 ASSERT3U(zv->zv_open_count, >, 0); 407 408 zfs_uio_bvec_init(&uio, bio, rq); 409 410 q = zv->zv_zso->zvo_queue; 411 disk = zv->zv_zso->zvo_disk; 412 413 ssize_t start_resid = uio.uio_resid; 414 415 /* 416 * When blk-mq is being used, accounting is done by 417 * blk_mq_start_request() and blk_mq_end_request(). 418 */ 419 if (bio) { 420 acct = blk_queue_io_stat(q); 421 if (acct) 422 start_time = blk_generic_start_io_acct(q, disk, READ, 423 bio); 424 } 425 426 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 427 uio.uio_loffset, uio.uio_resid, RL_READER); 428 429 uint64_t volsize = zv->zv_volsize; 430 431 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 432 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 433 434 /* don't read past the end */ 435 if (bytes > volsize - uio.uio_loffset) 436 bytes = volsize - uio.uio_loffset; 437 438 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, 439 DMU_READ_PREFETCH); 440 if (error) { 441 /* convert checksum errors into IO errors */ 442 if (error == ECKSUM) 443 error = SET_ERROR(EIO); 444 break; 445 } 446 } 447 zfs_rangelock_exit(lr); 448 449 int64_t nread = start_resid - uio.uio_resid; 450 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 451 task_io_account_read(nread); 452 453 rw_exit(&zv->zv_suspend_lock); 454 455 if (bio && acct) { 456 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 457 } 458 459 zvol_end_io(bio, rq, error); 460 } 461 462 static void 463 zvol_read_task(void *arg) 464 { 465 zv_request_task_t *task = arg; 466 zvol_read(&task->zvr); 467 zv_request_task_free(task); 468 } 469 470 471 /* 472 * Process a BIO or request 473 * 474 * Either 'bio' or 'rq' should be set depending on if we are processing a 475 * bio or a request (both should not be set). 476 * 477 * force_sync: Set to 0 to defer processing to a background taskq 478 * Set to 1 to process data synchronously 479 */ 480 static void 481 zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, 482 boolean_t force_sync) 483 { 484 fstrans_cookie_t cookie = spl_fstrans_mark(); 485 uint64_t offset = io_offset(bio, rq); 486 uint64_t size = io_size(bio, rq); 487 int rw; 488 489 if (rq != NULL) { 490 /* 491 * Flush & trim requests go down the zvol_write codepath. Or 492 * more specifically: 493 * 494 * If request is a write, or if it's op_is_sync() and not a 495 * read, or if it's a flush, or if it's a discard, then send the 496 * request down the write path. 497 */ 498 if (op_is_write(rq->cmd_flags) || 499 (op_is_sync(rq->cmd_flags) && req_op(rq) != REQ_OP_READ) || 500 req_op(rq) == REQ_OP_FLUSH || 501 op_is_discard(rq->cmd_flags)) { 502 rw = WRITE; 503 } else { 504 rw = READ; 505 } 506 } else { 507 rw = bio_data_dir(bio); 508 } 509 510 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 511 zvol_end_io(bio, rq, SET_ERROR(ENXIO)); 512 goto out; 513 } 514 515 if (zvol_request_sync || zv->zv_threading == B_FALSE) 516 force_sync = 1; 517 518 zv_request_t zvr = { 519 .zv = zv, 520 .bio = bio, 521 .rq = rq, 522 }; 523 524 if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { 525 printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", 526 zv->zv_zso->zvo_disk->disk_name, 527 (long long unsigned)offset, 528 (long unsigned)size); 529 530 zvol_end_io(bio, rq, SET_ERROR(EIO)); 531 goto out; 532 } 533 534 zv_request_task_t *task; 535 zv_taskq_t *ztqs = &zvol_taskqs; 536 uint_t blk_mq_hw_queue = 0; 537 uint_t tq_idx; 538 uint_t taskq_hash; 539 if (rq) 540 #ifdef HAVE_BLK_MQ_RQ_HCTX 541 blk_mq_hw_queue = rq->mq_hctx->queue_num; 542 #else 543 blk_mq_hw_queue = rq->q->queue_hw_ctx[ 544 rq->q->mq_map[raw_smp_processor_id()]]->queue_num; 545 #endif 546 taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, 547 blk_mq_hw_queue); 548 tq_idx = taskq_hash % ztqs->tqs_cnt; 549 550 if (rw == WRITE) { 551 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 552 zvol_end_io(bio, rq, SET_ERROR(EROFS)); 553 goto out; 554 } 555 556 /* 557 * Prevents the zvol from being suspended, or the ZIL being 558 * concurrently opened. Will be released after the i/o 559 * completes. 560 */ 561 rw_enter(&zv->zv_suspend_lock, RW_READER); 562 563 /* 564 * Open a ZIL if this is the first time we have written to this 565 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 566 * than zv_state_lock so that we don't need to acquire an 567 * additional lock in this path. 568 */ 569 if (zv->zv_zilog == NULL) { 570 rw_exit(&zv->zv_suspend_lock); 571 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 572 if (zv->zv_zilog == NULL) { 573 zv->zv_zilog = zil_open(zv->zv_objset, 574 zvol_get_data, &zv->zv_kstat.dk_zil_sums); 575 zv->zv_flags |= ZVOL_WRITTEN_TO; 576 /* replay / destroy done in zvol_create_minor */ 577 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 578 ZIL_REPLAY_NEEDED)); 579 } 580 rw_downgrade(&zv->zv_suspend_lock); 581 } 582 583 /* 584 * We don't want this thread to be blocked waiting for i/o to 585 * complete, so we instead wait from a taskq callback. The 586 * i/o may be a ZIL write (via zil_commit()), or a read of an 587 * indirect block, or a read of a data block (if this is a 588 * partial-block write). We will indicate that the i/o is 589 * complete by calling END_IO() from the taskq callback. 590 * 591 * This design allows the calling thread to continue and 592 * initiate more concurrent operations by calling 593 * zvol_request() again. There are typically only a small 594 * number of threads available to call zvol_request() (e.g. 595 * one per iSCSI target), so keeping the latency of 596 * zvol_request() low is important for performance. 597 * 598 * The zvol_request_sync module parameter allows this 599 * behavior to be altered, for performance evaluation 600 * purposes. If the callback blocks, setting 601 * zvol_request_sync=1 will result in much worse performance. 602 * 603 * We can have up to zvol_threads concurrent i/o's being 604 * processed for all zvols on the system. This is typically 605 * a vast improvement over the zvol_request_sync=1 behavior 606 * of one i/o at a time per zvol. However, an even better 607 * design would be for zvol_request() to initiate the zio 608 * directly, and then be notified by the zio_done callback, 609 * which would call END_IO(). Unfortunately, the DMU/ZIL 610 * interfaces lack this functionality (they block waiting for 611 * the i/o to complete). 612 */ 613 if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { 614 if (force_sync) { 615 zvol_discard(&zvr); 616 } else { 617 task = zv_request_task_create(zvr); 618 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 619 zvol_discard_task, task, 0, &task->ent); 620 } 621 } else { 622 if (force_sync) { 623 zvol_write(&zvr); 624 } else { 625 task = zv_request_task_create(zvr); 626 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 627 zvol_write_task, task, 0, &task->ent); 628 } 629 } 630 } else { 631 /* 632 * The SCST driver, and possibly others, may issue READ I/Os 633 * with a length of zero bytes. These empty I/Os contain no 634 * data and require no additional handling. 635 */ 636 if (size == 0) { 637 zvol_end_io(bio, rq, 0); 638 goto out; 639 } 640 641 rw_enter(&zv->zv_suspend_lock, RW_READER); 642 643 /* See comment in WRITE case above. */ 644 if (force_sync) { 645 zvol_read(&zvr); 646 } else { 647 task = zv_request_task_create(zvr); 648 taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], 649 zvol_read_task, task, 0, &task->ent); 650 } 651 } 652 653 out: 654 spl_fstrans_unmark(cookie); 655 } 656 657 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 658 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 659 static void 660 zvol_submit_bio(struct bio *bio) 661 #else 662 static blk_qc_t 663 zvol_submit_bio(struct bio *bio) 664 #endif 665 #else 666 static MAKE_REQUEST_FN_RET 667 zvol_request(struct request_queue *q, struct bio *bio) 668 #endif 669 { 670 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 671 #if defined(HAVE_BIO_BDEV_DISK) 672 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 673 #else 674 struct request_queue *q = bio->bi_disk->queue; 675 #endif 676 #endif 677 zvol_state_t *zv = q->queuedata; 678 679 zvol_request_impl(zv, bio, NULL, 0); 680 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 681 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 682 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 683 return (BLK_QC_T_NONE); 684 #endif 685 } 686 687 static int 688 #ifdef HAVE_BLK_MODE_T 689 zvol_open(struct gendisk *disk, blk_mode_t flag) 690 #else 691 zvol_open(struct block_device *bdev, fmode_t flag) 692 #endif 693 { 694 zvol_state_t *zv; 695 int error = 0; 696 boolean_t drop_suspend = B_FALSE; 697 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 698 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 699 hrtime_t start = gethrtime(); 700 701 retry: 702 #endif 703 704 #ifdef HAVE_BLK_MODE_T 705 zv = atomic_load_ptr(&disk->private_data); 706 #else 707 zv = atomic_load_ptr(&bdev->bd_disk->private_data); 708 #endif 709 if (zv == NULL) { 710 return (-SET_ERROR(ENXIO)); 711 } 712 713 mutex_enter(&zv->zv_state_lock); 714 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 715 mutex_exit(&zv->zv_state_lock); 716 return (-SET_ERROR(ENXIO)); 717 } 718 719 /* 720 * Make sure zvol is not suspended during first open 721 * (hold zv_suspend_lock) and respect proper lock acquisition 722 * ordering - zv_suspend_lock before zv_state_lock 723 */ 724 if (zv->zv_open_count == 0) { 725 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 726 mutex_exit(&zv->zv_state_lock); 727 728 /* 729 * Removal may happen while the locks are down, so 730 * we can't trust zv any longer; we have to start over. 731 */ 732 #ifdef HAVE_BLK_MODE_T 733 zv = atomic_load_ptr(&disk->private_data); 734 #else 735 zv = atomic_load_ptr(&bdev->bd_disk->private_data); 736 #endif 737 if (zv == NULL) 738 return (-SET_ERROR(ENXIO)); 739 740 rw_enter(&zv->zv_suspend_lock, RW_READER); 741 mutex_enter(&zv->zv_state_lock); 742 743 if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { 744 mutex_exit(&zv->zv_state_lock); 745 rw_exit(&zv->zv_suspend_lock); 746 return (-SET_ERROR(ENXIO)); 747 } 748 749 /* check to see if zv_suspend_lock is needed */ 750 if (zv->zv_open_count != 0) { 751 rw_exit(&zv->zv_suspend_lock); 752 } else { 753 drop_suspend = B_TRUE; 754 } 755 } else { 756 drop_suspend = B_TRUE; 757 } 758 } 759 760 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 761 762 if (zv->zv_open_count == 0) { 763 boolean_t drop_namespace = B_FALSE; 764 765 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 766 767 /* 768 * In all other call paths the spa_namespace_lock is taken 769 * before the bdev->bd_mutex lock. However, on open(2) 770 * the __blkdev_get() function calls fops->open() with the 771 * bdev->bd_mutex lock held. This can result in a deadlock 772 * when zvols from one pool are used as vdevs in another. 773 * 774 * To prevent a lock inversion deadlock we preemptively 775 * take the spa_namespace_lock. Normally the lock will not 776 * be contended and this is safe because spa_open_common() 777 * handles the case where the caller already holds the 778 * spa_namespace_lock. 779 * 780 * When the lock cannot be aquired after multiple retries 781 * this must be the vdev on zvol deadlock case and we have 782 * no choice but to return an error. For 5.12 and older 783 * kernels returning -ERESTARTSYS will result in the 784 * bdev->bd_mutex being dropped, then reacquired, and 785 * fops->open() being called again. This process can be 786 * repeated safely until both locks are acquired. For 5.13 787 * and newer the -ERESTARTSYS retry logic was removed from 788 * the kernel so the only option is to return the error for 789 * the caller to handle it. 790 */ 791 if (!mutex_owned(&spa_namespace_lock)) { 792 if (!mutex_tryenter(&spa_namespace_lock)) { 793 mutex_exit(&zv->zv_state_lock); 794 rw_exit(&zv->zv_suspend_lock); 795 drop_suspend = B_FALSE; 796 797 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 798 schedule(); 799 return (-SET_ERROR(ERESTARTSYS)); 800 #else 801 if ((gethrtime() - start) > timeout) 802 return (-SET_ERROR(ERESTARTSYS)); 803 804 schedule_timeout_interruptible( 805 MSEC_TO_TICK(10)); 806 goto retry; 807 #endif 808 } else { 809 drop_namespace = B_TRUE; 810 } 811 } 812 813 error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); 814 815 if (drop_namespace) 816 mutex_exit(&spa_namespace_lock); 817 } 818 819 if (error == 0) { 820 if ((blk_mode_is_open_write(flag)) && 821 (zv->zv_flags & ZVOL_RDONLY)) { 822 if (zv->zv_open_count == 0) 823 zvol_last_close(zv); 824 825 error = -SET_ERROR(EROFS); 826 } else { 827 zv->zv_open_count++; 828 } 829 } 830 831 mutex_exit(&zv->zv_state_lock); 832 if (drop_suspend) 833 rw_exit(&zv->zv_suspend_lock); 834 835 if (error == 0) 836 #ifdef HAVE_BLK_MODE_T 837 disk_check_media_change(disk); 838 #else 839 zfs_check_media_change(bdev); 840 #endif 841 842 return (error); 843 } 844 845 static void 846 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG 847 zvol_release(struct gendisk *disk) 848 #else 849 zvol_release(struct gendisk *disk, fmode_t unused) 850 #endif 851 { 852 #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) 853 (void) unused; 854 #endif 855 boolean_t drop_suspend = B_TRUE; 856 857 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 858 if (zv == NULL) 859 return; 860 861 mutex_enter(&zv->zv_state_lock); 862 ASSERT3U(zv->zv_open_count, >, 0); 863 /* 864 * make sure zvol is not suspended during last close 865 * (hold zv_suspend_lock) and respect proper lock acquisition 866 * ordering - zv_suspend_lock before zv_state_lock 867 */ 868 if (zv->zv_open_count == 1) { 869 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 870 mutex_exit(&zv->zv_state_lock); 871 rw_enter(&zv->zv_suspend_lock, RW_READER); 872 mutex_enter(&zv->zv_state_lock); 873 874 /* 875 * Unlike in zvol_open(), we don't check if removal 876 * started here, because we might be one of the openers 877 * that needs to be thrown out! If we're the last, we 878 * need to call zvol_last_close() below to finish 879 * cleanup. So, no special treatment for us. 880 */ 881 882 /* check to see if zv_suspend_lock is needed */ 883 if (zv->zv_open_count != 1) { 884 rw_exit(&zv->zv_suspend_lock); 885 drop_suspend = B_FALSE; 886 } 887 } 888 } else { 889 drop_suspend = B_FALSE; 890 } 891 892 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 893 894 zv->zv_open_count--; 895 if (zv->zv_open_count == 0) { 896 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 897 zvol_last_close(zv); 898 } 899 900 mutex_exit(&zv->zv_state_lock); 901 902 if (drop_suspend) 903 rw_exit(&zv->zv_suspend_lock); 904 } 905 906 static int 907 zvol_ioctl(struct block_device *bdev, fmode_t mode, 908 unsigned int cmd, unsigned long arg) 909 { 910 int error = 0; 911 912 zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); 913 ASSERT3P(zv, !=, NULL); 914 ASSERT3U(zv->zv_open_count, >, 0); 915 916 switch (cmd) { 917 case BLKFLSBUF: 918 #ifdef HAVE_FSYNC_BDEV 919 fsync_bdev(bdev); 920 #elif defined(HAVE_SYNC_BLOCKDEV) 921 sync_blockdev(bdev); 922 #else 923 #error "Neither fsync_bdev() nor sync_blockdev() found" 924 #endif 925 invalidate_bdev(bdev); 926 rw_enter(&zv->zv_suspend_lock, RW_READER); 927 928 if (!(zv->zv_flags & ZVOL_RDONLY)) 929 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 930 931 rw_exit(&zv->zv_suspend_lock); 932 break; 933 934 case BLKZNAME: 935 mutex_enter(&zv->zv_state_lock); 936 error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 937 mutex_exit(&zv->zv_state_lock); 938 if (error) 939 error = SET_ERROR(error); 940 break; 941 942 default: 943 error = SET_ERROR(ENOTTY); 944 break; 945 } 946 947 return (-error); 948 } 949 950 #ifdef CONFIG_COMPAT 951 static int 952 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 953 unsigned cmd, unsigned long arg) 954 { 955 return (zvol_ioctl(bdev, mode, cmd, arg)); 956 } 957 #else 958 #define zvol_compat_ioctl NULL 959 #endif 960 961 static unsigned int 962 zvol_check_events(struct gendisk *disk, unsigned int clearing) 963 { 964 unsigned int mask = 0; 965 966 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 967 968 if (zv != NULL) { 969 mutex_enter(&zv->zv_state_lock); 970 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 971 zv->zv_changed = 0; 972 mutex_exit(&zv->zv_state_lock); 973 } 974 975 return (mask); 976 } 977 978 static int 979 zvol_revalidate_disk(struct gendisk *disk) 980 { 981 zvol_state_t *zv = atomic_load_ptr(&disk->private_data); 982 983 if (zv != NULL) { 984 mutex_enter(&zv->zv_state_lock); 985 set_capacity(zv->zv_zso->zvo_disk, 986 zv->zv_volsize >> SECTOR_BITS); 987 mutex_exit(&zv->zv_state_lock); 988 } 989 990 return (0); 991 } 992 993 int 994 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 995 { 996 struct gendisk *disk = zv->zv_zso->zvo_disk; 997 998 #if defined(HAVE_REVALIDATE_DISK_SIZE) 999 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 1000 #elif defined(HAVE_REVALIDATE_DISK) 1001 revalidate_disk(disk); 1002 #else 1003 zvol_revalidate_disk(disk); 1004 #endif 1005 return (0); 1006 } 1007 1008 /* 1009 * Provide a simple virtual geometry for legacy compatibility. For devices 1010 * smaller than 1 MiB a small head and sector count is used to allow very 1011 * tiny devices. For devices over 1 Mib a standard head and sector count 1012 * is used to keep the cylinders count reasonable. 1013 */ 1014 static int 1015 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 1016 { 1017 sector_t sectors; 1018 1019 zvol_state_t *zv = atomic_load_ptr(&bdev->bd_disk->private_data); 1020 ASSERT3P(zv, !=, NULL); 1021 ASSERT3U(zv->zv_open_count, >, 0); 1022 1023 sectors = get_capacity(zv->zv_zso->zvo_disk); 1024 1025 if (sectors > 2048) { 1026 geo->heads = 16; 1027 geo->sectors = 63; 1028 } else { 1029 geo->heads = 2; 1030 geo->sectors = 4; 1031 } 1032 1033 geo->start = 0; 1034 geo->cylinders = sectors / (geo->heads * geo->sectors); 1035 1036 return (0); 1037 } 1038 1039 /* 1040 * Why have two separate block_device_operations structs? 1041 * 1042 * Normally we'd just have one, and assign 'submit_bio' as needed. However, 1043 * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we 1044 * can't just change submit_bio dynamically at runtime. So just create two 1045 * separate structs to get around this. 1046 */ 1047 static const struct block_device_operations zvol_ops_blk_mq = { 1048 .open = zvol_open, 1049 .release = zvol_release, 1050 .ioctl = zvol_ioctl, 1051 .compat_ioctl = zvol_compat_ioctl, 1052 .check_events = zvol_check_events, 1053 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1054 .revalidate_disk = zvol_revalidate_disk, 1055 #endif 1056 .getgeo = zvol_getgeo, 1057 .owner = THIS_MODULE, 1058 }; 1059 1060 static const struct block_device_operations zvol_ops = { 1061 .open = zvol_open, 1062 .release = zvol_release, 1063 .ioctl = zvol_ioctl, 1064 .compat_ioctl = zvol_compat_ioctl, 1065 .check_events = zvol_check_events, 1066 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 1067 .revalidate_disk = zvol_revalidate_disk, 1068 #endif 1069 .getgeo = zvol_getgeo, 1070 .owner = THIS_MODULE, 1071 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 1072 .submit_bio = zvol_submit_bio, 1073 #endif 1074 }; 1075 1076 /* 1077 * Since 6.9, Linux has been removing queue limit setters in favour of an 1078 * initial queue_limits struct applied when the device is open. Since 6.11, 1079 * queue_limits is being extended to allow more things to be applied when the 1080 * device is open. Setters are also being removed for this. 1081 * 1082 * For OpenZFS, this means that depending on kernel version, some options may 1083 * be set up before the device is open, and some applied to an open device 1084 * (queue) after the fact. 1085 * 1086 * We manage this complexity by having our own limits struct, 1087 * zvol_queue_limits_t, in which we carry any queue config that we're 1088 * interested in setting. This structure is the same on all kernels. 1089 * 1090 * These limits are then applied to the queue at device open time by the most 1091 * appropriate method for the kernel. 1092 * 1093 * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of 1094 * blk_alloc_disk() exists). This converts our limits struct to a proper Linux 1095 * struct queue_limits, and passes it in. Any fields added in later kernels are 1096 * (obviously) not set up here. 1097 * 1098 * zvol_queue_limits_apply() is called on all kernel versions after the queue 1099 * is created, and applies any remaining config. Before 6.9 that will be 1100 * everything, via setter methods. After 6.9 that will be whatever couldn't be 1101 * put into struct queue_limits. (This implies that zvol_queue_limits_apply() 1102 * will always be a no-op on the latest kernel we support). 1103 */ 1104 typedef struct zvol_queue_limits { 1105 unsigned int zql_max_hw_sectors; 1106 unsigned short zql_max_segments; 1107 unsigned int zql_max_segment_size; 1108 unsigned int zql_io_opt; 1109 unsigned int zql_physical_block_size; 1110 unsigned int zql_max_discard_sectors; 1111 unsigned int zql_discard_granularity; 1112 } zvol_queue_limits_t; 1113 1114 static void 1115 zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, 1116 boolean_t use_blk_mq) 1117 { 1118 limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; 1119 1120 if (use_blk_mq) { 1121 /* 1122 * IO requests can be really big (1MB). When an IO request 1123 * comes in, it is passed off to zvol_read() or zvol_write() 1124 * in a new thread, where it is chunked up into 'volblocksize' 1125 * sized pieces and processed. So for example, if the request 1126 * is a 1MB write and your volblocksize is 128k, one zvol_write 1127 * thread will take that request and sequentially do ten 128k 1128 * IOs. This is due to the fact that the thread needs to lock 1129 * each volblocksize sized block. So you might be wondering: 1130 * "instead of passing the whole 1MB request to one thread, 1131 * why not pass ten individual 128k chunks to ten threads and 1132 * process the whole write in parallel?" The short answer is 1133 * that there's a sweet spot number of chunks that balances 1134 * the greater parallelism with the added overhead of more 1135 * threads. The sweet spot can be different depending on if you 1136 * have a read or write heavy workload. Writes typically want 1137 * high chunk counts while reads typically want lower ones. On 1138 * a test pool with 6 NVMe drives in a 3x 2-disk mirror 1139 * configuration, with volblocksize=8k, the sweet spot for good 1140 * sequential reads and writes was at 8 chunks. 1141 */ 1142 1143 /* 1144 * Below we tell the kernel how big we want our requests 1145 * to be. You would think that blk_queue_io_opt() would be 1146 * used to do this since it is used to "set optimal request 1147 * size for the queue", but that doesn't seem to do 1148 * anything - the kernel still gives you huge requests 1149 * with tons of little PAGE_SIZE segments contained within it. 1150 * 1151 * Knowing that the kernel will just give you PAGE_SIZE segments 1152 * no matter what, you can say "ok, I want PAGE_SIZE byte 1153 * segments, and I want 'N' of them per request", where N is 1154 * the correct number of segments for the volblocksize and 1155 * number of chunks you want. 1156 */ 1157 if (zvol_blk_mq_blocks_per_thread != 0) { 1158 unsigned int chunks; 1159 chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); 1160 1161 limits->zql_max_segment_size = PAGE_SIZE; 1162 limits->zql_max_segments = 1163 (zv->zv_volblocksize * chunks) / PAGE_SIZE; 1164 } else { 1165 /* 1166 * Special case: zvol_blk_mq_blocks_per_thread = 0 1167 * Max everything out. 1168 */ 1169 limits->zql_max_segments = UINT16_MAX; 1170 limits->zql_max_segment_size = UINT_MAX; 1171 } 1172 } else { 1173 limits->zql_max_segments = UINT16_MAX; 1174 limits->zql_max_segment_size = UINT_MAX; 1175 } 1176 1177 limits->zql_io_opt = DMU_MAX_ACCESS / 2; 1178 1179 limits->zql_physical_block_size = zv->zv_volblocksize; 1180 limits->zql_max_discard_sectors = 1181 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; 1182 limits->zql_discard_granularity = zv->zv_volblocksize; 1183 } 1184 1185 #ifdef HAVE_BLK_ALLOC_DISK_2ARG 1186 static void 1187 zvol_queue_limits_convert(zvol_queue_limits_t *limits, 1188 struct queue_limits *qlimits) 1189 { 1190 memset(qlimits, 0, sizeof (struct queue_limits)); 1191 qlimits->max_hw_sectors = limits->zql_max_hw_sectors; 1192 qlimits->max_segments = limits->zql_max_segments; 1193 qlimits->max_segment_size = limits->zql_max_segment_size; 1194 qlimits->io_opt = limits->zql_io_opt; 1195 qlimits->physical_block_size = limits->zql_physical_block_size; 1196 qlimits->max_discard_sectors = limits->zql_max_discard_sectors; 1197 qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; 1198 qlimits->discard_granularity = limits->zql_discard_granularity; 1199 #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1200 qlimits->features = 1201 BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; 1202 #endif 1203 } 1204 #endif 1205 1206 static void 1207 zvol_queue_limits_apply(zvol_queue_limits_t *limits, 1208 struct request_queue *queue) 1209 { 1210 #ifndef HAVE_BLK_ALLOC_DISK_2ARG 1211 blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); 1212 blk_queue_max_segments(queue, limits->zql_max_segments); 1213 blk_queue_max_segment_size(queue, limits->zql_max_segment_size); 1214 blk_queue_io_opt(queue, limits->zql_io_opt); 1215 blk_queue_physical_block_size(queue, limits->zql_physical_block_size); 1216 blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); 1217 blk_queue_discard_granularity(queue, limits->zql_discard_granularity); 1218 #endif 1219 #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES 1220 blk_queue_set_write_cache(queue, B_TRUE); 1221 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); 1222 #endif 1223 } 1224 1225 static int 1226 zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) 1227 { 1228 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 1229 #if defined(HAVE_BLK_ALLOC_DISK) 1230 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 1231 if (zso->zvo_disk == NULL) 1232 return (1); 1233 1234 zso->zvo_disk->minors = ZVOL_MINORS; 1235 zso->zvo_queue = zso->zvo_disk->queue; 1236 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1237 struct queue_limits qlimits; 1238 zvol_queue_limits_convert(limits, &qlimits); 1239 struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); 1240 if (IS_ERR(disk)) { 1241 zso->zvo_disk = NULL; 1242 return (1); 1243 } 1244 1245 zso->zvo_disk = disk; 1246 zso->zvo_disk->minors = ZVOL_MINORS; 1247 zso->zvo_queue = zso->zvo_disk->queue; 1248 1249 #else 1250 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 1251 if (zso->zvo_queue == NULL) 1252 return (1); 1253 1254 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1255 if (zso->zvo_disk == NULL) { 1256 blk_cleanup_queue(zso->zvo_queue); 1257 return (1); 1258 } 1259 1260 zso->zvo_disk->queue = zso->zvo_queue; 1261 #endif /* HAVE_BLK_ALLOC_DISK */ 1262 #else 1263 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 1264 if (zso->zvo_queue == NULL) 1265 return (1); 1266 1267 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1268 if (zso->zvo_disk == NULL) { 1269 blk_cleanup_queue(zso->zvo_queue); 1270 return (1); 1271 } 1272 1273 zso->zvo_disk->queue = zso->zvo_queue; 1274 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 1275 1276 zvol_queue_limits_apply(limits, zso->zvo_queue); 1277 1278 return (0); 1279 1280 } 1281 1282 static int 1283 zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) 1284 { 1285 struct zvol_state_os *zso = zv->zv_zso; 1286 1287 /* Allocate our blk-mq tag_set */ 1288 if (zvol_blk_mq_alloc_tag_set(zv) != 0) 1289 return (1); 1290 1291 #if defined(HAVE_BLK_ALLOC_DISK) 1292 zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); 1293 if (zso->zvo_disk == NULL) { 1294 blk_mq_free_tag_set(&zso->tag_set); 1295 return (1); 1296 } 1297 zso->zvo_queue = zso->zvo_disk->queue; 1298 zso->zvo_disk->minors = ZVOL_MINORS; 1299 #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) 1300 struct queue_limits qlimits; 1301 zvol_queue_limits_convert(limits, &qlimits); 1302 struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); 1303 if (IS_ERR(disk)) { 1304 zso->zvo_disk = NULL; 1305 blk_mq_free_tag_set(&zso->tag_set); 1306 return (1); 1307 } 1308 1309 zso->zvo_disk = disk; 1310 zso->zvo_queue = zso->zvo_disk->queue; 1311 zso->zvo_disk->minors = ZVOL_MINORS; 1312 #else 1313 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 1314 if (zso->zvo_disk == NULL) { 1315 blk_cleanup_queue(zso->zvo_queue); 1316 blk_mq_free_tag_set(&zso->tag_set); 1317 return (1); 1318 } 1319 /* Allocate queue */ 1320 zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); 1321 if (IS_ERR(zso->zvo_queue)) { 1322 blk_mq_free_tag_set(&zso->tag_set); 1323 return (1); 1324 } 1325 1326 /* Our queue is now created, assign it to our disk */ 1327 zso->zvo_disk->queue = zso->zvo_queue; 1328 #endif 1329 1330 zvol_queue_limits_apply(limits, zso->zvo_queue); 1331 1332 return (0); 1333 } 1334 1335 /* 1336 * Allocate memory for a new zvol_state_t and setup the required 1337 * request queue and generic disk structures for the block device. 1338 */ 1339 static int 1340 zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, 1341 zvol_state_t **zvp) 1342 { 1343 zvol_state_t *zv; 1344 struct zvol_state_os *zso; 1345 uint64_t volmode; 1346 int ret; 1347 1348 ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); 1349 if (ret) 1350 return (ret); 1351 1352 if (volmode == ZFS_VOLMODE_DEFAULT) 1353 volmode = zvol_volmode; 1354 1355 if (volmode == ZFS_VOLMODE_NONE) 1356 return (0); 1357 1358 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 1359 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 1360 zv->zv_zso = zso; 1361 zv->zv_volmode = volmode; 1362 zv->zv_volsize = volsize; 1363 zv->zv_volblocksize = volblocksize; 1364 1365 list_link_init(&zv->zv_next); 1366 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 1367 cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); 1368 1369 zv->zv_zso->use_blk_mq = zvol_use_blk_mq; 1370 1371 zvol_queue_limits_t limits; 1372 zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); 1373 1374 /* 1375 * The block layer has 3 interfaces for getting BIOs: 1376 * 1377 * 1. blk-mq request queues (new) 1378 * 2. submit_bio() (oldest) 1379 * 3. regular request queues (old). 1380 * 1381 * Each of those interfaces has two permutations: 1382 * 1383 * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates 1384 * both the disk and its queue (5.14 kernel or newer) 1385 * 1386 * b) We don't have blk_*alloc_disk(), and have to allocate the 1387 * disk and the queue separately. (5.13 kernel or older) 1388 */ 1389 if (zv->zv_zso->use_blk_mq) { 1390 ret = zvol_alloc_blk_mq(zv, &limits); 1391 if (ret != 0) 1392 goto out_kmem; 1393 zso->zvo_disk->fops = &zvol_ops_blk_mq; 1394 } else { 1395 ret = zvol_alloc_non_blk_mq(zso, &limits); 1396 if (ret != 0) 1397 goto out_kmem; 1398 zso->zvo_disk->fops = &zvol_ops; 1399 } 1400 1401 /* Limit read-ahead to a single page to prevent over-prefetching. */ 1402 blk_queue_set_read_ahead(zso->zvo_queue, 1); 1403 1404 if (!zv->zv_zso->use_blk_mq) { 1405 /* Disable write merging in favor of the ZIO pipeline. */ 1406 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 1407 } 1408 1409 zso->zvo_queue->queuedata = zv; 1410 zso->zvo_dev = dev; 1411 zv->zv_open_count = 0; 1412 strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); 1413 1414 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 1415 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 1416 1417 zso->zvo_disk->major = zvol_major; 1418 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 1419 1420 /* 1421 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 1422 * This is accomplished by limiting the number of minors for the 1423 * device to one and explicitly disabling partition scanning. 1424 */ 1425 if (volmode == ZFS_VOLMODE_DEV) { 1426 zso->zvo_disk->minors = 1; 1427 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 1428 zso->zvo_disk->flags |= GENHD_FL_NO_PART; 1429 } 1430 1431 zso->zvo_disk->first_minor = (dev & MINORMASK); 1432 zso->zvo_disk->private_data = zv; 1433 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 1434 ZVOL_DEV_NAME, (dev & MINORMASK)); 1435 1436 *zvp = zv; 1437 return (ret); 1438 1439 out_kmem: 1440 kmem_free(zso, sizeof (struct zvol_state_os)); 1441 kmem_free(zv, sizeof (zvol_state_t)); 1442 return (ret); 1443 } 1444 1445 void 1446 zvol_os_remove_minor(zvol_state_t *zv) 1447 { 1448 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1449 ASSERT0(zv->zv_open_count); 1450 ASSERT0(atomic_read(&zv->zv_suspend_ref)); 1451 ASSERT(zv->zv_flags & ZVOL_REMOVING); 1452 1453 struct zvol_state_os *zso = zv->zv_zso; 1454 zv->zv_zso = NULL; 1455 1456 /* Clearing private_data will make new callers return immediately. */ 1457 atomic_store_ptr(&zso->zvo_disk->private_data, NULL); 1458 1459 /* 1460 * Drop the state lock before calling del_gendisk(). There may be 1461 * callers waiting to acquire it, but del_gendisk() will block until 1462 * they exit, which would deadlock. 1463 */ 1464 mutex_exit(&zv->zv_state_lock); 1465 1466 del_gendisk(zso->zvo_disk); 1467 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 1468 (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) 1469 #if defined(HAVE_BLK_CLEANUP_DISK) 1470 blk_cleanup_disk(zso->zvo_disk); 1471 #else 1472 put_disk(zso->zvo_disk); 1473 #endif 1474 #else 1475 blk_cleanup_queue(zso->zvo_queue); 1476 put_disk(zso->zvo_disk); 1477 #endif 1478 1479 if (zso->use_blk_mq) 1480 blk_mq_free_tag_set(&zso->tag_set); 1481 1482 ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); 1483 1484 kmem_free(zso, sizeof (struct zvol_state_os)); 1485 1486 mutex_enter(&zv->zv_state_lock); 1487 } 1488 1489 void 1490 zvol_os_free(zvol_state_t *zv) 1491 { 1492 1493 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 1494 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 1495 ASSERT0(zv->zv_open_count); 1496 ASSERT0P(zv->zv_zso); 1497 1498 ASSERT0P(zv->zv_objset); 1499 ASSERT0P(zv->zv_zilog); 1500 ASSERT0P(zv->zv_dn); 1501 1502 rw_destroy(&zv->zv_suspend_lock); 1503 zfs_rangelock_fini(&zv->zv_rangelock); 1504 1505 cv_destroy(&zv->zv_removing_cv); 1506 mutex_destroy(&zv->zv_state_lock); 1507 dataset_kstats_destroy(&zv->zv_kstat); 1508 1509 kmem_free(zv, sizeof (zvol_state_t)); 1510 } 1511 1512 void 1513 zvol_wait_close(zvol_state_t *zv) 1514 { 1515 } 1516 1517 struct add_disk_work { 1518 struct delayed_work work; 1519 struct gendisk *disk; 1520 int error; 1521 }; 1522 1523 static int 1524 __zvol_os_add_disk(struct gendisk *disk) 1525 { 1526 int error = 0; 1527 #ifdef HAVE_ADD_DISK_RET 1528 error = -add_disk(disk); 1529 if (error) 1530 error = SET_ERROR(error); 1531 #else 1532 add_disk(disk); 1533 #endif 1534 return (error); 1535 } 1536 1537 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) 1538 static void 1539 zvol_os_add_disk_work(struct work_struct *work) 1540 { 1541 struct add_disk_work *add_disk_work; 1542 add_disk_work = container_of(work, struct add_disk_work, work.work); 1543 add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); 1544 } 1545 #endif 1546 1547 /* 1548 * SPECIAL CASE: 1549 * 1550 * This function basically calls add_disk() from a workqueue. You may be 1551 * thinking: why not just call add_disk() directly? 1552 * 1553 * When you call add_disk(), the zvol appears to the world. When this happens, 1554 * the kernel calls disk_scan_partitions() on the zvol, which behaves 1555 * differently on the 6.9+ kernels: 1556 * 1557 * - 6.8 and older kernels - 1558 * disk_scan_partitions() 1559 * handle = bdev_open_by_dev( 1560 * zvol_open() 1561 * bdev_release(handle); 1562 * zvol_release() 1563 * 1564 * 1565 * - 6.9+ kernels - 1566 * disk_scan_partitions() 1567 * file = bdev_file_open_by_dev() 1568 * zvol_open() 1569 * fput(file) 1570 * < wait for return to userspace > 1571 * zvol_release() 1572 * 1573 * The difference is that the bdev_release() from the 6.8 kernel is synchronous 1574 * while the fput() from the 6.9 kernel is async. Or more specifically it's 1575 * async that has to wait until we return to userspace (since it adds the fput 1576 * into the caller's work queue with the TWA_RESUME flag set). This is not the 1577 * behavior we want, since we want do things like create+destroy a zvol within 1578 * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the 1579 * reference to the zvol while we're in the IOCTL, which can't wait until we 1580 * return to userspace. 1581 * 1582 * We can get around this since fput() has a special codepath for when it's 1583 * running in a kernel thread or interrupt. In those cases, it just puts the 1584 * fput into the system workqueue, which we can force to run with 1585 * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it 1586 * run from a kernel thread and "tricks" the fput() codepaths. 1587 * 1588 * Note that __flush_workqueue() is slowly getting deprecated. This may be ok 1589 * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via 1590 * fput) to happen, which it eventually, naturally, will from the system_wq 1591 * without us explicitly calling __flush_workqueue(). 1592 */ 1593 static int 1594 zvol_os_add_disk(struct gendisk *disk) 1595 { 1596 #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ 1597 struct add_disk_work add_disk_work; 1598 1599 INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); 1600 add_disk_work.disk = disk; 1601 add_disk_work.error = 0; 1602 1603 /* Use *_delayed_work functions since they're not GPL'd */ 1604 schedule_delayed_work(&add_disk_work.work, 0); 1605 flush_delayed_work(&add_disk_work.work); 1606 1607 __flush_workqueue(system_wq); 1608 return (add_disk_work.error); 1609 #else /* <= 6.8 kernel */ 1610 return (__zvol_os_add_disk(disk)); 1611 #endif 1612 } 1613 1614 /* 1615 * Create a block device minor node and setup the linkage between it 1616 * and the specified volume. Once this function returns the block 1617 * device is live and ready for use. 1618 */ 1619 int 1620 zvol_os_create_minor(const char *name) 1621 { 1622 zvol_state_t *zv = NULL; 1623 objset_t *os; 1624 dmu_object_info_t *doi; 1625 uint64_t volsize; 1626 uint64_t len; 1627 unsigned minor = 0; 1628 int error = 0; 1629 int idx; 1630 uint64_t hash = zvol_name_hash(name); 1631 uint64_t volthreading; 1632 bool replayed_zil = B_FALSE; 1633 1634 if (zvol_inhibit_dev) 1635 return (0); 1636 1637 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1638 if (idx < 0) 1639 return (SET_ERROR(-idx)); 1640 minor = idx << ZVOL_MINOR_BITS; 1641 if (MINOR(minor) != minor) { 1642 /* too many partitions can cause an overflow */ 1643 zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", 1644 name, minor, MINOR(minor)); 1645 ida_simple_remove(&zvol_ida, idx); 1646 return (SET_ERROR(EINVAL)); 1647 } 1648 1649 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1650 if (zv) { 1651 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1652 mutex_exit(&zv->zv_state_lock); 1653 ida_simple_remove(&zvol_ida, idx); 1654 return (SET_ERROR(EEXIST)); 1655 } 1656 1657 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1658 1659 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1660 if (error) 1661 goto out_doi; 1662 1663 error = dmu_object_info(os, ZVOL_OBJ, doi); 1664 if (error) 1665 goto out_dmu_objset_disown; 1666 1667 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1668 if (error) 1669 goto out_dmu_objset_disown; 1670 1671 error = zvol_alloc(MKDEV(zvol_major, minor), name, 1672 volsize, doi->doi_data_block_size, &zv); 1673 if (error || zv == NULL) 1674 goto out_dmu_objset_disown; 1675 1676 zv->zv_hash = hash; 1677 1678 if (dmu_objset_is_snapshot(os)) 1679 zv->zv_flags |= ZVOL_RDONLY; 1680 1681 zv->zv_objset = os; 1682 1683 /* Default */ 1684 zv->zv_threading = B_TRUE; 1685 if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) 1686 == 0) 1687 zv->zv_threading = volthreading; 1688 1689 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1690 1691 #ifdef QUEUE_FLAG_DISCARD 1692 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1693 #endif 1694 #ifdef QUEUE_FLAG_NONROT 1695 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1696 #endif 1697 #ifdef QUEUE_FLAG_ADD_RANDOM 1698 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1699 #endif 1700 /* This flag was introduced in kernel version 4.12. */ 1701 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1702 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1703 #endif 1704 1705 ASSERT0P(zv->zv_kstat.dk_kstats); 1706 error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1707 if (error) 1708 goto out_dmu_objset_disown; 1709 ASSERT0P(zv->zv_zilog); 1710 zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); 1711 if (spa_writeable(dmu_objset_spa(os))) { 1712 if (zil_replay_disable) 1713 replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); 1714 else 1715 replayed_zil = zil_replay(os, zv, zvol_replay_vector); 1716 } 1717 if (replayed_zil) 1718 zil_close(zv->zv_zilog); 1719 zv->zv_zilog = NULL; 1720 1721 /* 1722 * When udev detects the addition of the device it will immediately 1723 * invoke blkid(8) to determine the type of content on the device. 1724 * Prefetching the blocks commonly scanned by blkid(8) will speed 1725 * up this process. 1726 */ 1727 len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); 1728 if (len > 0) { 1729 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1730 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1731 ZIO_PRIORITY_SYNC_READ); 1732 } 1733 1734 zv->zv_objset = NULL; 1735 out_dmu_objset_disown: 1736 dmu_objset_disown(os, B_TRUE, FTAG); 1737 out_doi: 1738 kmem_free(doi, sizeof (dmu_object_info_t)); 1739 1740 /* 1741 * Keep in mind that once add_disk() is called, the zvol is 1742 * announced to the world, and zvol_open()/zvol_release() can 1743 * be called at any time. Incidentally, add_disk() itself calls 1744 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1745 * directly as well. 1746 */ 1747 if (error == 0 && zv) { 1748 rw_enter(&zvol_state_lock, RW_WRITER); 1749 zvol_insert(zv); 1750 rw_exit(&zvol_state_lock); 1751 error = zvol_os_add_disk(zv->zv_zso->zvo_disk); 1752 } else { 1753 ida_simple_remove(&zvol_ida, idx); 1754 } 1755 1756 return (error); 1757 } 1758 1759 int 1760 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1761 { 1762 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1763 1764 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1765 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1766 1767 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1768 1769 /* move to new hashtable entry */ 1770 zv->zv_hash = zvol_name_hash(newname); 1771 hlist_del(&zv->zv_hlink); 1772 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1773 1774 /* 1775 * The block device's read-only state is briefly changed causing 1776 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1777 * the name change and fixes the symlinks. This does not change 1778 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1779 * changes. This would normally be done using kobject_uevent() but 1780 * that is a GPL-only symbol which is why we need this workaround. 1781 */ 1782 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1783 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1784 1785 dataset_kstats_rename(&zv->zv_kstat, newname); 1786 1787 return (0); 1788 } 1789 1790 void 1791 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1792 { 1793 1794 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1795 } 1796 1797 void 1798 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1799 { 1800 1801 set_capacity(zv->zv_zso->zvo_disk, capacity); 1802 } 1803 1804 int 1805 zvol_init(void) 1806 { 1807 int error; 1808 1809 error = zvol_init_impl(); 1810 if (error) { 1811 printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); 1812 return (error); 1813 } 1814 1815 error = -register_blkdev(zvol_major, ZVOL_DRIVER); 1816 if (error) { 1817 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1818 return (SET_ERROR(error)); 1819 } 1820 1821 if (zvol_blk_mq_queue_depth == 0) { 1822 zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; 1823 } else { 1824 zvol_actual_blk_mq_queue_depth = 1825 MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); 1826 } 1827 1828 if (zvol_blk_mq_threads == 0) { 1829 zvol_blk_mq_actual_threads = num_online_cpus(); 1830 } else { 1831 zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1832 1024); 1833 } 1834 1835 ida_init(&zvol_ida); 1836 return (0); 1837 } 1838 1839 void 1840 zvol_fini(void) 1841 { 1842 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1843 1844 zvol_fini_impl(); 1845 1846 ida_destroy(&zvol_ida); 1847 } 1848 1849 module_param(zvol_major, uint, 0444); 1850 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1851 1852 module_param(zvol_max_discard_blocks, ulong, 0444); 1853 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1854 1855 module_param(zvol_blk_mq_queue_depth, uint, 0644); 1856 MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); 1857 1858 module_param(zvol_use_blk_mq, uint, 0644); 1859 MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); 1860 1861 module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); 1862 MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, 1863 "Process volblocksize blocks per thread"); 1864 1865 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 1866 module_param(zvol_open_timeout_ms, uint, 0644); 1867 MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); 1868 #endif 1869