1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 23 */ 24 25 #include <sys/dataset_kstats.h> 26 #include <sys/dbuf.h> 27 #include <sys/dmu_traverse.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/zap.h> 32 #include <sys/zfeature.h> 33 #include <sys/zil_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/zio.h> 36 #include <sys/zfs_rlock.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zvol.h> 39 #include <sys/zvol_impl.h> 40 41 #include <linux/blkdev_compat.h> 42 #include <linux/task_io_accounting_ops.h> 43 44 static unsigned int zvol_major = ZVOL_MAJOR; 45 static unsigned int zvol_request_sync = 0; 46 static unsigned int zvol_prefetch_bytes = (128 * 1024); 47 static unsigned long zvol_max_discard_blocks = 16384; 48 static unsigned int zvol_threads = 32; 49 50 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 51 static const unsigned int zvol_open_timeout_ms = 1000; 52 #endif 53 54 struct zvol_state_os { 55 struct gendisk *zvo_disk; /* generic disk */ 56 struct request_queue *zvo_queue; /* request queue */ 57 dev_t zvo_dev; /* device id */ 58 }; 59 60 taskq_t *zvol_taskq; 61 static struct ida zvol_ida; 62 63 typedef struct zv_request_stack { 64 zvol_state_t *zv; 65 struct bio *bio; 66 } zv_request_t; 67 68 typedef struct zv_request_task { 69 zv_request_t zvr; 70 taskq_ent_t ent; 71 } zv_request_task_t; 72 73 static zv_request_task_t * 74 zv_request_task_create(zv_request_t zvr) 75 { 76 zv_request_task_t *task; 77 task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); 78 taskq_init_ent(&task->ent); 79 task->zvr = zvr; 80 return (task); 81 } 82 83 static void 84 zv_request_task_free(zv_request_task_t *task) 85 { 86 kmem_free(task, sizeof (*task)); 87 } 88 89 /* 90 * Given a path, return TRUE if path is a ZVOL. 91 */ 92 boolean_t 93 zvol_os_is_zvol(const char *path) 94 { 95 dev_t dev = 0; 96 97 if (vdev_lookup_bdev(path, &dev) != 0) 98 return (B_FALSE); 99 100 if (MAJOR(dev) == zvol_major) 101 return (B_TRUE); 102 103 return (B_FALSE); 104 } 105 106 static void 107 zvol_write(zv_request_t *zvr) 108 { 109 struct bio *bio = zvr->bio; 110 int error = 0; 111 zfs_uio_t uio; 112 113 zfs_uio_bvec_init(&uio, bio); 114 115 zvol_state_t *zv = zvr->zv; 116 ASSERT3P(zv, !=, NULL); 117 ASSERT3U(zv->zv_open_count, >, 0); 118 ASSERT3P(zv->zv_zilog, !=, NULL); 119 120 /* bio marked as FLUSH need to flush before write */ 121 if (bio_is_flush(bio)) 122 zil_commit(zv->zv_zilog, ZVOL_OBJ); 123 124 /* Some requests are just for flush and nothing else. */ 125 if (uio.uio_resid == 0) { 126 rw_exit(&zv->zv_suspend_lock); 127 BIO_END_IO(bio, 0); 128 return; 129 } 130 131 struct request_queue *q = zv->zv_zso->zvo_queue; 132 struct gendisk *disk = zv->zv_zso->zvo_disk; 133 ssize_t start_resid = uio.uio_resid; 134 unsigned long start_time; 135 136 boolean_t acct = blk_queue_io_stat(q); 137 if (acct) 138 start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); 139 140 boolean_t sync = 141 bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 142 143 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 144 uio.uio_loffset, uio.uio_resid, RL_WRITER); 145 146 uint64_t volsize = zv->zv_volsize; 147 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 148 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 149 uint64_t off = uio.uio_loffset; 150 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 151 152 if (bytes > volsize - off) /* don't write past the end */ 153 bytes = volsize - off; 154 155 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 156 157 /* This will only fail for ENOSPC */ 158 error = dmu_tx_assign(tx, TXG_WAIT); 159 if (error) { 160 dmu_tx_abort(tx); 161 break; 162 } 163 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 164 if (error == 0) { 165 zvol_log_write(zv, tx, off, bytes, sync); 166 } 167 dmu_tx_commit(tx); 168 169 if (error) 170 break; 171 } 172 zfs_rangelock_exit(lr); 173 174 int64_t nwritten = start_resid - uio.uio_resid; 175 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 176 task_io_account_write(nwritten); 177 178 if (sync) 179 zil_commit(zv->zv_zilog, ZVOL_OBJ); 180 181 rw_exit(&zv->zv_suspend_lock); 182 183 if (acct) 184 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 185 186 BIO_END_IO(bio, -error); 187 } 188 189 static void 190 zvol_write_task(void *arg) 191 { 192 zv_request_task_t *task = arg; 193 zvol_write(&task->zvr); 194 zv_request_task_free(task); 195 } 196 197 static void 198 zvol_discard(zv_request_t *zvr) 199 { 200 struct bio *bio = zvr->bio; 201 zvol_state_t *zv = zvr->zv; 202 uint64_t start = BIO_BI_SECTOR(bio) << 9; 203 uint64_t size = BIO_BI_SIZE(bio); 204 uint64_t end = start + size; 205 boolean_t sync; 206 int error = 0; 207 dmu_tx_t *tx; 208 209 ASSERT3P(zv, !=, NULL); 210 ASSERT3U(zv->zv_open_count, >, 0); 211 ASSERT3P(zv->zv_zilog, !=, NULL); 212 213 struct request_queue *q = zv->zv_zso->zvo_queue; 214 struct gendisk *disk = zv->zv_zso->zvo_disk; 215 unsigned long start_time; 216 217 boolean_t acct = blk_queue_io_stat(q); 218 if (acct) 219 start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); 220 221 sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 222 223 if (end > zv->zv_volsize) { 224 error = SET_ERROR(EIO); 225 goto unlock; 226 } 227 228 /* 229 * Align the request to volume block boundaries when a secure erase is 230 * not required. This will prevent dnode_free_range() from zeroing out 231 * the unaligned parts which is slow (read-modify-write) and useless 232 * since we are not freeing any space by doing so. 233 */ 234 if (!bio_is_secure_erase(bio)) { 235 start = P2ROUNDUP(start, zv->zv_volblocksize); 236 end = P2ALIGN(end, zv->zv_volblocksize); 237 size = end - start; 238 } 239 240 if (start >= end) 241 goto unlock; 242 243 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 244 start, size, RL_WRITER); 245 246 tx = dmu_tx_create(zv->zv_objset); 247 dmu_tx_mark_netfree(tx); 248 error = dmu_tx_assign(tx, TXG_WAIT); 249 if (error != 0) { 250 dmu_tx_abort(tx); 251 } else { 252 zvol_log_truncate(zv, tx, start, size, B_TRUE); 253 dmu_tx_commit(tx); 254 error = dmu_free_long_range(zv->zv_objset, 255 ZVOL_OBJ, start, size); 256 } 257 zfs_rangelock_exit(lr); 258 259 if (error == 0 && sync) 260 zil_commit(zv->zv_zilog, ZVOL_OBJ); 261 262 unlock: 263 rw_exit(&zv->zv_suspend_lock); 264 265 if (acct) 266 blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); 267 268 BIO_END_IO(bio, -error); 269 } 270 271 static void 272 zvol_discard_task(void *arg) 273 { 274 zv_request_task_t *task = arg; 275 zvol_discard(&task->zvr); 276 zv_request_task_free(task); 277 } 278 279 static void 280 zvol_read(zv_request_t *zvr) 281 { 282 struct bio *bio = zvr->bio; 283 int error = 0; 284 zfs_uio_t uio; 285 286 zfs_uio_bvec_init(&uio, bio); 287 288 zvol_state_t *zv = zvr->zv; 289 ASSERT3P(zv, !=, NULL); 290 ASSERT3U(zv->zv_open_count, >, 0); 291 292 struct request_queue *q = zv->zv_zso->zvo_queue; 293 struct gendisk *disk = zv->zv_zso->zvo_disk; 294 ssize_t start_resid = uio.uio_resid; 295 unsigned long start_time; 296 297 boolean_t acct = blk_queue_io_stat(q); 298 if (acct) 299 start_time = blk_generic_start_io_acct(q, disk, READ, bio); 300 301 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 302 uio.uio_loffset, uio.uio_resid, RL_READER); 303 304 uint64_t volsize = zv->zv_volsize; 305 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 306 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 307 308 /* don't read past the end */ 309 if (bytes > volsize - uio.uio_loffset) 310 bytes = volsize - uio.uio_loffset; 311 312 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 313 if (error) { 314 /* convert checksum errors into IO errors */ 315 if (error == ECKSUM) 316 error = SET_ERROR(EIO); 317 break; 318 } 319 } 320 zfs_rangelock_exit(lr); 321 322 int64_t nread = start_resid - uio.uio_resid; 323 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 324 task_io_account_read(nread); 325 326 rw_exit(&zv->zv_suspend_lock); 327 328 if (acct) 329 blk_generic_end_io_acct(q, disk, READ, bio, start_time); 330 331 BIO_END_IO(bio, -error); 332 } 333 334 static void 335 zvol_read_task(void *arg) 336 { 337 zv_request_task_t *task = arg; 338 zvol_read(&task->zvr); 339 zv_request_task_free(task); 340 } 341 342 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 343 #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID 344 static void 345 zvol_submit_bio(struct bio *bio) 346 #else 347 static blk_qc_t 348 zvol_submit_bio(struct bio *bio) 349 #endif 350 #else 351 static MAKE_REQUEST_FN_RET 352 zvol_request(struct request_queue *q, struct bio *bio) 353 #endif 354 { 355 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 356 #if defined(HAVE_BIO_BDEV_DISK) 357 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 358 #else 359 struct request_queue *q = bio->bi_disk->queue; 360 #endif 361 #endif 362 zvol_state_t *zv = q->queuedata; 363 fstrans_cookie_t cookie = spl_fstrans_mark(); 364 uint64_t offset = BIO_BI_SECTOR(bio) << 9; 365 uint64_t size = BIO_BI_SIZE(bio); 366 int rw = bio_data_dir(bio); 367 368 if (bio_has_data(bio) && offset + size > zv->zv_volsize) { 369 printk(KERN_INFO 370 "%s: bad access: offset=%llu, size=%lu\n", 371 zv->zv_zso->zvo_disk->disk_name, 372 (long long unsigned)offset, 373 (long unsigned)size); 374 375 BIO_END_IO(bio, -SET_ERROR(EIO)); 376 goto out; 377 } 378 379 zv_request_t zvr = { 380 .zv = zv, 381 .bio = bio, 382 }; 383 zv_request_task_t *task; 384 385 if (rw == WRITE) { 386 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 387 BIO_END_IO(bio, -SET_ERROR(EROFS)); 388 goto out; 389 } 390 391 /* 392 * Prevents the zvol from being suspended, or the ZIL being 393 * concurrently opened. Will be released after the i/o 394 * completes. 395 */ 396 rw_enter(&zv->zv_suspend_lock, RW_READER); 397 398 /* 399 * Open a ZIL if this is the first time we have written to this 400 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 401 * than zv_state_lock so that we don't need to acquire an 402 * additional lock in this path. 403 */ 404 if (zv->zv_zilog == NULL) { 405 rw_exit(&zv->zv_suspend_lock); 406 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 407 if (zv->zv_zilog == NULL) { 408 zv->zv_zilog = zil_open(zv->zv_objset, 409 zvol_get_data); 410 zv->zv_flags |= ZVOL_WRITTEN_TO; 411 /* replay / destroy done in zvol_create_minor */ 412 VERIFY0((zv->zv_zilog->zl_header->zh_flags & 413 ZIL_REPLAY_NEEDED)); 414 } 415 rw_downgrade(&zv->zv_suspend_lock); 416 } 417 418 /* 419 * We don't want this thread to be blocked waiting for i/o to 420 * complete, so we instead wait from a taskq callback. The 421 * i/o may be a ZIL write (via zil_commit()), or a read of an 422 * indirect block, or a read of a data block (if this is a 423 * partial-block write). We will indicate that the i/o is 424 * complete by calling BIO_END_IO() from the taskq callback. 425 * 426 * This design allows the calling thread to continue and 427 * initiate more concurrent operations by calling 428 * zvol_request() again. There are typically only a small 429 * number of threads available to call zvol_request() (e.g. 430 * one per iSCSI target), so keeping the latency of 431 * zvol_request() low is important for performance. 432 * 433 * The zvol_request_sync module parameter allows this 434 * behavior to be altered, for performance evaluation 435 * purposes. If the callback blocks, setting 436 * zvol_request_sync=1 will result in much worse performance. 437 * 438 * We can have up to zvol_threads concurrent i/o's being 439 * processed for all zvols on the system. This is typically 440 * a vast improvement over the zvol_request_sync=1 behavior 441 * of one i/o at a time per zvol. However, an even better 442 * design would be for zvol_request() to initiate the zio 443 * directly, and then be notified by the zio_done callback, 444 * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL 445 * interfaces lack this functionality (they block waiting for 446 * the i/o to complete). 447 */ 448 if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { 449 if (zvol_request_sync) { 450 zvol_discard(&zvr); 451 } else { 452 task = zv_request_task_create(zvr); 453 taskq_dispatch_ent(zvol_taskq, 454 zvol_discard_task, task, 0, &task->ent); 455 } 456 } else { 457 if (zvol_request_sync) { 458 zvol_write(&zvr); 459 } else { 460 task = zv_request_task_create(zvr); 461 taskq_dispatch_ent(zvol_taskq, 462 zvol_write_task, task, 0, &task->ent); 463 } 464 } 465 } else { 466 /* 467 * The SCST driver, and possibly others, may issue READ I/Os 468 * with a length of zero bytes. These empty I/Os contain no 469 * data and require no additional handling. 470 */ 471 if (size == 0) { 472 BIO_END_IO(bio, 0); 473 goto out; 474 } 475 476 rw_enter(&zv->zv_suspend_lock, RW_READER); 477 478 /* See comment in WRITE case above. */ 479 if (zvol_request_sync) { 480 zvol_read(&zvr); 481 } else { 482 task = zv_request_task_create(zvr); 483 taskq_dispatch_ent(zvol_taskq, 484 zvol_read_task, task, 0, &task->ent); 485 } 486 } 487 488 out: 489 spl_fstrans_unmark(cookie); 490 #if (defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 491 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS)) && \ 492 !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) 493 return (BLK_QC_T_NONE); 494 #endif 495 } 496 497 static int 498 zvol_open(struct block_device *bdev, fmode_t flag) 499 { 500 zvol_state_t *zv; 501 int error = 0; 502 boolean_t drop_suspend = B_FALSE; 503 #ifndef HAVE_BLKDEV_GET_ERESTARTSYS 504 hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); 505 hrtime_t start = gethrtime(); 506 507 retry: 508 #endif 509 rw_enter(&zvol_state_lock, RW_READER); 510 /* 511 * Obtain a copy of private_data under the zvol_state_lock to make 512 * sure that either the result of zvol free code path setting 513 * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free() 514 * is not called on this zv because of the positive zv_open_count. 515 */ 516 zv = bdev->bd_disk->private_data; 517 if (zv == NULL) { 518 rw_exit(&zvol_state_lock); 519 return (SET_ERROR(-ENXIO)); 520 } 521 522 mutex_enter(&zv->zv_state_lock); 523 /* 524 * Make sure zvol is not suspended during first open 525 * (hold zv_suspend_lock) and respect proper lock acquisition 526 * ordering - zv_suspend_lock before zv_state_lock 527 */ 528 if (zv->zv_open_count == 0) { 529 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 530 mutex_exit(&zv->zv_state_lock); 531 rw_enter(&zv->zv_suspend_lock, RW_READER); 532 mutex_enter(&zv->zv_state_lock); 533 /* check to see if zv_suspend_lock is needed */ 534 if (zv->zv_open_count != 0) { 535 rw_exit(&zv->zv_suspend_lock); 536 } else { 537 drop_suspend = B_TRUE; 538 } 539 } else { 540 drop_suspend = B_TRUE; 541 } 542 } 543 rw_exit(&zvol_state_lock); 544 545 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 546 547 if (zv->zv_open_count == 0) { 548 boolean_t drop_namespace = B_FALSE; 549 550 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 551 552 /* 553 * In all other call paths the spa_namespace_lock is taken 554 * before the bdev->bd_mutex lock. However, on open(2) 555 * the __blkdev_get() function calls fops->open() with the 556 * bdev->bd_mutex lock held. This can result in a deadlock 557 * when zvols from one pool are used as vdevs in another. 558 * 559 * To prevent a lock inversion deadlock we preemptively 560 * take the spa_namespace_lock. Normally the lock will not 561 * be contended and this is safe because spa_open_common() 562 * handles the case where the caller already holds the 563 * spa_namespace_lock. 564 * 565 * When the lock cannot be aquired after multiple retries 566 * this must be the vdev on zvol deadlock case and we have 567 * no choice but to return an error. For 5.12 and older 568 * kernels returning -ERESTARTSYS will result in the 569 * bdev->bd_mutex being dropped, then reacquired, and 570 * fops->open() being called again. This process can be 571 * repeated safely until both locks are acquired. For 5.13 572 * and newer the -ERESTARTSYS retry logic was removed from 573 * the kernel so the only option is to return the error for 574 * the caller to handle it. 575 */ 576 if (!mutex_owned(&spa_namespace_lock)) { 577 if (!mutex_tryenter(&spa_namespace_lock)) { 578 mutex_exit(&zv->zv_state_lock); 579 rw_exit(&zv->zv_suspend_lock); 580 581 #ifdef HAVE_BLKDEV_GET_ERESTARTSYS 582 schedule(); 583 return (SET_ERROR(-ERESTARTSYS)); 584 #else 585 if ((gethrtime() - start) > timeout) 586 return (SET_ERROR(-ERESTARTSYS)); 587 588 schedule_timeout(MSEC_TO_TICK(10)); 589 goto retry; 590 #endif 591 } else { 592 drop_namespace = B_TRUE; 593 } 594 } 595 596 error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); 597 598 if (drop_namespace) 599 mutex_exit(&spa_namespace_lock); 600 } 601 602 if (error == 0) { 603 if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 604 if (zv->zv_open_count == 0) 605 zvol_last_close(zv); 606 607 error = SET_ERROR(-EROFS); 608 } else { 609 zv->zv_open_count++; 610 } 611 } 612 613 mutex_exit(&zv->zv_state_lock); 614 if (drop_suspend) 615 rw_exit(&zv->zv_suspend_lock); 616 617 if (error == 0) 618 zfs_check_media_change(bdev); 619 620 return (error); 621 } 622 623 static void 624 zvol_release(struct gendisk *disk, fmode_t mode) 625 { 626 zvol_state_t *zv; 627 boolean_t drop_suspend = B_TRUE; 628 629 rw_enter(&zvol_state_lock, RW_READER); 630 zv = disk->private_data; 631 632 mutex_enter(&zv->zv_state_lock); 633 ASSERT3U(zv->zv_open_count, >, 0); 634 /* 635 * make sure zvol is not suspended during last close 636 * (hold zv_suspend_lock) and respect proper lock acquisition 637 * ordering - zv_suspend_lock before zv_state_lock 638 */ 639 if (zv->zv_open_count == 1) { 640 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 641 mutex_exit(&zv->zv_state_lock); 642 rw_enter(&zv->zv_suspend_lock, RW_READER); 643 mutex_enter(&zv->zv_state_lock); 644 /* check to see if zv_suspend_lock is needed */ 645 if (zv->zv_open_count != 1) { 646 rw_exit(&zv->zv_suspend_lock); 647 drop_suspend = B_FALSE; 648 } 649 } 650 } else { 651 drop_suspend = B_FALSE; 652 } 653 rw_exit(&zvol_state_lock); 654 655 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 656 657 zv->zv_open_count--; 658 if (zv->zv_open_count == 0) { 659 ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); 660 zvol_last_close(zv); 661 } 662 663 mutex_exit(&zv->zv_state_lock); 664 665 if (drop_suspend) 666 rw_exit(&zv->zv_suspend_lock); 667 } 668 669 static int 670 zvol_ioctl(struct block_device *bdev, fmode_t mode, 671 unsigned int cmd, unsigned long arg) 672 { 673 zvol_state_t *zv = bdev->bd_disk->private_data; 674 int error = 0; 675 676 ASSERT3U(zv->zv_open_count, >, 0); 677 678 switch (cmd) { 679 case BLKFLSBUF: 680 fsync_bdev(bdev); 681 invalidate_bdev(bdev); 682 rw_enter(&zv->zv_suspend_lock, RW_READER); 683 684 if (!(zv->zv_flags & ZVOL_RDONLY)) 685 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 686 687 rw_exit(&zv->zv_suspend_lock); 688 break; 689 690 case BLKZNAME: 691 mutex_enter(&zv->zv_state_lock); 692 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 693 mutex_exit(&zv->zv_state_lock); 694 break; 695 696 default: 697 error = -ENOTTY; 698 break; 699 } 700 701 return (SET_ERROR(error)); 702 } 703 704 #ifdef CONFIG_COMPAT 705 static int 706 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 707 unsigned cmd, unsigned long arg) 708 { 709 return (zvol_ioctl(bdev, mode, cmd, arg)); 710 } 711 #else 712 #define zvol_compat_ioctl NULL 713 #endif 714 715 static unsigned int 716 zvol_check_events(struct gendisk *disk, unsigned int clearing) 717 { 718 unsigned int mask = 0; 719 720 rw_enter(&zvol_state_lock, RW_READER); 721 722 zvol_state_t *zv = disk->private_data; 723 if (zv != NULL) { 724 mutex_enter(&zv->zv_state_lock); 725 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 726 zv->zv_changed = 0; 727 mutex_exit(&zv->zv_state_lock); 728 } 729 730 rw_exit(&zvol_state_lock); 731 732 return (mask); 733 } 734 735 static int 736 zvol_revalidate_disk(struct gendisk *disk) 737 { 738 rw_enter(&zvol_state_lock, RW_READER); 739 740 zvol_state_t *zv = disk->private_data; 741 if (zv != NULL) { 742 mutex_enter(&zv->zv_state_lock); 743 set_capacity(zv->zv_zso->zvo_disk, 744 zv->zv_volsize >> SECTOR_BITS); 745 mutex_exit(&zv->zv_state_lock); 746 } 747 748 rw_exit(&zvol_state_lock); 749 750 return (0); 751 } 752 753 int 754 zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) 755 { 756 struct gendisk *disk = zv->zv_zso->zvo_disk; 757 758 #if defined(HAVE_REVALIDATE_DISK_SIZE) 759 revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); 760 #elif defined(HAVE_REVALIDATE_DISK) 761 revalidate_disk(disk); 762 #else 763 zvol_revalidate_disk(disk); 764 #endif 765 return (0); 766 } 767 768 void 769 zvol_os_clear_private(zvol_state_t *zv) 770 { 771 /* 772 * Cleared while holding zvol_state_lock as a writer 773 * which will prevent zvol_open() from opening it. 774 */ 775 zv->zv_zso->zvo_disk->private_data = NULL; 776 } 777 778 /* 779 * Provide a simple virtual geometry for legacy compatibility. For devices 780 * smaller than 1 MiB a small head and sector count is used to allow very 781 * tiny devices. For devices over 1 Mib a standard head and sector count 782 * is used to keep the cylinders count reasonable. 783 */ 784 static int 785 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 786 { 787 zvol_state_t *zv = bdev->bd_disk->private_data; 788 sector_t sectors; 789 790 ASSERT3U(zv->zv_open_count, >, 0); 791 792 sectors = get_capacity(zv->zv_zso->zvo_disk); 793 794 if (sectors > 2048) { 795 geo->heads = 16; 796 geo->sectors = 63; 797 } else { 798 geo->heads = 2; 799 geo->sectors = 4; 800 } 801 802 geo->start = 0; 803 geo->cylinders = sectors / (geo->heads * geo->sectors); 804 805 return (0); 806 } 807 808 static const struct block_device_operations zvol_ops = { 809 .open = zvol_open, 810 .release = zvol_release, 811 .ioctl = zvol_ioctl, 812 .compat_ioctl = zvol_compat_ioctl, 813 .check_events = zvol_check_events, 814 #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK 815 .revalidate_disk = zvol_revalidate_disk, 816 #endif 817 .getgeo = zvol_getgeo, 818 .owner = THIS_MODULE, 819 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 820 .submit_bio = zvol_submit_bio, 821 #endif 822 }; 823 824 /* 825 * Allocate memory for a new zvol_state_t and setup the required 826 * request queue and generic disk structures for the block device. 827 */ 828 static zvol_state_t * 829 zvol_alloc(dev_t dev, const char *name) 830 { 831 zvol_state_t *zv; 832 struct zvol_state_os *zso; 833 uint64_t volmode; 834 835 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 836 return (NULL); 837 838 if (volmode == ZFS_VOLMODE_DEFAULT) 839 volmode = zvol_volmode; 840 841 if (volmode == ZFS_VOLMODE_NONE) 842 return (NULL); 843 844 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 845 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 846 zv->zv_zso = zso; 847 zv->zv_volmode = volmode; 848 849 list_link_init(&zv->zv_next); 850 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 851 852 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 853 #ifdef HAVE_BLK_ALLOC_DISK 854 zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); 855 if (zso->zvo_disk == NULL) 856 goto out_kmem; 857 858 zso->zvo_disk->minors = ZVOL_MINORS; 859 zso->zvo_queue = zso->zvo_disk->queue; 860 #else 861 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 862 if (zso->zvo_queue == NULL) 863 goto out_kmem; 864 865 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 866 if (zso->zvo_disk == NULL) { 867 blk_cleanup_queue(zso->zvo_queue); 868 goto out_kmem; 869 } 870 871 zso->zvo_disk->queue = zso->zvo_queue; 872 #endif /* HAVE_BLK_ALLOC_DISK */ 873 #else 874 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 875 if (zso->zvo_queue == NULL) 876 goto out_kmem; 877 878 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 879 if (zso->zvo_disk == NULL) { 880 blk_cleanup_queue(zso->zvo_queue); 881 goto out_kmem; 882 } 883 884 zso->zvo_disk->queue = zso->zvo_queue; 885 #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ 886 887 blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); 888 889 /* Limit read-ahead to a single page to prevent over-prefetching. */ 890 blk_queue_set_read_ahead(zso->zvo_queue, 1); 891 892 /* Disable write merging in favor of the ZIO pipeline. */ 893 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 894 895 /* Enable /proc/diskstats */ 896 blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); 897 898 zso->zvo_queue->queuedata = zv; 899 zso->zvo_dev = dev; 900 zv->zv_open_count = 0; 901 strlcpy(zv->zv_name, name, MAXNAMELEN); 902 903 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 904 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 905 906 zso->zvo_disk->major = zvol_major; 907 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 908 909 /* 910 * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. 911 * This is accomplished by limiting the number of minors for the 912 * device to one and explicitly disabling partition scanning. 913 */ 914 if (volmode == ZFS_VOLMODE_DEV) { 915 zso->zvo_disk->minors = 1; 916 zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; 917 zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; 918 } 919 920 zso->zvo_disk->first_minor = (dev & MINORMASK); 921 zso->zvo_disk->fops = &zvol_ops; 922 zso->zvo_disk->private_data = zv; 923 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 924 ZVOL_DEV_NAME, (dev & MINORMASK)); 925 926 return (zv); 927 928 out_kmem: 929 kmem_free(zso, sizeof (struct zvol_state_os)); 930 kmem_free(zv, sizeof (zvol_state_t)); 931 return (NULL); 932 } 933 934 /* 935 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 936 * At this time, the structure is not opened by anyone, is taken off 937 * the zvol_state_list, and has its private data set to NULL. 938 * The zvol_state_lock is dropped. 939 * 940 * This function may take many milliseconds to complete (e.g. we've seen 941 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 942 * "del_gendisk". Thus, consumers need to be careful to account for this 943 * latency when calling this function. 944 */ 945 void 946 zvol_os_free(zvol_state_t *zv) 947 { 948 949 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 950 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 951 ASSERT0(zv->zv_open_count); 952 ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); 953 954 rw_destroy(&zv->zv_suspend_lock); 955 zfs_rangelock_fini(&zv->zv_rangelock); 956 957 del_gendisk(zv->zv_zso->zvo_disk); 958 #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ 959 defined(HAVE_BLK_ALLOC_DISK) 960 blk_cleanup_disk(zv->zv_zso->zvo_disk); 961 #else 962 blk_cleanup_queue(zv->zv_zso->zvo_queue); 963 put_disk(zv->zv_zso->zvo_disk); 964 #endif 965 966 ida_simple_remove(&zvol_ida, 967 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 968 969 mutex_destroy(&zv->zv_state_lock); 970 dataset_kstats_destroy(&zv->zv_kstat); 971 972 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 973 kmem_free(zv, sizeof (zvol_state_t)); 974 } 975 976 void 977 zvol_wait_close(zvol_state_t *zv) 978 { 979 } 980 981 /* 982 * Create a block device minor node and setup the linkage between it 983 * and the specified volume. Once this function returns the block 984 * device is live and ready for use. 985 */ 986 int 987 zvol_os_create_minor(const char *name) 988 { 989 zvol_state_t *zv; 990 objset_t *os; 991 dmu_object_info_t *doi; 992 uint64_t volsize; 993 uint64_t len; 994 unsigned minor = 0; 995 int error = 0; 996 int idx; 997 uint64_t hash = zvol_name_hash(name); 998 999 if (zvol_inhibit_dev) 1000 return (0); 1001 1002 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 1003 if (idx < 0) 1004 return (SET_ERROR(-idx)); 1005 minor = idx << ZVOL_MINOR_BITS; 1006 1007 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 1008 if (zv) { 1009 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1010 mutex_exit(&zv->zv_state_lock); 1011 ida_simple_remove(&zvol_ida, idx); 1012 return (SET_ERROR(EEXIST)); 1013 } 1014 1015 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 1016 1017 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 1018 if (error) 1019 goto out_doi; 1020 1021 error = dmu_object_info(os, ZVOL_OBJ, doi); 1022 if (error) 1023 goto out_dmu_objset_disown; 1024 1025 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 1026 if (error) 1027 goto out_dmu_objset_disown; 1028 1029 zv = zvol_alloc(MKDEV(zvol_major, minor), name); 1030 if (zv == NULL) { 1031 error = SET_ERROR(EAGAIN); 1032 goto out_dmu_objset_disown; 1033 } 1034 zv->zv_hash = hash; 1035 1036 if (dmu_objset_is_snapshot(os)) 1037 zv->zv_flags |= ZVOL_RDONLY; 1038 1039 zv->zv_volblocksize = doi->doi_data_block_size; 1040 zv->zv_volsize = volsize; 1041 zv->zv_objset = os; 1042 1043 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 1044 1045 blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, 1046 (DMU_MAX_ACCESS / 4) >> 9); 1047 blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); 1048 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); 1049 blk_queue_physical_block_size(zv->zv_zso->zvo_queue, 1050 zv->zv_volblocksize); 1051 blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); 1052 blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, 1053 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); 1054 blk_queue_discard_granularity(zv->zv_zso->zvo_queue, 1055 zv->zv_volblocksize); 1056 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 1057 #ifdef QUEUE_FLAG_NONROT 1058 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 1059 #endif 1060 #ifdef QUEUE_FLAG_ADD_RANDOM 1061 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 1062 #endif 1063 /* This flag was introduced in kernel version 4.12. */ 1064 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 1065 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 1066 #endif 1067 1068 ASSERT3P(zv->zv_zilog, ==, NULL); 1069 zv->zv_zilog = zil_open(os, zvol_get_data); 1070 if (spa_writeable(dmu_objset_spa(os))) { 1071 if (zil_replay_disable) 1072 zil_destroy(zv->zv_zilog, B_FALSE); 1073 else 1074 zil_replay(os, zv, zvol_replay_vector); 1075 } 1076 zil_close(zv->zv_zilog); 1077 zv->zv_zilog = NULL; 1078 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 1079 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 1080 1081 /* 1082 * When udev detects the addition of the device it will immediately 1083 * invoke blkid(8) to determine the type of content on the device. 1084 * Prefetching the blocks commonly scanned by blkid(8) will speed 1085 * up this process. 1086 */ 1087 len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); 1088 if (len > 0) { 1089 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 1090 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 1091 ZIO_PRIORITY_SYNC_READ); 1092 } 1093 1094 zv->zv_objset = NULL; 1095 out_dmu_objset_disown: 1096 dmu_objset_disown(os, B_TRUE, FTAG); 1097 out_doi: 1098 kmem_free(doi, sizeof (dmu_object_info_t)); 1099 1100 /* 1101 * Keep in mind that once add_disk() is called, the zvol is 1102 * announced to the world, and zvol_open()/zvol_release() can 1103 * be called at any time. Incidentally, add_disk() itself calls 1104 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1105 * directly as well. 1106 */ 1107 if (error == 0) { 1108 rw_enter(&zvol_state_lock, RW_WRITER); 1109 zvol_insert(zv); 1110 rw_exit(&zvol_state_lock); 1111 #ifdef HAVE_ADD_DISK_RET 1112 error = add_disk(zv->zv_zso->zvo_disk); 1113 #else 1114 add_disk(zv->zv_zso->zvo_disk); 1115 #endif 1116 } else { 1117 ida_simple_remove(&zvol_ida, idx); 1118 } 1119 1120 return (error); 1121 } 1122 1123 void 1124 zvol_os_rename_minor(zvol_state_t *zv, const char *newname) 1125 { 1126 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1127 1128 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1129 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1130 1131 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1132 1133 /* move to new hashtable entry */ 1134 zv->zv_hash = zvol_name_hash(zv->zv_name); 1135 hlist_del(&zv->zv_hlink); 1136 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1137 1138 /* 1139 * The block device's read-only state is briefly changed causing 1140 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1141 * the name change and fixes the symlinks. This does not change 1142 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1143 * changes. This would normally be done using kobject_uevent() but 1144 * that is a GPL-only symbol which is why we need this workaround. 1145 */ 1146 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1147 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1148 } 1149 1150 void 1151 zvol_os_set_disk_ro(zvol_state_t *zv, int flags) 1152 { 1153 1154 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1155 } 1156 1157 void 1158 zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) 1159 { 1160 1161 set_capacity(zv->zv_zso->zvo_disk, capacity); 1162 } 1163 1164 int 1165 zvol_init(void) 1166 { 1167 int error; 1168 int threads = MIN(MAX(zvol_threads, 1), 1024); 1169 1170 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1171 if (error) { 1172 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1173 return (error); 1174 } 1175 zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, 1176 threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1177 if (zvol_taskq == NULL) { 1178 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1179 return (-ENOMEM); 1180 } 1181 zvol_init_impl(); 1182 ida_init(&zvol_ida); 1183 return (0); 1184 } 1185 1186 void 1187 zvol_fini(void) 1188 { 1189 zvol_fini_impl(); 1190 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1191 taskq_destroy(zvol_taskq); 1192 ida_destroy(&zvol_ida); 1193 } 1194 1195 /* BEGIN CSTYLED */ 1196 module_param(zvol_inhibit_dev, uint, 0644); 1197 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1198 1199 module_param(zvol_major, uint, 0444); 1200 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1201 1202 module_param(zvol_threads, uint, 0444); 1203 MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); 1204 1205 module_param(zvol_request_sync, uint, 0644); 1206 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1207 1208 module_param(zvol_max_discard_blocks, ulong, 0444); 1209 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1210 1211 module_param(zvol_prefetch_bytes, uint, 0644); 1212 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1213 1214 module_param(zvol_volmode, uint, 0644); 1215 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1216 /* END CSTYLED */ 1217