1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2012, 2020 by Delphix. All rights reserved. 23 */ 24 25 #include <sys/dataset_kstats.h> 26 #include <sys/dbuf.h> 27 #include <sys/dmu_traverse.h> 28 #include <sys/dsl_dataset.h> 29 #include <sys/dsl_prop.h> 30 #include <sys/dsl_dir.h> 31 #include <sys/zap.h> 32 #include <sys/zfeature.h> 33 #include <sys/zil_impl.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/zio.h> 36 #include <sys/zfs_rlock.h> 37 #include <sys/spa_impl.h> 38 #include <sys/zvol.h> 39 #include <sys/zvol_impl.h> 40 41 #include <linux/blkdev_compat.h> 42 #include <linux/task_io_accounting_ops.h> 43 44 unsigned int zvol_major = ZVOL_MAJOR; 45 unsigned int zvol_request_sync = 0; 46 unsigned int zvol_prefetch_bytes = (128 * 1024); 47 unsigned long zvol_max_discard_blocks = 16384; 48 unsigned int zvol_threads = 32; 49 50 struct zvol_state_os { 51 struct gendisk *zvo_disk; /* generic disk */ 52 struct request_queue *zvo_queue; /* request queue */ 53 dev_t zvo_dev; /* device id */ 54 }; 55 56 taskq_t *zvol_taskq; 57 static struct ida zvol_ida; 58 59 typedef struct zv_request { 60 zvol_state_t *zv; 61 struct bio *bio; 62 taskq_ent_t ent; 63 } zv_request_t; 64 65 /* 66 * Given a path, return TRUE if path is a ZVOL. 67 */ 68 static boolean_t 69 zvol_is_zvol_impl(const char *device) 70 { 71 struct block_device *bdev; 72 unsigned int major; 73 74 bdev = vdev_lookup_bdev(device); 75 if (IS_ERR(bdev)) 76 return (B_FALSE); 77 78 major = MAJOR(bdev->bd_dev); 79 bdput(bdev); 80 81 if (major == zvol_major) 82 return (B_TRUE); 83 84 return (B_FALSE); 85 } 86 87 static void 88 uio_from_bio(uio_t *uio, struct bio *bio) 89 { 90 uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; 91 uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); 92 uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; 93 uio->uio_segflg = UIO_BVEC; 94 uio->uio_limit = MAXOFFSET_T; 95 uio->uio_resid = BIO_BI_SIZE(bio); 96 uio->uio_skip = BIO_BI_SKIP(bio); 97 } 98 99 static void 100 zvol_write(void *arg) 101 { 102 int error = 0; 103 104 zv_request_t *zvr = arg; 105 struct bio *bio = zvr->bio; 106 uio_t uio = { { 0 }, 0 }; 107 uio_from_bio(&uio, bio); 108 109 zvol_state_t *zv = zvr->zv; 110 ASSERT(zv && zv->zv_open_count > 0); 111 ASSERT(zv->zv_zilog != NULL); 112 113 /* bio marked as FLUSH need to flush before write */ 114 if (bio_is_flush(bio)) 115 zil_commit(zv->zv_zilog, ZVOL_OBJ); 116 117 /* Some requests are just for flush and nothing else. */ 118 if (uio.uio_resid == 0) { 119 rw_exit(&zv->zv_suspend_lock); 120 BIO_END_IO(bio, 0); 121 kmem_free(zvr, sizeof (zv_request_t)); 122 return; 123 } 124 125 ssize_t start_resid = uio.uio_resid; 126 unsigned long start_jif = jiffies; 127 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE, 128 bio_sectors(bio), &zv->zv_zso->zvo_disk->part0); 129 130 boolean_t sync = 131 bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 132 133 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 134 uio.uio_loffset, uio.uio_resid, RL_WRITER); 135 136 uint64_t volsize = zv->zv_volsize; 137 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 138 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 139 uint64_t off = uio.uio_loffset; 140 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 141 142 if (bytes > volsize - off) /* don't write past the end */ 143 bytes = volsize - off; 144 145 dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); 146 147 /* This will only fail for ENOSPC */ 148 error = dmu_tx_assign(tx, TXG_WAIT); 149 if (error) { 150 dmu_tx_abort(tx); 151 break; 152 } 153 error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); 154 if (error == 0) { 155 zvol_log_write(zv, tx, off, bytes, sync); 156 } 157 dmu_tx_commit(tx); 158 159 if (error) 160 break; 161 } 162 zfs_rangelock_exit(lr); 163 164 int64_t nwritten = start_resid - uio.uio_resid; 165 dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); 166 task_io_account_write(nwritten); 167 168 if (sync) 169 zil_commit(zv->zv_zilog, ZVOL_OBJ); 170 171 rw_exit(&zv->zv_suspend_lock); 172 blk_generic_end_io_acct(zv->zv_zso->zvo_queue, 173 WRITE, &zv->zv_zso->zvo_disk->part0, start_jif); 174 BIO_END_IO(bio, -error); 175 kmem_free(zvr, sizeof (zv_request_t)); 176 } 177 178 static void 179 zvol_discard(void *arg) 180 { 181 zv_request_t *zvr = arg; 182 struct bio *bio = zvr->bio; 183 zvol_state_t *zv = zvr->zv; 184 uint64_t start = BIO_BI_SECTOR(bio) << 9; 185 uint64_t size = BIO_BI_SIZE(bio); 186 uint64_t end = start + size; 187 boolean_t sync; 188 int error = 0; 189 dmu_tx_t *tx; 190 unsigned long start_jif; 191 192 ASSERT(zv && zv->zv_open_count > 0); 193 ASSERT(zv->zv_zilog != NULL); 194 195 start_jif = jiffies; 196 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE, 197 bio_sectors(bio), &zv->zv_zso->zvo_disk->part0); 198 199 sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; 200 201 if (end > zv->zv_volsize) { 202 error = SET_ERROR(EIO); 203 goto unlock; 204 } 205 206 /* 207 * Align the request to volume block boundaries when a secure erase is 208 * not required. This will prevent dnode_free_range() from zeroing out 209 * the unaligned parts which is slow (read-modify-write) and useless 210 * since we are not freeing any space by doing so. 211 */ 212 if (!bio_is_secure_erase(bio)) { 213 start = P2ROUNDUP(start, zv->zv_volblocksize); 214 end = P2ALIGN(end, zv->zv_volblocksize); 215 size = end - start; 216 } 217 218 if (start >= end) 219 goto unlock; 220 221 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 222 start, size, RL_WRITER); 223 224 tx = dmu_tx_create(zv->zv_objset); 225 dmu_tx_mark_netfree(tx); 226 error = dmu_tx_assign(tx, TXG_WAIT); 227 if (error != 0) { 228 dmu_tx_abort(tx); 229 } else { 230 zvol_log_truncate(zv, tx, start, size, B_TRUE); 231 dmu_tx_commit(tx); 232 error = dmu_free_long_range(zv->zv_objset, 233 ZVOL_OBJ, start, size); 234 } 235 zfs_rangelock_exit(lr); 236 237 if (error == 0 && sync) 238 zil_commit(zv->zv_zilog, ZVOL_OBJ); 239 240 unlock: 241 rw_exit(&zv->zv_suspend_lock); 242 blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE, 243 &zv->zv_zso->zvo_disk->part0, start_jif); 244 BIO_END_IO(bio, -error); 245 kmem_free(zvr, sizeof (zv_request_t)); 246 } 247 248 static void 249 zvol_read(void *arg) 250 { 251 int error = 0; 252 253 zv_request_t *zvr = arg; 254 struct bio *bio = zvr->bio; 255 uio_t uio = { { 0 }, 0 }; 256 uio_from_bio(&uio, bio); 257 258 zvol_state_t *zv = zvr->zv; 259 ASSERT(zv && zv->zv_open_count > 0); 260 261 ssize_t start_resid = uio.uio_resid; 262 unsigned long start_jif = jiffies; 263 blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio), 264 &zv->zv_zso->zvo_disk->part0); 265 266 zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, 267 uio.uio_loffset, uio.uio_resid, RL_READER); 268 269 uint64_t volsize = zv->zv_volsize; 270 while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { 271 uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); 272 273 /* don't read past the end */ 274 if (bytes > volsize - uio.uio_loffset) 275 bytes = volsize - uio.uio_loffset; 276 277 error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); 278 if (error) { 279 /* convert checksum errors into IO errors */ 280 if (error == ECKSUM) 281 error = SET_ERROR(EIO); 282 break; 283 } 284 } 285 zfs_rangelock_exit(lr); 286 287 int64_t nread = start_resid - uio.uio_resid; 288 dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); 289 task_io_account_read(nread); 290 291 rw_exit(&zv->zv_suspend_lock); 292 blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ, 293 &zv->zv_zso->zvo_disk->part0, start_jif); 294 BIO_END_IO(bio, -error); 295 kmem_free(zvr, sizeof (zv_request_t)); 296 } 297 298 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 299 static blk_qc_t 300 zvol_submit_bio(struct bio *bio) 301 #else 302 static MAKE_REQUEST_FN_RET 303 zvol_request(struct request_queue *q, struct bio *bio) 304 #endif 305 { 306 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 307 struct request_queue *q = bio->bi_disk->queue; 308 #endif 309 zvol_state_t *zv = q->queuedata; 310 fstrans_cookie_t cookie = spl_fstrans_mark(); 311 uint64_t offset = BIO_BI_SECTOR(bio) << 9; 312 uint64_t size = BIO_BI_SIZE(bio); 313 int rw = bio_data_dir(bio); 314 zv_request_t *zvr; 315 316 if (bio_has_data(bio) && offset + size > zv->zv_volsize) { 317 printk(KERN_INFO 318 "%s: bad access: offset=%llu, size=%lu\n", 319 zv->zv_zso->zvo_disk->disk_name, 320 (long long unsigned)offset, 321 (long unsigned)size); 322 323 BIO_END_IO(bio, -SET_ERROR(EIO)); 324 goto out; 325 } 326 327 if (rw == WRITE) { 328 if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { 329 BIO_END_IO(bio, -SET_ERROR(EROFS)); 330 goto out; 331 } 332 333 /* 334 * Prevents the zvol from being suspended, or the ZIL being 335 * concurrently opened. Will be released after the i/o 336 * completes. 337 */ 338 rw_enter(&zv->zv_suspend_lock, RW_READER); 339 340 /* 341 * Open a ZIL if this is the first time we have written to this 342 * zvol. We protect zv->zv_zilog with zv_suspend_lock rather 343 * than zv_state_lock so that we don't need to acquire an 344 * additional lock in this path. 345 */ 346 if (zv->zv_zilog == NULL) { 347 rw_exit(&zv->zv_suspend_lock); 348 rw_enter(&zv->zv_suspend_lock, RW_WRITER); 349 if (zv->zv_zilog == NULL) { 350 zv->zv_zilog = zil_open(zv->zv_objset, 351 zvol_get_data); 352 zv->zv_flags |= ZVOL_WRITTEN_TO; 353 } 354 rw_downgrade(&zv->zv_suspend_lock); 355 } 356 357 zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); 358 zvr->zv = zv; 359 zvr->bio = bio; 360 taskq_init_ent(&zvr->ent); 361 362 /* 363 * We don't want this thread to be blocked waiting for i/o to 364 * complete, so we instead wait from a taskq callback. The 365 * i/o may be a ZIL write (via zil_commit()), or a read of an 366 * indirect block, or a read of a data block (if this is a 367 * partial-block write). We will indicate that the i/o is 368 * complete by calling BIO_END_IO() from the taskq callback. 369 * 370 * This design allows the calling thread to continue and 371 * initiate more concurrent operations by calling 372 * zvol_request() again. There are typically only a small 373 * number of threads available to call zvol_request() (e.g. 374 * one per iSCSI target), so keeping the latency of 375 * zvol_request() low is important for performance. 376 * 377 * The zvol_request_sync module parameter allows this 378 * behavior to be altered, for performance evaluation 379 * purposes. If the callback blocks, setting 380 * zvol_request_sync=1 will result in much worse performance. 381 * 382 * We can have up to zvol_threads concurrent i/o's being 383 * processed for all zvols on the system. This is typically 384 * a vast improvement over the zvol_request_sync=1 behavior 385 * of one i/o at a time per zvol. However, an even better 386 * design would be for zvol_request() to initiate the zio 387 * directly, and then be notified by the zio_done callback, 388 * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL 389 * interfaces lack this functionality (they block waiting for 390 * the i/o to complete). 391 */ 392 if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { 393 if (zvol_request_sync) { 394 zvol_discard(zvr); 395 } else { 396 taskq_dispatch_ent(zvol_taskq, 397 zvol_discard, zvr, 0, &zvr->ent); 398 } 399 } else { 400 if (zvol_request_sync) { 401 zvol_write(zvr); 402 } else { 403 taskq_dispatch_ent(zvol_taskq, 404 zvol_write, zvr, 0, &zvr->ent); 405 } 406 } 407 } else { 408 /* 409 * The SCST driver, and possibly others, may issue READ I/Os 410 * with a length of zero bytes. These empty I/Os contain no 411 * data and require no additional handling. 412 */ 413 if (size == 0) { 414 BIO_END_IO(bio, 0); 415 goto out; 416 } 417 418 zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); 419 zvr->zv = zv; 420 zvr->bio = bio; 421 taskq_init_ent(&zvr->ent); 422 423 rw_enter(&zv->zv_suspend_lock, RW_READER); 424 425 /* See comment in WRITE case above. */ 426 if (zvol_request_sync) { 427 zvol_read(zvr); 428 } else { 429 taskq_dispatch_ent(zvol_taskq, 430 zvol_read, zvr, 0, &zvr->ent); 431 } 432 } 433 434 out: 435 spl_fstrans_unmark(cookie); 436 #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ 437 defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) 438 return (BLK_QC_T_NONE); 439 #endif 440 } 441 442 static int 443 zvol_open(struct block_device *bdev, fmode_t flag) 444 { 445 zvol_state_t *zv; 446 int error = 0; 447 boolean_t drop_suspend = B_TRUE; 448 449 rw_enter(&zvol_state_lock, RW_READER); 450 /* 451 * Obtain a copy of private_data under the zvol_state_lock to make 452 * sure that either the result of zvol free code path setting 453 * bdev->bd_disk->private_data to NULL is observed, or zvol_free() 454 * is not called on this zv because of the positive zv_open_count. 455 */ 456 zv = bdev->bd_disk->private_data; 457 if (zv == NULL) { 458 rw_exit(&zvol_state_lock); 459 return (SET_ERROR(-ENXIO)); 460 } 461 462 mutex_enter(&zv->zv_state_lock); 463 /* 464 * make sure zvol is not suspended during first open 465 * (hold zv_suspend_lock) and respect proper lock acquisition 466 * ordering - zv_suspend_lock before zv_state_lock 467 */ 468 if (zv->zv_open_count == 0) { 469 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 470 mutex_exit(&zv->zv_state_lock); 471 rw_enter(&zv->zv_suspend_lock, RW_READER); 472 mutex_enter(&zv->zv_state_lock); 473 /* check to see if zv_suspend_lock is needed */ 474 if (zv->zv_open_count != 0) { 475 rw_exit(&zv->zv_suspend_lock); 476 drop_suspend = B_FALSE; 477 } 478 } 479 } else { 480 drop_suspend = B_FALSE; 481 } 482 rw_exit(&zvol_state_lock); 483 484 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 485 ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock)); 486 487 if (zv->zv_open_count == 0) { 488 error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); 489 if (error) 490 goto out_mutex; 491 } 492 493 if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 494 error = -EROFS; 495 goto out_open_count; 496 } 497 498 zv->zv_open_count++; 499 500 mutex_exit(&zv->zv_state_lock); 501 if (drop_suspend) 502 rw_exit(&zv->zv_suspend_lock); 503 504 check_disk_change(bdev); 505 506 return (0); 507 508 out_open_count: 509 if (zv->zv_open_count == 0) 510 zvol_last_close(zv); 511 512 out_mutex: 513 mutex_exit(&zv->zv_state_lock); 514 if (drop_suspend) 515 rw_exit(&zv->zv_suspend_lock); 516 if (error == -EINTR) { 517 error = -ERESTARTSYS; 518 schedule(); 519 } 520 return (SET_ERROR(error)); 521 } 522 523 static void 524 zvol_release(struct gendisk *disk, fmode_t mode) 525 { 526 zvol_state_t *zv; 527 boolean_t drop_suspend = B_TRUE; 528 529 rw_enter(&zvol_state_lock, RW_READER); 530 zv = disk->private_data; 531 532 mutex_enter(&zv->zv_state_lock); 533 ASSERT(zv->zv_open_count > 0); 534 /* 535 * make sure zvol is not suspended during last close 536 * (hold zv_suspend_lock) and respect proper lock acquisition 537 * ordering - zv_suspend_lock before zv_state_lock 538 */ 539 if (zv->zv_open_count == 1) { 540 if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { 541 mutex_exit(&zv->zv_state_lock); 542 rw_enter(&zv->zv_suspend_lock, RW_READER); 543 mutex_enter(&zv->zv_state_lock); 544 /* check to see if zv_suspend_lock is needed */ 545 if (zv->zv_open_count != 1) { 546 rw_exit(&zv->zv_suspend_lock); 547 drop_suspend = B_FALSE; 548 } 549 } 550 } else { 551 drop_suspend = B_FALSE; 552 } 553 rw_exit(&zvol_state_lock); 554 555 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 556 ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock)); 557 558 zv->zv_open_count--; 559 if (zv->zv_open_count == 0) 560 zvol_last_close(zv); 561 562 mutex_exit(&zv->zv_state_lock); 563 564 if (drop_suspend) 565 rw_exit(&zv->zv_suspend_lock); 566 } 567 568 static int 569 zvol_ioctl(struct block_device *bdev, fmode_t mode, 570 unsigned int cmd, unsigned long arg) 571 { 572 zvol_state_t *zv = bdev->bd_disk->private_data; 573 int error = 0; 574 575 ASSERT3U(zv->zv_open_count, >, 0); 576 577 switch (cmd) { 578 case BLKFLSBUF: 579 fsync_bdev(bdev); 580 invalidate_bdev(bdev); 581 rw_enter(&zv->zv_suspend_lock, RW_READER); 582 583 if (!(zv->zv_flags & ZVOL_RDONLY)) 584 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 585 586 rw_exit(&zv->zv_suspend_lock); 587 break; 588 589 case BLKZNAME: 590 mutex_enter(&zv->zv_state_lock); 591 error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); 592 mutex_exit(&zv->zv_state_lock); 593 break; 594 595 default: 596 error = -ENOTTY; 597 break; 598 } 599 600 return (SET_ERROR(error)); 601 } 602 603 #ifdef CONFIG_COMPAT 604 static int 605 zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, 606 unsigned cmd, unsigned long arg) 607 { 608 return (zvol_ioctl(bdev, mode, cmd, arg)); 609 } 610 #else 611 #define zvol_compat_ioctl NULL 612 #endif 613 614 static unsigned int 615 zvol_check_events(struct gendisk *disk, unsigned int clearing) 616 { 617 unsigned int mask = 0; 618 619 rw_enter(&zvol_state_lock, RW_READER); 620 621 zvol_state_t *zv = disk->private_data; 622 if (zv != NULL) { 623 mutex_enter(&zv->zv_state_lock); 624 mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; 625 zv->zv_changed = 0; 626 mutex_exit(&zv->zv_state_lock); 627 } 628 629 rw_exit(&zvol_state_lock); 630 631 return (mask); 632 } 633 634 static int 635 zvol_revalidate_disk(struct gendisk *disk) 636 { 637 rw_enter(&zvol_state_lock, RW_READER); 638 639 zvol_state_t *zv = disk->private_data; 640 if (zv != NULL) { 641 mutex_enter(&zv->zv_state_lock); 642 set_capacity(zv->zv_zso->zvo_disk, 643 zv->zv_volsize >> SECTOR_BITS); 644 mutex_exit(&zv->zv_state_lock); 645 } 646 647 rw_exit(&zvol_state_lock); 648 649 return (0); 650 } 651 652 static int 653 zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) 654 { 655 656 revalidate_disk(zv->zv_zso->zvo_disk); 657 return (0); 658 } 659 660 static void 661 zvol_clear_private(zvol_state_t *zv) 662 { 663 /* 664 * Cleared while holding zvol_state_lock as a writer 665 * which will prevent zvol_open() from opening it. 666 */ 667 zv->zv_zso->zvo_disk->private_data = NULL; 668 } 669 670 /* 671 * Provide a simple virtual geometry for legacy compatibility. For devices 672 * smaller than 1 MiB a small head and sector count is used to allow very 673 * tiny devices. For devices over 1 Mib a standard head and sector count 674 * is used to keep the cylinders count reasonable. 675 */ 676 static int 677 zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) 678 { 679 zvol_state_t *zv = bdev->bd_disk->private_data; 680 sector_t sectors; 681 682 ASSERT3U(zv->zv_open_count, >, 0); 683 684 sectors = get_capacity(zv->zv_zso->zvo_disk); 685 686 if (sectors > 2048) { 687 geo->heads = 16; 688 geo->sectors = 63; 689 } else { 690 geo->heads = 2; 691 geo->sectors = 4; 692 } 693 694 geo->start = 0; 695 geo->cylinders = sectors / (geo->heads * geo->sectors); 696 697 return (0); 698 } 699 700 /* 701 * Find a zvol_state_t given the full major+minor dev_t. If found, 702 * return with zv_state_lock taken, otherwise, return (NULL) without 703 * taking zv_state_lock. 704 */ 705 static zvol_state_t * 706 zvol_find_by_dev(dev_t dev) 707 { 708 zvol_state_t *zv; 709 710 rw_enter(&zvol_state_lock, RW_READER); 711 for (zv = list_head(&zvol_state_list); zv != NULL; 712 zv = list_next(&zvol_state_list, zv)) { 713 mutex_enter(&zv->zv_state_lock); 714 if (zv->zv_zso->zvo_dev == dev) { 715 rw_exit(&zvol_state_lock); 716 return (zv); 717 } 718 mutex_exit(&zv->zv_state_lock); 719 } 720 rw_exit(&zvol_state_lock); 721 722 return (NULL); 723 } 724 725 static struct kobject * 726 zvol_probe(dev_t dev, int *part, void *arg) 727 { 728 zvol_state_t *zv; 729 struct kobject *kobj; 730 731 zv = zvol_find_by_dev(dev); 732 kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL; 733 ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock)); 734 if (zv) 735 mutex_exit(&zv->zv_state_lock); 736 737 return (kobj); 738 } 739 740 static struct block_device_operations zvol_ops = { 741 .open = zvol_open, 742 .release = zvol_release, 743 .ioctl = zvol_ioctl, 744 .compat_ioctl = zvol_compat_ioctl, 745 .check_events = zvol_check_events, 746 .revalidate_disk = zvol_revalidate_disk, 747 .getgeo = zvol_getgeo, 748 .owner = THIS_MODULE, 749 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 750 .submit_bio = zvol_submit_bio, 751 #endif 752 }; 753 754 /* 755 * Allocate memory for a new zvol_state_t and setup the required 756 * request queue and generic disk structures for the block device. 757 */ 758 static zvol_state_t * 759 zvol_alloc(dev_t dev, const char *name) 760 { 761 zvol_state_t *zv; 762 struct zvol_state_os *zso; 763 uint64_t volmode; 764 765 if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) 766 return (NULL); 767 768 if (volmode == ZFS_VOLMODE_DEFAULT) 769 volmode = zvol_volmode; 770 771 if (volmode == ZFS_VOLMODE_NONE) 772 return (NULL); 773 774 zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); 775 zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); 776 zv->zv_zso = zso; 777 778 list_link_init(&zv->zv_next); 779 mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); 780 781 #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS 782 zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); 783 #else 784 zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); 785 #endif 786 if (zso->zvo_queue == NULL) 787 goto out_kmem; 788 789 blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); 790 791 /* Limit read-ahead to a single page to prevent over-prefetching. */ 792 blk_queue_set_read_ahead(zso->zvo_queue, 1); 793 794 /* Disable write merging in favor of the ZIO pipeline. */ 795 blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); 796 797 zso->zvo_disk = alloc_disk(ZVOL_MINORS); 798 if (zso->zvo_disk == NULL) 799 goto out_queue; 800 801 zso->zvo_queue->queuedata = zv; 802 zso->zvo_dev = dev; 803 zv->zv_open_count = 0; 804 strlcpy(zv->zv_name, name, MAXNAMELEN); 805 806 zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); 807 rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); 808 809 zso->zvo_disk->major = zvol_major; 810 zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; 811 812 if (volmode == ZFS_VOLMODE_DEV) { 813 /* 814 * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set 815 * gendisk->minors = 1 as noted in include/linux/genhd.h. 816 * Also disable extended partition numbers (GENHD_FL_EXT_DEVT) 817 * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN) 818 * setting gendisk->flags accordingly. 819 */ 820 zso->zvo_disk->minors = 1; 821 #if defined(GENHD_FL_EXT_DEVT) 822 zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; 823 #endif 824 #if defined(GENHD_FL_NO_PART_SCAN) 825 zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN; 826 #endif 827 } 828 zso->zvo_disk->first_minor = (dev & MINORMASK); 829 zso->zvo_disk->fops = &zvol_ops; 830 zso->zvo_disk->private_data = zv; 831 zso->zvo_disk->queue = zso->zvo_queue; 832 snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", 833 ZVOL_DEV_NAME, (dev & MINORMASK)); 834 835 return (zv); 836 837 out_queue: 838 blk_cleanup_queue(zso->zvo_queue); 839 out_kmem: 840 kmem_free(zso, sizeof (struct zvol_state_os)); 841 kmem_free(zv, sizeof (zvol_state_t)); 842 return (NULL); 843 } 844 845 /* 846 * Cleanup then free a zvol_state_t which was created by zvol_alloc(). 847 * At this time, the structure is not opened by anyone, is taken off 848 * the zvol_state_list, and has its private data set to NULL. 849 * The zvol_state_lock is dropped. 850 * 851 * This function may take many milliseconds to complete (e.g. we've seen 852 * it take over 256ms), due to the calls to "blk_cleanup_queue" and 853 * "del_gendisk". Thus, consumers need to be careful to account for this 854 * latency when calling this function. 855 */ 856 static void 857 zvol_free(zvol_state_t *zv) 858 { 859 860 ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); 861 ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); 862 ASSERT(zv->zv_open_count == 0); 863 ASSERT(zv->zv_zso->zvo_disk->private_data == NULL); 864 865 rw_destroy(&zv->zv_suspend_lock); 866 zfs_rangelock_fini(&zv->zv_rangelock); 867 868 del_gendisk(zv->zv_zso->zvo_disk); 869 blk_cleanup_queue(zv->zv_zso->zvo_queue); 870 put_disk(zv->zv_zso->zvo_disk); 871 872 ida_simple_remove(&zvol_ida, 873 MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); 874 875 mutex_destroy(&zv->zv_state_lock); 876 dataset_kstats_destroy(&zv->zv_kstat); 877 878 kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); 879 kmem_free(zv, sizeof (zvol_state_t)); 880 } 881 882 /* 883 * Create a block device minor node and setup the linkage between it 884 * and the specified volume. Once this function returns the block 885 * device is live and ready for use. 886 */ 887 static int 888 zvol_os_create_minor(const char *name) 889 { 890 zvol_state_t *zv; 891 objset_t *os; 892 dmu_object_info_t *doi; 893 uint64_t volsize; 894 uint64_t len; 895 unsigned minor = 0; 896 int error = 0; 897 int idx; 898 uint64_t hash = zvol_name_hash(name); 899 900 if (zvol_inhibit_dev) 901 return (0); 902 903 idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); 904 if (idx < 0) 905 return (SET_ERROR(-idx)); 906 minor = idx << ZVOL_MINOR_BITS; 907 908 zv = zvol_find_by_name_hash(name, hash, RW_NONE); 909 if (zv) { 910 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 911 mutex_exit(&zv->zv_state_lock); 912 ida_simple_remove(&zvol_ida, idx); 913 return (SET_ERROR(EEXIST)); 914 } 915 916 doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); 917 918 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); 919 if (error) 920 goto out_doi; 921 922 error = dmu_object_info(os, ZVOL_OBJ, doi); 923 if (error) 924 goto out_dmu_objset_disown; 925 926 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 927 if (error) 928 goto out_dmu_objset_disown; 929 930 zv = zvol_alloc(MKDEV(zvol_major, minor), name); 931 if (zv == NULL) { 932 error = SET_ERROR(EAGAIN); 933 goto out_dmu_objset_disown; 934 } 935 zv->zv_hash = hash; 936 937 if (dmu_objset_is_snapshot(os)) 938 zv->zv_flags |= ZVOL_RDONLY; 939 940 zv->zv_volblocksize = doi->doi_data_block_size; 941 zv->zv_volsize = volsize; 942 zv->zv_objset = os; 943 944 set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); 945 946 blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, 947 (DMU_MAX_ACCESS / 4) >> 9); 948 blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); 949 blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); 950 blk_queue_physical_block_size(zv->zv_zso->zvo_queue, 951 zv->zv_volblocksize); 952 blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); 953 blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, 954 (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); 955 blk_queue_discard_granularity(zv->zv_zso->zvo_queue, 956 zv->zv_volblocksize); 957 blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); 958 #ifdef QUEUE_FLAG_NONROT 959 blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); 960 #endif 961 #ifdef QUEUE_FLAG_ADD_RANDOM 962 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); 963 #endif 964 /* This flag was introduced in kernel version 4.12. */ 965 #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH 966 blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); 967 #endif 968 969 if (spa_writeable(dmu_objset_spa(os))) { 970 if (zil_replay_disable) 971 zil_destroy(dmu_objset_zil(os), B_FALSE); 972 else 973 zil_replay(os, zv, zvol_replay_vector); 974 } 975 ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); 976 dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); 977 978 /* 979 * When udev detects the addition of the device it will immediately 980 * invoke blkid(8) to determine the type of content on the device. 981 * Prefetching the blocks commonly scanned by blkid(8) will speed 982 * up this process. 983 */ 984 len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); 985 if (len > 0) { 986 dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); 987 dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, 988 ZIO_PRIORITY_SYNC_READ); 989 } 990 991 zv->zv_objset = NULL; 992 out_dmu_objset_disown: 993 dmu_objset_disown(os, B_TRUE, FTAG); 994 out_doi: 995 kmem_free(doi, sizeof (dmu_object_info_t)); 996 997 /* 998 * Keep in mind that once add_disk() is called, the zvol is 999 * announced to the world, and zvol_open()/zvol_release() can 1000 * be called at any time. Incidentally, add_disk() itself calls 1001 * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() 1002 * directly as well. 1003 */ 1004 if (error == 0) { 1005 rw_enter(&zvol_state_lock, RW_WRITER); 1006 zvol_insert(zv); 1007 rw_exit(&zvol_state_lock); 1008 add_disk(zv->zv_zso->zvo_disk); 1009 } else { 1010 ida_simple_remove(&zvol_ida, idx); 1011 } 1012 1013 return (error); 1014 } 1015 1016 static void 1017 zvol_rename_minor(zvol_state_t *zv, const char *newname) 1018 { 1019 int readonly = get_disk_ro(zv->zv_zso->zvo_disk); 1020 1021 ASSERT(RW_LOCK_HELD(&zvol_state_lock)); 1022 ASSERT(MUTEX_HELD(&zv->zv_state_lock)); 1023 1024 strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); 1025 1026 /* move to new hashtable entry */ 1027 zv->zv_hash = zvol_name_hash(zv->zv_name); 1028 hlist_del(&zv->zv_hlink); 1029 hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); 1030 1031 /* 1032 * The block device's read-only state is briefly changed causing 1033 * a KOBJ_CHANGE uevent to be issued. This ensures udev detects 1034 * the name change and fixes the symlinks. This does not change 1035 * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never 1036 * changes. This would normally be done using kobject_uevent() but 1037 * that is a GPL-only symbol which is why we need this workaround. 1038 */ 1039 set_disk_ro(zv->zv_zso->zvo_disk, !readonly); 1040 set_disk_ro(zv->zv_zso->zvo_disk, readonly); 1041 } 1042 1043 static void 1044 zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) 1045 { 1046 1047 set_disk_ro(zv->zv_zso->zvo_disk, flags); 1048 } 1049 1050 static void 1051 zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) 1052 { 1053 1054 set_capacity(zv->zv_zso->zvo_disk, capacity); 1055 } 1056 1057 const static zvol_platform_ops_t zvol_linux_ops = { 1058 .zv_free = zvol_free, 1059 .zv_rename_minor = zvol_rename_minor, 1060 .zv_create_minor = zvol_os_create_minor, 1061 .zv_update_volsize = zvol_update_volsize, 1062 .zv_clear_private = zvol_clear_private, 1063 .zv_is_zvol = zvol_is_zvol_impl, 1064 .zv_set_disk_ro = zvol_set_disk_ro_impl, 1065 .zv_set_capacity = zvol_set_capacity_impl, 1066 }; 1067 1068 int 1069 zvol_init(void) 1070 { 1071 int error; 1072 int threads = MIN(MAX(zvol_threads, 1), 1024); 1073 1074 error = register_blkdev(zvol_major, ZVOL_DRIVER); 1075 if (error) { 1076 printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); 1077 return (error); 1078 } 1079 zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, 1080 threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); 1081 if (zvol_taskq == NULL) { 1082 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1083 return (-ENOMEM); 1084 } 1085 zvol_init_impl(); 1086 blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, 1087 THIS_MODULE, zvol_probe, NULL, NULL); 1088 1089 ida_init(&zvol_ida); 1090 zvol_register_ops(&zvol_linux_ops); 1091 return (0); 1092 } 1093 1094 void 1095 zvol_fini(void) 1096 { 1097 zvol_fini_impl(); 1098 blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); 1099 unregister_blkdev(zvol_major, ZVOL_DRIVER); 1100 taskq_destroy(zvol_taskq); 1101 ida_destroy(&zvol_ida); 1102 } 1103 1104 /* BEGIN CSTYLED */ 1105 module_param(zvol_inhibit_dev, uint, 0644); 1106 MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); 1107 1108 module_param(zvol_major, uint, 0444); 1109 MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); 1110 1111 module_param(zvol_threads, uint, 0444); 1112 MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); 1113 1114 module_param(zvol_request_sync, uint, 0644); 1115 MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); 1116 1117 module_param(zvol_max_discard_blocks, ulong, 0444); 1118 MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); 1119 1120 module_param(zvol_prefetch_bytes, uint, 0644); 1121 MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); 1122 1123 module_param(zvol_volmode, uint, 0644); 1124 MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); 1125 /* END CSTYLED */ 1126