1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * ZFS volume emulation driver. 28 * 29 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 30 * Volumes are accessed through the symbolic links named: 31 * 32 * /dev/zvol/dsk/<pool_name>/<dataset_name> 33 * /dev/zvol/rdsk/<pool_name>/<dataset_name> 34 * 35 * These links are created by the /dev filesystem (sdev_zvolops.c). 36 * Volumes are persistent through reboot. No user command needs to be 37 * run before opening and using a device. 38 */ 39 40 #include <sys/types.h> 41 #include <sys/param.h> 42 #include <sys/errno.h> 43 #include <sys/uio.h> 44 #include <sys/buf.h> 45 #include <sys/modctl.h> 46 #include <sys/open.h> 47 #include <sys/kmem.h> 48 #include <sys/conf.h> 49 #include <sys/cmn_err.h> 50 #include <sys/stat.h> 51 #include <sys/zap.h> 52 #include <sys/spa.h> 53 #include <sys/zio.h> 54 #include <sys/dmu_traverse.h> 55 #include <sys/dnode.h> 56 #include <sys/dsl_dataset.h> 57 #include <sys/dsl_prop.h> 58 #include <sys/dkio.h> 59 #include <sys/efi_partition.h> 60 #include <sys/byteorder.h> 61 #include <sys/pathname.h> 62 #include <sys/ddi.h> 63 #include <sys/sunddi.h> 64 #include <sys/crc32.h> 65 #include <sys/dirent.h> 66 #include <sys/policy.h> 67 #include <sys/fs/zfs.h> 68 #include <sys/zfs_ioctl.h> 69 #include <sys/mkdev.h> 70 #include <sys/zil.h> 71 #include <sys/refcount.h> 72 #include <sys/zfs_znode.h> 73 #include <sys/zfs_rlock.h> 74 #include <sys/vdev_disk.h> 75 #include <sys/vdev_impl.h> 76 #include <sys/zvol.h> 77 #include <sys/dumphdr.h> 78 #include <sys/zil_impl.h> 79 80 #include "zfs_namecheck.h" 81 82 static void *zvol_state; 83 static char *zvol_tag = "zvol_tag"; 84 85 #define ZVOL_DUMPSIZE "dumpsize" 86 87 /* 88 * This lock protects the zvol_state structure from being modified 89 * while it's being used, e.g. an open that comes in before a create 90 * finishes. It also protects temporary opens of the dataset so that, 91 * e.g., an open doesn't get a spurious EBUSY. 92 */ 93 static kmutex_t zvol_state_lock; 94 static uint32_t zvol_minors; 95 96 typedef struct zvol_extent { 97 list_node_t ze_node; 98 dva_t ze_dva; /* dva associated with this extent */ 99 uint64_t ze_nblks; /* number of blocks in extent */ 100 } zvol_extent_t; 101 102 /* 103 * The in-core state of each volume. 104 */ 105 typedef struct zvol_state { 106 char zv_name[MAXPATHLEN]; /* pool/dd name */ 107 uint64_t zv_volsize; /* amount of space we advertise */ 108 uint64_t zv_volblocksize; /* volume block size */ 109 minor_t zv_minor; /* minor number */ 110 uint8_t zv_min_bs; /* minimum addressable block shift */ 111 uint8_t zv_flags; /* readonly, dumpified, etc. */ 112 objset_t *zv_objset; /* objset handle */ 113 uint32_t zv_open_count[OTYPCNT]; /* open counts */ 114 uint32_t zv_total_opens; /* total open count */ 115 zilog_t *zv_zilog; /* ZIL handle */ 116 list_t zv_extents; /* List of extents for dump */ 117 znode_t zv_znode; /* for range locking */ 118 } zvol_state_t; 119 120 /* 121 * zvol specific flags 122 */ 123 #define ZVOL_RDONLY 0x1 124 #define ZVOL_DUMPIFIED 0x2 125 #define ZVOL_EXCL 0x4 126 #define ZVOL_WCE 0x8 127 128 /* 129 * zvol maximum transfer in one DMU tx. 130 */ 131 int zvol_maxphys = DMU_MAX_ACCESS/2; 132 133 static int zvol_remove_zv(zvol_state_t *); 134 extern int zfs_set_prop_nvlist(const char *, nvlist_t *); 135 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); 136 static int zvol_dumpify(zvol_state_t *zv); 137 static int zvol_dump_fini(zvol_state_t *zv); 138 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); 139 140 static void 141 zvol_size_changed(uint64_t volsize, major_t maj, minor_t min) 142 { 143 dev_t dev = makedevice(maj, min); 144 145 VERIFY(ddi_prop_update_int64(dev, zfs_dip, 146 "Size", volsize) == DDI_SUCCESS); 147 VERIFY(ddi_prop_update_int64(dev, zfs_dip, 148 "Nblocks", lbtodb(volsize)) == DDI_SUCCESS); 149 150 /* Notify specfs to invalidate the cached size */ 151 spec_size_invalidate(dev, VBLK); 152 spec_size_invalidate(dev, VCHR); 153 } 154 155 int 156 zvol_check_volsize(uint64_t volsize, uint64_t blocksize) 157 { 158 if (volsize == 0) 159 return (EINVAL); 160 161 if (volsize % blocksize != 0) 162 return (EINVAL); 163 164 #ifdef _ILP32 165 if (volsize - 1 > SPEC_MAXOFFSET_T) 166 return (EOVERFLOW); 167 #endif 168 return (0); 169 } 170 171 int 172 zvol_check_volblocksize(uint64_t volblocksize) 173 { 174 if (volblocksize < SPA_MINBLOCKSIZE || 175 volblocksize > SPA_MAXBLOCKSIZE || 176 !ISP2(volblocksize)) 177 return (EDOM); 178 179 return (0); 180 } 181 182 int 183 zvol_get_stats(objset_t *os, nvlist_t *nv) 184 { 185 int error; 186 dmu_object_info_t doi; 187 uint64_t val; 188 189 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); 190 if (error) 191 return (error); 192 193 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); 194 195 error = dmu_object_info(os, ZVOL_OBJ, &doi); 196 197 if (error == 0) { 198 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, 199 doi.doi_data_block_size); 200 } 201 202 return (error); 203 } 204 205 /* 206 * Find a free minor number. 207 */ 208 static minor_t 209 zvol_minor_alloc(void) 210 { 211 minor_t minor; 212 213 ASSERT(MUTEX_HELD(&zvol_state_lock)); 214 215 for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) 216 if (ddi_get_soft_state(zvol_state, minor) == NULL) 217 return (minor); 218 219 return (0); 220 } 221 222 static zvol_state_t * 223 zvol_minor_lookup(const char *name) 224 { 225 minor_t minor; 226 zvol_state_t *zv; 227 228 ASSERT(MUTEX_HELD(&zvol_state_lock)); 229 230 for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { 231 zv = ddi_get_soft_state(zvol_state, minor); 232 if (zv == NULL) 233 continue; 234 if (strcmp(zv->zv_name, name) == 0) 235 break; 236 } 237 238 return (zv); 239 } 240 241 /* extent mapping arg */ 242 struct maparg { 243 zvol_state_t *ma_zv; 244 uint64_t ma_blks; 245 }; 246 247 /*ARGSUSED*/ 248 static int 249 zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 250 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 251 { 252 struct maparg *ma = arg; 253 zvol_extent_t *ze; 254 int bs = ma->ma_zv->zv_volblocksize; 255 256 if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) 257 return (0); 258 259 VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); 260 ma->ma_blks++; 261 262 /* Abort immediately if we have encountered gang blocks */ 263 if (BP_IS_GANG(bp)) 264 return (EFRAGS); 265 266 /* 267 * See if the block is at the end of the previous extent. 268 */ 269 ze = list_tail(&ma->ma_zv->zv_extents); 270 if (ze && 271 DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) && 272 DVA_GET_OFFSET(BP_IDENTITY(bp)) == 273 DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) { 274 ze->ze_nblks++; 275 return (0); 276 } 277 278 dprintf_bp(bp, "%s", "next blkptr:"); 279 280 /* start a new extent */ 281 ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP); 282 ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ 283 ze->ze_nblks = 1; 284 list_insert_tail(&ma->ma_zv->zv_extents, ze); 285 return (0); 286 } 287 288 static void 289 zvol_free_extents(zvol_state_t *zv) 290 { 291 zvol_extent_t *ze; 292 293 while (ze = list_head(&zv->zv_extents)) { 294 list_remove(&zv->zv_extents, ze); 295 kmem_free(ze, sizeof (zvol_extent_t)); 296 } 297 } 298 299 static int 300 zvol_get_lbas(zvol_state_t *zv) 301 { 302 struct maparg ma; 303 int err; 304 305 ma.ma_zv = zv; 306 ma.ma_blks = 0; 307 zvol_free_extents(zv); 308 309 err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0, 310 TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); 311 if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { 312 zvol_free_extents(zv); 313 return (err ? err : EIO); 314 } 315 316 return (0); 317 } 318 319 /* ARGSUSED */ 320 void 321 zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) 322 { 323 zfs_creat_t *zct = arg; 324 nvlist_t *nvprops = zct->zct_props; 325 int error; 326 uint64_t volblocksize, volsize; 327 328 VERIFY(nvlist_lookup_uint64(nvprops, 329 zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); 330 if (nvlist_lookup_uint64(nvprops, 331 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) 332 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); 333 334 /* 335 * These properties must be removed from the list so the generic 336 * property setting step won't apply to them. 337 */ 338 VERIFY(nvlist_remove_all(nvprops, 339 zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); 340 (void) nvlist_remove_all(nvprops, 341 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); 342 343 error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, 344 DMU_OT_NONE, 0, tx); 345 ASSERT(error == 0); 346 347 error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, 348 DMU_OT_NONE, 0, tx); 349 ASSERT(error == 0); 350 351 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); 352 ASSERT(error == 0); 353 } 354 355 /* 356 * Replay a TX_WRITE ZIL transaction that didn't get committed 357 * after a system failure 358 */ 359 static int 360 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) 361 { 362 objset_t *os = zv->zv_objset; 363 char *data = (char *)(lr + 1); /* data follows lr_write_t */ 364 uint64_t offset, length; 365 dmu_tx_t *tx; 366 int error; 367 368 if (byteswap) 369 byteswap_uint64_array(lr, sizeof (*lr)); 370 371 offset = lr->lr_offset; 372 length = lr->lr_length; 373 374 /* If it's a dmu_sync() block, write the whole block */ 375 if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { 376 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); 377 if (length < blocksize) { 378 offset -= offset % blocksize; 379 length = blocksize; 380 } 381 } 382 383 tx = dmu_tx_create(os); 384 dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); 385 error = dmu_tx_assign(tx, TXG_WAIT); 386 if (error) { 387 dmu_tx_abort(tx); 388 } else { 389 dmu_write(os, ZVOL_OBJ, offset, length, data, tx); 390 dmu_tx_commit(tx); 391 } 392 393 return (error); 394 } 395 396 /* ARGSUSED */ 397 static int 398 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) 399 { 400 return (ENOTSUP); 401 } 402 403 /* 404 * Callback vectors for replaying records. 405 * Only TX_WRITE is needed for zvol. 406 */ 407 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { 408 zvol_replay_err, /* 0 no such transaction type */ 409 zvol_replay_err, /* TX_CREATE */ 410 zvol_replay_err, /* TX_MKDIR */ 411 zvol_replay_err, /* TX_MKXATTR */ 412 zvol_replay_err, /* TX_SYMLINK */ 413 zvol_replay_err, /* TX_REMOVE */ 414 zvol_replay_err, /* TX_RMDIR */ 415 zvol_replay_err, /* TX_LINK */ 416 zvol_replay_err, /* TX_RENAME */ 417 zvol_replay_write, /* TX_WRITE */ 418 zvol_replay_err, /* TX_TRUNCATE */ 419 zvol_replay_err, /* TX_SETATTR */ 420 zvol_replay_err, /* TX_ACL */ 421 zvol_replay_err, /* TX_CREATE_ACL */ 422 zvol_replay_err, /* TX_CREATE_ATTR */ 423 zvol_replay_err, /* TX_CREATE_ACL_ATTR */ 424 zvol_replay_err, /* TX_MKDIR_ACL */ 425 zvol_replay_err, /* TX_MKDIR_ATTR */ 426 zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ 427 zvol_replay_err, /* TX_WRITE2 */ 428 }; 429 430 int 431 zvol_name2minor(const char *name, minor_t *minor) 432 { 433 zvol_state_t *zv; 434 435 mutex_enter(&zvol_state_lock); 436 zv = zvol_minor_lookup(name); 437 if (minor && zv) 438 *minor = zv->zv_minor; 439 mutex_exit(&zvol_state_lock); 440 return (zv ? 0 : -1); 441 } 442 443 /* 444 * Create a minor node (plus a whole lot more) for the specified volume. 445 */ 446 int 447 zvol_create_minor(const char *name) 448 { 449 zvol_state_t *zv; 450 objset_t *os; 451 dmu_object_info_t doi; 452 minor_t minor = 0; 453 char chrbuf[30], blkbuf[30]; 454 int error; 455 456 mutex_enter(&zvol_state_lock); 457 458 if ((zv = zvol_minor_lookup(name)) != NULL) { 459 mutex_exit(&zvol_state_lock); 460 return (EEXIST); 461 } 462 463 /* lie and say we're read-only */ 464 error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); 465 466 if (error) { 467 mutex_exit(&zvol_state_lock); 468 return (error); 469 } 470 471 if ((minor = zvol_minor_alloc()) == 0) { 472 dmu_objset_disown(os, zvol_tag); 473 mutex_exit(&zvol_state_lock); 474 return (ENXIO); 475 } 476 477 if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) { 478 dmu_objset_disown(os, zvol_tag); 479 mutex_exit(&zvol_state_lock); 480 return (EAGAIN); 481 } 482 (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, 483 (char *)name); 484 485 (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); 486 487 if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, 488 minor, DDI_PSEUDO, 0) == DDI_FAILURE) { 489 ddi_soft_state_free(zvol_state, minor); 490 dmu_objset_disown(os, zvol_tag); 491 mutex_exit(&zvol_state_lock); 492 return (EAGAIN); 493 } 494 495 (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); 496 497 if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, 498 minor, DDI_PSEUDO, 0) == DDI_FAILURE) { 499 ddi_remove_minor_node(zfs_dip, chrbuf); 500 ddi_soft_state_free(zvol_state, minor); 501 dmu_objset_disown(os, zvol_tag); 502 mutex_exit(&zvol_state_lock); 503 return (EAGAIN); 504 } 505 506 zv = ddi_get_soft_state(zvol_state, minor); 507 508 (void) strlcpy(zv->zv_name, name, MAXPATHLEN); 509 zv->zv_min_bs = DEV_BSHIFT; 510 zv->zv_minor = minor; 511 zv->zv_objset = os; 512 if (dmu_objset_is_snapshot(os)) 513 zv->zv_flags |= ZVOL_RDONLY; 514 mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); 515 avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, 516 sizeof (rl_t), offsetof(rl_t, r_node)); 517 list_create(&zv->zv_extents, sizeof (zvol_extent_t), 518 offsetof(zvol_extent_t, ze_node)); 519 /* get and cache the blocksize */ 520 error = dmu_object_info(os, ZVOL_OBJ, &doi); 521 ASSERT(error == 0); 522 zv->zv_volblocksize = doi.doi_data_block_size; 523 524 zil_replay(os, zv, zvol_replay_vector); 525 dmu_objset_disown(os, zvol_tag); 526 zv->zv_objset = NULL; 527 528 zvol_minors++; 529 530 mutex_exit(&zvol_state_lock); 531 532 return (0); 533 } 534 535 /* 536 * Remove minor node for the specified volume. 537 */ 538 static int 539 zvol_remove_zv(zvol_state_t *zv) 540 { 541 char nmbuf[20]; 542 543 ASSERT(MUTEX_HELD(&zvol_state_lock)); 544 if (zv->zv_total_opens != 0) 545 return (EBUSY); 546 547 (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", zv->zv_minor); 548 ddi_remove_minor_node(zfs_dip, nmbuf); 549 550 (void) snprintf(nmbuf, sizeof (nmbuf), "%u", zv->zv_minor); 551 ddi_remove_minor_node(zfs_dip, nmbuf); 552 553 avl_destroy(&zv->zv_znode.z_range_avl); 554 mutex_destroy(&zv->zv_znode.z_range_lock); 555 556 ddi_soft_state_free(zvol_state, zv->zv_minor); 557 558 zvol_minors--; 559 return (0); 560 } 561 562 int 563 zvol_remove_minor(const char *name) 564 { 565 zvol_state_t *zv; 566 int rc; 567 568 mutex_enter(&zvol_state_lock); 569 if ((zv = zvol_minor_lookup(name)) == NULL) { 570 mutex_exit(&zvol_state_lock); 571 return (ENXIO); 572 } 573 rc = zvol_remove_zv(zv); 574 mutex_exit(&zvol_state_lock); 575 return (rc); 576 } 577 578 int 579 zvol_first_open(zvol_state_t *zv) 580 { 581 objset_t *os; 582 uint64_t volsize; 583 int error; 584 uint64_t readonly; 585 586 /* lie and say we're read-only */ 587 error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE, 588 zvol_tag, &os); 589 if (error) 590 return (error); 591 592 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 593 if (error) { 594 ASSERT(error == 0); 595 dmu_objset_disown(os, zvol_tag); 596 return (error); 597 } 598 zv->zv_objset = os; 599 zv->zv_volsize = volsize; 600 zv->zv_zilog = zil_open(os, zvol_get_data); 601 zvol_size_changed(zv->zv_volsize, ddi_driver_major(zfs_dip), 602 zv->zv_minor); 603 604 VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly, 605 NULL) == 0); 606 if (readonly || dmu_objset_is_snapshot(os)) 607 zv->zv_flags |= ZVOL_RDONLY; 608 else 609 zv->zv_flags &= ~ZVOL_RDONLY; 610 return (error); 611 } 612 613 void 614 zvol_last_close(zvol_state_t *zv) 615 { 616 zil_close(zv->zv_zilog); 617 zv->zv_zilog = NULL; 618 dmu_objset_disown(zv->zv_objset, zvol_tag); 619 zv->zv_objset = NULL; 620 } 621 622 int 623 zvol_prealloc(zvol_state_t *zv) 624 { 625 objset_t *os = zv->zv_objset; 626 dmu_tx_t *tx; 627 uint64_t refd, avail, usedobjs, availobjs; 628 uint64_t resid = zv->zv_volsize; 629 uint64_t off = 0; 630 631 /* Check the space usage before attempting to allocate the space */ 632 dmu_objset_space(os, &refd, &avail, &usedobjs, &availobjs); 633 if (avail < zv->zv_volsize) 634 return (ENOSPC); 635 636 /* Free old extents if they exist */ 637 zvol_free_extents(zv); 638 639 while (resid != 0) { 640 int error; 641 uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE); 642 643 tx = dmu_tx_create(os); 644 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); 645 error = dmu_tx_assign(tx, TXG_WAIT); 646 if (error) { 647 dmu_tx_abort(tx); 648 (void) dmu_free_long_range(os, ZVOL_OBJ, 0, off); 649 return (error); 650 } 651 dmu_prealloc(os, ZVOL_OBJ, off, bytes, tx); 652 dmu_tx_commit(tx); 653 off += bytes; 654 resid -= bytes; 655 } 656 txg_wait_synced(dmu_objset_pool(os), 0); 657 658 return (0); 659 } 660 661 int 662 zvol_update_volsize(objset_t *os, uint64_t volsize) 663 { 664 dmu_tx_t *tx; 665 int error; 666 667 ASSERT(MUTEX_HELD(&zvol_state_lock)); 668 669 tx = dmu_tx_create(os); 670 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 671 error = dmu_tx_assign(tx, TXG_WAIT); 672 if (error) { 673 dmu_tx_abort(tx); 674 return (error); 675 } 676 677 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, 678 &volsize, tx); 679 dmu_tx_commit(tx); 680 681 if (error == 0) 682 error = dmu_free_long_range(os, 683 ZVOL_OBJ, volsize, DMU_OBJECT_END); 684 return (error); 685 } 686 687 void 688 zvol_remove_minors(const char *name) 689 { 690 zvol_state_t *zv; 691 char *namebuf; 692 minor_t minor; 693 694 namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP); 695 (void) strncpy(namebuf, name, strlen(name)); 696 (void) strcat(namebuf, "/"); 697 mutex_enter(&zvol_state_lock); 698 for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { 699 700 zv = ddi_get_soft_state(zvol_state, minor); 701 if (zv == NULL) 702 continue; 703 if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0) 704 (void) zvol_remove_zv(zv); 705 } 706 kmem_free(namebuf, strlen(name) + 2); 707 708 mutex_exit(&zvol_state_lock); 709 } 710 711 int 712 zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) 713 { 714 zvol_state_t *zv = NULL; 715 objset_t *os; 716 int error; 717 dmu_object_info_t doi; 718 uint64_t old_volsize = 0ULL; 719 uint64_t readonly; 720 721 mutex_enter(&zvol_state_lock); 722 zv = zvol_minor_lookup(name); 723 if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { 724 mutex_exit(&zvol_state_lock); 725 return (error); 726 } 727 728 if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || 729 (error = zvol_check_volsize(volsize, 730 doi.doi_data_block_size)) != 0) 731 goto out; 732 733 VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, 734 NULL) == 0); 735 if (readonly) { 736 error = EROFS; 737 goto out; 738 } 739 740 error = zvol_update_volsize(os, volsize); 741 /* 742 * Reinitialize the dump area to the new size. If we 743 * failed to resize the dump area then restore it back to 744 * its original size. 745 */ 746 if (zv && error == 0) { 747 if (zv->zv_flags & ZVOL_DUMPIFIED) { 748 old_volsize = zv->zv_volsize; 749 zv->zv_volsize = volsize; 750 if ((error = zvol_dumpify(zv)) != 0 || 751 (error = dumpvp_resize()) != 0) { 752 (void) zvol_update_volsize(os, old_volsize); 753 zv->zv_volsize = old_volsize; 754 error = zvol_dumpify(zv); 755 } 756 } 757 if (error == 0) { 758 zv->zv_volsize = volsize; 759 zvol_size_changed(volsize, maj, zv->zv_minor); 760 } 761 } 762 763 /* 764 * Generate a LUN expansion event. 765 */ 766 if (zv && error == 0) { 767 sysevent_id_t eid; 768 nvlist_t *attr; 769 char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 770 771 (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV, 772 zv->zv_minor); 773 774 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 775 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 776 777 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 778 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 779 780 nvlist_free(attr); 781 kmem_free(physpath, MAXPATHLEN); 782 } 783 784 out: 785 dmu_objset_rele(os, FTAG); 786 787 mutex_exit(&zvol_state_lock); 788 789 return (error); 790 } 791 792 /*ARGSUSED*/ 793 int 794 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) 795 { 796 minor_t minor = getminor(*devp); 797 zvol_state_t *zv; 798 int err = 0; 799 800 if (minor == 0) /* This is the control device */ 801 return (0); 802 803 mutex_enter(&zvol_state_lock); 804 805 zv = ddi_get_soft_state(zvol_state, minor); 806 if (zv == NULL) { 807 mutex_exit(&zvol_state_lock); 808 return (ENXIO); 809 } 810 811 if (zv->zv_total_opens == 0) 812 err = zvol_first_open(zv); 813 if (err) { 814 mutex_exit(&zvol_state_lock); 815 return (err); 816 } 817 if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { 818 err = EROFS; 819 goto out; 820 } 821 if (zv->zv_flags & ZVOL_EXCL) { 822 err = EBUSY; 823 goto out; 824 } 825 if (flag & FEXCL) { 826 if (zv->zv_total_opens != 0) { 827 err = EBUSY; 828 goto out; 829 } 830 zv->zv_flags |= ZVOL_EXCL; 831 } 832 833 if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { 834 zv->zv_open_count[otyp]++; 835 zv->zv_total_opens++; 836 } 837 mutex_exit(&zvol_state_lock); 838 839 return (err); 840 out: 841 if (zv->zv_total_opens == 0) 842 zvol_last_close(zv); 843 mutex_exit(&zvol_state_lock); 844 return (err); 845 } 846 847 /*ARGSUSED*/ 848 int 849 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) 850 { 851 minor_t minor = getminor(dev); 852 zvol_state_t *zv; 853 int error = 0; 854 855 if (minor == 0) /* This is the control device */ 856 return (0); 857 858 mutex_enter(&zvol_state_lock); 859 860 zv = ddi_get_soft_state(zvol_state, minor); 861 if (zv == NULL) { 862 mutex_exit(&zvol_state_lock); 863 return (ENXIO); 864 } 865 866 if (zv->zv_flags & ZVOL_EXCL) { 867 ASSERT(zv->zv_total_opens == 1); 868 zv->zv_flags &= ~ZVOL_EXCL; 869 } 870 871 /* 872 * If the open count is zero, this is a spurious close. 873 * That indicates a bug in the kernel / DDI framework. 874 */ 875 ASSERT(zv->zv_open_count[otyp] != 0); 876 ASSERT(zv->zv_total_opens != 0); 877 878 /* 879 * You may get multiple opens, but only one close. 880 */ 881 zv->zv_open_count[otyp]--; 882 zv->zv_total_opens--; 883 884 if (zv->zv_total_opens == 0) 885 zvol_last_close(zv); 886 887 mutex_exit(&zvol_state_lock); 888 return (error); 889 } 890 891 static void 892 zvol_get_done(zgd_t *zgd, int error) 893 { 894 if (zgd->zgd_db) 895 dmu_buf_rele(zgd->zgd_db, zgd); 896 897 zfs_range_unlock(zgd->zgd_rl); 898 899 if (error == 0 && zgd->zgd_bp) 900 zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); 901 902 kmem_free(zgd, sizeof (zgd_t)); 903 } 904 905 /* 906 * Get data to generate a TX_WRITE intent log record. 907 */ 908 static int 909 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 910 { 911 zvol_state_t *zv = arg; 912 objset_t *os = zv->zv_objset; 913 uint64_t object = ZVOL_OBJ; 914 uint64_t offset = lr->lr_offset; 915 uint64_t size = lr->lr_length; /* length of user data */ 916 blkptr_t *bp = &lr->lr_blkptr; 917 dmu_buf_t *db; 918 zgd_t *zgd; 919 int error; 920 921 ASSERT(zio != NULL); 922 ASSERT(size != 0); 923 924 zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); 925 zgd->zgd_zilog = zv->zv_zilog; 926 zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); 927 928 /* 929 * Write records come in two flavors: immediate and indirect. 930 * For small writes it's cheaper to store the data with the 931 * log record (immediate); for large writes it's cheaper to 932 * sync the data and get a pointer to it (indirect) so that 933 * we don't have to write the data twice. 934 */ 935 if (buf != NULL) { /* immediate write */ 936 error = dmu_read(os, object, offset, size, buf, 937 DMU_READ_NO_PREFETCH); 938 } else { 939 size = zv->zv_volblocksize; 940 offset = P2ALIGN(offset, size); 941 error = dmu_buf_hold(os, object, offset, zgd, &db); 942 if (error == 0) { 943 zgd->zgd_db = db; 944 zgd->zgd_bp = bp; 945 946 ASSERT(db->db_offset == offset); 947 ASSERT(db->db_size == size); 948 949 error = dmu_sync(zio, lr->lr_common.lrc_txg, 950 zvol_get_done, zgd); 951 952 if (error == 0) 953 return (0); 954 } 955 } 956 957 zvol_get_done(zgd, error); 958 959 return (error); 960 } 961 962 /* 963 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. 964 * 965 * We store data in the log buffers if it's small enough. 966 * Otherwise we will later flush the data out via dmu_sync(). 967 */ 968 ssize_t zvol_immediate_write_sz = 32768; 969 970 static void 971 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, 972 boolean_t sync) 973 { 974 uint32_t blocksize = zv->zv_volblocksize; 975 zilog_t *zilog = zv->zv_zilog; 976 boolean_t slogging; 977 ssize_t immediate_write_sz; 978 979 if (zil_disable) 980 return; 981 982 if (zil_replaying(zilog, tx)) 983 return; 984 985 immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) 986 ? 0 : zvol_immediate_write_sz; 987 988 slogging = spa_has_slogs(zilog->zl_spa) && 989 (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); 990 991 while (resid) { 992 itx_t *itx; 993 lr_write_t *lr; 994 ssize_t len; 995 itx_wr_state_t write_state; 996 997 /* 998 * Unlike zfs_log_write() we can be called with 999 * upto DMU_MAX_ACCESS/2 (5MB) writes. 1000 */ 1001 if (blocksize > immediate_write_sz && !slogging && 1002 resid >= blocksize && off % blocksize == 0) { 1003 write_state = WR_INDIRECT; /* uses dmu_sync */ 1004 len = blocksize; 1005 } else if (sync) { 1006 write_state = WR_COPIED; 1007 len = MIN(ZIL_MAX_LOG_DATA, resid); 1008 } else { 1009 write_state = WR_NEED_COPY; 1010 len = MIN(ZIL_MAX_LOG_DATA, resid); 1011 } 1012 1013 itx = zil_itx_create(TX_WRITE, sizeof (*lr) + 1014 (write_state == WR_COPIED ? len : 0)); 1015 lr = (lr_write_t *)&itx->itx_lr; 1016 if (write_state == WR_COPIED && dmu_read(zv->zv_objset, 1017 ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { 1018 zil_itx_destroy(itx); 1019 itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 1020 lr = (lr_write_t *)&itx->itx_lr; 1021 write_state = WR_NEED_COPY; 1022 } 1023 1024 itx->itx_wr_state = write_state; 1025 if (write_state == WR_NEED_COPY) 1026 itx->itx_sod += len; 1027 lr->lr_foid = ZVOL_OBJ; 1028 lr->lr_offset = off; 1029 lr->lr_length = len; 1030 lr->lr_blkoff = 0; 1031 BP_ZERO(&lr->lr_blkptr); 1032 1033 itx->itx_private = zv; 1034 itx->itx_sync = sync; 1035 1036 (void) zil_itx_assign(zilog, itx, tx); 1037 1038 off += len; 1039 resid -= len; 1040 } 1041 } 1042 1043 static int 1044 zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, 1045 boolean_t doread, boolean_t isdump) 1046 { 1047 vdev_disk_t *dvd; 1048 int c; 1049 int numerrors = 0; 1050 1051 for (c = 0; c < vd->vdev_children; c++) { 1052 ASSERT(vd->vdev_ops == &vdev_mirror_ops || 1053 vd->vdev_ops == &vdev_replacing_ops || 1054 vd->vdev_ops == &vdev_spare_ops); 1055 int err = zvol_dumpio_vdev(vd->vdev_child[c], 1056 addr, offset, size, doread, isdump); 1057 if (err != 0) { 1058 numerrors++; 1059 } else if (doread) { 1060 break; 1061 } 1062 } 1063 1064 if (!vd->vdev_ops->vdev_op_leaf) 1065 return (numerrors < vd->vdev_children ? 0 : EIO); 1066 1067 if (doread && !vdev_readable(vd)) 1068 return (EIO); 1069 else if (!doread && !vdev_writeable(vd)) 1070 return (EIO); 1071 1072 dvd = vd->vdev_tsd; 1073 ASSERT3P(dvd, !=, NULL); 1074 offset += VDEV_LABEL_START_SIZE; 1075 1076 if (ddi_in_panic() || isdump) { 1077 ASSERT(!doread); 1078 if (doread) 1079 return (EIO); 1080 return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), 1081 lbtodb(size))); 1082 } else { 1083 return (vdev_disk_physio(dvd->vd_lh, addr, size, offset, 1084 doread ? B_READ : B_WRITE)); 1085 } 1086 } 1087 1088 static int 1089 zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, 1090 boolean_t doread, boolean_t isdump) 1091 { 1092 vdev_t *vd; 1093 int error; 1094 zvol_extent_t *ze; 1095 spa_t *spa = dmu_objset_spa(zv->zv_objset); 1096 1097 /* Must be sector aligned, and not stradle a block boundary. */ 1098 if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) || 1099 P2BOUNDARY(offset, size, zv->zv_volblocksize)) { 1100 return (EINVAL); 1101 } 1102 ASSERT(size <= zv->zv_volblocksize); 1103 1104 /* Locate the extent this belongs to */ 1105 ze = list_head(&zv->zv_extents); 1106 while (offset >= ze->ze_nblks * zv->zv_volblocksize) { 1107 offset -= ze->ze_nblks * zv->zv_volblocksize; 1108 ze = list_next(&zv->zv_extents, ze); 1109 } 1110 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 1111 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); 1112 offset += DVA_GET_OFFSET(&ze->ze_dva); 1113 error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump); 1114 spa_config_exit(spa, SCL_STATE, FTAG); 1115 return (error); 1116 } 1117 1118 int 1119 zvol_strategy(buf_t *bp) 1120 { 1121 zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); 1122 uint64_t off, volsize; 1123 size_t resid; 1124 char *addr; 1125 objset_t *os; 1126 rl_t *rl; 1127 int error = 0; 1128 boolean_t doread = bp->b_flags & B_READ; 1129 boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED; 1130 boolean_t sync; 1131 1132 if (zv == NULL) { 1133 bioerror(bp, ENXIO); 1134 biodone(bp); 1135 return (0); 1136 } 1137 1138 if (getminor(bp->b_edev) == 0) { 1139 bioerror(bp, EINVAL); 1140 biodone(bp); 1141 return (0); 1142 } 1143 1144 if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) { 1145 bioerror(bp, EROFS); 1146 biodone(bp); 1147 return (0); 1148 } 1149 1150 off = ldbtob(bp->b_blkno); 1151 volsize = zv->zv_volsize; 1152 1153 os = zv->zv_objset; 1154 ASSERT(os != NULL); 1155 1156 bp_mapin(bp); 1157 addr = bp->b_un.b_addr; 1158 resid = bp->b_bcount; 1159 1160 if (resid > 0 && (off < 0 || off >= volsize)) { 1161 bioerror(bp, EIO); 1162 biodone(bp); 1163 return (0); 1164 } 1165 1166 sync = !(bp->b_flags & B_ASYNC) && !doread && !is_dump && 1167 !(zv->zv_flags & ZVOL_WCE) && !zil_disable; 1168 1169 /* 1170 * There must be no buffer changes when doing a dmu_sync() because 1171 * we can't change the data whilst calculating the checksum. 1172 */ 1173 rl = zfs_range_lock(&zv->zv_znode, off, resid, 1174 doread ? RL_READER : RL_WRITER); 1175 1176 while (resid != 0 && off < volsize) { 1177 size_t size = MIN(resid, zvol_maxphys); 1178 if (is_dump) { 1179 size = MIN(size, P2END(off, zv->zv_volblocksize) - off); 1180 error = zvol_dumpio(zv, addr, off, size, 1181 doread, B_FALSE); 1182 } else if (doread) { 1183 error = dmu_read(os, ZVOL_OBJ, off, size, addr, 1184 DMU_READ_PREFETCH); 1185 } else { 1186 dmu_tx_t *tx = dmu_tx_create(os); 1187 dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); 1188 error = dmu_tx_assign(tx, TXG_WAIT); 1189 if (error) { 1190 dmu_tx_abort(tx); 1191 } else { 1192 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 1193 zvol_log_write(zv, tx, off, size, sync); 1194 dmu_tx_commit(tx); 1195 } 1196 } 1197 if (error) { 1198 /* convert checksum errors into IO errors */ 1199 if (error == ECKSUM) 1200 error = EIO; 1201 break; 1202 } 1203 off += size; 1204 addr += size; 1205 resid -= size; 1206 } 1207 zfs_range_unlock(rl); 1208 1209 if ((bp->b_resid = resid) == bp->b_bcount) 1210 bioerror(bp, off > volsize ? EINVAL : error); 1211 1212 if (sync) 1213 zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); 1214 biodone(bp); 1215 1216 return (0); 1217 } 1218 1219 /* 1220 * Set the buffer count to the zvol maximum transfer. 1221 * Using our own routine instead of the default minphys() 1222 * means that for larger writes we write bigger buffers on X86 1223 * (128K instead of 56K) and flush the disk write cache less often 1224 * (every zvol_maxphys - currently 1MB) instead of minphys (currently 1225 * 56K on X86 and 128K on sparc). 1226 */ 1227 void 1228 zvol_minphys(struct buf *bp) 1229 { 1230 if (bp->b_bcount > zvol_maxphys) 1231 bp->b_bcount = zvol_maxphys; 1232 } 1233 1234 int 1235 zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) 1236 { 1237 minor_t minor = getminor(dev); 1238 zvol_state_t *zv; 1239 int error = 0; 1240 uint64_t size; 1241 uint64_t boff; 1242 uint64_t resid; 1243 1244 if (minor == 0) /* This is the control device */ 1245 return (ENXIO); 1246 1247 zv = ddi_get_soft_state(zvol_state, minor); 1248 if (zv == NULL) 1249 return (ENXIO); 1250 1251 boff = ldbtob(blkno); 1252 resid = ldbtob(nblocks); 1253 1254 VERIFY3U(boff + resid, <=, zv->zv_volsize); 1255 1256 while (resid) { 1257 size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); 1258 error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE); 1259 if (error) 1260 break; 1261 boff += size; 1262 addr += size; 1263 resid -= size; 1264 } 1265 1266 return (error); 1267 } 1268 1269 /*ARGSUSED*/ 1270 int 1271 zvol_read(dev_t dev, uio_t *uio, cred_t *cr) 1272 { 1273 minor_t minor = getminor(dev); 1274 zvol_state_t *zv; 1275 uint64_t volsize; 1276 rl_t *rl; 1277 int error = 0; 1278 1279 if (minor == 0) /* This is the control device */ 1280 return (ENXIO); 1281 1282 zv = ddi_get_soft_state(zvol_state, minor); 1283 if (zv == NULL) 1284 return (ENXIO); 1285 1286 volsize = zv->zv_volsize; 1287 if (uio->uio_resid > 0 && 1288 (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) 1289 return (EIO); 1290 1291 if (zv->zv_flags & ZVOL_DUMPIFIED) { 1292 error = physio(zvol_strategy, NULL, dev, B_READ, 1293 zvol_minphys, uio); 1294 return (error); 1295 } 1296 1297 rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, 1298 RL_READER); 1299 while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { 1300 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); 1301 1302 /* don't read past the end */ 1303 if (bytes > volsize - uio->uio_loffset) 1304 bytes = volsize - uio->uio_loffset; 1305 1306 error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); 1307 if (error) { 1308 /* convert checksum errors into IO errors */ 1309 if (error == ECKSUM) 1310 error = EIO; 1311 break; 1312 } 1313 } 1314 zfs_range_unlock(rl); 1315 return (error); 1316 } 1317 1318 /*ARGSUSED*/ 1319 int 1320 zvol_write(dev_t dev, uio_t *uio, cred_t *cr) 1321 { 1322 minor_t minor = getminor(dev); 1323 zvol_state_t *zv; 1324 uint64_t volsize; 1325 rl_t *rl; 1326 int error = 0; 1327 boolean_t sync; 1328 1329 if (minor == 0) /* This is the control device */ 1330 return (ENXIO); 1331 1332 zv = ddi_get_soft_state(zvol_state, minor); 1333 if (zv == NULL) 1334 return (ENXIO); 1335 1336 volsize = zv->zv_volsize; 1337 if (uio->uio_resid > 0 && 1338 (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) 1339 return (EIO); 1340 1341 if (zv->zv_flags & ZVOL_DUMPIFIED) { 1342 error = physio(zvol_strategy, NULL, dev, B_WRITE, 1343 zvol_minphys, uio); 1344 return (error); 1345 } 1346 1347 sync = !(zv->zv_flags & ZVOL_WCE) && !zil_disable; 1348 1349 rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, 1350 RL_WRITER); 1351 while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { 1352 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); 1353 uint64_t off = uio->uio_loffset; 1354 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 1355 1356 if (bytes > volsize - off) /* don't write past the end */ 1357 bytes = volsize - off; 1358 1359 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); 1360 error = dmu_tx_assign(tx, TXG_WAIT); 1361 if (error) { 1362 dmu_tx_abort(tx); 1363 break; 1364 } 1365 error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); 1366 if (error == 0) 1367 zvol_log_write(zv, tx, off, bytes, sync); 1368 dmu_tx_commit(tx); 1369 1370 if (error) 1371 break; 1372 } 1373 zfs_range_unlock(rl); 1374 if (sync) 1375 zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); 1376 return (error); 1377 } 1378 1379 int 1380 zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs) 1381 { 1382 struct uuid uuid = EFI_RESERVED; 1383 efi_gpe_t gpe = { 0 }; 1384 uint32_t crc; 1385 dk_efi_t efi; 1386 int length; 1387 char *ptr; 1388 1389 if (ddi_copyin(arg, &efi, sizeof (dk_efi_t), flag)) 1390 return (EFAULT); 1391 ptr = (char *)(uintptr_t)efi.dki_data_64; 1392 length = efi.dki_length; 1393 /* 1394 * Some clients may attempt to request a PMBR for the 1395 * zvol. Currently this interface will return EINVAL to 1396 * such requests. These requests could be supported by 1397 * adding a check for lba == 0 and consing up an appropriate 1398 * PMBR. 1399 */ 1400 if (efi.dki_lba < 1 || efi.dki_lba > 2 || length <= 0) 1401 return (EINVAL); 1402 1403 gpe.efi_gpe_StartingLBA = LE_64(34ULL); 1404 gpe.efi_gpe_EndingLBA = LE_64((vs >> bs) - 1); 1405 UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); 1406 1407 if (efi.dki_lba == 1) { 1408 efi_gpt_t gpt = { 0 }; 1409 1410 gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); 1411 gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 1412 gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); 1413 gpt.efi_gpt_MyLBA = LE_64(1ULL); 1414 gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); 1415 gpt.efi_gpt_LastUsableLBA = LE_64((vs >> bs) - 1); 1416 gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); 1417 gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); 1418 gpt.efi_gpt_SizeOfPartitionEntry = 1419 LE_32(sizeof (efi_gpe_t)); 1420 CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); 1421 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 1422 CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); 1423 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); 1424 if (ddi_copyout(&gpt, ptr, MIN(sizeof (gpt), length), 1425 flag)) 1426 return (EFAULT); 1427 ptr += sizeof (gpt); 1428 length -= sizeof (gpt); 1429 } 1430 if (length > 0 && ddi_copyout(&gpe, ptr, MIN(sizeof (gpe), 1431 length), flag)) 1432 return (EFAULT); 1433 return (0); 1434 } 1435 1436 /* 1437 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). 1438 */ 1439 /*ARGSUSED*/ 1440 int 1441 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 1442 { 1443 zvol_state_t *zv; 1444 struct dk_cinfo dki; 1445 struct dk_minfo dkm; 1446 struct dk_callback *dkc; 1447 int error = 0; 1448 rl_t *rl; 1449 1450 mutex_enter(&zvol_state_lock); 1451 1452 zv = ddi_get_soft_state(zvol_state, getminor(dev)); 1453 1454 if (zv == NULL) { 1455 mutex_exit(&zvol_state_lock); 1456 return (ENXIO); 1457 } 1458 ASSERT(zv->zv_total_opens > 0); 1459 1460 switch (cmd) { 1461 1462 case DKIOCINFO: 1463 bzero(&dki, sizeof (dki)); 1464 (void) strcpy(dki.dki_cname, "zvol"); 1465 (void) strcpy(dki.dki_dname, "zvol"); 1466 dki.dki_ctype = DKC_UNKNOWN; 1467 dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs); 1468 mutex_exit(&zvol_state_lock); 1469 if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) 1470 error = EFAULT; 1471 return (error); 1472 1473 case DKIOCGMEDIAINFO: 1474 bzero(&dkm, sizeof (dkm)); 1475 dkm.dki_lbsize = 1U << zv->zv_min_bs; 1476 dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; 1477 dkm.dki_media_type = DK_UNKNOWN; 1478 mutex_exit(&zvol_state_lock); 1479 if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) 1480 error = EFAULT; 1481 return (error); 1482 1483 case DKIOCGETEFI: 1484 { 1485 uint64_t vs = zv->zv_volsize; 1486 uint8_t bs = zv->zv_min_bs; 1487 1488 mutex_exit(&zvol_state_lock); 1489 error = zvol_getefi((void *)arg, flag, vs, bs); 1490 return (error); 1491 } 1492 1493 case DKIOCFLUSHWRITECACHE: 1494 dkc = (struct dk_callback *)arg; 1495 mutex_exit(&zvol_state_lock); 1496 zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); 1497 if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { 1498 (*dkc->dkc_callback)(dkc->dkc_cookie, error); 1499 error = 0; 1500 } 1501 return (error); 1502 1503 case DKIOCGETWCE: 1504 { 1505 int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0; 1506 if (ddi_copyout(&wce, (void *)arg, sizeof (int), 1507 flag)) 1508 error = EFAULT; 1509 break; 1510 } 1511 case DKIOCSETWCE: 1512 { 1513 int wce; 1514 if (ddi_copyin((void *)arg, &wce, sizeof (int), 1515 flag)) { 1516 error = EFAULT; 1517 break; 1518 } 1519 if (wce) { 1520 zv->zv_flags |= ZVOL_WCE; 1521 mutex_exit(&zvol_state_lock); 1522 } else { 1523 zv->zv_flags &= ~ZVOL_WCE; 1524 mutex_exit(&zvol_state_lock); 1525 zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); 1526 } 1527 return (0); 1528 } 1529 1530 case DKIOCGGEOM: 1531 case DKIOCGVTOC: 1532 /* 1533 * commands using these (like prtvtoc) expect ENOTSUP 1534 * since we're emulating an EFI label 1535 */ 1536 error = ENOTSUP; 1537 break; 1538 1539 case DKIOCDUMPINIT: 1540 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, 1541 RL_WRITER); 1542 error = zvol_dumpify(zv); 1543 zfs_range_unlock(rl); 1544 break; 1545 1546 case DKIOCDUMPFINI: 1547 if (!(zv->zv_flags & ZVOL_DUMPIFIED)) 1548 break; 1549 rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, 1550 RL_WRITER); 1551 error = zvol_dump_fini(zv); 1552 zfs_range_unlock(rl); 1553 break; 1554 1555 default: 1556 error = ENOTTY; 1557 break; 1558 1559 } 1560 mutex_exit(&zvol_state_lock); 1561 return (error); 1562 } 1563 1564 int 1565 zvol_busy(void) 1566 { 1567 return (zvol_minors != 0); 1568 } 1569 1570 void 1571 zvol_init(void) 1572 { 1573 VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0); 1574 mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); 1575 } 1576 1577 void 1578 zvol_fini(void) 1579 { 1580 mutex_destroy(&zvol_state_lock); 1581 ddi_soft_state_fini(&zvol_state); 1582 } 1583 1584 static int 1585 zvol_dump_init(zvol_state_t *zv, boolean_t resize) 1586 { 1587 dmu_tx_t *tx; 1588 int error = 0; 1589 objset_t *os = zv->zv_objset; 1590 nvlist_t *nv = NULL; 1591 1592 ASSERT(MUTEX_HELD(&zvol_state_lock)); 1593 error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0, 1594 DMU_OBJECT_END); 1595 /* wait for dmu_free_long_range to actually free the blocks */ 1596 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 1597 1598 tx = dmu_tx_create(os); 1599 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 1600 dmu_tx_hold_bonus(tx, ZVOL_OBJ); 1601 error = dmu_tx_assign(tx, TXG_WAIT); 1602 if (error) { 1603 dmu_tx_abort(tx); 1604 return (error); 1605 } 1606 1607 /* 1608 * If we are resizing the dump device then we only need to 1609 * update the refreservation to match the newly updated 1610 * zvolsize. Otherwise, we save off the original state of the 1611 * zvol so that we can restore them if the zvol is ever undumpified. 1612 */ 1613 if (resize) { 1614 error = zap_update(os, ZVOL_ZAP_OBJ, 1615 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, 1616 &zv->zv_volsize, tx); 1617 } else { 1618 uint64_t checksum, compress, refresrv, vbs; 1619 1620 error = dsl_prop_get_integer(zv->zv_name, 1621 zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); 1622 error = error ? error : dsl_prop_get_integer(zv->zv_name, 1623 zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); 1624 error = error ? error : dsl_prop_get_integer(zv->zv_name, 1625 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); 1626 error = error ? error : dsl_prop_get_integer(zv->zv_name, 1627 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL); 1628 1629 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, 1630 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, 1631 &compress, tx); 1632 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, 1633 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx); 1634 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, 1635 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, 1636 &refresrv, tx); 1637 error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, 1638 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, 1639 &vbs, tx); 1640 error = error ? error : dmu_object_set_blocksize( 1641 os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx); 1642 if (error == 0) 1643 zv->zv_volblocksize = SPA_MAXBLOCKSIZE; 1644 } 1645 dmu_tx_commit(tx); 1646 1647 /* 1648 * We only need update the zvol's property if we are initializing 1649 * the dump area for the first time. 1650 */ 1651 if (!resize) { 1652 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1653 VERIFY(nvlist_add_uint64(nv, 1654 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0); 1655 VERIFY(nvlist_add_uint64(nv, 1656 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 1657 ZIO_COMPRESS_OFF) == 0); 1658 VERIFY(nvlist_add_uint64(nv, 1659 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 1660 ZIO_CHECKSUM_OFF) == 0); 1661 1662 error = zfs_set_prop_nvlist(zv->zv_name, nv); 1663 nvlist_free(nv); 1664 1665 if (error) 1666 return (error); 1667 } 1668 1669 /* Allocate the space for the dump */ 1670 error = zvol_prealloc(zv); 1671 return (error); 1672 } 1673 1674 static int 1675 zvol_dumpify(zvol_state_t *zv) 1676 { 1677 int error = 0; 1678 uint64_t dumpsize = 0; 1679 dmu_tx_t *tx; 1680 objset_t *os = zv->zv_objset; 1681 1682 if (zv->zv_flags & ZVOL_RDONLY) 1683 return (EROFS); 1684 1685 if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 1686 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { 1687 boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE; 1688 1689 if ((error = zvol_dump_init(zv, resize)) != 0) { 1690 (void) zvol_dump_fini(zv); 1691 return (error); 1692 } 1693 } 1694 1695 /* 1696 * Build up our lba mapping. 1697 */ 1698 error = zvol_get_lbas(zv); 1699 if (error) { 1700 (void) zvol_dump_fini(zv); 1701 return (error); 1702 } 1703 1704 tx = dmu_tx_create(os); 1705 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 1706 error = dmu_tx_assign(tx, TXG_WAIT); 1707 if (error) { 1708 dmu_tx_abort(tx); 1709 (void) zvol_dump_fini(zv); 1710 return (error); 1711 } 1712 1713 zv->zv_flags |= ZVOL_DUMPIFIED; 1714 error = zap_update(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, 1715 &zv->zv_volsize, tx); 1716 dmu_tx_commit(tx); 1717 1718 if (error) { 1719 (void) zvol_dump_fini(zv); 1720 return (error); 1721 } 1722 1723 txg_wait_synced(dmu_objset_pool(os), 0); 1724 return (0); 1725 } 1726 1727 static int 1728 zvol_dump_fini(zvol_state_t *zv) 1729 { 1730 dmu_tx_t *tx; 1731 objset_t *os = zv->zv_objset; 1732 nvlist_t *nv; 1733 int error = 0; 1734 uint64_t checksum, compress, refresrv, vbs; 1735 1736 /* 1737 * Attempt to restore the zvol back to its pre-dumpified state. 1738 * This is a best-effort attempt as it's possible that not all 1739 * of these properties were initialized during the dumpify process 1740 * (i.e. error during zvol_dump_init). 1741 */ 1742 1743 tx = dmu_tx_create(os); 1744 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 1745 error = dmu_tx_assign(tx, TXG_WAIT); 1746 if (error) { 1747 dmu_tx_abort(tx); 1748 return (error); 1749 } 1750 (void) zap_remove(os, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, tx); 1751 dmu_tx_commit(tx); 1752 1753 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, 1754 zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum); 1755 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, 1756 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); 1757 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, 1758 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); 1759 (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, 1760 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs); 1761 1762 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1763 (void) nvlist_add_uint64(nv, 1764 zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum); 1765 (void) nvlist_add_uint64(nv, 1766 zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); 1767 (void) nvlist_add_uint64(nv, 1768 zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); 1769 (void) zfs_set_prop_nvlist(zv->zv_name, nv); 1770 nvlist_free(nv); 1771 1772 zvol_free_extents(zv); 1773 zv->zv_flags &= ~ZVOL_DUMPIFIED; 1774 (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); 1775 /* wait for dmu_free_long_range to actually free the blocks */ 1776 txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); 1777 tx = dmu_tx_create(os); 1778 dmu_tx_hold_bonus(tx, ZVOL_OBJ); 1779 error = dmu_tx_assign(tx, TXG_WAIT); 1780 if (error) { 1781 dmu_tx_abort(tx); 1782 return (error); 1783 } 1784 if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0) 1785 zv->zv_volblocksize = vbs; 1786 dmu_tx_commit(tx); 1787 1788 return (0); 1789 } 1790