1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * ZFS volume emulation driver. 30 * 31 * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. 32 * Volumes are accessed through the symbolic links named: 33 * 34 * /dev/zvol/dsk/<pool_name>/<dataset_name> 35 * /dev/zvol/rdsk/<pool_name>/<dataset_name> 36 * 37 * These links are created by the ZFS-specific devfsadm link generator. 38 * Volumes are persistent through reboot. No user command needs to be 39 * run before opening and using a device. 40 */ 41 42 #include <sys/types.h> 43 #include <sys/param.h> 44 #include <sys/errno.h> 45 #include <sys/uio.h> 46 #include <sys/buf.h> 47 #include <sys/modctl.h> 48 #include <sys/open.h> 49 #include <sys/kmem.h> 50 #include <sys/conf.h> 51 #include <sys/cmn_err.h> 52 #include <sys/stat.h> 53 #include <sys/zap.h> 54 #include <sys/spa.h> 55 #include <sys/zio.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dkio.h> 58 #include <sys/efi_partition.h> 59 #include <sys/byteorder.h> 60 #include <sys/pathname.h> 61 #include <sys/ddi.h> 62 #include <sys/sunddi.h> 63 #include <sys/crc32.h> 64 #include <sys/dirent.h> 65 #include <sys/policy.h> 66 #include <sys/fs/zfs.h> 67 #include <sys/zfs_ioctl.h> 68 #include <sys/mkdev.h> 69 #include <sys/zil.h> 70 #include <sys/refcount.h> 71 72 #include "zfs_namecheck.h" 73 74 #define ZVOL_OBJ 1ULL 75 #define ZVOL_ZAP_OBJ 2ULL 76 77 static void *zvol_state; 78 79 /* 80 * This lock protects the zvol_state structure from being modified 81 * while it's being used, e.g. an open that comes in before a create 82 * finishes. It also protects temporary opens of the dataset so that, 83 * e.g., an open doesn't get a spurious EBUSY. 84 */ 85 static kmutex_t zvol_state_lock; 86 static uint32_t zvol_minors; 87 88 /* 89 * The in-core state of each volume. 90 */ 91 typedef struct zvol_state { 92 char zv_name[MAXPATHLEN]; /* pool/dd name */ 93 uint64_t zv_volsize; /* amount of space we advertise */ 94 uint64_t zv_volblocksize; /* volume block size */ 95 minor_t zv_minor; /* minor number */ 96 uint8_t zv_min_bs; /* minimum addressable block shift */ 97 uint8_t zv_readonly; /* hard readonly; like write-protect */ 98 objset_t *zv_objset; /* objset handle */ 99 uint32_t zv_mode; /* DS_MODE_* flags at open time */ 100 uint32_t zv_open_count[OTYPCNT]; /* open counts */ 101 uint32_t zv_total_opens; /* total open count */ 102 zilog_t *zv_zilog; /* ZIL handle */ 103 uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ 104 krwlock_t zv_dslock; /* dmu_sync() rwlock */ 105 } zvol_state_t; 106 107 /* 108 * zvol maximum transfer in one DMU tx. 109 */ 110 int zvol_maxphys = DMU_MAX_ACCESS/2; 111 112 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); 113 114 static void 115 zvol_size_changed(zvol_state_t *zv, dev_t dev) 116 { 117 dev = makedevice(getmajor(dev), zv->zv_minor); 118 119 VERIFY(ddi_prop_update_int64(dev, zfs_dip, 120 "Size", zv->zv_volsize) == DDI_SUCCESS); 121 VERIFY(ddi_prop_update_int64(dev, zfs_dip, 122 "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS); 123 } 124 125 int 126 zvol_check_volsize(uint64_t volsize, uint64_t blocksize) 127 { 128 if (volsize == 0) 129 return (EINVAL); 130 131 if (volsize % blocksize != 0) 132 return (EINVAL); 133 134 #ifdef _ILP32 135 if (volsize - 1 > SPEC_MAXOFFSET_T) 136 return (EOVERFLOW); 137 #endif 138 return (0); 139 } 140 141 int 142 zvol_check_volblocksize(uint64_t volblocksize) 143 { 144 if (volblocksize < SPA_MINBLOCKSIZE || 145 volblocksize > SPA_MAXBLOCKSIZE || 146 !ISP2(volblocksize)) 147 return (EDOM); 148 149 return (0); 150 } 151 152 static void 153 zvol_readonly_changed_cb(void *arg, uint64_t newval) 154 { 155 zvol_state_t *zv = arg; 156 157 zv->zv_readonly = (uint8_t)newval; 158 } 159 160 int 161 zvol_get_stats(objset_t *os, nvlist_t *nv) 162 { 163 int error; 164 dmu_object_info_t doi; 165 uint64_t val; 166 167 168 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); 169 if (error) 170 return (error); 171 172 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); 173 174 error = dmu_object_info(os, ZVOL_OBJ, &doi); 175 176 if (error == 0) { 177 dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, 178 doi.doi_data_block_size); 179 } 180 181 return (error); 182 } 183 184 /* 185 * Find a free minor number. 186 */ 187 static minor_t 188 zvol_minor_alloc(void) 189 { 190 minor_t minor; 191 192 ASSERT(MUTEX_HELD(&zvol_state_lock)); 193 194 for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) 195 if (ddi_get_soft_state(zvol_state, minor) == NULL) 196 return (minor); 197 198 return (0); 199 } 200 201 static zvol_state_t * 202 zvol_minor_lookup(const char *name) 203 { 204 minor_t minor; 205 zvol_state_t *zv; 206 207 ASSERT(MUTEX_HELD(&zvol_state_lock)); 208 209 for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { 210 zv = ddi_get_soft_state(zvol_state, minor); 211 if (zv == NULL) 212 continue; 213 if (strcmp(zv->zv_name, name) == 0) 214 break; 215 } 216 217 return (zv); 218 } 219 220 void 221 zvol_create_cb(objset_t *os, void *arg, dmu_tx_t *tx) 222 { 223 zfs_create_data_t *zc = arg; 224 int error; 225 uint64_t volblocksize, volsize; 226 227 VERIFY(nvlist_lookup_uint64(zc->zc_props, 228 zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); 229 if (nvlist_lookup_uint64(zc->zc_props, 230 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) 231 volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); 232 233 /* 234 * These properites must be removed from the list so the generic 235 * property setting step won't apply to them. 236 */ 237 VERIFY(nvlist_remove_all(zc->zc_props, 238 zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); 239 (void) nvlist_remove_all(zc->zc_props, 240 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); 241 242 error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, 243 DMU_OT_NONE, 0, tx); 244 ASSERT(error == 0); 245 246 error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, 247 DMU_OT_NONE, 0, tx); 248 ASSERT(error == 0); 249 250 error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); 251 ASSERT(error == 0); 252 } 253 254 /* 255 * Replay a TX_WRITE ZIL transaction that didn't get committed 256 * after a system failure 257 */ 258 static int 259 zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) 260 { 261 objset_t *os = zv->zv_objset; 262 char *data = (char *)(lr + 1); /* data follows lr_write_t */ 263 uint64_t off = lr->lr_offset; 264 uint64_t len = lr->lr_length; 265 dmu_tx_t *tx; 266 int error; 267 268 if (byteswap) 269 byteswap_uint64_array(lr, sizeof (*lr)); 270 271 tx = dmu_tx_create(os); 272 dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); 273 error = dmu_tx_assign(tx, zv->zv_txg_assign); 274 if (error) { 275 dmu_tx_abort(tx); 276 } else { 277 dmu_write(os, ZVOL_OBJ, off, len, data, tx); 278 dmu_tx_commit(tx); 279 } 280 281 return (error); 282 } 283 284 /* ARGSUSED */ 285 static int 286 zvol_replay_err(zvol_state_t *zv, lr_t *lr, boolean_t byteswap) 287 { 288 return (ENOTSUP); 289 } 290 291 /* 292 * Callback vectors for replaying records. 293 * Only TX_WRITE is needed for zvol. 294 */ 295 zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { 296 zvol_replay_err, /* 0 no such transaction type */ 297 zvol_replay_err, /* TX_CREATE */ 298 zvol_replay_err, /* TX_MKDIR */ 299 zvol_replay_err, /* TX_MKXATTR */ 300 zvol_replay_err, /* TX_SYMLINK */ 301 zvol_replay_err, /* TX_REMOVE */ 302 zvol_replay_err, /* TX_RMDIR */ 303 zvol_replay_err, /* TX_LINK */ 304 zvol_replay_err, /* TX_RENAME */ 305 zvol_replay_write, /* TX_WRITE */ 306 zvol_replay_err, /* TX_TRUNCATE */ 307 zvol_replay_err, /* TX_SETATTR */ 308 zvol_replay_err, /* TX_ACL */ 309 }; 310 311 /* 312 * Create a minor node for the specified volume. 313 */ 314 int 315 zvol_create_minor(const char *name, dev_t dev) 316 { 317 zvol_state_t *zv; 318 objset_t *os; 319 dmu_object_info_t doi; 320 uint64_t volsize; 321 minor_t minor = 0; 322 struct pathname linkpath; 323 int ds_mode = DS_MODE_PRIMARY; 324 vnode_t *vp = NULL; 325 char *devpath; 326 size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + 1 + strlen(name) + 1; 327 char chrbuf[30], blkbuf[30]; 328 int error; 329 330 mutex_enter(&zvol_state_lock); 331 332 if ((zv = zvol_minor_lookup(name)) != NULL) { 333 mutex_exit(&zvol_state_lock); 334 return (EEXIST); 335 } 336 337 if (strchr(name, '@') != 0) 338 ds_mode |= DS_MODE_READONLY; 339 340 error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); 341 342 if (error) { 343 mutex_exit(&zvol_state_lock); 344 return (error); 345 } 346 347 error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); 348 349 if (error) { 350 dmu_objset_close(os); 351 mutex_exit(&zvol_state_lock); 352 return (error); 353 } 354 355 /* 356 * If there's an existing /dev/zvol symlink, try to use the 357 * same minor number we used last time. 358 */ 359 devpath = kmem_alloc(devpathlen, KM_SLEEP); 360 361 (void) sprintf(devpath, "%s/%s", ZVOL_FULL_DEV_DIR, name); 362 363 error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp); 364 365 kmem_free(devpath, devpathlen); 366 367 if (error == 0 && vp->v_type != VLNK) 368 error = EINVAL; 369 370 if (error == 0) { 371 pn_alloc(&linkpath); 372 error = pn_getsymlink(vp, &linkpath, kcred); 373 if (error == 0) { 374 char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV); 375 if (ms != NULL) { 376 ms += strlen(ZVOL_PSEUDO_DEV); 377 minor = stoi(&ms); 378 } 379 } 380 pn_free(&linkpath); 381 } 382 383 if (vp != NULL) 384 VN_RELE(vp); 385 386 /* 387 * If we found a minor but it's already in use, we must pick a new one. 388 */ 389 if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL) 390 minor = 0; 391 392 if (minor == 0) 393 minor = zvol_minor_alloc(); 394 395 if (minor == 0) { 396 dmu_objset_close(os); 397 mutex_exit(&zvol_state_lock); 398 return (ENXIO); 399 } 400 401 if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) { 402 dmu_objset_close(os); 403 mutex_exit(&zvol_state_lock); 404 return (EAGAIN); 405 } 406 407 (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, 408 (char *)name); 409 410 (void) sprintf(chrbuf, "%uc,raw", minor); 411 412 if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, 413 minor, DDI_PSEUDO, 0) == DDI_FAILURE) { 414 ddi_soft_state_free(zvol_state, minor); 415 dmu_objset_close(os); 416 mutex_exit(&zvol_state_lock); 417 return (EAGAIN); 418 } 419 420 (void) sprintf(blkbuf, "%uc", minor); 421 422 if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, 423 minor, DDI_PSEUDO, 0) == DDI_FAILURE) { 424 ddi_remove_minor_node(zfs_dip, chrbuf); 425 ddi_soft_state_free(zvol_state, minor); 426 dmu_objset_close(os); 427 mutex_exit(&zvol_state_lock); 428 return (EAGAIN); 429 } 430 431 zv = ddi_get_soft_state(zvol_state, minor); 432 433 (void) strcpy(zv->zv_name, name); 434 zv->zv_min_bs = DEV_BSHIFT; 435 zv->zv_minor = minor; 436 zv->zv_volsize = volsize; 437 zv->zv_objset = os; 438 zv->zv_mode = ds_mode; 439 zv->zv_zilog = zil_open(os, zvol_get_data); 440 441 /* get and cache the blocksize */ 442 error = dmu_object_info(os, ZVOL_OBJ, &doi); 443 ASSERT(error == 0); 444 zv->zv_volblocksize = doi.doi_data_block_size; 445 446 rw_init(&zv->zv_dslock, NULL, RW_DEFAULT, NULL); 447 448 zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector); 449 450 zvol_size_changed(zv, dev); 451 452 /* XXX this should handle the possible i/o error */ 453 VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), 454 "readonly", zvol_readonly_changed_cb, zv) == 0); 455 456 zvol_minors++; 457 458 mutex_exit(&zvol_state_lock); 459 460 return (0); 461 } 462 463 /* 464 * Remove minor node for the specified volume. 465 */ 466 int 467 zvol_remove_minor(const char *name) 468 { 469 zvol_state_t *zv; 470 char namebuf[30]; 471 472 mutex_enter(&zvol_state_lock); 473 474 if ((zv = zvol_minor_lookup(name)) == NULL) { 475 mutex_exit(&zvol_state_lock); 476 return (ENXIO); 477 } 478 479 if (zv->zv_total_opens != 0) { 480 mutex_exit(&zvol_state_lock); 481 return (EBUSY); 482 } 483 484 (void) sprintf(namebuf, "%uc,raw", zv->zv_minor); 485 ddi_remove_minor_node(zfs_dip, namebuf); 486 487 (void) sprintf(namebuf, "%uc", zv->zv_minor); 488 ddi_remove_minor_node(zfs_dip, namebuf); 489 490 VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), 491 "readonly", zvol_readonly_changed_cb, zv) == 0); 492 493 zil_close(zv->zv_zilog); 494 zv->zv_zilog = NULL; 495 dmu_objset_close(zv->zv_objset); 496 zv->zv_objset = NULL; 497 498 ddi_soft_state_free(zvol_state, zv->zv_minor); 499 500 zvol_minors--; 501 502 mutex_exit(&zvol_state_lock); 503 504 return (0); 505 } 506 507 int 508 zvol_set_volsize(const char *name, dev_t dev, uint64_t volsize) 509 { 510 zvol_state_t *zv; 511 dmu_tx_t *tx; 512 int error; 513 dmu_object_info_t doi; 514 515 mutex_enter(&zvol_state_lock); 516 517 if ((zv = zvol_minor_lookup(name)) == NULL) { 518 mutex_exit(&zvol_state_lock); 519 return (ENXIO); 520 } 521 522 if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || 523 (error = zvol_check_volsize(volsize, 524 doi.doi_data_block_size)) != 0) { 525 mutex_exit(&zvol_state_lock); 526 return (error); 527 } 528 529 if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { 530 mutex_exit(&zvol_state_lock); 531 return (EROFS); 532 } 533 534 tx = dmu_tx_create(zv->zv_objset); 535 dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); 536 dmu_tx_hold_free(tx, ZVOL_OBJ, volsize, DMU_OBJECT_END); 537 error = dmu_tx_assign(tx, TXG_WAIT); 538 if (error) { 539 dmu_tx_abort(tx); 540 mutex_exit(&zvol_state_lock); 541 return (error); 542 } 543 544 error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, 545 &volsize, tx); 546 if (error == 0) { 547 error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, volsize, 548 DMU_OBJECT_END, tx); 549 } 550 551 dmu_tx_commit(tx); 552 553 if (error == 0) { 554 zv->zv_volsize = volsize; 555 zvol_size_changed(zv, dev); 556 } 557 558 mutex_exit(&zvol_state_lock); 559 560 return (error); 561 } 562 563 int 564 zvol_set_volblocksize(const char *name, uint64_t volblocksize) 565 { 566 zvol_state_t *zv; 567 dmu_tx_t *tx; 568 int error; 569 570 mutex_enter(&zvol_state_lock); 571 572 if ((zv = zvol_minor_lookup(name)) == NULL) { 573 mutex_exit(&zvol_state_lock); 574 return (ENXIO); 575 } 576 577 if (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) { 578 mutex_exit(&zvol_state_lock); 579 return (EROFS); 580 } 581 582 tx = dmu_tx_create(zv->zv_objset); 583 dmu_tx_hold_bonus(tx, ZVOL_OBJ); 584 error = dmu_tx_assign(tx, TXG_WAIT); 585 if (error) { 586 dmu_tx_abort(tx); 587 } else { 588 error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, 589 volblocksize, 0, tx); 590 if (error == ENOTSUP) 591 error = EBUSY; 592 dmu_tx_commit(tx); 593 } 594 595 mutex_exit(&zvol_state_lock); 596 597 return (error); 598 } 599 600 /*ARGSUSED*/ 601 int 602 zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) 603 { 604 minor_t minor = getminor(*devp); 605 zvol_state_t *zv; 606 607 if (minor == 0) /* This is the control device */ 608 return (0); 609 610 mutex_enter(&zvol_state_lock); 611 612 zv = ddi_get_soft_state(zvol_state, minor); 613 if (zv == NULL) { 614 mutex_exit(&zvol_state_lock); 615 return (ENXIO); 616 } 617 618 ASSERT(zv->zv_objset != NULL); 619 620 if ((flag & FWRITE) && 621 (zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY))) { 622 mutex_exit(&zvol_state_lock); 623 return (EROFS); 624 } 625 626 if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) { 627 zv->zv_open_count[otyp]++; 628 zv->zv_total_opens++; 629 } 630 631 mutex_exit(&zvol_state_lock); 632 633 return (0); 634 } 635 636 /*ARGSUSED*/ 637 int 638 zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) 639 { 640 minor_t minor = getminor(dev); 641 zvol_state_t *zv; 642 643 if (minor == 0) /* This is the control device */ 644 return (0); 645 646 mutex_enter(&zvol_state_lock); 647 648 zv = ddi_get_soft_state(zvol_state, minor); 649 if (zv == NULL) { 650 mutex_exit(&zvol_state_lock); 651 return (ENXIO); 652 } 653 654 /* 655 * The next statement is a workaround for the following DDI bug: 656 * 6343604 specfs race: multiple "last-close" of the same device 657 */ 658 if (zv->zv_total_opens == 0) { 659 mutex_exit(&zvol_state_lock); 660 return (0); 661 } 662 663 /* 664 * If the open count is zero, this is a spurious close. 665 * That indicates a bug in the kernel / DDI framework. 666 */ 667 ASSERT(zv->zv_open_count[otyp] != 0); 668 ASSERT(zv->zv_total_opens != 0); 669 670 /* 671 * You may get multiple opens, but only one close. 672 */ 673 zv->zv_open_count[otyp]--; 674 zv->zv_total_opens--; 675 676 mutex_exit(&zvol_state_lock); 677 678 return (0); 679 } 680 681 static void 682 zvol_get_done(dmu_buf_t *db, void *vzgd) 683 { 684 zgd_t *zgd = (zgd_t *)vzgd; 685 686 dmu_buf_rele(db, vzgd); 687 zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp))); 688 kmem_free(zgd, sizeof (zgd_t)); 689 } 690 691 /* 692 * Get data to generate a TX_WRITE intent log record. 693 */ 694 static int 695 zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) 696 { 697 zvol_state_t *zv = arg; 698 objset_t *os = zv->zv_objset; 699 dmu_buf_t *db; 700 zgd_t *zgd; 701 int error; 702 703 ASSERT(zio); 704 ASSERT(lr->lr_length != 0); 705 706 if (buf != NULL) 707 return (dmu_read(os, ZVOL_OBJ, 708 lr->lr_offset, lr->lr_length, buf)); 709 710 zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); 711 zgd->zgd_zilog = zv->zv_zilog; 712 zgd->zgd_bp = &lr->lr_blkptr; 713 714 VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); 715 /* 716 * Have to lock to ensure when when the data is 717 * written out and it's checksum is being calculated 718 * that no one can change the data. 719 */ 720 rw_enter(&zv->zv_dslock, RW_READER); 721 error = dmu_sync(zio, db, &lr->lr_blkptr, 722 lr->lr_common.lrc_txg, zvol_get_done, zgd); 723 rw_exit(&zv->zv_dslock); 724 if (error == 0) 725 zil_add_vdev(zv->zv_zilog, 726 DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr))); 727 /* 728 * If we get EINPROGRESS, then we need to wait for a 729 * write IO initiated by dmu_sync() to complete before 730 * we can release this dbuf. We will finish everything 731 * up in the zvol_get_done() callback. 732 */ 733 if (error == EINPROGRESS) 734 return (0); 735 dmu_buf_rele(db, zgd); 736 kmem_free(zgd, sizeof (zgd_t)); 737 return (error); 738 } 739 740 /* 741 * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. 742 * 743 * We store data in the log buffers if it's small enough. 744 * Otherwise we will later flush the data out via dmu_sync(). 745 */ 746 ssize_t zvol_immediate_write_sz = 32768; 747 748 static void 749 zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) 750 { 751 uint32_t blocksize = zv->zv_volblocksize; 752 lr_write_t *lr; 753 754 while (len) { 755 ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); 756 itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); 757 758 itx->itx_wr_state = 759 len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; 760 itx->itx_private = zv; 761 lr = (lr_write_t *)&itx->itx_lr; 762 lr->lr_foid = ZVOL_OBJ; 763 lr->lr_offset = off; 764 lr->lr_length = nbytes; 765 lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); 766 BP_ZERO(&lr->lr_blkptr); 767 768 (void) zil_itx_assign(zv->zv_zilog, itx, tx); 769 len -= nbytes; 770 off += nbytes; 771 } 772 } 773 774 int 775 zvol_strategy(buf_t *bp) 776 { 777 zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); 778 uint64_t off, volsize; 779 size_t size, resid; 780 char *addr; 781 objset_t *os; 782 int error = 0; 783 int sync; 784 int reading; 785 786 if (zv == NULL) { 787 bioerror(bp, ENXIO); 788 biodone(bp); 789 return (0); 790 } 791 792 if (getminor(bp->b_edev) == 0) { 793 bioerror(bp, EINVAL); 794 biodone(bp); 795 return (0); 796 } 797 798 if ((zv->zv_readonly || (zv->zv_mode & DS_MODE_READONLY)) && 799 !(bp->b_flags & B_READ)) { 800 bioerror(bp, EROFS); 801 biodone(bp); 802 return (0); 803 } 804 805 off = ldbtob(bp->b_blkno); 806 volsize = zv->zv_volsize; 807 808 os = zv->zv_objset; 809 ASSERT(os != NULL); 810 sync = !(bp->b_flags & B_ASYNC) && !(zil_disable); 811 812 bp_mapin(bp); 813 addr = bp->b_un.b_addr; 814 resid = bp->b_bcount; 815 816 /* 817 * There must be no buffer changes when doing a dmu_sync() because 818 * we can't change the data whilst calculating the checksum. 819 * A better approach than a per zvol rwlock would be to lock ranges. 820 */ 821 reading = bp->b_flags & B_READ; 822 if (reading || resid <= zvol_immediate_write_sz) 823 rw_enter(&zv->zv_dslock, RW_READER); 824 else 825 rw_enter(&zv->zv_dslock, RW_WRITER); 826 827 while (resid != 0 && off < volsize) { 828 829 size = MIN(resid, zvol_maxphys); /* zvol_maxphys per tx */ 830 831 if (size > volsize - off) /* don't write past the end */ 832 size = volsize - off; 833 834 if (reading) { 835 error = dmu_read(os, ZVOL_OBJ, off, size, addr); 836 } else { 837 dmu_tx_t *tx = dmu_tx_create(os); 838 dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); 839 error = dmu_tx_assign(tx, TXG_WAIT); 840 if (error) { 841 dmu_tx_abort(tx); 842 } else { 843 dmu_write(os, ZVOL_OBJ, off, size, addr, tx); 844 zvol_log_write(zv, tx, off, size); 845 dmu_tx_commit(tx); 846 } 847 } 848 if (error) 849 break; 850 off += size; 851 addr += size; 852 resid -= size; 853 } 854 rw_exit(&zv->zv_dslock); 855 856 if ((bp->b_resid = resid) == bp->b_bcount) 857 bioerror(bp, off > volsize ? EINVAL : error); 858 859 if (sync) 860 zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); 861 862 biodone(bp); 863 864 return (0); 865 } 866 867 /* 868 * Set the buffer count to the zvol maximum transfer. 869 * Using our own routine instead of the default minphys() 870 * means that for larger writes we write bigger buffers on X86 871 * (128K instead of 56K) and flush the disk write cache less often 872 * (every zvol_maxphys - currently 1MB) instead of minphys (currently 873 * 56K on X86 and 128K on sparc). 874 */ 875 void 876 zvol_minphys(struct buf *bp) 877 { 878 if (bp->b_bcount > zvol_maxphys) 879 bp->b_bcount = zvol_maxphys; 880 } 881 882 /*ARGSUSED*/ 883 int 884 zvol_read(dev_t dev, uio_t *uio, cred_t *cr) 885 { 886 zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev)); 887 int error = 0; 888 889 while (uio->uio_resid > 0) { 890 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); 891 892 error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes); 893 if (error) 894 break; 895 } 896 return (error); 897 } 898 899 /*ARGSUSED*/ 900 int 901 zvol_write(dev_t dev, uio_t *uio, cred_t *cr) 902 { 903 zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(dev)); 904 int error = 0; 905 906 while (uio->uio_resid > 0) { 907 uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); 908 uint64_t off = uio->uio_loffset; 909 910 dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); 911 dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes); 912 error = dmu_tx_assign(tx, TXG_WAIT); 913 if (error) { 914 dmu_tx_abort(tx); 915 break; 916 } 917 error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); 918 if (error == 0) 919 zvol_log_write(zv, tx, off, bytes); 920 dmu_tx_commit(tx); 921 922 if (error) 923 break; 924 } 925 return (error); 926 } 927 928 /* 929 * Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I). 930 */ 931 /*ARGSUSED*/ 932 int 933 zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 934 { 935 zvol_state_t *zv; 936 struct dk_cinfo dkc; 937 struct dk_minfo dkm; 938 dk_efi_t efi; 939 struct uuid uuid = EFI_RESERVED; 940 uint32_t crc; 941 int error = 0; 942 943 mutex_enter(&zvol_state_lock); 944 945 zv = ddi_get_soft_state(zvol_state, getminor(dev)); 946 947 if (zv == NULL) { 948 mutex_exit(&zvol_state_lock); 949 return (ENXIO); 950 } 951 952 switch (cmd) { 953 954 case DKIOCINFO: 955 bzero(&dkc, sizeof (dkc)); 956 (void) strcpy(dkc.dki_cname, "zvol"); 957 (void) strcpy(dkc.dki_dname, "zvol"); 958 dkc.dki_ctype = DKC_UNKNOWN; 959 dkc.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs); 960 mutex_exit(&zvol_state_lock); 961 if (ddi_copyout(&dkc, (void *)arg, sizeof (dkc), flag)) 962 error = EFAULT; 963 return (error); 964 965 case DKIOCGMEDIAINFO: 966 bzero(&dkm, sizeof (dkm)); 967 dkm.dki_lbsize = 1U << zv->zv_min_bs; 968 dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs; 969 dkm.dki_media_type = DK_UNKNOWN; 970 mutex_exit(&zvol_state_lock); 971 if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag)) 972 error = EFAULT; 973 return (error); 974 975 case DKIOCGETEFI: 976 if (ddi_copyin((void *)arg, &efi, sizeof (dk_efi_t), flag)) { 977 mutex_exit(&zvol_state_lock); 978 return (EFAULT); 979 } 980 efi.dki_data = (void *)(uintptr_t)efi.dki_data_64; 981 982 /* 983 * Some clients may attempt to request a PMBR for the 984 * zvol. Currently this interface will return ENOTTY to 985 * such requests. These requests could be supported by 986 * adding a check for lba == 0 and consing up an appropriate 987 * RMBR. 988 */ 989 if (efi.dki_lba == 1) { 990 efi_gpt_t gpt; 991 efi_gpe_t gpe; 992 993 bzero(&gpt, sizeof (gpt)); 994 bzero(&gpe, sizeof (gpe)); 995 996 if (efi.dki_length < sizeof (gpt)) { 997 mutex_exit(&zvol_state_lock); 998 return (EINVAL); 999 } 1000 1001 gpt.efi_gpt_Signature = LE_64(EFI_SIGNATURE); 1002 gpt.efi_gpt_Revision = LE_32(EFI_VERSION_CURRENT); 1003 gpt.efi_gpt_HeaderSize = LE_32(sizeof (gpt)); 1004 gpt.efi_gpt_FirstUsableLBA = LE_64(34ULL); 1005 gpt.efi_gpt_LastUsableLBA = 1006 LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1); 1007 gpt.efi_gpt_NumberOfPartitionEntries = LE_32(1); 1008 gpt.efi_gpt_PartitionEntryLBA = LE_64(2ULL); 1009 gpt.efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (gpe)); 1010 1011 UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); 1012 gpe.efi_gpe_StartingLBA = gpt.efi_gpt_FirstUsableLBA; 1013 gpe.efi_gpe_EndingLBA = gpt.efi_gpt_LastUsableLBA; 1014 1015 CRC32(crc, &gpe, sizeof (gpe), -1U, crc32_table); 1016 gpt.efi_gpt_PartitionEntryArrayCRC32 = LE_32(~crc); 1017 1018 CRC32(crc, &gpt, sizeof (gpt), -1U, crc32_table); 1019 gpt.efi_gpt_HeaderCRC32 = LE_32(~crc); 1020 1021 mutex_exit(&zvol_state_lock); 1022 if (ddi_copyout(&gpt, efi.dki_data, sizeof (gpt), flag)) 1023 error = EFAULT; 1024 } else if (efi.dki_lba == 2) { 1025 efi_gpe_t gpe; 1026 1027 bzero(&gpe, sizeof (gpe)); 1028 1029 if (efi.dki_length < sizeof (gpe)) { 1030 mutex_exit(&zvol_state_lock); 1031 return (EINVAL); 1032 } 1033 1034 UUID_LE_CONVERT(gpe.efi_gpe_PartitionTypeGUID, uuid); 1035 gpe.efi_gpe_StartingLBA = LE_64(34ULL); 1036 gpe.efi_gpe_EndingLBA = 1037 LE_64((zv->zv_volsize >> zv->zv_min_bs) - 1); 1038 1039 mutex_exit(&zvol_state_lock); 1040 if (ddi_copyout(&gpe, efi.dki_data, sizeof (gpe), flag)) 1041 error = EFAULT; 1042 } else { 1043 mutex_exit(&zvol_state_lock); 1044 error = EINVAL; 1045 } 1046 return (error); 1047 1048 case DKIOCFLUSHWRITECACHE: 1049 zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); 1050 break; 1051 1052 case DKIOCGGEOM: 1053 case DKIOCGVTOC: 1054 /* commands using these (like prtvtoc) expect ENOTSUP */ 1055 error = ENOTSUP; 1056 break; 1057 1058 default: 1059 error = ENOTTY; 1060 break; 1061 1062 } 1063 mutex_exit(&zvol_state_lock); 1064 return (error); 1065 } 1066 1067 int 1068 zvol_busy(void) 1069 { 1070 return (zvol_minors != 0); 1071 } 1072 1073 void 1074 zvol_init(void) 1075 { 1076 VERIFY(ddi_soft_state_init(&zvol_state, sizeof (zvol_state_t), 1) == 0); 1077 mutex_init(&zvol_state_lock, NULL, MUTEX_DEFAULT, NULL); 1078 } 1079 1080 void 1081 zvol_fini(void) 1082 { 1083 mutex_destroy(&zvol_state_lock); 1084 ddi_soft_state_fini(&zvol_state); 1085 } 1086