1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012 by Delphix. All rights reserved. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa_impl.h> 28 #include <sys/refcount.h> 29 #include <sys/vdev_disk.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/fs/zfs.h> 32 #include <sys/zio.h> 33 #include <sys/sunldi.h> 34 #include <sys/efi_partition.h> 35 #include <sys/fm/fs/zfs.h> 36 37 /* 38 * Virtual device vector for disks. 39 */ 40 41 extern ldi_ident_t zfs_li; 42 43 typedef struct vdev_disk_buf { 44 buf_t vdb_buf; 45 zio_t *vdb_io; 46 } vdev_disk_buf_t; 47 48 static void 49 vdev_disk_hold(vdev_t *vd) 50 { 51 ddi_devid_t devid; 52 char *minor; 53 54 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 55 56 /* 57 * We must have a pathname, and it must be absolute. 58 */ 59 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 60 return; 61 62 /* 63 * Only prefetch path and devid info if the device has 64 * never been opened. 65 */ 66 if (vd->vdev_tsd != NULL) 67 return; 68 69 if (vd->vdev_wholedisk == -1ULL) { 70 size_t len = strlen(vd->vdev_path) + 3; 71 char *buf = kmem_alloc(len, KM_SLEEP); 72 73 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 74 75 (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); 76 kmem_free(buf, len); 77 } 78 79 if (vd->vdev_name_vp == NULL) 80 (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); 81 82 if (vd->vdev_devid != NULL && 83 ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { 84 (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); 85 ddi_devid_str_free(minor); 86 ddi_devid_free(devid); 87 } 88 } 89 90 static void 91 vdev_disk_rele(vdev_t *vd) 92 { 93 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 94 95 if (vd->vdev_name_vp) { 96 VN_RELE_ASYNC(vd->vdev_name_vp, 97 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 98 vd->vdev_name_vp = NULL; 99 } 100 if (vd->vdev_devid_vp) { 101 VN_RELE_ASYNC(vd->vdev_devid_vp, 102 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 103 vd->vdev_devid_vp = NULL; 104 } 105 } 106 107 static uint64_t 108 vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz) 109 { 110 ASSERT(vd->vdev_wholedisk); 111 112 vdev_disk_t *dvd = vd->vdev_tsd; 113 dk_efi_t dk_ioc; 114 efi_gpt_t *efi; 115 uint64_t avail_space = 0; 116 int efisize = EFI_LABEL_SIZE * 2; 117 118 dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP); 119 dk_ioc.dki_lba = 1; 120 dk_ioc.dki_length = efisize; 121 dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data; 122 efi = dk_ioc.dki_data; 123 124 if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc, 125 FKIOCTL, kcred, NULL) == 0) { 126 uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); 127 128 zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu", 129 vd->vdev_path, capacity, efi_altern_lba); 130 if (capacity > efi_altern_lba) 131 avail_space = (capacity - efi_altern_lba) * blksz; 132 } 133 kmem_free(dk_ioc.dki_data, efisize); 134 return (avail_space); 135 } 136 137 static int 138 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 139 uint64_t *ashift) 140 { 141 spa_t *spa = vd->vdev_spa; 142 vdev_disk_t *dvd; 143 struct dk_minfo_ext dkmext; 144 int error; 145 dev_t dev; 146 int otyp; 147 148 /* 149 * We must have a pathname, and it must be absolute. 150 */ 151 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 152 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 153 return (EINVAL); 154 } 155 156 /* 157 * Reopen the device if it's not currently open. Otherwise, 158 * just update the physical size of the device. 159 */ 160 if (vd->vdev_tsd != NULL) { 161 ASSERT(vd->vdev_reopening); 162 dvd = vd->vdev_tsd; 163 goto skip_open; 164 } 165 166 dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 167 168 /* 169 * When opening a disk device, we want to preserve the user's original 170 * intent. We always want to open the device by the path the user gave 171 * us, even if it is one of multiple paths to the save device. But we 172 * also want to be able to survive disks being removed/recabled. 173 * Therefore the sequence of opening devices is: 174 * 175 * 1. Try opening the device by path. For legacy pools without the 176 * 'whole_disk' property, attempt to fix the path by appending 's0'. 177 * 178 * 2. If the devid of the device matches the stored value, return 179 * success. 180 * 181 * 3. Otherwise, the device may have moved. Try opening the device 182 * by the devid instead. 183 */ 184 if (vd->vdev_devid != NULL) { 185 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, 186 &dvd->vd_minor) != 0) { 187 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 188 return (EINVAL); 189 } 190 } 191 192 error = EINVAL; /* presume failure */ 193 194 if (vd->vdev_path != NULL) { 195 ddi_devid_t devid; 196 197 if (vd->vdev_wholedisk == -1ULL) { 198 size_t len = strlen(vd->vdev_path) + 3; 199 char *buf = kmem_alloc(len, KM_SLEEP); 200 ldi_handle_t lh; 201 202 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 203 204 if (ldi_open_by_name(buf, spa_mode(spa), kcred, 205 &lh, zfs_li) == 0) { 206 spa_strfree(vd->vdev_path); 207 vd->vdev_path = buf; 208 vd->vdev_wholedisk = 1ULL; 209 (void) ldi_close(lh, spa_mode(spa), kcred); 210 } else { 211 kmem_free(buf, len); 212 } 213 } 214 215 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, 216 &dvd->vd_lh, zfs_li); 217 218 /* 219 * Compare the devid to the stored value. 220 */ 221 if (error == 0 && vd->vdev_devid != NULL && 222 ldi_get_devid(dvd->vd_lh, &devid) == 0) { 223 if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { 224 error = EINVAL; 225 (void) ldi_close(dvd->vd_lh, spa_mode(spa), 226 kcred); 227 dvd->vd_lh = NULL; 228 } 229 ddi_devid_free(devid); 230 } 231 232 /* 233 * If we succeeded in opening the device, but 'vdev_wholedisk' 234 * is not yet set, then this must be a slice. 235 */ 236 if (error == 0 && vd->vdev_wholedisk == -1ULL) 237 vd->vdev_wholedisk = 0; 238 } 239 240 /* 241 * If we were unable to open by path, or the devid check fails, open by 242 * devid instead. 243 */ 244 if (error != 0 && vd->vdev_devid != NULL) 245 error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, 246 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); 247 248 /* 249 * If all else fails, then try opening by physical path (if available) 250 * or the logical path (if we failed due to the devid check). While not 251 * as reliable as the devid, this will give us something, and the higher 252 * level vdev validation will prevent us from opening the wrong device. 253 */ 254 if (error) { 255 if (vd->vdev_physpath != NULL && 256 (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) 257 error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), 258 kcred, &dvd->vd_lh, zfs_li); 259 260 /* 261 * Note that we don't support the legacy auto-wholedisk support 262 * as above. This hasn't been used in a very long time and we 263 * don't need to propagate its oddities to this edge condition. 264 */ 265 if (error && vd->vdev_path != NULL) 266 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 267 kcred, &dvd->vd_lh, zfs_li); 268 } 269 270 if (error) { 271 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 272 return (error); 273 } 274 275 /* 276 * Once a device is opened, verify that the physical device path (if 277 * available) is up to date. 278 */ 279 if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && 280 ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { 281 char *physpath, *minorname; 282 283 physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); 284 minorname = NULL; 285 if (ddi_dev_pathname(dev, otyp, physpath) == 0 && 286 ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && 287 (vd->vdev_physpath == NULL || 288 strcmp(vd->vdev_physpath, physpath) != 0)) { 289 if (vd->vdev_physpath) 290 spa_strfree(vd->vdev_physpath); 291 (void) strlcat(physpath, ":", MAXPATHLEN); 292 (void) strlcat(physpath, minorname, MAXPATHLEN); 293 vd->vdev_physpath = spa_strdup(physpath); 294 } 295 if (minorname) 296 kmem_free(minorname, strlen(minorname) + 1); 297 kmem_free(physpath, MAXPATHLEN); 298 } 299 300 skip_open: 301 /* 302 * Determine the actual size of the device. 303 */ 304 if (ldi_get_size(dvd->vd_lh, psize) != 0) { 305 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 306 return (EINVAL); 307 } 308 309 /* 310 * Determine the device's minimum transfer size. 311 * If the ioctl isn't supported, assume DEV_BSIZE. 312 */ 313 if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext, 314 FKIOCTL, kcred, NULL) != 0) 315 dkmext.dki_pbsize = DEV_BSIZE; 316 317 *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1; 318 319 if (vd->vdev_wholedisk == 1) { 320 uint64_t capacity = dkmext.dki_capacity - 1; 321 uint64_t blksz = dkmext.dki_lbsize; 322 int wce = 1; 323 324 /* 325 * If we own the whole disk, try to enable disk write caching. 326 * We ignore errors because it's OK if we can't do it. 327 */ 328 (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, 329 FKIOCTL, kcred, NULL); 330 331 *max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz); 332 zfs_dbgmsg("capacity change: vdev %s, psize %llu, " 333 "max_psize %llu", vd->vdev_path, *psize, *max_psize); 334 } else { 335 *max_psize = *psize; 336 } 337 338 /* 339 * Clear the nowritecache bit, so that on a vdev_reopen() we will 340 * try again. 341 */ 342 vd->vdev_nowritecache = B_FALSE; 343 344 return (0); 345 } 346 347 static void 348 vdev_disk_close(vdev_t *vd) 349 { 350 vdev_disk_t *dvd = vd->vdev_tsd; 351 352 if (vd->vdev_reopening || dvd == NULL) 353 return; 354 355 if (dvd->vd_minor != NULL) 356 ddi_devid_str_free(dvd->vd_minor); 357 358 if (dvd->vd_devid != NULL) 359 ddi_devid_free(dvd->vd_devid); 360 361 if (dvd->vd_lh != NULL) 362 (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); 363 364 vd->vdev_delayed_close = B_FALSE; 365 kmem_free(dvd, sizeof (vdev_disk_t)); 366 vd->vdev_tsd = NULL; 367 } 368 369 int 370 vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size, 371 uint64_t offset, int flags) 372 { 373 buf_t *bp; 374 int error = 0; 375 376 if (vd_lh == NULL) 377 return (EINVAL); 378 379 ASSERT(flags & B_READ || flags & B_WRITE); 380 381 bp = getrbuf(KM_SLEEP); 382 bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; 383 bp->b_bcount = size; 384 bp->b_un.b_addr = (void *)data; 385 bp->b_lblkno = lbtodb(offset); 386 bp->b_bufsize = size; 387 388 error = ldi_strategy(vd_lh, bp); 389 ASSERT(error == 0); 390 if ((error = biowait(bp)) == 0 && bp->b_resid != 0) 391 error = EIO; 392 freerbuf(bp); 393 394 return (error); 395 } 396 397 static void 398 vdev_disk_io_intr(buf_t *bp) 399 { 400 vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp; 401 zio_t *zio = vdb->vdb_io; 402 403 /* 404 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. 405 * Rather than teach the rest of the stack about other error 406 * possibilities (EFAULT, etc), we normalize the error value here. 407 */ 408 zio->io_error = (geterror(bp) != 0 ? EIO : 0); 409 410 if (zio->io_error == 0 && bp->b_resid != 0) 411 zio->io_error = EIO; 412 413 kmem_free(vdb, sizeof (vdev_disk_buf_t)); 414 415 zio_interrupt(zio); 416 } 417 418 static void 419 vdev_disk_ioctl_free(zio_t *zio) 420 { 421 kmem_free(zio->io_vsd, sizeof (struct dk_callback)); 422 } 423 424 static const zio_vsd_ops_t vdev_disk_vsd_ops = { 425 vdev_disk_ioctl_free, 426 zio_vsd_default_cksum_report 427 }; 428 429 static void 430 vdev_disk_ioctl_done(void *zio_arg, int error) 431 { 432 zio_t *zio = zio_arg; 433 434 zio->io_error = error; 435 436 zio_interrupt(zio); 437 } 438 439 static int 440 vdev_disk_io_start(zio_t *zio) 441 { 442 vdev_t *vd = zio->io_vd; 443 vdev_disk_t *dvd = vd->vdev_tsd; 444 vdev_disk_buf_t *vdb; 445 struct dk_callback *dkc; 446 buf_t *bp; 447 int error; 448 449 if (zio->io_type == ZIO_TYPE_IOCTL) { 450 /* XXPOLICY */ 451 if (!vdev_readable(vd)) { 452 zio->io_error = ENXIO; 453 return (ZIO_PIPELINE_CONTINUE); 454 } 455 456 switch (zio->io_cmd) { 457 458 case DKIOCFLUSHWRITECACHE: 459 460 if (zfs_nocacheflush) 461 break; 462 463 if (vd->vdev_nowritecache) { 464 zio->io_error = ENOTSUP; 465 break; 466 } 467 468 zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); 469 zio->io_vsd_ops = &vdev_disk_vsd_ops; 470 471 dkc->dkc_callback = vdev_disk_ioctl_done; 472 dkc->dkc_flag = FLUSH_VOLATILE; 473 dkc->dkc_cookie = zio; 474 475 error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, 476 (uintptr_t)dkc, FKIOCTL, kcred, NULL); 477 478 if (error == 0) { 479 /* 480 * The ioctl will be done asychronously, 481 * and will call vdev_disk_ioctl_done() 482 * upon completion. 483 */ 484 return (ZIO_PIPELINE_STOP); 485 } 486 487 if (error == ENOTSUP || error == ENOTTY) { 488 /* 489 * If we get ENOTSUP or ENOTTY, we know that 490 * no future attempts will ever succeed. 491 * In this case we set a persistent bit so 492 * that we don't bother with the ioctl in the 493 * future. 494 */ 495 vd->vdev_nowritecache = B_TRUE; 496 } 497 zio->io_error = error; 498 499 break; 500 501 default: 502 zio->io_error = ENOTSUP; 503 } 504 505 return (ZIO_PIPELINE_CONTINUE); 506 } 507 508 vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP); 509 510 vdb->vdb_io = zio; 511 bp = &vdb->vdb_buf; 512 513 bioinit(bp); 514 bp->b_flags = B_BUSY | B_NOCACHE | 515 (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 516 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 517 bp->b_flags |= B_FAILFAST; 518 bp->b_bcount = zio->io_size; 519 bp->b_un.b_addr = zio->io_data; 520 bp->b_lblkno = lbtodb(zio->io_offset); 521 bp->b_bufsize = zio->io_size; 522 bp->b_iodone = (int (*)())vdev_disk_io_intr; 523 524 /* ldi_strategy() will return non-zero only on programming errors */ 525 VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0); 526 527 return (ZIO_PIPELINE_STOP); 528 } 529 530 static void 531 vdev_disk_io_done(zio_t *zio) 532 { 533 vdev_t *vd = zio->io_vd; 534 535 /* 536 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if 537 * the device has been removed. If this is the case, then we trigger an 538 * asynchronous removal of the device. Otherwise, probe the device and 539 * make sure it's still accessible. 540 */ 541 if (zio->io_error == EIO && !vd->vdev_remove_wanted) { 542 vdev_disk_t *dvd = vd->vdev_tsd; 543 int state = DKIO_NONE; 544 545 if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, 546 FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { 547 /* 548 * We post the resource as soon as possible, instead of 549 * when the async removal actually happens, because the 550 * DE is using this information to discard previous I/O 551 * errors. 552 */ 553 zfs_post_remove(zio->io_spa, vd); 554 vd->vdev_remove_wanted = B_TRUE; 555 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 556 } else if (!vd->vdev_delayed_close) { 557 vd->vdev_delayed_close = B_TRUE; 558 } 559 } 560 } 561 562 vdev_ops_t vdev_disk_ops = { 563 vdev_disk_open, 564 vdev_disk_close, 565 vdev_default_asize, 566 vdev_disk_io_start, 567 vdev_disk_io_done, 568 NULL, 569 vdev_disk_hold, 570 vdev_disk_rele, 571 VDEV_TYPE_DISK, /* name of this vdev type */ 572 B_TRUE /* leaf vdev */ 573 }; 574 575 /* 576 * Given the root disk device devid or pathname, read the label from 577 * the device, and construct a configuration nvlist. 578 */ 579 int 580 vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) 581 { 582 ldi_handle_t vd_lh; 583 vdev_label_t *label; 584 uint64_t s, size; 585 int l; 586 ddi_devid_t tmpdevid; 587 int error = -1; 588 char *minor_name; 589 590 /* 591 * Read the device label and build the nvlist. 592 */ 593 if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, 594 &minor_name) == 0) { 595 error = ldi_open_by_devid(tmpdevid, minor_name, 596 FREAD, kcred, &vd_lh, zfs_li); 597 ddi_devid_free(tmpdevid); 598 ddi_devid_str_free(minor_name); 599 } 600 601 if (error && (error = ldi_open_by_name(devpath, FREAD, kcred, &vd_lh, 602 zfs_li))) 603 return (error); 604 605 if (ldi_get_size(vd_lh, &s)) { 606 (void) ldi_close(vd_lh, FREAD, kcred); 607 return (EIO); 608 } 609 610 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); 611 label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); 612 613 *config = NULL; 614 for (l = 0; l < VDEV_LABELS; l++) { 615 uint64_t offset, state, txg = 0; 616 617 /* read vdev label */ 618 offset = vdev_label_offset(size, l, 0); 619 if (vdev_disk_physio(vd_lh, (caddr_t)label, 620 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) 621 continue; 622 623 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, 624 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { 625 *config = NULL; 626 continue; 627 } 628 629 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 630 &state) != 0 || state >= POOL_STATE_DESTROYED) { 631 nvlist_free(*config); 632 *config = NULL; 633 continue; 634 } 635 636 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 637 &txg) != 0 || txg == 0) { 638 nvlist_free(*config); 639 *config = NULL; 640 continue; 641 } 642 643 break; 644 } 645 646 kmem_free(label, sizeof (vdev_label_t)); 647 (void) ldi_close(vd_lh, FREAD, kcred); 648 if (*config == NULL) 649 error = EIDRM; 650 651 return (error); 652 } 653