1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2020 Joyent, Inc. 26 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> 27 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa_impl.h> 32 #include <sys/refcount.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/vdev_trim.h> 35 #include <sys/abd.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/zio.h> 38 #include <sys/sunldi.h> 39 #include <sys/efi_partition.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <sys/ddi.h> 42 43 /* 44 * Tunable to disable TRIM in case we're using a problematic SSD. 45 */ 46 uint_t zfs_no_trim = 0; 47 48 /* 49 * Tunable parameter for debugging or performance analysis. Setting this 50 * will cause pool corruption on power loss if a volatile out-of-order 51 * write cache is enabled. 52 */ 53 boolean_t zfs_nocacheflush = B_FALSE; 54 55 /* 56 * Virtual device vector for disks. 57 */ 58 59 extern ldi_ident_t zfs_li; 60 61 static void vdev_disk_close(vdev_t *); 62 63 typedef struct vdev_disk { 64 ddi_devid_t vd_devid; 65 char *vd_minor; 66 ldi_handle_t vd_lh; 67 list_t vd_ldi_cbs; 68 boolean_t vd_ldi_offline; 69 } vdev_disk_t; 70 71 typedef struct vdev_disk_buf { 72 buf_t vdb_buf; 73 zio_t *vdb_io; 74 } vdev_disk_buf_t; 75 76 typedef struct vdev_disk_ldi_cb { 77 list_node_t lcb_next; 78 ldi_callback_id_t lcb_id; 79 } vdev_disk_ldi_cb_t; 80 81 /* 82 * Bypass the devid when opening a disk vdev. 83 * There have been issues where the devids of several devices were shuffled, 84 * causing pool open failures. Note, that this flag is intended to be used 85 * for pool recovery only. 86 * 87 * Note that if a pool is imported with the devids bypassed, all its vdevs will 88 * cease storing devid information permanently. In practice, the devid is rarely 89 * useful as vdev paths do not tend to change unless the hardware is 90 * reconfigured. That said, if the paths do change and a pool fails to open 91 * automatically at boot, a simple zpool import should re-scan the paths and fix 92 * the issue. 93 */ 94 boolean_t vdev_disk_bypass_devid = B_FALSE; 95 96 static void 97 vdev_disk_alloc(vdev_t *vd) 98 { 99 vdev_disk_t *dvd; 100 101 dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 102 /* 103 * Create the LDI event callback list. 104 */ 105 list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), 106 offsetof(vdev_disk_ldi_cb_t, lcb_next)); 107 } 108 109 static void 110 vdev_disk_free(vdev_t *vd) 111 { 112 vdev_disk_t *dvd = vd->vdev_tsd; 113 vdev_disk_ldi_cb_t *lcb; 114 115 if (dvd == NULL) 116 return; 117 118 /* 119 * We have already closed the LDI handle. Clean up the LDI event 120 * callbacks and free vd->vdev_tsd. 121 */ 122 while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { 123 list_remove(&dvd->vd_ldi_cbs, lcb); 124 (void) ldi_ev_remove_callbacks(lcb->lcb_id); 125 kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); 126 } 127 list_destroy(&dvd->vd_ldi_cbs); 128 kmem_free(dvd, sizeof (vdev_disk_t)); 129 vd->vdev_tsd = NULL; 130 } 131 132 static int 133 vdev_disk_off_notify(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 134 void *arg, void *ev_data __unused) 135 { 136 vdev_t *vd = (vdev_t *)arg; 137 vdev_disk_t *dvd = vd->vdev_tsd; 138 139 /* 140 * Ignore events other than offline. 141 */ 142 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 143 return (LDI_EV_SUCCESS); 144 145 /* 146 * Tell any new threads that stumble upon this vdev that they should not 147 * try to do I/O. 148 */ 149 dvd->vd_ldi_offline = B_TRUE; 150 151 /* 152 * Request that the spa_async_thread mark the device as REMOVED and 153 * notify FMA of the removal. This should also trigger a vdev_close() 154 * in the async thread. 155 */ 156 zfs_post_remove(vd->vdev_spa, vd); 157 vd->vdev_remove_wanted = B_TRUE; 158 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); 159 160 return (LDI_EV_SUCCESS); 161 } 162 163 static void 164 vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 165 int ldi_result, void *arg, void *ev_data __unused) 166 { 167 vdev_t *vd = (vdev_t *)arg; 168 169 /* 170 * Ignore events other than offline. 171 */ 172 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 173 return; 174 175 /* 176 * Request that the vdev be reopened if the offline state change was 177 * unsuccessful. 178 */ 179 if (ldi_result != LDI_EV_SUCCESS) { 180 vd->vdev_probe_wanted = B_TRUE; 181 spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); 182 } 183 } 184 185 static ldi_ev_callback_t vdev_disk_off_callb = { 186 .cb_vers = LDI_EV_CB_VERS, 187 .cb_notify = vdev_disk_off_notify, 188 .cb_finalize = vdev_disk_off_finalize 189 }; 190 191 static void 192 vdev_disk_dgrd_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 193 int ldi_result, void *arg, void *ev_data __unused) 194 { 195 vdev_t *vd = (vdev_t *)arg; 196 197 /* 198 * Ignore events other than degrade. 199 */ 200 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) 201 return; 202 203 /* 204 * Degrade events always succeed. Mark the vdev as degraded. 205 * This status is purely informative for the user. 206 */ 207 (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); 208 } 209 210 static ldi_ev_callback_t vdev_disk_dgrd_callb = { 211 .cb_vers = LDI_EV_CB_VERS, 212 .cb_notify = NULL, 213 .cb_finalize = vdev_disk_dgrd_finalize 214 }; 215 216 static void 217 vdev_disk_hold(vdev_t *vd) 218 { 219 ddi_devid_t devid; 220 char *minor; 221 222 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 223 224 /* 225 * We must have a pathname, and it must be absolute. 226 */ 227 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 228 return; 229 230 /* 231 * Only prefetch path and devid info if the device has 232 * never been opened. 233 */ 234 if (vd->vdev_tsd != NULL) 235 return; 236 237 if (vd->vdev_wholedisk == -1ULL) { 238 size_t len = strlen(vd->vdev_path) + 3; 239 char *buf = kmem_alloc(len, KM_SLEEP); 240 241 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 242 243 (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); 244 kmem_free(buf, len); 245 } 246 247 if (vd->vdev_name_vp == NULL) 248 (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); 249 250 if (vd->vdev_devid != NULL && 251 ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { 252 (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); 253 ddi_devid_str_free(minor); 254 ddi_devid_free(devid); 255 } 256 } 257 258 static void 259 vdev_disk_rele(vdev_t *vd) 260 { 261 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 262 263 if (vd->vdev_name_vp) { 264 VN_RELE_ASYNC(vd->vdev_name_vp, 265 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 266 vd->vdev_name_vp = NULL; 267 } 268 if (vd->vdev_devid_vp) { 269 VN_RELE_ASYNC(vd->vdev_devid_vp, 270 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 271 vd->vdev_devid_vp = NULL; 272 } 273 } 274 275 /* 276 * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when 277 * even a fallback to DKIOCGMEDIAINFO fails. 278 */ 279 #ifdef DEBUG 280 #define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) 281 #else 282 #define VDEV_DEBUG(...) /* Nothing... */ 283 #endif 284 285 static int 286 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 287 uint64_t *ashift) 288 { 289 spa_t *spa = vd->vdev_spa; 290 vdev_disk_t *dvd = vd->vdev_tsd; 291 ldi_ev_cookie_t ecookie; 292 vdev_disk_ldi_cb_t *lcb; 293 union { 294 struct dk_minfo_ext ude; 295 struct dk_minfo ud; 296 } dks; 297 struct dk_minfo_ext *dkmext = &dks.ude; 298 struct dk_minfo *dkm = &dks.ud; 299 int error, can_free; 300 dev_t dev; 301 int otyp; 302 boolean_t validate_devid = B_FALSE; 303 uint64_t capacity = 0, blksz = 0, pbsize; 304 const char *rdpath = vdev_disk_preroot_force_path(); 305 306 /* 307 * We must have a pathname, and it must be absolute. 308 */ 309 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 310 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 311 return (SET_ERROR(EINVAL)); 312 } 313 314 /* 315 * Reopen the device if it's not currently open. Otherwise, 316 * just update the physical size of the device. 317 */ 318 if (dvd != NULL) { 319 ASSERT(vd->vdev_reopening); 320 goto skip_open; 321 } 322 323 /* 324 * Create vd->vdev_tsd. 325 */ 326 vdev_disk_alloc(vd); 327 dvd = vd->vdev_tsd; 328 329 /* 330 * Allow bypassing the devid. 331 */ 332 if (vd->vdev_devid != NULL && 333 (vdev_disk_bypass_devid || rdpath != NULL)) { 334 vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed", 335 vd->vdev_devid); 336 spa_strfree(vd->vdev_devid); 337 vd->vdev_devid = NULL; 338 } 339 340 /* 341 * When opening a disk device, we want to preserve the user's original 342 * intent. We always want to open the device by the path the user gave 343 * us, even if it is one of multiple paths to the same device. But we 344 * also want to be able to survive disks being removed/recabled. 345 * Therefore the sequence of opening devices is: 346 * 347 * 1. Try opening the device by path. For legacy pools without the 348 * 'whole_disk' property, attempt to fix the path by appending 's0'. 349 * 350 * 2. If the devid of the device matches the stored value, return 351 * success. 352 * 353 * 3. Otherwise, the device may have moved. Try opening the device 354 * by the devid instead. 355 */ 356 if (vd->vdev_devid != NULL) { 357 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, 358 &dvd->vd_minor) != 0) { 359 vdev_dbgmsg(vd, 360 "vdev_disk_open, invalid devid %s bypassed", 361 vd->vdev_devid); 362 spa_strfree(vd->vdev_devid); 363 vd->vdev_devid = NULL; 364 } 365 } 366 367 error = EINVAL; /* presume failure */ 368 369 if (rdpath != NULL) { 370 /* 371 * We have been asked to open only a specific root device, and 372 * to fail otherwise. 373 */ 374 error = ldi_open_by_name((char *)rdpath, spa_mode(spa), kcred, 375 &dvd->vd_lh, zfs_li); 376 validate_devid = B_TRUE; 377 goto rootdisk_only; 378 } 379 380 if (vd->vdev_path != NULL) { 381 if (vd->vdev_wholedisk == -1ULL) { 382 size_t len = strlen(vd->vdev_path) + 3; 383 char *buf = kmem_alloc(len, KM_SLEEP); 384 385 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 386 387 error = ldi_open_by_name(buf, spa_mode(spa), kcred, 388 &dvd->vd_lh, zfs_li); 389 if (error == 0) { 390 spa_strfree(vd->vdev_path); 391 vd->vdev_path = buf; 392 vd->vdev_wholedisk = 1ULL; 393 } else { 394 kmem_free(buf, len); 395 } 396 } 397 398 /* 399 * If we have not yet opened the device, try to open it by the 400 * specified path. 401 */ 402 if (error != 0) { 403 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 404 kcred, &dvd->vd_lh, zfs_li); 405 } 406 407 /* 408 * Compare the devid to the stored value. 409 */ 410 if (error == 0 && vd->vdev_devid != NULL) { 411 ddi_devid_t devid = NULL; 412 413 if (ldi_get_devid(dvd->vd_lh, &devid) != 0) { 414 /* 415 * We expected a devid on this device but it no 416 * longer appears to have one. The validation 417 * step may need to remove it from the 418 * configuration. 419 */ 420 validate_devid = B_TRUE; 421 422 } else if (ddi_devid_compare(devid, dvd->vd_devid) != 423 0) { 424 /* 425 * A mismatch here is unexpected, log it. 426 */ 427 char *devid_str = ddi_devid_str_encode(devid, 428 dvd->vd_minor); 429 vdev_dbgmsg(vd, "vdev_disk_open: devid " 430 "mismatch: %s != %s", vd->vdev_devid, 431 devid_str); 432 cmn_err(CE_NOTE, "vdev_disk_open %s: devid " 433 "mismatch: %s != %s", vd->vdev_path, 434 vd->vdev_devid, devid_str); 435 ddi_devid_str_free(devid_str); 436 437 error = SET_ERROR(EINVAL); 438 (void) ldi_close(dvd->vd_lh, spa_mode(spa), 439 kcred); 440 dvd->vd_lh = NULL; 441 } 442 443 if (devid != NULL) { 444 ddi_devid_free(devid); 445 } 446 } 447 448 /* 449 * If we succeeded in opening the device, but 'vdev_wholedisk' 450 * is not yet set, then this must be a slice. 451 */ 452 if (error == 0 && vd->vdev_wholedisk == -1ULL) 453 vd->vdev_wholedisk = 0; 454 } 455 456 /* 457 * If we were unable to open by path, or the devid check fails, open by 458 * devid instead. 459 */ 460 if (error != 0 && vd->vdev_devid != NULL) { 461 error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, 462 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); 463 if (error != 0) { 464 vdev_dbgmsg(vd, "Failed to open by devid (%s)", 465 vd->vdev_devid); 466 } 467 } 468 469 /* 470 * If all else fails, then try opening by physical path (if available) 471 * or the logical path (if we failed due to the devid check). While not 472 * as reliable as the devid, this will give us something, and the higher 473 * level vdev validation will prevent us from opening the wrong device. 474 */ 475 if (error != 0) { 476 validate_devid = B_TRUE; 477 478 if (vd->vdev_physpath != NULL && 479 (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) { 480 error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), 481 kcred, &dvd->vd_lh, zfs_li); 482 } 483 484 /* 485 * Note that we don't support the legacy auto-wholedisk support 486 * as above. This hasn't been used in a very long time and we 487 * don't need to propagate its oddities to this edge condition. 488 */ 489 if (error != 0 && vd->vdev_path != NULL) { 490 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 491 kcred, &dvd->vd_lh, zfs_li); 492 } 493 } 494 495 /* 496 * If this is early in boot, a sweep of available block devices may 497 * locate an alternative path that we can try. 498 */ 499 if (error != 0) { 500 const char *altdevpath = vdev_disk_preroot_lookup( 501 spa_guid(spa), vd->vdev_guid); 502 503 if (altdevpath != NULL) { 504 vdev_dbgmsg(vd, "Trying alternate preroot path (%s)", 505 altdevpath); 506 507 validate_devid = B_TRUE; 508 509 if ((error = ldi_open_by_name((char *)altdevpath, 510 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li)) != 0) { 511 vdev_dbgmsg(vd, "Failed to open by preroot " 512 "path (%s)", altdevpath); 513 } 514 } 515 } 516 517 rootdisk_only: 518 if (error != 0) { 519 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 520 vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", 521 error); 522 return (error); 523 } 524 525 /* 526 * Now that the device has been successfully opened, update the devid 527 * if necessary. 528 */ 529 if (validate_devid) { 530 ddi_devid_t devid = NULL; 531 char *minorname = NULL; 532 char *vd_devid = NULL; 533 boolean_t remove = B_FALSE, update = B_FALSE; 534 535 /* 536 * Get the current devid and minor name for the device we 537 * opened. 538 */ 539 if (ldi_get_devid(dvd->vd_lh, &devid) != 0 || 540 ldi_get_minor_name(dvd->vd_lh, &minorname) != 0) { 541 /* 542 * If we are unable to get the devid or the minor name 543 * for the device, we need to remove them from the 544 * configuration to prevent potential inconsistencies. 545 */ 546 if (dvd->vd_minor != NULL || dvd->vd_devid != NULL || 547 vd->vdev_devid != NULL) { 548 /* 549 * We only need to remove the devid if one 550 * exists. 551 */ 552 remove = B_TRUE; 553 } 554 555 } else if (dvd->vd_devid == NULL || dvd->vd_minor == NULL) { 556 /* 557 * There was previously no devid at all so we need to 558 * add one. 559 */ 560 update = B_TRUE; 561 562 } else if (ddi_devid_compare(devid, dvd->vd_devid) != 0 || 563 strcmp(minorname, dvd->vd_minor) != 0) { 564 /* 565 * The devid or minor name on file does not match the 566 * one from the opened device. 567 */ 568 update = B_TRUE; 569 } 570 571 if (update) { 572 /* 573 * Render the new devid and minor name as a string for 574 * logging and to store in the vdev configuration. 575 */ 576 vd_devid = ddi_devid_str_encode(devid, minorname); 577 } 578 579 if (update || remove) { 580 vdev_dbgmsg(vd, "vdev_disk_open: update devid from " 581 "'%s' to '%s'", 582 vd->vdev_devid != NULL ? vd->vdev_devid : "<none>", 583 vd_devid != NULL ? vd_devid : "<none>"); 584 cmn_err(CE_NOTE, "vdev_disk_open %s: update devid " 585 "from '%s' to '%s'", 586 vd->vdev_path != NULL ? vd->vdev_path : "?", 587 vd->vdev_devid != NULL ? vd->vdev_devid : "<none>", 588 vd_devid != NULL ? vd_devid : "<none>"); 589 590 /* 591 * Remove and free any existing values. 592 */ 593 if (dvd->vd_minor != NULL) { 594 ddi_devid_str_free(dvd->vd_minor); 595 dvd->vd_minor = NULL; 596 } 597 if (dvd->vd_devid != NULL) { 598 ddi_devid_free(dvd->vd_devid); 599 dvd->vd_devid = NULL; 600 } 601 if (vd->vdev_devid != NULL) { 602 spa_strfree(vd->vdev_devid); 603 vd->vdev_devid = NULL; 604 } 605 } 606 607 if (update) { 608 /* 609 * Install the new values. 610 */ 611 vd->vdev_devid = vd_devid; 612 dvd->vd_minor = minorname; 613 dvd->vd_devid = devid; 614 615 } else { 616 if (devid != NULL) { 617 ddi_devid_free(devid); 618 } 619 if (minorname != NULL) { 620 kmem_free(minorname, strlen(minorname) + 1); 621 } 622 } 623 } 624 625 /* 626 * Once a device is opened, verify that the physical device path (if 627 * available) is up to date. 628 */ 629 if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && 630 ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { 631 char *physpath, *minorname; 632 633 physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); 634 minorname = NULL; 635 if (ddi_dev_pathname(dev, otyp, physpath) == 0 && 636 ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && 637 (vd->vdev_physpath == NULL || 638 strcmp(vd->vdev_physpath, physpath) != 0)) { 639 if (vd->vdev_physpath) 640 spa_strfree(vd->vdev_physpath); 641 (void) strlcat(physpath, ":", MAXPATHLEN); 642 (void) strlcat(physpath, minorname, MAXPATHLEN); 643 vd->vdev_physpath = spa_strdup(physpath); 644 } 645 if (minorname) 646 kmem_free(minorname, strlen(minorname) + 1); 647 kmem_free(physpath, MAXPATHLEN); 648 } 649 650 /* 651 * Register callbacks for the LDI offline event. 652 */ 653 if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == 654 LDI_EV_SUCCESS) { 655 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 656 list_insert_tail(&dvd->vd_ldi_cbs, lcb); 657 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 658 &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); 659 } 660 661 /* 662 * Register callbacks for the LDI degrade event. 663 */ 664 if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == 665 LDI_EV_SUCCESS) { 666 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 667 list_insert_tail(&dvd->vd_ldi_cbs, lcb); 668 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 669 &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); 670 } 671 672 skip_open: 673 /* 674 * Determine the actual size of the device. 675 */ 676 if (ldi_get_size(dvd->vd_lh, psize) != 0) { 677 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 678 vdev_dbgmsg(vd, "vdev_disk_open: failed to get size"); 679 return (SET_ERROR(EINVAL)); 680 } 681 682 *max_psize = *psize; 683 684 /* 685 * Determine the device's minimum transfer size. 686 * If the ioctl isn't supported, assume DEV_BSIZE. 687 */ 688 if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, 689 (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { 690 capacity = dkmext->dki_capacity - 1; 691 blksz = dkmext->dki_lbsize; 692 pbsize = dkmext->dki_pbsize; 693 } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, 694 (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { 695 VDEV_DEBUG( 696 "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", 697 vd->vdev_path); 698 capacity = dkm->dki_capacity - 1; 699 blksz = dkm->dki_lbsize; 700 pbsize = blksz; 701 } else { 702 VDEV_DEBUG("vdev_disk_open(\"%s\"): " 703 "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", 704 vd->vdev_path, error); 705 pbsize = DEV_BSIZE; 706 } 707 708 *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; 709 710 if (vd->vdev_wholedisk == 1) { 711 int wce = 1; 712 713 if (error == 0) { 714 /* 715 * If we have the capability to expand, we'd have 716 * found out via success from DKIOCGMEDIAINFO{,EXT}. 717 * Adjust max_psize upward accordingly since we know 718 * we own the whole disk now. 719 */ 720 *max_psize = capacity * blksz; 721 } 722 723 /* 724 * Since we own the whole disk, try to enable disk write 725 * caching. We ignore errors because it's OK if we can't do it. 726 */ 727 (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, 728 FKIOCTL, kcred, NULL); 729 } 730 731 /* 732 * Clear the nowritecache bit, so that on a vdev_reopen() we will 733 * try again. 734 */ 735 vd->vdev_nowritecache = B_FALSE; 736 737 if (ldi_ioctl(dvd->vd_lh, DKIOC_CANFREE, (intptr_t)&can_free, FKIOCTL, 738 kcred, NULL) == 0 && can_free == 1) { 739 vd->vdev_has_trim = B_TRUE; 740 } else { 741 vd->vdev_has_trim = B_FALSE; 742 } 743 744 if (zfs_no_trim == 1) 745 vd->vdev_has_trim = B_FALSE; 746 747 /* Currently only supported for ZoL. */ 748 vd->vdev_has_securetrim = B_FALSE; 749 750 /* Inform the ZIO pipeline that we are non-rotational */ 751 vd->vdev_nonrot = B_FALSE; 752 if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 753 "device-solid-state")) { 754 if (ldi_prop_get_int(dvd->vd_lh, 755 LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 756 "device-solid-state", B_FALSE) != 0) 757 vd->vdev_nonrot = B_TRUE; 758 } 759 760 return (0); 761 } 762 763 static void 764 vdev_disk_close(vdev_t *vd) 765 { 766 vdev_disk_t *dvd = vd->vdev_tsd; 767 768 if (vd->vdev_reopening || dvd == NULL) 769 return; 770 771 if (dvd->vd_minor != NULL) { 772 ddi_devid_str_free(dvd->vd_minor); 773 dvd->vd_minor = NULL; 774 } 775 776 if (dvd->vd_devid != NULL) { 777 ddi_devid_free(dvd->vd_devid); 778 dvd->vd_devid = NULL; 779 } 780 781 if (dvd->vd_lh != NULL) { 782 (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); 783 dvd->vd_lh = NULL; 784 } 785 786 vd->vdev_delayed_close = B_FALSE; 787 vdev_disk_free(vd); 788 } 789 790 static int 791 vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, 792 size_t size, uint64_t offset, int flags) 793 { 794 buf_t *bp; 795 int error = 0; 796 797 if (vd_lh == NULL) 798 return (SET_ERROR(EINVAL)); 799 800 ASSERT(flags & B_READ || flags & B_WRITE); 801 802 bp = getrbuf(KM_SLEEP); 803 bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; 804 bp->b_bcount = size; 805 bp->b_un.b_addr = (void *)data; 806 bp->b_lblkno = lbtodb(offset); 807 bp->b_bufsize = size; 808 809 error = ldi_strategy(vd_lh, bp); 810 ASSERT(error == 0); 811 if ((error = biowait(bp)) == 0 && bp->b_resid != 0) 812 error = SET_ERROR(EIO); 813 freerbuf(bp); 814 815 return (error); 816 } 817 818 static int 819 vdev_disk_dumpio(vdev_t *vd, caddr_t data, size_t size, 820 uint64_t offset, uint64_t origoffset __unused, boolean_t doread, 821 boolean_t isdump) 822 { 823 vdev_disk_t *dvd = vd->vdev_tsd; 824 int flags = doread ? B_READ : B_WRITE; 825 826 /* 827 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 828 * Nothing to be done here but return failure. 829 */ 830 if (dvd == NULL || dvd->vd_ldi_offline) { 831 return (SET_ERROR(ENXIO)); 832 } 833 834 ASSERT(vd->vdev_ops == &vdev_disk_ops); 835 836 offset += VDEV_LABEL_START_SIZE; 837 838 /* 839 * If in the context of an active crash dump, use the ldi_dump(9F) 840 * call instead of ldi_strategy(9F) as usual. 841 */ 842 if (isdump) { 843 ASSERT3P(dvd, !=, NULL); 844 return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), 845 lbtodb(size))); 846 } 847 848 return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); 849 } 850 851 static int 852 vdev_disk_io_intr(buf_t *bp) 853 { 854 vdev_buf_t *vb = (vdev_buf_t *)bp; 855 zio_t *zio = vb->vb_io; 856 857 /* 858 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. 859 * Rather than teach the rest of the stack about other error 860 * possibilities (EFAULT, etc), we normalize the error value here. 861 */ 862 zio->io_error = (geterror(bp) != 0 ? EIO : 0); 863 864 if (zio->io_error == 0 && bp->b_resid != 0) 865 zio->io_error = SET_ERROR(EIO); 866 867 if (zio->io_type == ZIO_TYPE_READ) { 868 abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); 869 } else { 870 abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); 871 } 872 873 kmem_free(vb, sizeof (vdev_buf_t)); 874 875 zio_delay_interrupt(zio); 876 return (0); 877 } 878 879 static void 880 vdev_disk_ioctl_free(zio_t *zio) 881 { 882 kmem_free(zio->io_vsd, sizeof (struct dk_callback)); 883 } 884 885 static const zio_vsd_ops_t vdev_disk_vsd_ops = { 886 vdev_disk_ioctl_free, 887 zio_vsd_default_cksum_report 888 }; 889 890 static void 891 vdev_disk_ioctl_done(void *zio_arg, int error) 892 { 893 zio_t *zio = zio_arg; 894 895 zio->io_error = error; 896 897 zio_interrupt(zio); 898 } 899 900 static void 901 vdev_disk_io_start(zio_t *zio) 902 { 903 vdev_t *vd = zio->io_vd; 904 vdev_disk_t *dvd = vd->vdev_tsd; 905 unsigned long trim_flags = 0; 906 vdev_buf_t *vb; 907 struct dk_callback *dkc; 908 buf_t *bp; 909 int error; 910 911 /* 912 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 913 * Nothing to be done here but return failure. 914 */ 915 if (dvd == NULL || dvd->vd_ldi_offline) { 916 zio->io_error = ENXIO; 917 zio_interrupt(zio); 918 return; 919 } 920 921 switch (zio->io_type) { 922 case ZIO_TYPE_IOCTL: 923 /* XXPOLICY */ 924 if (!vdev_readable(vd)) { 925 zio->io_error = SET_ERROR(ENXIO); 926 zio_interrupt(zio); 927 return; 928 } 929 930 switch (zio->io_cmd) { 931 932 case DKIOCFLUSHWRITECACHE: 933 934 if (zfs_nocacheflush) 935 break; 936 937 if (vd->vdev_nowritecache) { 938 zio->io_error = SET_ERROR(ENOTSUP); 939 break; 940 } 941 942 zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); 943 zio->io_vsd_ops = &vdev_disk_vsd_ops; 944 945 dkc->dkc_callback = vdev_disk_ioctl_done; 946 dkc->dkc_flag = FLUSH_VOLATILE; 947 dkc->dkc_cookie = zio; 948 949 error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, 950 (uintptr_t)dkc, FKIOCTL, kcred, NULL); 951 952 if (error == 0) { 953 /* 954 * The ioctl will be done asychronously, 955 * and will call vdev_disk_ioctl_done() 956 * upon completion. 957 */ 958 return; 959 } 960 961 zio->io_error = error; 962 963 break; 964 965 default: 966 zio->io_error = SET_ERROR(ENOTSUP); 967 } 968 969 zio_execute(zio); 970 return; 971 972 case ZIO_TYPE_TRIM: 973 if (zfs_no_trim == 1 || !vd->vdev_has_trim) { 974 zio->io_error = SET_ERROR(ENOTSUP); 975 zio_execute(zio); 976 return; 977 } 978 /* Currently only supported on ZoL. */ 979 ASSERT0(zio->io_trim_flags & ZIO_TRIM_SECURE); 980 981 /* dkioc_free_list_t is already declared to hold one entry */ 982 dkioc_free_list_t dfl; 983 dfl.dfl_flags = 0; 984 dfl.dfl_num_exts = 1; 985 dfl.dfl_offset = 0; 986 dfl.dfl_exts[0].dfle_start = zio->io_offset; 987 dfl.dfl_exts[0].dfle_length = zio->io_size; 988 989 zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE, 990 (uintptr_t)&dfl, FKIOCTL, kcred, NULL); 991 992 if (zio->io_error == ENOTSUP || zio->io_error == ENOTTY) { 993 /* 994 * The device must have changed and now TRIM is 995 * no longer supported. 996 */ 997 vd->vdev_has_trim = B_FALSE; 998 } 999 1000 zio_interrupt(zio); 1001 return; 1002 } 1003 1004 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 1005 zio->io_target_timestamp = zio_handle_io_delay(zio); 1006 1007 vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); 1008 1009 vb->vb_io = zio; 1010 bp = &vb->vb_buf; 1011 1012 bioinit(bp); 1013 bp->b_flags = B_BUSY | B_NOCACHE | 1014 (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 1015 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 1016 bp->b_flags |= B_FAILFAST; 1017 bp->b_bcount = zio->io_size; 1018 1019 if (zio->io_type == ZIO_TYPE_READ) { 1020 bp->b_un.b_addr = 1021 abd_borrow_buf(zio->io_abd, zio->io_size); 1022 } else { 1023 bp->b_un.b_addr = 1024 abd_borrow_buf_copy(zio->io_abd, zio->io_size); 1025 } 1026 1027 bp->b_lblkno = lbtodb(zio->io_offset); 1028 bp->b_bufsize = zio->io_size; 1029 bp->b_iodone = vdev_disk_io_intr; 1030 1031 /* 1032 * In general we would expect ldi_strategy() to return non-zero only 1033 * because of programming errors, but we've also seen this fail shortly 1034 * after a disk dies. 1035 */ 1036 if (ldi_strategy(dvd->vd_lh, bp) != 0) { 1037 zio->io_error = ENXIO; 1038 zio_interrupt(zio); 1039 } 1040 } 1041 1042 static void 1043 vdev_disk_io_done(zio_t *zio) 1044 { 1045 vdev_t *vd = zio->io_vd; 1046 1047 /* 1048 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if 1049 * the device has been removed. If this is the case, then we trigger an 1050 * asynchronous removal of the device. Otherwise, probe the device and 1051 * make sure it's still accessible. 1052 */ 1053 if (zio->io_error == EIO && !vd->vdev_remove_wanted) { 1054 vdev_disk_t *dvd = vd->vdev_tsd; 1055 int state = DKIO_NONE; 1056 1057 if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, 1058 FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { 1059 /* 1060 * We post the resource as soon as possible, instead of 1061 * when the async removal actually happens, because the 1062 * DE is using this information to discard previous I/O 1063 * errors. 1064 */ 1065 zfs_post_remove(zio->io_spa, vd); 1066 vd->vdev_remove_wanted = B_TRUE; 1067 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1068 } else if (!vd->vdev_delayed_close) { 1069 vd->vdev_delayed_close = B_TRUE; 1070 } 1071 } 1072 } 1073 1074 vdev_ops_t vdev_disk_ops = { 1075 .vdev_op_open = vdev_disk_open, 1076 .vdev_op_close = vdev_disk_close, 1077 .vdev_op_asize = vdev_default_asize, 1078 .vdev_op_io_start = vdev_disk_io_start, 1079 .vdev_op_io_done = vdev_disk_io_done, 1080 .vdev_op_state_change = NULL, 1081 .vdev_op_need_resilver = NULL, 1082 .vdev_op_hold = vdev_disk_hold, 1083 .vdev_op_rele = vdev_disk_rele, 1084 .vdev_op_remap = NULL, 1085 .vdev_op_xlate = vdev_default_xlate, 1086 .vdev_op_dumpio = vdev_disk_dumpio, 1087 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1088 .vdev_op_leaf = B_TRUE /* leaf vdev */ 1089 }; 1090 1091 /* 1092 * Given the root disk device devid or pathname, read the label from 1093 * the device, and construct a configuration nvlist. 1094 */ 1095 int 1096 vdev_disk_read_rootlabel(const char *devpath, const char *devid, 1097 nvlist_t **config) 1098 { 1099 ldi_handle_t vd_lh; 1100 vdev_label_t *label; 1101 uint64_t s, size; 1102 int l; 1103 ddi_devid_t tmpdevid; 1104 int error = -1; 1105 char *minor_name; 1106 1107 /* 1108 * Read the device label and build the nvlist. 1109 */ 1110 if (devid != NULL && ddi_devid_str_decode((char *)devid, &tmpdevid, 1111 &minor_name) == 0) { 1112 error = ldi_open_by_devid(tmpdevid, minor_name, 1113 FREAD, kcred, &vd_lh, zfs_li); 1114 ddi_devid_free(tmpdevid); 1115 ddi_devid_str_free(minor_name); 1116 } 1117 1118 if (error != 0 && (error = ldi_open_by_name((char *)devpath, FREAD, 1119 kcred, &vd_lh, zfs_li)) != 0) { 1120 return (error); 1121 } 1122 1123 if (ldi_get_size(vd_lh, &s)) { 1124 (void) ldi_close(vd_lh, FREAD, kcred); 1125 return (SET_ERROR(EIO)); 1126 } 1127 1128 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); 1129 label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); 1130 1131 *config = NULL; 1132 for (l = 0; l < VDEV_LABELS; l++) { 1133 uint64_t offset, state, txg = 0; 1134 1135 /* read vdev label */ 1136 offset = vdev_label_offset(size, l, 0); 1137 if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, 1138 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) 1139 continue; 1140 1141 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, 1142 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { 1143 *config = NULL; 1144 continue; 1145 } 1146 1147 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 1148 &state) != 0 || state >= POOL_STATE_DESTROYED) { 1149 nvlist_free(*config); 1150 *config = NULL; 1151 continue; 1152 } 1153 1154 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 1155 &txg) != 0 || txg == 0) { 1156 nvlist_free(*config); 1157 *config = NULL; 1158 continue; 1159 } 1160 1161 break; 1162 } 1163 1164 kmem_free(label, sizeof (vdev_label_t)); 1165 (void) ldi_close(vd_lh, FREAD, kcred); 1166 if (*config == NULL) 1167 error = SET_ERROR(EIDRM); 1168 1169 return (error); 1170 } 1171 1172 struct veb { 1173 list_t veb_ents; 1174 boolean_t veb_scanned; 1175 char *veb_force_path; 1176 }; 1177 1178 struct veb_ent { 1179 uint64_t vebe_pool_guid; 1180 uint64_t vebe_vdev_guid; 1181 1182 char *vebe_devpath; 1183 1184 list_node_t vebe_link; 1185 }; 1186 1187 static kmutex_t veb_lock; 1188 static struct veb *veb; 1189 1190 static int 1191 vdev_disk_preroot_scan_walk(const char *devpath, void *arg) 1192 { 1193 int r; 1194 nvlist_t *cfg = NULL; 1195 uint64_t pguid = 0, vguid = 0; 1196 1197 /* 1198 * Attempt to read the label from this block device. 1199 */ 1200 if ((r = vdev_disk_read_rootlabel(devpath, NULL, &cfg)) != 0) { 1201 /* 1202 * Many of the available block devices will represent slices or 1203 * partitions of disks, or may represent disks that are not at 1204 * all initialised with ZFS. As this is a best effort 1205 * mechanism to locate an alternate path to a particular vdev, 1206 * we will ignore any failures and keep scanning. 1207 */ 1208 return (PREROOT_WALK_BLOCK_DEVICES_NEXT); 1209 } 1210 1211 /* 1212 * Determine the pool and vdev GUID read from the label for this 1213 * device. Both values must be present and have a non-zero value. 1214 */ 1215 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pguid) != 0 || 1216 nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_GUID, &vguid) != 0 || 1217 pguid == 0 || vguid == 0) { 1218 /* 1219 * This label was not complete. 1220 */ 1221 goto out; 1222 } 1223 1224 /* 1225 * Keep track of all of the GUID-to-devpath mappings we find so that 1226 * vdev_disk_preroot_lookup() can search them. 1227 */ 1228 struct veb_ent *vebe = kmem_zalloc(sizeof (*vebe), KM_SLEEP); 1229 vebe->vebe_pool_guid = pguid; 1230 vebe->vebe_vdev_guid = vguid; 1231 vebe->vebe_devpath = spa_strdup(devpath); 1232 1233 list_insert_tail(&veb->veb_ents, vebe); 1234 1235 out: 1236 nvlist_free(cfg); 1237 return (PREROOT_WALK_BLOCK_DEVICES_NEXT); 1238 } 1239 1240 const char * 1241 vdev_disk_preroot_lookup(uint64_t pool_guid, uint64_t vdev_guid) 1242 { 1243 if (pool_guid == 0 || vdev_guid == 0) { 1244 /* 1245 * If we aren't provided both a pool and a vdev GUID, we cannot 1246 * perform a lookup. 1247 */ 1248 return (NULL); 1249 } 1250 1251 mutex_enter(&veb_lock); 1252 if (veb == NULL) { 1253 /* 1254 * If vdev_disk_preroot_fini() has been called already, there 1255 * is nothing we can do. 1256 */ 1257 mutex_exit(&veb_lock); 1258 return (NULL); 1259 } 1260 1261 /* 1262 * We want to perform at most one scan of all block devices per boot. 1263 */ 1264 if (!veb->veb_scanned) { 1265 cmn_err(CE_NOTE, "Performing full ZFS device scan!"); 1266 1267 preroot_walk_block_devices(vdev_disk_preroot_scan_walk, NULL); 1268 1269 veb->veb_scanned = B_TRUE; 1270 } 1271 1272 const char *path = NULL; 1273 for (struct veb_ent *vebe = list_head(&veb->veb_ents); vebe != NULL; 1274 vebe = list_next(&veb->veb_ents, vebe)) { 1275 if (vebe->vebe_pool_guid == pool_guid && 1276 vebe->vebe_vdev_guid == vdev_guid) { 1277 path = vebe->vebe_devpath; 1278 break; 1279 } 1280 } 1281 1282 mutex_exit(&veb_lock); 1283 1284 return (path); 1285 } 1286 1287 const char * 1288 vdev_disk_preroot_force_path(void) 1289 { 1290 const char *force_path = NULL; 1291 1292 mutex_enter(&veb_lock); 1293 if (veb != NULL) { 1294 force_path = veb->veb_force_path; 1295 } 1296 mutex_exit(&veb_lock); 1297 1298 return (force_path); 1299 } 1300 1301 void 1302 vdev_disk_preroot_init(const char *force_path) 1303 { 1304 mutex_init(&veb_lock, NULL, MUTEX_DEFAULT, NULL); 1305 1306 VERIFY3P(veb, ==, NULL); 1307 veb = kmem_zalloc(sizeof (*veb), KM_SLEEP); 1308 list_create(&veb->veb_ents, sizeof (struct veb_ent), 1309 offsetof(struct veb_ent, vebe_link)); 1310 veb->veb_scanned = B_FALSE; 1311 if (force_path != NULL) { 1312 veb->veb_force_path = spa_strdup(force_path); 1313 } 1314 } 1315 1316 void 1317 vdev_disk_preroot_fini(void) 1318 { 1319 mutex_enter(&veb_lock); 1320 1321 if (veb != NULL) { 1322 while (!list_is_empty(&veb->veb_ents)) { 1323 struct veb_ent *vebe = list_remove_head(&veb->veb_ents); 1324 1325 spa_strfree(vebe->vebe_devpath); 1326 1327 kmem_free(vebe, sizeof (*vebe)); 1328 } 1329 1330 if (veb->veb_force_path != NULL) { 1331 spa_strfree(veb->veb_force_path); 1332 } 1333 1334 kmem_free(veb, sizeof (*veb)); 1335 veb = NULL; 1336 } 1337 1338 mutex_exit(&veb_lock); 1339 } 1340