1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2020 Joyent, Inc. 26 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> 27 * Copyright 2022 Tintri by DDN, Inc. All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/spa_impl.h> 32 #include <sys/refcount.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/vdev_trim.h> 35 #include <sys/abd.h> 36 #include <sys/fs/zfs.h> 37 #include <sys/zio.h> 38 #include <sys/sunldi.h> 39 #include <sys/efi_partition.h> 40 #include <sys/fm/fs/zfs.h> 41 #include <sys/ddi.h> 42 43 /* 44 * Tunable to disable TRIM in case we're using a problematic SSD. 45 */ 46 uint_t zfs_no_trim = 0; 47 48 /* 49 * Tunable parameter for debugging or performance analysis. Setting this 50 * will cause pool corruption on power loss if a volatile out-of-order 51 * write cache is enabled. 52 */ 53 boolean_t zfs_nocacheflush = B_FALSE; 54 55 /* 56 * Virtual device vector for disks. 57 */ 58 59 extern ldi_ident_t zfs_li; 60 61 static void vdev_disk_close(vdev_t *); 62 63 typedef struct vdev_disk { 64 ddi_devid_t vd_devid; 65 char *vd_minor; 66 ldi_handle_t vd_lh; 67 list_t vd_ldi_cbs; 68 boolean_t vd_ldi_offline; 69 } vdev_disk_t; 70 71 typedef struct vdev_disk_buf { 72 buf_t vdb_buf; 73 zio_t *vdb_io; 74 } vdev_disk_buf_t; 75 76 typedef struct vdev_disk_ldi_cb { 77 list_node_t lcb_next; 78 ldi_callback_id_t lcb_id; 79 } vdev_disk_ldi_cb_t; 80 81 /* 82 * Bypass the devid when opening a disk vdev. 83 * There have been issues where the devids of several devices were shuffled, 84 * causing pool open failures. Note, that this flag is intended to be used 85 * for pool recovery only. 86 * 87 * Note that if a pool is imported with the devids bypassed, all its vdevs will 88 * cease storing devid information permanently. In practice, the devid is rarely 89 * useful as vdev paths do not tend to change unless the hardware is 90 * reconfigured. That said, if the paths do change and a pool fails to open 91 * automatically at boot, a simple zpool import should re-scan the paths and fix 92 * the issue. 93 */ 94 boolean_t vdev_disk_bypass_devid = B_FALSE; 95 96 static void 97 vdev_disk_alloc(vdev_t *vd) 98 { 99 vdev_disk_t *dvd; 100 101 dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 102 /* 103 * Create the LDI event callback list. 104 */ 105 list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), 106 offsetof(vdev_disk_ldi_cb_t, lcb_next)); 107 } 108 109 static void 110 vdev_disk_free(vdev_t *vd) 111 { 112 vdev_disk_t *dvd = vd->vdev_tsd; 113 vdev_disk_ldi_cb_t *lcb; 114 115 if (dvd == NULL) 116 return; 117 118 /* 119 * We have already closed the LDI handle. Clean up the LDI event 120 * callbacks and free vd->vdev_tsd. 121 */ 122 while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { 123 list_remove(&dvd->vd_ldi_cbs, lcb); 124 (void) ldi_ev_remove_callbacks(lcb->lcb_id); 125 kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); 126 } 127 list_destroy(&dvd->vd_ldi_cbs); 128 kmem_free(dvd, sizeof (vdev_disk_t)); 129 vd->vdev_tsd = NULL; 130 } 131 132 static int 133 vdev_disk_off_notify(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 134 void *arg, void *ev_data __unused) 135 { 136 vdev_t *vd = (vdev_t *)arg; 137 vdev_disk_t *dvd = vd->vdev_tsd; 138 139 /* 140 * Ignore events other than offline. 141 */ 142 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 143 return (LDI_EV_SUCCESS); 144 145 /* 146 * Tell any new threads that stumble upon this vdev that they should not 147 * try to do I/O. 148 */ 149 dvd->vd_ldi_offline = B_TRUE; 150 151 /* 152 * Request that the spa_async_thread mark the device as REMOVED and 153 * notify FMA of the removal. This should also trigger a vdev_close() 154 * in the async thread. 155 */ 156 zfs_post_remove(vd->vdev_spa, vd); 157 vd->vdev_remove_wanted = B_TRUE; 158 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); 159 160 return (LDI_EV_SUCCESS); 161 } 162 163 static void 164 vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 165 int ldi_result, void *arg, void *ev_data __unused) 166 { 167 vdev_t *vd = (vdev_t *)arg; 168 169 /* 170 * Ignore events other than offline. 171 */ 172 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 173 return; 174 175 /* 176 * Request that the vdev be reopened if the offline state change was 177 * unsuccessful. 178 */ 179 if (ldi_result != LDI_EV_SUCCESS) { 180 vd->vdev_probe_wanted = B_TRUE; 181 spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); 182 } 183 } 184 185 static ldi_ev_callback_t vdev_disk_off_callb = { 186 .cb_vers = LDI_EV_CB_VERS, 187 .cb_notify = vdev_disk_off_notify, 188 .cb_finalize = vdev_disk_off_finalize 189 }; 190 191 static void 192 vdev_disk_dgrd_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 193 int ldi_result, void *arg, void *ev_data __unused) 194 { 195 vdev_t *vd = (vdev_t *)arg; 196 197 /* 198 * Ignore events other than degrade. 199 */ 200 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) 201 return; 202 203 /* 204 * Degrade events always succeed. Mark the vdev as degraded. 205 * This status is purely informative for the user. 206 */ 207 (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); 208 } 209 210 static ldi_ev_callback_t vdev_disk_dgrd_callb = { 211 .cb_vers = LDI_EV_CB_VERS, 212 .cb_notify = NULL, 213 .cb_finalize = vdev_disk_dgrd_finalize 214 }; 215 216 static void 217 vdev_disk_hold(vdev_t *vd) 218 { 219 ddi_devid_t devid; 220 char *minor; 221 222 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 223 224 /* 225 * We must have a pathname, and it must be absolute. 226 */ 227 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 228 return; 229 230 /* 231 * Only prefetch path and devid info if the device has 232 * never been opened. 233 */ 234 if (vd->vdev_tsd != NULL) 235 return; 236 237 if (vd->vdev_wholedisk == -1ULL) { 238 size_t len = strlen(vd->vdev_path) + 3; 239 char *buf = kmem_alloc(len, KM_SLEEP); 240 241 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 242 243 (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); 244 kmem_free(buf, len); 245 } 246 247 if (vd->vdev_name_vp == NULL) 248 (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); 249 250 if (vd->vdev_devid != NULL && 251 ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { 252 (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); 253 ddi_devid_str_free(minor); 254 ddi_devid_free(devid); 255 } 256 } 257 258 static void 259 vdev_disk_rele(vdev_t *vd) 260 { 261 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 262 263 if (vd->vdev_name_vp) { 264 VN_RELE_ASYNC(vd->vdev_name_vp, 265 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 266 vd->vdev_name_vp = NULL; 267 } 268 if (vd->vdev_devid_vp) { 269 VN_RELE_ASYNC(vd->vdev_devid_vp, 270 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 271 vd->vdev_devid_vp = NULL; 272 } 273 } 274 275 /* 276 * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when 277 * even a fallback to DKIOCGMEDIAINFO fails. 278 */ 279 #ifdef DEBUG 280 #define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) 281 #else 282 #define VDEV_DEBUG(...) /* Nothing... */ 283 #endif 284 285 static int 286 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 287 uint64_t *ashift) 288 { 289 spa_t *spa = vd->vdev_spa; 290 vdev_disk_t *dvd = vd->vdev_tsd; 291 ldi_ev_cookie_t ecookie; 292 vdev_disk_ldi_cb_t *lcb; 293 union { 294 struct dk_minfo_ext ude; 295 struct dk_minfo ud; 296 } dks; 297 struct dk_minfo_ext *dkmext = &dks.ude; 298 struct dk_minfo *dkm = &dks.ud; 299 int error, can_free; 300 dev_t dev; 301 int otyp; 302 boolean_t validate_devid = B_FALSE; 303 uint64_t capacity = 0, blksz = 0, pbsize; 304 305 /* 306 * We must have a pathname, and it must be absolute. 307 */ 308 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 309 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 310 return (SET_ERROR(EINVAL)); 311 } 312 313 /* 314 * Reopen the device if it's not currently open. Otherwise, 315 * just update the physical size of the device. 316 */ 317 if (dvd != NULL) { 318 ASSERT(vd->vdev_reopening); 319 goto skip_open; 320 } 321 322 /* 323 * Create vd->vdev_tsd. 324 */ 325 vdev_disk_alloc(vd); 326 dvd = vd->vdev_tsd; 327 328 /* 329 * Allow bypassing the devid. 330 */ 331 if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) { 332 vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed", 333 vd->vdev_devid); 334 spa_strfree(vd->vdev_devid); 335 vd->vdev_devid = NULL; 336 } 337 338 /* 339 * When opening a disk device, we want to preserve the user's original 340 * intent. We always want to open the device by the path the user gave 341 * us, even if it is one of multiple paths to the same device. But we 342 * also want to be able to survive disks being removed/recabled. 343 * Therefore the sequence of opening devices is: 344 * 345 * 1. Try opening the device by path. For legacy pools without the 346 * 'whole_disk' property, attempt to fix the path by appending 's0'. 347 * 348 * 2. If the devid of the device matches the stored value, return 349 * success. 350 * 351 * 3. Otherwise, the device may have moved. Try opening the device 352 * by the devid instead. 353 */ 354 if (vd->vdev_devid != NULL) { 355 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, 356 &dvd->vd_minor) != 0) { 357 vdev_dbgmsg(vd, 358 "vdev_disk_open, invalid devid %s bypassed", 359 vd->vdev_devid); 360 spa_strfree(vd->vdev_devid); 361 vd->vdev_devid = NULL; 362 } 363 } 364 365 error = EINVAL; /* presume failure */ 366 367 if (vd->vdev_path != NULL) { 368 if (vd->vdev_wholedisk == -1ULL) { 369 size_t len = strlen(vd->vdev_path) + 3; 370 char *buf = kmem_alloc(len, KM_SLEEP); 371 372 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 373 374 error = ldi_open_by_name(buf, spa_mode(spa), kcred, 375 &dvd->vd_lh, zfs_li); 376 if (error == 0) { 377 spa_strfree(vd->vdev_path); 378 vd->vdev_path = buf; 379 vd->vdev_wholedisk = 1ULL; 380 } else { 381 kmem_free(buf, len); 382 } 383 } 384 385 /* 386 * If we have not yet opened the device, try to open it by the 387 * specified path. 388 */ 389 if (error != 0) { 390 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 391 kcred, &dvd->vd_lh, zfs_li); 392 } 393 394 /* 395 * Compare the devid to the stored value. 396 */ 397 if (error == 0 && vd->vdev_devid != NULL) { 398 ddi_devid_t devid = NULL; 399 400 if (ldi_get_devid(dvd->vd_lh, &devid) != 0) { 401 /* 402 * We expected a devid on this device but it no 403 * longer appears to have one. The validation 404 * step may need to remove it from the 405 * configuration. 406 */ 407 validate_devid = B_TRUE; 408 409 } else if (ddi_devid_compare(devid, dvd->vd_devid) != 410 0) { 411 /* 412 * A mismatch here is unexpected, log it. 413 */ 414 char *devid_str = ddi_devid_str_encode(devid, 415 dvd->vd_minor); 416 vdev_dbgmsg(vd, "vdev_disk_open: devid " 417 "mismatch: %s != %s", vd->vdev_devid, 418 devid_str); 419 cmn_err(CE_NOTE, "vdev_disk_open %s: devid " 420 "mismatch: %s != %s", vd->vdev_path, 421 vd->vdev_devid, devid_str); 422 ddi_devid_str_free(devid_str); 423 424 error = SET_ERROR(EINVAL); 425 (void) ldi_close(dvd->vd_lh, spa_mode(spa), 426 kcred); 427 dvd->vd_lh = NULL; 428 } 429 430 if (devid != NULL) { 431 ddi_devid_free(devid); 432 } 433 } 434 435 /* 436 * If we succeeded in opening the device, but 'vdev_wholedisk' 437 * is not yet set, then this must be a slice. 438 */ 439 if (error == 0 && vd->vdev_wholedisk == -1ULL) 440 vd->vdev_wholedisk = 0; 441 } 442 443 /* 444 * If we were unable to open by path, or the devid check fails, open by 445 * devid instead. 446 */ 447 if (error != 0 && vd->vdev_devid != NULL) { 448 error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, 449 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); 450 if (error != 0) { 451 vdev_dbgmsg(vd, "Failed to open by devid (%s)", 452 vd->vdev_devid); 453 } 454 } 455 456 /* 457 * If all else fails, then try opening by physical path (if available) 458 * or the logical path (if we failed due to the devid check). While not 459 * as reliable as the devid, this will give us something, and the higher 460 * level vdev validation will prevent us from opening the wrong device. 461 */ 462 if (error != 0) { 463 validate_devid = B_TRUE; 464 465 if (vd->vdev_physpath != NULL && 466 (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) { 467 error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), 468 kcred, &dvd->vd_lh, zfs_li); 469 } 470 471 /* 472 * Note that we don't support the legacy auto-wholedisk support 473 * as above. This hasn't been used in a very long time and we 474 * don't need to propagate its oddities to this edge condition. 475 */ 476 if (error != 0 && vd->vdev_path != NULL) { 477 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 478 kcred, &dvd->vd_lh, zfs_li); 479 } 480 } 481 482 /* 483 * If this is early in boot, a sweep of available block devices may 484 * locate an alternative path that we can try. 485 */ 486 if (error != 0) { 487 const char *altdevpath = vdev_disk_preroot_lookup( 488 spa_guid(spa), vd->vdev_guid); 489 490 if (altdevpath != NULL) { 491 vdev_dbgmsg(vd, "Trying alternate preroot path (%s)", 492 altdevpath); 493 494 validate_devid = B_TRUE; 495 496 if ((error = ldi_open_by_name((char *)altdevpath, 497 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li)) != 0) { 498 vdev_dbgmsg(vd, "Failed to open by preroot " 499 "path (%s)", altdevpath); 500 } 501 } 502 } 503 504 if (error != 0) { 505 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 506 vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", 507 error); 508 return (error); 509 } 510 511 /* 512 * Now that the device has been successfully opened, update the devid 513 * if necessary. 514 */ 515 if (validate_devid) { 516 ddi_devid_t devid = NULL; 517 char *minorname = NULL; 518 char *vd_devid = NULL; 519 boolean_t remove = B_FALSE, update = B_FALSE; 520 521 /* 522 * Get the current devid and minor name for the device we 523 * opened. 524 */ 525 if (ldi_get_devid(dvd->vd_lh, &devid) != 0 || 526 ldi_get_minor_name(dvd->vd_lh, &minorname) != 0) { 527 /* 528 * If we are unable to get the devid or the minor name 529 * for the device, we need to remove them from the 530 * configuration to prevent potential inconsistencies. 531 */ 532 if (dvd->vd_minor != NULL || dvd->vd_devid != NULL || 533 vd->vdev_devid != NULL) { 534 /* 535 * We only need to remove the devid if one 536 * exists. 537 */ 538 remove = B_TRUE; 539 } 540 541 } else if (dvd->vd_devid == NULL || dvd->vd_minor == NULL) { 542 /* 543 * There was previously no devid at all so we need to 544 * add one. 545 */ 546 update = B_TRUE; 547 548 } else if (ddi_devid_compare(devid, dvd->vd_devid) != 0 || 549 strcmp(minorname, dvd->vd_minor) != 0) { 550 /* 551 * The devid or minor name on file does not match the 552 * one from the opened device. 553 */ 554 update = B_TRUE; 555 } 556 557 if (update) { 558 /* 559 * Render the new devid and minor name as a string for 560 * logging and to store in the vdev configuration. 561 */ 562 vd_devid = ddi_devid_str_encode(devid, minorname); 563 } 564 565 if (update || remove) { 566 vdev_dbgmsg(vd, "vdev_disk_open: update devid from " 567 "'%s' to '%s'", 568 vd->vdev_devid != NULL ? vd->vdev_devid : "<none>", 569 vd_devid != NULL ? vd_devid : "<none>"); 570 cmn_err(CE_NOTE, "vdev_disk_open %s: update devid " 571 "from '%s' to '%s'", 572 vd->vdev_path != NULL ? vd->vdev_path : "?", 573 vd->vdev_devid != NULL ? vd->vdev_devid : "<none>", 574 vd_devid != NULL ? vd_devid : "<none>"); 575 576 /* 577 * Remove and free any existing values. 578 */ 579 if (dvd->vd_minor != NULL) { 580 ddi_devid_str_free(dvd->vd_minor); 581 dvd->vd_minor = NULL; 582 } 583 if (dvd->vd_devid != NULL) { 584 ddi_devid_free(dvd->vd_devid); 585 dvd->vd_devid = NULL; 586 } 587 if (vd->vdev_devid != NULL) { 588 spa_strfree(vd->vdev_devid); 589 vd->vdev_devid = NULL; 590 } 591 } 592 593 if (update) { 594 /* 595 * Install the new values. 596 */ 597 vd->vdev_devid = vd_devid; 598 dvd->vd_minor = minorname; 599 dvd->vd_devid = devid; 600 601 } else { 602 if (devid != NULL) { 603 ddi_devid_free(devid); 604 } 605 if (minorname != NULL) { 606 kmem_free(minorname, strlen(minorname) + 1); 607 } 608 } 609 } 610 611 /* 612 * Once a device is opened, verify that the physical device path (if 613 * available) is up to date. 614 */ 615 if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && 616 ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { 617 char *physpath, *minorname; 618 619 physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); 620 minorname = NULL; 621 if (ddi_dev_pathname(dev, otyp, physpath) == 0 && 622 ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && 623 (vd->vdev_physpath == NULL || 624 strcmp(vd->vdev_physpath, physpath) != 0)) { 625 if (vd->vdev_physpath) 626 spa_strfree(vd->vdev_physpath); 627 (void) strlcat(physpath, ":", MAXPATHLEN); 628 (void) strlcat(physpath, minorname, MAXPATHLEN); 629 vd->vdev_physpath = spa_strdup(physpath); 630 } 631 if (minorname) 632 kmem_free(minorname, strlen(minorname) + 1); 633 kmem_free(physpath, MAXPATHLEN); 634 } 635 636 /* 637 * Register callbacks for the LDI offline event. 638 */ 639 if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == 640 LDI_EV_SUCCESS) { 641 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 642 list_insert_tail(&dvd->vd_ldi_cbs, lcb); 643 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 644 &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); 645 } 646 647 /* 648 * Register callbacks for the LDI degrade event. 649 */ 650 if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == 651 LDI_EV_SUCCESS) { 652 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 653 list_insert_tail(&dvd->vd_ldi_cbs, lcb); 654 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 655 &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); 656 } 657 658 skip_open: 659 /* 660 * Determine the actual size of the device. 661 */ 662 if (ldi_get_size(dvd->vd_lh, psize) != 0) { 663 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 664 vdev_dbgmsg(vd, "vdev_disk_open: failed to get size"); 665 return (SET_ERROR(EINVAL)); 666 } 667 668 *max_psize = *psize; 669 670 /* 671 * Determine the device's minimum transfer size. 672 * If the ioctl isn't supported, assume DEV_BSIZE. 673 */ 674 if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, 675 (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { 676 capacity = dkmext->dki_capacity - 1; 677 blksz = dkmext->dki_lbsize; 678 pbsize = dkmext->dki_pbsize; 679 } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, 680 (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { 681 VDEV_DEBUG( 682 "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", 683 vd->vdev_path); 684 capacity = dkm->dki_capacity - 1; 685 blksz = dkm->dki_lbsize; 686 pbsize = blksz; 687 } else { 688 VDEV_DEBUG("vdev_disk_open(\"%s\"): " 689 "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", 690 vd->vdev_path, error); 691 pbsize = DEV_BSIZE; 692 } 693 694 *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; 695 696 if (vd->vdev_wholedisk == 1) { 697 int wce = 1; 698 699 if (error == 0) { 700 /* 701 * If we have the capability to expand, we'd have 702 * found out via success from DKIOCGMEDIAINFO{,EXT}. 703 * Adjust max_psize upward accordingly since we know 704 * we own the whole disk now. 705 */ 706 *max_psize = capacity * blksz; 707 } 708 709 /* 710 * Since we own the whole disk, try to enable disk write 711 * caching. We ignore errors because it's OK if we can't do it. 712 */ 713 (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, 714 FKIOCTL, kcred, NULL); 715 } 716 717 /* 718 * Clear the nowritecache bit, so that on a vdev_reopen() we will 719 * try again. 720 */ 721 vd->vdev_nowritecache = B_FALSE; 722 723 if (ldi_ioctl(dvd->vd_lh, DKIOC_CANFREE, (intptr_t)&can_free, FKIOCTL, 724 kcred, NULL) == 0 && can_free == 1) { 725 vd->vdev_has_trim = B_TRUE; 726 } else { 727 vd->vdev_has_trim = B_FALSE; 728 } 729 730 if (zfs_no_trim == 1) 731 vd->vdev_has_trim = B_FALSE; 732 733 /* Currently only supported for ZoL. */ 734 vd->vdev_has_securetrim = B_FALSE; 735 736 /* Inform the ZIO pipeline that we are non-rotational */ 737 vd->vdev_nonrot = B_FALSE; 738 if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 739 "device-solid-state")) { 740 if (ldi_prop_get_int(dvd->vd_lh, 741 LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 742 "device-solid-state", B_FALSE) != 0) 743 vd->vdev_nonrot = B_TRUE; 744 } 745 746 return (0); 747 } 748 749 static void 750 vdev_disk_close(vdev_t *vd) 751 { 752 vdev_disk_t *dvd = vd->vdev_tsd; 753 754 if (vd->vdev_reopening || dvd == NULL) 755 return; 756 757 if (dvd->vd_minor != NULL) { 758 ddi_devid_str_free(dvd->vd_minor); 759 dvd->vd_minor = NULL; 760 } 761 762 if (dvd->vd_devid != NULL) { 763 ddi_devid_free(dvd->vd_devid); 764 dvd->vd_devid = NULL; 765 } 766 767 if (dvd->vd_lh != NULL) { 768 (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); 769 dvd->vd_lh = NULL; 770 } 771 772 vd->vdev_delayed_close = B_FALSE; 773 vdev_disk_free(vd); 774 } 775 776 static int 777 vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, 778 size_t size, uint64_t offset, int flags) 779 { 780 buf_t *bp; 781 int error = 0; 782 783 if (vd_lh == NULL) 784 return (SET_ERROR(EINVAL)); 785 786 ASSERT(flags & B_READ || flags & B_WRITE); 787 788 bp = getrbuf(KM_SLEEP); 789 bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; 790 bp->b_bcount = size; 791 bp->b_un.b_addr = (void *)data; 792 bp->b_lblkno = lbtodb(offset); 793 bp->b_bufsize = size; 794 795 error = ldi_strategy(vd_lh, bp); 796 ASSERT(error == 0); 797 if ((error = biowait(bp)) == 0 && bp->b_resid != 0) 798 error = SET_ERROR(EIO); 799 freerbuf(bp); 800 801 return (error); 802 } 803 804 static int 805 vdev_disk_dumpio(vdev_t *vd, caddr_t data, size_t size, 806 uint64_t offset, uint64_t origoffset __unused, boolean_t doread, 807 boolean_t isdump) 808 { 809 vdev_disk_t *dvd = vd->vdev_tsd; 810 int flags = doread ? B_READ : B_WRITE; 811 812 /* 813 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 814 * Nothing to be done here but return failure. 815 */ 816 if (dvd == NULL || dvd->vd_ldi_offline) { 817 return (SET_ERROR(ENXIO)); 818 } 819 820 ASSERT(vd->vdev_ops == &vdev_disk_ops); 821 822 offset += VDEV_LABEL_START_SIZE; 823 824 /* 825 * If in the context of an active crash dump, use the ldi_dump(9F) 826 * call instead of ldi_strategy(9F) as usual. 827 */ 828 if (isdump) { 829 ASSERT3P(dvd, !=, NULL); 830 return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), 831 lbtodb(size))); 832 } 833 834 return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); 835 } 836 837 static int 838 vdev_disk_io_intr(buf_t *bp) 839 { 840 vdev_buf_t *vb = (vdev_buf_t *)bp; 841 zio_t *zio = vb->vb_io; 842 843 /* 844 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. 845 * Rather than teach the rest of the stack about other error 846 * possibilities (EFAULT, etc), we normalize the error value here. 847 */ 848 zio->io_error = (geterror(bp) != 0 ? EIO : 0); 849 850 if (zio->io_error == 0 && bp->b_resid != 0) 851 zio->io_error = SET_ERROR(EIO); 852 853 if (zio->io_type == ZIO_TYPE_READ) { 854 abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); 855 } else { 856 abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); 857 } 858 859 kmem_free(vb, sizeof (vdev_buf_t)); 860 861 zio_delay_interrupt(zio); 862 return (0); 863 } 864 865 static void 866 vdev_disk_ioctl_free(zio_t *zio) 867 { 868 kmem_free(zio->io_vsd, sizeof (struct dk_callback)); 869 } 870 871 static const zio_vsd_ops_t vdev_disk_vsd_ops = { 872 vdev_disk_ioctl_free, 873 zio_vsd_default_cksum_report 874 }; 875 876 static void 877 vdev_disk_ioctl_done(void *zio_arg, int error) 878 { 879 zio_t *zio = zio_arg; 880 881 zio->io_error = error; 882 883 zio_interrupt(zio); 884 } 885 886 static void 887 vdev_disk_io_start(zio_t *zio) 888 { 889 vdev_t *vd = zio->io_vd; 890 vdev_disk_t *dvd = vd->vdev_tsd; 891 unsigned long trim_flags = 0; 892 vdev_buf_t *vb; 893 struct dk_callback *dkc; 894 buf_t *bp; 895 int error; 896 897 /* 898 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 899 * Nothing to be done here but return failure. 900 */ 901 if (dvd == NULL || dvd->vd_ldi_offline) { 902 zio->io_error = ENXIO; 903 zio_interrupt(zio); 904 return; 905 } 906 907 switch (zio->io_type) { 908 case ZIO_TYPE_IOCTL: 909 /* XXPOLICY */ 910 if (!vdev_readable(vd)) { 911 zio->io_error = SET_ERROR(ENXIO); 912 zio_interrupt(zio); 913 return; 914 } 915 916 switch (zio->io_cmd) { 917 918 case DKIOCFLUSHWRITECACHE: 919 920 if (zfs_nocacheflush) 921 break; 922 923 if (vd->vdev_nowritecache) { 924 zio->io_error = SET_ERROR(ENOTSUP); 925 break; 926 } 927 928 zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); 929 zio->io_vsd_ops = &vdev_disk_vsd_ops; 930 931 dkc->dkc_callback = vdev_disk_ioctl_done; 932 dkc->dkc_flag = FLUSH_VOLATILE; 933 dkc->dkc_cookie = zio; 934 935 error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, 936 (uintptr_t)dkc, FKIOCTL, kcred, NULL); 937 938 if (error == 0) { 939 /* 940 * The ioctl will be done asychronously, 941 * and will call vdev_disk_ioctl_done() 942 * upon completion. 943 */ 944 return; 945 } 946 947 zio->io_error = error; 948 949 break; 950 951 default: 952 zio->io_error = SET_ERROR(ENOTSUP); 953 } 954 955 zio_execute(zio); 956 return; 957 958 case ZIO_TYPE_TRIM: 959 if (zfs_no_trim == 1 || !vd->vdev_has_trim) { 960 zio->io_error = SET_ERROR(ENOTSUP); 961 zio_execute(zio); 962 return; 963 } 964 /* Currently only supported on ZoL. */ 965 ASSERT0(zio->io_trim_flags & ZIO_TRIM_SECURE); 966 967 /* dkioc_free_list_t is already declared to hold one entry */ 968 dkioc_free_list_t dfl; 969 dfl.dfl_flags = 0; 970 dfl.dfl_num_exts = 1; 971 dfl.dfl_offset = 0; 972 dfl.dfl_exts[0].dfle_start = zio->io_offset; 973 dfl.dfl_exts[0].dfle_length = zio->io_size; 974 975 zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE, 976 (uintptr_t)&dfl, FKIOCTL, kcred, NULL); 977 978 if (zio->io_error == ENOTSUP || zio->io_error == ENOTTY) { 979 /* 980 * The device must have changed and now TRIM is 981 * no longer supported. 982 */ 983 vd->vdev_has_trim = B_FALSE; 984 } 985 986 zio_interrupt(zio); 987 return; 988 } 989 990 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 991 zio->io_target_timestamp = zio_handle_io_delay(zio); 992 993 vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); 994 995 vb->vb_io = zio; 996 bp = &vb->vb_buf; 997 998 bioinit(bp); 999 bp->b_flags = B_BUSY | B_NOCACHE | 1000 (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 1001 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 1002 bp->b_flags |= B_FAILFAST; 1003 bp->b_bcount = zio->io_size; 1004 1005 if (zio->io_type == ZIO_TYPE_READ) { 1006 bp->b_un.b_addr = 1007 abd_borrow_buf(zio->io_abd, zio->io_size); 1008 } else { 1009 bp->b_un.b_addr = 1010 abd_borrow_buf_copy(zio->io_abd, zio->io_size); 1011 } 1012 1013 bp->b_lblkno = lbtodb(zio->io_offset); 1014 bp->b_bufsize = zio->io_size; 1015 bp->b_iodone = vdev_disk_io_intr; 1016 1017 /* 1018 * In general we would expect ldi_strategy() to return non-zero only 1019 * because of programming errors, but we've also seen this fail shortly 1020 * after a disk dies. 1021 */ 1022 if (ldi_strategy(dvd->vd_lh, bp) != 0) { 1023 zio->io_error = ENXIO; 1024 zio_interrupt(zio); 1025 } 1026 } 1027 1028 static void 1029 vdev_disk_io_done(zio_t *zio) 1030 { 1031 vdev_t *vd = zio->io_vd; 1032 1033 /* 1034 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if 1035 * the device has been removed. If this is the case, then we trigger an 1036 * asynchronous removal of the device. Otherwise, probe the device and 1037 * make sure it's still accessible. 1038 */ 1039 if (zio->io_error == EIO && !vd->vdev_remove_wanted) { 1040 vdev_disk_t *dvd = vd->vdev_tsd; 1041 int state = DKIO_NONE; 1042 1043 if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, 1044 FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { 1045 /* 1046 * We post the resource as soon as possible, instead of 1047 * when the async removal actually happens, because the 1048 * DE is using this information to discard previous I/O 1049 * errors. 1050 */ 1051 zfs_post_remove(zio->io_spa, vd); 1052 vd->vdev_remove_wanted = B_TRUE; 1053 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1054 } else if (!vd->vdev_delayed_close) { 1055 vd->vdev_delayed_close = B_TRUE; 1056 } 1057 } 1058 } 1059 1060 vdev_ops_t vdev_disk_ops = { 1061 .vdev_op_open = vdev_disk_open, 1062 .vdev_op_close = vdev_disk_close, 1063 .vdev_op_asize = vdev_default_asize, 1064 .vdev_op_io_start = vdev_disk_io_start, 1065 .vdev_op_io_done = vdev_disk_io_done, 1066 .vdev_op_state_change = NULL, 1067 .vdev_op_need_resilver = NULL, 1068 .vdev_op_hold = vdev_disk_hold, 1069 .vdev_op_rele = vdev_disk_rele, 1070 .vdev_op_remap = NULL, 1071 .vdev_op_xlate = vdev_default_xlate, 1072 .vdev_op_dumpio = vdev_disk_dumpio, 1073 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1074 .vdev_op_leaf = B_TRUE /* leaf vdev */ 1075 }; 1076 1077 /* 1078 * Given the root disk device devid or pathname, read the label from 1079 * the device, and construct a configuration nvlist. 1080 */ 1081 int 1082 vdev_disk_read_rootlabel(const char *devpath, const char *devid, 1083 nvlist_t **config) 1084 { 1085 ldi_handle_t vd_lh; 1086 vdev_label_t *label; 1087 uint64_t s, size; 1088 int l; 1089 ddi_devid_t tmpdevid; 1090 int error = -1; 1091 char *minor_name; 1092 1093 /* 1094 * Read the device label and build the nvlist. 1095 */ 1096 if (devid != NULL && ddi_devid_str_decode((char *)devid, &tmpdevid, 1097 &minor_name) == 0) { 1098 error = ldi_open_by_devid(tmpdevid, minor_name, 1099 FREAD, kcred, &vd_lh, zfs_li); 1100 ddi_devid_free(tmpdevid); 1101 ddi_devid_str_free(minor_name); 1102 } 1103 1104 if (error != 0 && (error = ldi_open_by_name((char *)devpath, FREAD, 1105 kcred, &vd_lh, zfs_li)) != 0) { 1106 return (error); 1107 } 1108 1109 if (ldi_get_size(vd_lh, &s)) { 1110 (void) ldi_close(vd_lh, FREAD, kcred); 1111 return (SET_ERROR(EIO)); 1112 } 1113 1114 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); 1115 label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); 1116 1117 *config = NULL; 1118 for (l = 0; l < VDEV_LABELS; l++) { 1119 uint64_t offset, state, txg = 0; 1120 1121 /* read vdev label */ 1122 offset = vdev_label_offset(size, l, 0); 1123 if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, 1124 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) 1125 continue; 1126 1127 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, 1128 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { 1129 *config = NULL; 1130 continue; 1131 } 1132 1133 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 1134 &state) != 0 || state >= POOL_STATE_DESTROYED) { 1135 nvlist_free(*config); 1136 *config = NULL; 1137 continue; 1138 } 1139 1140 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 1141 &txg) != 0 || txg == 0) { 1142 nvlist_free(*config); 1143 *config = NULL; 1144 continue; 1145 } 1146 1147 break; 1148 } 1149 1150 kmem_free(label, sizeof (vdev_label_t)); 1151 (void) ldi_close(vd_lh, FREAD, kcred); 1152 if (*config == NULL) 1153 error = SET_ERROR(EIDRM); 1154 1155 return (error); 1156 } 1157 1158 struct veb { 1159 list_t veb_ents; 1160 boolean_t veb_scanned; 1161 }; 1162 1163 struct veb_ent { 1164 uint64_t vebe_pool_guid; 1165 uint64_t vebe_vdev_guid; 1166 1167 char *vebe_devpath; 1168 1169 list_node_t vebe_link; 1170 }; 1171 1172 static kmutex_t veb_lock; 1173 static struct veb *veb; 1174 1175 static int 1176 vdev_disk_preroot_scan_walk(const char *devpath, void *arg) 1177 { 1178 int r; 1179 nvlist_t *cfg = NULL; 1180 uint64_t pguid = 0, vguid = 0; 1181 1182 /* 1183 * Attempt to read the label from this block device. 1184 */ 1185 if ((r = vdev_disk_read_rootlabel(devpath, NULL, &cfg)) != 0) { 1186 /* 1187 * Many of the available block devices will represent slices or 1188 * partitions of disks, or may represent disks that are not at 1189 * all initialised with ZFS. As this is a best effort 1190 * mechanism to locate an alternate path to a particular vdev, 1191 * we will ignore any failures and keep scanning. 1192 */ 1193 return (PREROOT_WALK_BLOCK_DEVICES_NEXT); 1194 } 1195 1196 /* 1197 * Determine the pool and vdev GUID read from the label for this 1198 * device. Both values must be present and have a non-zero value. 1199 */ 1200 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pguid) != 0 || 1201 nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_GUID, &vguid) != 0 || 1202 pguid == 0 || vguid == 0) { 1203 /* 1204 * This label was not complete. 1205 */ 1206 goto out; 1207 } 1208 1209 /* 1210 * Keep track of all of the GUID-to-devpath mappings we find so that 1211 * vdev_disk_preroot_lookup() can search them. 1212 */ 1213 struct veb_ent *vebe = kmem_zalloc(sizeof (*vebe), KM_SLEEP); 1214 vebe->vebe_pool_guid = pguid; 1215 vebe->vebe_vdev_guid = vguid; 1216 vebe->vebe_devpath = spa_strdup(devpath); 1217 1218 list_insert_tail(&veb->veb_ents, vebe); 1219 1220 out: 1221 nvlist_free(cfg); 1222 return (PREROOT_WALK_BLOCK_DEVICES_NEXT); 1223 } 1224 1225 const char * 1226 vdev_disk_preroot_lookup(uint64_t pool_guid, uint64_t vdev_guid) 1227 { 1228 if (pool_guid == 0 || vdev_guid == 0) { 1229 /* 1230 * If we aren't provided both a pool and a vdev GUID, we cannot 1231 * perform a lookup. 1232 */ 1233 return (NULL); 1234 } 1235 1236 mutex_enter(&veb_lock); 1237 if (veb == NULL) { 1238 /* 1239 * If vdev_disk_preroot_fini() has been called already, there 1240 * is nothing we can do. 1241 */ 1242 mutex_exit(&veb_lock); 1243 return (NULL); 1244 } 1245 1246 /* 1247 * We want to perform at most one scan of all block devices per boot. 1248 */ 1249 if (!veb->veb_scanned) { 1250 cmn_err(CE_NOTE, "Performing full ZFS device scan!"); 1251 1252 preroot_walk_block_devices(vdev_disk_preroot_scan_walk, NULL); 1253 1254 veb->veb_scanned = B_TRUE; 1255 } 1256 1257 const char *path = NULL; 1258 for (struct veb_ent *vebe = list_head(&veb->veb_ents); vebe != NULL; 1259 vebe = list_next(&veb->veb_ents, vebe)) { 1260 if (vebe->vebe_pool_guid == pool_guid && 1261 vebe->vebe_vdev_guid == vdev_guid) { 1262 path = vebe->vebe_devpath; 1263 break; 1264 } 1265 } 1266 1267 mutex_exit(&veb_lock); 1268 1269 return (path); 1270 } 1271 1272 void 1273 vdev_disk_preroot_init(void) 1274 { 1275 mutex_init(&veb_lock, NULL, MUTEX_DEFAULT, NULL); 1276 1277 VERIFY3P(veb, ==, NULL); 1278 veb = kmem_zalloc(sizeof (*veb), KM_SLEEP); 1279 list_create(&veb->veb_ents, sizeof (struct veb_ent), 1280 offsetof(struct veb_ent, vebe_link)); 1281 veb->veb_scanned = B_FALSE; 1282 } 1283 1284 void 1285 vdev_disk_preroot_fini(void) 1286 { 1287 mutex_enter(&veb_lock); 1288 1289 if (veb != NULL) { 1290 while (!list_is_empty(&veb->veb_ents)) { 1291 struct veb_ent *vebe = list_remove_head(&veb->veb_ents); 1292 1293 spa_strfree(vebe->vebe_devpath); 1294 1295 kmem_free(vebe, sizeof (*vebe)); 1296 } 1297 1298 kmem_free(veb, sizeof (*veb)); 1299 veb = NULL; 1300 } 1301 1302 mutex_exit(&veb_lock); 1303 } 1304