1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2018 by Delphix. All rights reserved. 24 * Copyright 2016 Nexenta Systems, Inc. All rights reserved. 25 * Copyright 2020 Joyent, Inc. 26 * Copyright 2020 Joshua M. Clulow <josh@sysmgr.org> 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa_impl.h> 31 #include <sys/refcount.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/vdev_trim.h> 34 #include <sys/abd.h> 35 #include <sys/fs/zfs.h> 36 #include <sys/zio.h> 37 #include <sys/sunldi.h> 38 #include <sys/efi_partition.h> 39 #include <sys/fm/fs/zfs.h> 40 #include <sys/ddi.h> 41 42 /* 43 * Tunable to disable TRIM in case we're using a problematic SSD. 44 */ 45 uint_t zfs_no_trim = 0; 46 47 /* 48 * Tunable parameter for debugging or performance analysis. Setting this 49 * will cause pool corruption on power loss if a volatile out-of-order 50 * write cache is enabled. 51 */ 52 boolean_t zfs_nocacheflush = B_FALSE; 53 54 /* 55 * Virtual device vector for disks. 56 */ 57 58 extern ldi_ident_t zfs_li; 59 60 static void vdev_disk_close(vdev_t *); 61 62 typedef struct vdev_disk { 63 ddi_devid_t vd_devid; 64 char *vd_minor; 65 ldi_handle_t vd_lh; 66 list_t vd_ldi_cbs; 67 boolean_t vd_ldi_offline; 68 } vdev_disk_t; 69 70 typedef struct vdev_disk_buf { 71 buf_t vdb_buf; 72 zio_t *vdb_io; 73 } vdev_disk_buf_t; 74 75 typedef struct vdev_disk_ldi_cb { 76 list_node_t lcb_next; 77 ldi_callback_id_t lcb_id; 78 } vdev_disk_ldi_cb_t; 79 80 /* 81 * Bypass the devid when opening a disk vdev. 82 * There have been issues where the devids of several devices were shuffled, 83 * causing pool open failures. Note, that this flag is intended to be used 84 * for pool recovery only. 85 * 86 * Note that if a pool is imported with the devids bypassed, all its vdevs will 87 * cease storing devid information permanently. In practice, the devid is rarely 88 * useful as vdev paths do not tend to change unless the hardware is 89 * reconfigured. That said, if the paths do change and a pool fails to open 90 * automatically at boot, a simple zpool import should re-scan the paths and fix 91 * the issue. 92 */ 93 boolean_t vdev_disk_bypass_devid = B_FALSE; 94 95 static void 96 vdev_disk_alloc(vdev_t *vd) 97 { 98 vdev_disk_t *dvd; 99 100 dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); 101 /* 102 * Create the LDI event callback list. 103 */ 104 list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t), 105 offsetof(vdev_disk_ldi_cb_t, lcb_next)); 106 } 107 108 static void 109 vdev_disk_free(vdev_t *vd) 110 { 111 vdev_disk_t *dvd = vd->vdev_tsd; 112 vdev_disk_ldi_cb_t *lcb; 113 114 if (dvd == NULL) 115 return; 116 117 /* 118 * We have already closed the LDI handle. Clean up the LDI event 119 * callbacks and free vd->vdev_tsd. 120 */ 121 while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) { 122 list_remove(&dvd->vd_ldi_cbs, lcb); 123 (void) ldi_ev_remove_callbacks(lcb->lcb_id); 124 kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t)); 125 } 126 list_destroy(&dvd->vd_ldi_cbs); 127 kmem_free(dvd, sizeof (vdev_disk_t)); 128 vd->vdev_tsd = NULL; 129 } 130 131 static int 132 vdev_disk_off_notify(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 133 void *arg, void *ev_data __unused) 134 { 135 vdev_t *vd = (vdev_t *)arg; 136 vdev_disk_t *dvd = vd->vdev_tsd; 137 138 /* 139 * Ignore events other than offline. 140 */ 141 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 142 return (LDI_EV_SUCCESS); 143 144 /* 145 * Tell any new threads that stumble upon this vdev that they should not 146 * try to do I/O. 147 */ 148 dvd->vd_ldi_offline = B_TRUE; 149 150 /* 151 * Request that the spa_async_thread mark the device as REMOVED and 152 * notify FMA of the removal. This should also trigger a vdev_close() 153 * in the async thread. 154 */ 155 zfs_post_remove(vd->vdev_spa, vd); 156 vd->vdev_remove_wanted = B_TRUE; 157 spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); 158 159 return (LDI_EV_SUCCESS); 160 } 161 162 static void 163 vdev_disk_off_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 164 int ldi_result, void *arg, void *ev_data __unused) 165 { 166 vdev_t *vd = (vdev_t *)arg; 167 168 /* 169 * Ignore events other than offline. 170 */ 171 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0) 172 return; 173 174 /* 175 * Request that the vdev be reopened if the offline state change was 176 * unsuccessful. 177 */ 178 if (ldi_result != LDI_EV_SUCCESS) { 179 vd->vdev_probe_wanted = B_TRUE; 180 spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE); 181 } 182 } 183 184 static ldi_ev_callback_t vdev_disk_off_callb = { 185 .cb_vers = LDI_EV_CB_VERS, 186 .cb_notify = vdev_disk_off_notify, 187 .cb_finalize = vdev_disk_off_finalize 188 }; 189 190 static void 191 vdev_disk_dgrd_finalize(ldi_handle_t lh __unused, ldi_ev_cookie_t ecookie, 192 int ldi_result, void *arg, void *ev_data __unused) 193 { 194 vdev_t *vd = (vdev_t *)arg; 195 196 /* 197 * Ignore events other than degrade. 198 */ 199 if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0) 200 return; 201 202 /* 203 * Degrade events always succeed. Mark the vdev as degraded. 204 * This status is purely informative for the user. 205 */ 206 (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0); 207 } 208 209 static ldi_ev_callback_t vdev_disk_dgrd_callb = { 210 .cb_vers = LDI_EV_CB_VERS, 211 .cb_notify = NULL, 212 .cb_finalize = vdev_disk_dgrd_finalize 213 }; 214 215 static void 216 vdev_disk_hold(vdev_t *vd) 217 { 218 ddi_devid_t devid; 219 char *minor; 220 221 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 222 223 /* 224 * We must have a pathname, and it must be absolute. 225 */ 226 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') 227 return; 228 229 /* 230 * Only prefetch path and devid info if the device has 231 * never been opened. 232 */ 233 if (vd->vdev_tsd != NULL) 234 return; 235 236 if (vd->vdev_wholedisk == -1ULL) { 237 size_t len = strlen(vd->vdev_path) + 3; 238 char *buf = kmem_alloc(len, KM_SLEEP); 239 240 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 241 242 (void) ldi_vp_from_name(buf, &vd->vdev_name_vp); 243 kmem_free(buf, len); 244 } 245 246 if (vd->vdev_name_vp == NULL) 247 (void) ldi_vp_from_name(vd->vdev_path, &vd->vdev_name_vp); 248 249 if (vd->vdev_devid != NULL && 250 ddi_devid_str_decode(vd->vdev_devid, &devid, &minor) == 0) { 251 (void) ldi_vp_from_devid(devid, minor, &vd->vdev_devid_vp); 252 ddi_devid_str_free(minor); 253 ddi_devid_free(devid); 254 } 255 } 256 257 static void 258 vdev_disk_rele(vdev_t *vd) 259 { 260 ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); 261 262 if (vd->vdev_name_vp) { 263 VN_RELE_ASYNC(vd->vdev_name_vp, 264 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 265 vd->vdev_name_vp = NULL; 266 } 267 if (vd->vdev_devid_vp) { 268 VN_RELE_ASYNC(vd->vdev_devid_vp, 269 dsl_pool_vnrele_taskq(vd->vdev_spa->spa_dsl_pool)); 270 vd->vdev_devid_vp = NULL; 271 } 272 } 273 274 /* 275 * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when 276 * even a fallback to DKIOCGMEDIAINFO fails. 277 */ 278 #ifdef DEBUG 279 #define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__) 280 #else 281 #define VDEV_DEBUG(...) /* Nothing... */ 282 #endif 283 284 static int 285 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, 286 uint64_t *ashift) 287 { 288 spa_t *spa = vd->vdev_spa; 289 vdev_disk_t *dvd = vd->vdev_tsd; 290 ldi_ev_cookie_t ecookie; 291 vdev_disk_ldi_cb_t *lcb; 292 union { 293 struct dk_minfo_ext ude; 294 struct dk_minfo ud; 295 } dks; 296 struct dk_minfo_ext *dkmext = &dks.ude; 297 struct dk_minfo *dkm = &dks.ud; 298 int error, can_free; 299 dev_t dev; 300 int otyp; 301 boolean_t validate_devid = B_FALSE; 302 uint64_t capacity = 0, blksz = 0, pbsize; 303 304 /* 305 * We must have a pathname, and it must be absolute. 306 */ 307 if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { 308 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 309 return (SET_ERROR(EINVAL)); 310 } 311 312 /* 313 * Reopen the device if it's not currently open. Otherwise, 314 * just update the physical size of the device. 315 */ 316 if (dvd != NULL) { 317 ASSERT(vd->vdev_reopening); 318 goto skip_open; 319 } 320 321 /* 322 * Create vd->vdev_tsd. 323 */ 324 vdev_disk_alloc(vd); 325 dvd = vd->vdev_tsd; 326 327 /* 328 * Allow bypassing the devid. 329 */ 330 if (vd->vdev_devid != NULL && vdev_disk_bypass_devid) { 331 vdev_dbgmsg(vd, "vdev_disk_open, devid %s bypassed", 332 vd->vdev_devid); 333 spa_strfree(vd->vdev_devid); 334 vd->vdev_devid = NULL; 335 } 336 337 /* 338 * When opening a disk device, we want to preserve the user's original 339 * intent. We always want to open the device by the path the user gave 340 * us, even if it is one of multiple paths to the same device. But we 341 * also want to be able to survive disks being removed/recabled. 342 * Therefore the sequence of opening devices is: 343 * 344 * 1. Try opening the device by path. For legacy pools without the 345 * 'whole_disk' property, attempt to fix the path by appending 's0'. 346 * 347 * 2. If the devid of the device matches the stored value, return 348 * success. 349 * 350 * 3. Otherwise, the device may have moved. Try opening the device 351 * by the devid instead. 352 */ 353 if (vd->vdev_devid != NULL) { 354 if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, 355 &dvd->vd_minor) != 0) { 356 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 357 vdev_dbgmsg(vd, "vdev_disk_open: invalid " 358 "vdev_devid '%s'", vd->vdev_devid); 359 return (SET_ERROR(EINVAL)); 360 } 361 } 362 363 error = EINVAL; /* presume failure */ 364 365 if (vd->vdev_path != NULL) { 366 if (vd->vdev_wholedisk == -1ULL) { 367 size_t len = strlen(vd->vdev_path) + 3; 368 char *buf = kmem_alloc(len, KM_SLEEP); 369 370 (void) snprintf(buf, len, "%ss0", vd->vdev_path); 371 372 error = ldi_open_by_name(buf, spa_mode(spa), kcred, 373 &dvd->vd_lh, zfs_li); 374 if (error == 0) { 375 spa_strfree(vd->vdev_path); 376 vd->vdev_path = buf; 377 vd->vdev_wholedisk = 1ULL; 378 } else { 379 kmem_free(buf, len); 380 } 381 } 382 383 /* 384 * If we have not yet opened the device, try to open it by the 385 * specified path. 386 */ 387 if (error != 0) { 388 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 389 kcred, &dvd->vd_lh, zfs_li); 390 } 391 392 /* 393 * Compare the devid to the stored value. 394 */ 395 if (error == 0 && vd->vdev_devid != NULL) { 396 ddi_devid_t devid = NULL; 397 398 if (ldi_get_devid(dvd->vd_lh, &devid) != 0) { 399 /* 400 * We expected a devid on this device but it no 401 * longer appears to have one. The validation 402 * step may need to remove it from the 403 * configuration. 404 */ 405 validate_devid = B_TRUE; 406 407 } else if (ddi_devid_compare(devid, dvd->vd_devid) != 408 0) { 409 /* 410 * A mismatch here is unexpected, log it. 411 */ 412 char *devid_str = ddi_devid_str_encode(devid, 413 dvd->vd_minor); 414 vdev_dbgmsg(vd, "vdev_disk_open: devid " 415 "mismatch: %s != %s", vd->vdev_devid, 416 devid_str); 417 cmn_err(CE_NOTE, "vdev_disk_open %s: devid " 418 "mismatch: %s != %s", vd->vdev_path, 419 vd->vdev_devid, devid_str); 420 ddi_devid_str_free(devid_str); 421 422 error = SET_ERROR(EINVAL); 423 (void) ldi_close(dvd->vd_lh, spa_mode(spa), 424 kcred); 425 dvd->vd_lh = NULL; 426 } 427 428 if (devid != NULL) { 429 ddi_devid_free(devid); 430 } 431 } 432 433 /* 434 * If we succeeded in opening the device, but 'vdev_wholedisk' 435 * is not yet set, then this must be a slice. 436 */ 437 if (error == 0 && vd->vdev_wholedisk == -1ULL) 438 vd->vdev_wholedisk = 0; 439 } 440 441 /* 442 * If we were unable to open by path, or the devid check fails, open by 443 * devid instead. 444 */ 445 if (error != 0 && vd->vdev_devid != NULL) { 446 error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, 447 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); 448 if (error != 0) { 449 vdev_dbgmsg(vd, "Failed to open by devid (%s)", 450 vd->vdev_devid); 451 } 452 } 453 454 /* 455 * If all else fails, then try opening by physical path (if available) 456 * or the logical path (if we failed due to the devid check). While not 457 * as reliable as the devid, this will give us something, and the higher 458 * level vdev validation will prevent us from opening the wrong device. 459 */ 460 if (error != 0) { 461 validate_devid = B_TRUE; 462 463 if (vd->vdev_physpath != NULL && 464 (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) { 465 error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), 466 kcred, &dvd->vd_lh, zfs_li); 467 } 468 469 /* 470 * Note that we don't support the legacy auto-wholedisk support 471 * as above. This hasn't been used in a very long time and we 472 * don't need to propagate its oddities to this edge condition. 473 */ 474 if (error != 0 && vd->vdev_path != NULL) { 475 error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), 476 kcred, &dvd->vd_lh, zfs_li); 477 } 478 } 479 480 /* 481 * If this is early in boot, a sweep of available block devices may 482 * locate an alternative path that we can try. 483 */ 484 if (error != 0) { 485 const char *altdevpath = vdev_disk_preroot_lookup( 486 spa_guid(spa), vd->vdev_guid); 487 488 if (altdevpath != NULL) { 489 vdev_dbgmsg(vd, "Trying alternate preroot path (%s)", 490 altdevpath); 491 492 validate_devid = B_TRUE; 493 494 if ((error = ldi_open_by_name((char *)altdevpath, 495 spa_mode(spa), kcred, &dvd->vd_lh, zfs_li)) != 0) { 496 vdev_dbgmsg(vd, "Failed to open by preroot " 497 "path (%s)", altdevpath); 498 } 499 } 500 } 501 502 if (error != 0) { 503 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 504 vdev_dbgmsg(vd, "vdev_disk_open: failed to open [error=%d]", 505 error); 506 return (error); 507 } 508 509 /* 510 * Now that the device has been successfully opened, update the devid 511 * if necessary. 512 */ 513 if (validate_devid) { 514 ddi_devid_t devid = NULL; 515 char *minorname = NULL; 516 char *vd_devid = NULL; 517 boolean_t remove = B_FALSE, update = B_FALSE; 518 519 /* 520 * Get the current devid and minor name for the device we 521 * opened. 522 */ 523 if (ldi_get_devid(dvd->vd_lh, &devid) != 0 || 524 ldi_get_minor_name(dvd->vd_lh, &minorname) != 0) { 525 /* 526 * If we are unable to get the devid or the minor name 527 * for the device, we need to remove them from the 528 * configuration to prevent potential inconsistencies. 529 */ 530 if (dvd->vd_minor != NULL || dvd->vd_devid != NULL || 531 vd->vdev_devid != NULL) { 532 /* 533 * We only need to remove the devid if one 534 * exists. 535 */ 536 remove = B_TRUE; 537 } 538 539 } else if (dvd->vd_devid == NULL || dvd->vd_minor == NULL) { 540 /* 541 * There was previously no devid at all so we need to 542 * add one. 543 */ 544 update = B_TRUE; 545 546 } else if (ddi_devid_compare(devid, dvd->vd_devid) != 0 || 547 strcmp(minorname, dvd->vd_minor) != 0) { 548 /* 549 * The devid or minor name on file does not match the 550 * one from the opened device. 551 */ 552 update = B_TRUE; 553 } 554 555 if (update) { 556 /* 557 * Render the new devid and minor name as a string for 558 * logging and to store in the vdev configuration. 559 */ 560 vd_devid = ddi_devid_str_encode(devid, minorname); 561 } 562 563 if (update || remove) { 564 vdev_dbgmsg(vd, "vdev_disk_open: update devid from " 565 "'%s' to '%s'", 566 vd->vdev_devid != NULL ? vd->vdev_devid : "<none>", 567 vd_devid != NULL ? vd_devid : "<none>"); 568 cmn_err(CE_NOTE, "vdev_disk_open %s: update devid " 569 "from '%s' to '%s'", 570 vd->vdev_path != NULL ? vd->vdev_path : "?", 571 vd->vdev_devid != NULL ? vd->vdev_devid : "<none>", 572 vd_devid != NULL ? vd_devid : "<none>"); 573 574 /* 575 * Remove and free any existing values. 576 */ 577 if (dvd->vd_minor != NULL) { 578 ddi_devid_str_free(dvd->vd_minor); 579 dvd->vd_minor = NULL; 580 } 581 if (dvd->vd_devid != NULL) { 582 ddi_devid_free(dvd->vd_devid); 583 dvd->vd_devid = NULL; 584 } 585 if (vd->vdev_devid != NULL) { 586 spa_strfree(vd->vdev_devid); 587 vd->vdev_devid = NULL; 588 } 589 } 590 591 if (update) { 592 /* 593 * Install the new values. 594 */ 595 vd->vdev_devid = vd_devid; 596 dvd->vd_minor = minorname; 597 dvd->vd_devid = devid; 598 599 } else { 600 if (devid != NULL) { 601 ddi_devid_free(devid); 602 } 603 if (minorname != NULL) { 604 kmem_free(minorname, strlen(minorname) + 1); 605 } 606 } 607 } 608 609 /* 610 * Once a device is opened, verify that the physical device path (if 611 * available) is up to date. 612 */ 613 if (ldi_get_dev(dvd->vd_lh, &dev) == 0 && 614 ldi_get_otyp(dvd->vd_lh, &otyp) == 0) { 615 char *physpath, *minorname; 616 617 physpath = kmem_alloc(MAXPATHLEN, KM_SLEEP); 618 minorname = NULL; 619 if (ddi_dev_pathname(dev, otyp, physpath) == 0 && 620 ldi_get_minor_name(dvd->vd_lh, &minorname) == 0 && 621 (vd->vdev_physpath == NULL || 622 strcmp(vd->vdev_physpath, physpath) != 0)) { 623 if (vd->vdev_physpath) 624 spa_strfree(vd->vdev_physpath); 625 (void) strlcat(physpath, ":", MAXPATHLEN); 626 (void) strlcat(physpath, minorname, MAXPATHLEN); 627 vd->vdev_physpath = spa_strdup(physpath); 628 } 629 if (minorname) 630 kmem_free(minorname, strlen(minorname) + 1); 631 kmem_free(physpath, MAXPATHLEN); 632 } 633 634 /* 635 * Register callbacks for the LDI offline event. 636 */ 637 if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) == 638 LDI_EV_SUCCESS) { 639 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 640 list_insert_tail(&dvd->vd_ldi_cbs, lcb); 641 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 642 &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id); 643 } 644 645 /* 646 * Register callbacks for the LDI degrade event. 647 */ 648 if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) == 649 LDI_EV_SUCCESS) { 650 lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP); 651 list_insert_tail(&dvd->vd_ldi_cbs, lcb); 652 (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie, 653 &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id); 654 } 655 656 skip_open: 657 /* 658 * Determine the actual size of the device. 659 */ 660 if (ldi_get_size(dvd->vd_lh, psize) != 0) { 661 vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; 662 vdev_dbgmsg(vd, "vdev_disk_open: failed to get size"); 663 return (SET_ERROR(EINVAL)); 664 } 665 666 *max_psize = *psize; 667 668 /* 669 * Determine the device's minimum transfer size. 670 * If the ioctl isn't supported, assume DEV_BSIZE. 671 */ 672 if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, 673 (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) { 674 capacity = dkmext->dki_capacity - 1; 675 blksz = dkmext->dki_lbsize; 676 pbsize = dkmext->dki_pbsize; 677 } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO, 678 (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) { 679 VDEV_DEBUG( 680 "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n", 681 vd->vdev_path); 682 capacity = dkm->dki_capacity - 1; 683 blksz = dkm->dki_lbsize; 684 pbsize = blksz; 685 } else { 686 VDEV_DEBUG("vdev_disk_open(\"%s\"): " 687 "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n", 688 vd->vdev_path, error); 689 pbsize = DEV_BSIZE; 690 } 691 692 *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1; 693 694 if (vd->vdev_wholedisk == 1) { 695 int wce = 1; 696 697 if (error == 0) { 698 /* 699 * If we have the capability to expand, we'd have 700 * found out via success from DKIOCGMEDIAINFO{,EXT}. 701 * Adjust max_psize upward accordingly since we know 702 * we own the whole disk now. 703 */ 704 *max_psize = capacity * blksz; 705 } 706 707 /* 708 * Since we own the whole disk, try to enable disk write 709 * caching. We ignore errors because it's OK if we can't do it. 710 */ 711 (void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce, 712 FKIOCTL, kcred, NULL); 713 } 714 715 /* 716 * Clear the nowritecache bit, so that on a vdev_reopen() we will 717 * try again. 718 */ 719 vd->vdev_nowritecache = B_FALSE; 720 721 if (ldi_ioctl(dvd->vd_lh, DKIOC_CANFREE, (intptr_t)&can_free, FKIOCTL, 722 kcred, NULL) == 0 && can_free == 1) { 723 vd->vdev_has_trim = B_TRUE; 724 } else { 725 vd->vdev_has_trim = B_FALSE; 726 } 727 728 if (zfs_no_trim == 1) 729 vd->vdev_has_trim = B_FALSE; 730 731 /* Currently only supported for ZoL. */ 732 vd->vdev_has_securetrim = B_FALSE; 733 734 /* Inform the ZIO pipeline that we are non-rotational */ 735 vd->vdev_nonrot = B_FALSE; 736 if (ldi_prop_exists(dvd->vd_lh, DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 737 "device-solid-state")) { 738 if (ldi_prop_get_int(dvd->vd_lh, 739 LDI_DEV_T_ANY | DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, 740 "device-solid-state", B_FALSE) != 0) 741 vd->vdev_nonrot = B_TRUE; 742 } 743 744 return (0); 745 } 746 747 static void 748 vdev_disk_close(vdev_t *vd) 749 { 750 vdev_disk_t *dvd = vd->vdev_tsd; 751 752 if (vd->vdev_reopening || dvd == NULL) 753 return; 754 755 if (dvd->vd_minor != NULL) { 756 ddi_devid_str_free(dvd->vd_minor); 757 dvd->vd_minor = NULL; 758 } 759 760 if (dvd->vd_devid != NULL) { 761 ddi_devid_free(dvd->vd_devid); 762 dvd->vd_devid = NULL; 763 } 764 765 if (dvd->vd_lh != NULL) { 766 (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); 767 dvd->vd_lh = NULL; 768 } 769 770 vd->vdev_delayed_close = B_FALSE; 771 vdev_disk_free(vd); 772 } 773 774 static int 775 vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data, 776 size_t size, uint64_t offset, int flags) 777 { 778 buf_t *bp; 779 int error = 0; 780 781 if (vd_lh == NULL) 782 return (SET_ERROR(EINVAL)); 783 784 ASSERT(flags & B_READ || flags & B_WRITE); 785 786 bp = getrbuf(KM_SLEEP); 787 bp->b_flags = flags | B_BUSY | B_NOCACHE | B_FAILFAST; 788 bp->b_bcount = size; 789 bp->b_un.b_addr = (void *)data; 790 bp->b_lblkno = lbtodb(offset); 791 bp->b_bufsize = size; 792 793 error = ldi_strategy(vd_lh, bp); 794 ASSERT(error == 0); 795 if ((error = biowait(bp)) == 0 && bp->b_resid != 0) 796 error = SET_ERROR(EIO); 797 freerbuf(bp); 798 799 return (error); 800 } 801 802 static int 803 vdev_disk_dumpio(vdev_t *vd, caddr_t data, size_t size, 804 uint64_t offset, uint64_t origoffset __unused, boolean_t doread, 805 boolean_t isdump) 806 { 807 vdev_disk_t *dvd = vd->vdev_tsd; 808 int flags = doread ? B_READ : B_WRITE; 809 810 /* 811 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 812 * Nothing to be done here but return failure. 813 */ 814 if (dvd == NULL || dvd->vd_ldi_offline) { 815 return (SET_ERROR(ENXIO)); 816 } 817 818 ASSERT(vd->vdev_ops == &vdev_disk_ops); 819 820 offset += VDEV_LABEL_START_SIZE; 821 822 /* 823 * If in the context of an active crash dump, use the ldi_dump(9F) 824 * call instead of ldi_strategy(9F) as usual. 825 */ 826 if (isdump) { 827 ASSERT3P(dvd, !=, NULL); 828 return (ldi_dump(dvd->vd_lh, data, lbtodb(offset), 829 lbtodb(size))); 830 } 831 832 return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags)); 833 } 834 835 static int 836 vdev_disk_io_intr(buf_t *bp) 837 { 838 vdev_buf_t *vb = (vdev_buf_t *)bp; 839 zio_t *zio = vb->vb_io; 840 841 /* 842 * The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO. 843 * Rather than teach the rest of the stack about other error 844 * possibilities (EFAULT, etc), we normalize the error value here. 845 */ 846 zio->io_error = (geterror(bp) != 0 ? EIO : 0); 847 848 if (zio->io_error == 0 && bp->b_resid != 0) 849 zio->io_error = SET_ERROR(EIO); 850 851 if (zio->io_type == ZIO_TYPE_READ) { 852 abd_return_buf_copy(zio->io_abd, bp->b_un.b_addr, zio->io_size); 853 } else { 854 abd_return_buf(zio->io_abd, bp->b_un.b_addr, zio->io_size); 855 } 856 857 kmem_free(vb, sizeof (vdev_buf_t)); 858 859 zio_delay_interrupt(zio); 860 return (0); 861 } 862 863 static void 864 vdev_disk_ioctl_free(zio_t *zio) 865 { 866 kmem_free(zio->io_vsd, sizeof (struct dk_callback)); 867 } 868 869 static const zio_vsd_ops_t vdev_disk_vsd_ops = { 870 vdev_disk_ioctl_free, 871 zio_vsd_default_cksum_report 872 }; 873 874 static void 875 vdev_disk_ioctl_done(void *zio_arg, int error) 876 { 877 zio_t *zio = zio_arg; 878 879 zio->io_error = error; 880 881 zio_interrupt(zio); 882 } 883 884 static void 885 vdev_disk_io_start(zio_t *zio) 886 { 887 vdev_t *vd = zio->io_vd; 888 vdev_disk_t *dvd = vd->vdev_tsd; 889 unsigned long trim_flags = 0; 890 vdev_buf_t *vb; 891 struct dk_callback *dkc; 892 buf_t *bp; 893 int error; 894 895 /* 896 * If the vdev is closed, it's likely in the REMOVED or FAULTED state. 897 * Nothing to be done here but return failure. 898 */ 899 if (dvd == NULL || dvd->vd_ldi_offline) { 900 zio->io_error = ENXIO; 901 zio_interrupt(zio); 902 return; 903 } 904 905 switch (zio->io_type) { 906 case ZIO_TYPE_IOCTL: 907 /* XXPOLICY */ 908 if (!vdev_readable(vd)) { 909 zio->io_error = SET_ERROR(ENXIO); 910 zio_interrupt(zio); 911 return; 912 } 913 914 switch (zio->io_cmd) { 915 916 case DKIOCFLUSHWRITECACHE: 917 918 if (zfs_nocacheflush) 919 break; 920 921 if (vd->vdev_nowritecache) { 922 zio->io_error = SET_ERROR(ENOTSUP); 923 break; 924 } 925 926 zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); 927 zio->io_vsd_ops = &vdev_disk_vsd_ops; 928 929 dkc->dkc_callback = vdev_disk_ioctl_done; 930 dkc->dkc_flag = FLUSH_VOLATILE; 931 dkc->dkc_cookie = zio; 932 933 error = ldi_ioctl(dvd->vd_lh, zio->io_cmd, 934 (uintptr_t)dkc, FKIOCTL, kcred, NULL); 935 936 if (error == 0) { 937 /* 938 * The ioctl will be done asychronously, 939 * and will call vdev_disk_ioctl_done() 940 * upon completion. 941 */ 942 return; 943 } 944 945 zio->io_error = error; 946 947 break; 948 949 default: 950 zio->io_error = SET_ERROR(ENOTSUP); 951 } 952 953 zio_execute(zio); 954 return; 955 956 case ZIO_TYPE_TRIM: 957 if (zfs_no_trim == 1 || !vd->vdev_has_trim) { 958 zio->io_error = SET_ERROR(ENOTSUP); 959 zio_execute(zio); 960 return; 961 } 962 /* Currently only supported on ZoL. */ 963 ASSERT0(zio->io_trim_flags & ZIO_TRIM_SECURE); 964 965 /* dkioc_free_list_t is already declared to hold one entry */ 966 dkioc_free_list_t dfl; 967 dfl.dfl_flags = 0; 968 dfl.dfl_num_exts = 1; 969 dfl.dfl_offset = 0; 970 dfl.dfl_exts[0].dfle_start = zio->io_offset; 971 dfl.dfl_exts[0].dfle_length = zio->io_size; 972 973 zio->io_error = ldi_ioctl(dvd->vd_lh, DKIOCFREE, 974 (uintptr_t)&dfl, FKIOCTL, kcred, NULL); 975 976 if (zio->io_error == ENOTSUP || zio->io_error == ENOTTY) { 977 /* 978 * The device must have changed and now TRIM is 979 * no longer supported. 980 */ 981 vd->vdev_has_trim = B_FALSE; 982 } 983 984 zio_interrupt(zio); 985 return; 986 } 987 988 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 989 zio->io_target_timestamp = zio_handle_io_delay(zio); 990 991 vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); 992 993 vb->vb_io = zio; 994 bp = &vb->vb_buf; 995 996 bioinit(bp); 997 bp->b_flags = B_BUSY | B_NOCACHE | 998 (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); 999 if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) 1000 bp->b_flags |= B_FAILFAST; 1001 bp->b_bcount = zio->io_size; 1002 1003 if (zio->io_type == ZIO_TYPE_READ) { 1004 bp->b_un.b_addr = 1005 abd_borrow_buf(zio->io_abd, zio->io_size); 1006 } else { 1007 bp->b_un.b_addr = 1008 abd_borrow_buf_copy(zio->io_abd, zio->io_size); 1009 } 1010 1011 bp->b_lblkno = lbtodb(zio->io_offset); 1012 bp->b_bufsize = zio->io_size; 1013 bp->b_iodone = vdev_disk_io_intr; 1014 1015 /* 1016 * In general we would expect ldi_strategy() to return non-zero only 1017 * because of programming errors, but we've also seen this fail shortly 1018 * after a disk dies. 1019 */ 1020 if (ldi_strategy(dvd->vd_lh, bp) != 0) { 1021 zio->io_error = ENXIO; 1022 zio_interrupt(zio); 1023 } 1024 } 1025 1026 static void 1027 vdev_disk_io_done(zio_t *zio) 1028 { 1029 vdev_t *vd = zio->io_vd; 1030 1031 /* 1032 * If the device returned EIO, then attempt a DKIOCSTATE ioctl to see if 1033 * the device has been removed. If this is the case, then we trigger an 1034 * asynchronous removal of the device. Otherwise, probe the device and 1035 * make sure it's still accessible. 1036 */ 1037 if (zio->io_error == EIO && !vd->vdev_remove_wanted) { 1038 vdev_disk_t *dvd = vd->vdev_tsd; 1039 int state = DKIO_NONE; 1040 1041 if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, 1042 FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { 1043 /* 1044 * We post the resource as soon as possible, instead of 1045 * when the async removal actually happens, because the 1046 * DE is using this information to discard previous I/O 1047 * errors. 1048 */ 1049 zfs_post_remove(zio->io_spa, vd); 1050 vd->vdev_remove_wanted = B_TRUE; 1051 spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); 1052 } else if (!vd->vdev_delayed_close) { 1053 vd->vdev_delayed_close = B_TRUE; 1054 } 1055 } 1056 } 1057 1058 vdev_ops_t vdev_disk_ops = { 1059 .vdev_op_open = vdev_disk_open, 1060 .vdev_op_close = vdev_disk_close, 1061 .vdev_op_asize = vdev_default_asize, 1062 .vdev_op_io_start = vdev_disk_io_start, 1063 .vdev_op_io_done = vdev_disk_io_done, 1064 .vdev_op_state_change = NULL, 1065 .vdev_op_need_resilver = NULL, 1066 .vdev_op_hold = vdev_disk_hold, 1067 .vdev_op_rele = vdev_disk_rele, 1068 .vdev_op_remap = NULL, 1069 .vdev_op_xlate = vdev_default_xlate, 1070 .vdev_op_dumpio = vdev_disk_dumpio, 1071 .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ 1072 .vdev_op_leaf = B_TRUE /* leaf vdev */ 1073 }; 1074 1075 /* 1076 * Given the root disk device devid or pathname, read the label from 1077 * the device, and construct a configuration nvlist. 1078 */ 1079 int 1080 vdev_disk_read_rootlabel(const char *devpath, const char *devid, 1081 nvlist_t **config) 1082 { 1083 ldi_handle_t vd_lh; 1084 vdev_label_t *label; 1085 uint64_t s, size; 1086 int l; 1087 ddi_devid_t tmpdevid; 1088 int error = -1; 1089 char *minor_name; 1090 1091 /* 1092 * Read the device label and build the nvlist. 1093 */ 1094 if (devid != NULL && ddi_devid_str_decode((char *)devid, &tmpdevid, 1095 &minor_name) == 0) { 1096 error = ldi_open_by_devid(tmpdevid, minor_name, 1097 FREAD, kcred, &vd_lh, zfs_li); 1098 ddi_devid_free(tmpdevid); 1099 ddi_devid_str_free(minor_name); 1100 } 1101 1102 if (error != 0 && (error = ldi_open_by_name((char *)devpath, FREAD, 1103 kcred, &vd_lh, zfs_li)) != 0) { 1104 return (error); 1105 } 1106 1107 if (ldi_get_size(vd_lh, &s)) { 1108 (void) ldi_close(vd_lh, FREAD, kcred); 1109 return (SET_ERROR(EIO)); 1110 } 1111 1112 size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); 1113 label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); 1114 1115 *config = NULL; 1116 for (l = 0; l < VDEV_LABELS; l++) { 1117 uint64_t offset, state, txg = 0; 1118 1119 /* read vdev label */ 1120 offset = vdev_label_offset(size, l, 0); 1121 if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label, 1122 VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) 1123 continue; 1124 1125 if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, 1126 sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) { 1127 *config = NULL; 1128 continue; 1129 } 1130 1131 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, 1132 &state) != 0 || state >= POOL_STATE_DESTROYED) { 1133 nvlist_free(*config); 1134 *config = NULL; 1135 continue; 1136 } 1137 1138 if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, 1139 &txg) != 0 || txg == 0) { 1140 nvlist_free(*config); 1141 *config = NULL; 1142 continue; 1143 } 1144 1145 break; 1146 } 1147 1148 kmem_free(label, sizeof (vdev_label_t)); 1149 (void) ldi_close(vd_lh, FREAD, kcred); 1150 if (*config == NULL) 1151 error = SET_ERROR(EIDRM); 1152 1153 return (error); 1154 } 1155 1156 struct veb { 1157 list_t veb_ents; 1158 boolean_t veb_scanned; 1159 }; 1160 1161 struct veb_ent { 1162 uint64_t vebe_pool_guid; 1163 uint64_t vebe_vdev_guid; 1164 1165 char *vebe_devpath; 1166 1167 list_node_t vebe_link; 1168 }; 1169 1170 static kmutex_t veb_lock; 1171 static struct veb *veb; 1172 1173 static int 1174 vdev_disk_preroot_scan_walk(const char *devpath, void *arg) 1175 { 1176 int r; 1177 nvlist_t *cfg = NULL; 1178 uint64_t pguid = 0, vguid = 0; 1179 1180 /* 1181 * Attempt to read the label from this block device. 1182 */ 1183 if ((r = vdev_disk_read_rootlabel(devpath, NULL, &cfg)) != 0) { 1184 /* 1185 * Many of the available block devices will represent slices or 1186 * partitions of disks, or may represent disks that are not at 1187 * all initialised with ZFS. As this is a best effort 1188 * mechanism to locate an alternate path to a particular vdev, 1189 * we will ignore any failures and keep scanning. 1190 */ 1191 return (PREROOT_WALK_BLOCK_DEVICES_NEXT); 1192 } 1193 1194 /* 1195 * Determine the pool and vdev GUID read from the label for this 1196 * device. Both values must be present and have a non-zero value. 1197 */ 1198 if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pguid) != 0 || 1199 nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_GUID, &vguid) != 0 || 1200 pguid == 0 || vguid == 0) { 1201 /* 1202 * This label was not complete. 1203 */ 1204 goto out; 1205 } 1206 1207 /* 1208 * Keep track of all of the GUID-to-devpath mappings we find so that 1209 * vdev_disk_preroot_lookup() can search them. 1210 */ 1211 struct veb_ent *vebe = kmem_zalloc(sizeof (*vebe), KM_SLEEP); 1212 vebe->vebe_pool_guid = pguid; 1213 vebe->vebe_vdev_guid = vguid; 1214 vebe->vebe_devpath = spa_strdup(devpath); 1215 1216 list_insert_tail(&veb->veb_ents, vebe); 1217 1218 out: 1219 nvlist_free(cfg); 1220 return (PREROOT_WALK_BLOCK_DEVICES_NEXT); 1221 } 1222 1223 const char * 1224 vdev_disk_preroot_lookup(uint64_t pool_guid, uint64_t vdev_guid) 1225 { 1226 if (pool_guid == 0 || vdev_guid == 0) { 1227 /* 1228 * If we aren't provided both a pool and a vdev GUID, we cannot 1229 * perform a lookup. 1230 */ 1231 return (NULL); 1232 } 1233 1234 mutex_enter(&veb_lock); 1235 if (veb == NULL) { 1236 /* 1237 * If vdev_disk_preroot_fini() has been called already, there 1238 * is nothing we can do. 1239 */ 1240 mutex_exit(&veb_lock); 1241 return (NULL); 1242 } 1243 1244 /* 1245 * We want to perform at most one scan of all block devices per boot. 1246 */ 1247 if (!veb->veb_scanned) { 1248 cmn_err(CE_NOTE, "Performing full ZFS device scan!"); 1249 1250 preroot_walk_block_devices(vdev_disk_preroot_scan_walk, NULL); 1251 1252 veb->veb_scanned = B_TRUE; 1253 } 1254 1255 const char *path = NULL; 1256 for (struct veb_ent *vebe = list_head(&veb->veb_ents); vebe != NULL; 1257 vebe = list_next(&veb->veb_ents, vebe)) { 1258 if (vebe->vebe_pool_guid == pool_guid && 1259 vebe->vebe_vdev_guid == vdev_guid) { 1260 path = vebe->vebe_devpath; 1261 break; 1262 } 1263 } 1264 1265 mutex_exit(&veb_lock); 1266 1267 return (path); 1268 } 1269 1270 void 1271 vdev_disk_preroot_init(void) 1272 { 1273 mutex_init(&veb_lock, NULL, MUTEX_DEFAULT, NULL); 1274 1275 VERIFY3P(veb, ==, NULL); 1276 veb = kmem_zalloc(sizeof (*veb), KM_SLEEP); 1277 list_create(&veb->veb_ents, sizeof (struct veb_ent), 1278 offsetof(struct veb_ent, vebe_link)); 1279 veb->veb_scanned = B_FALSE; 1280 } 1281 1282 void 1283 vdev_disk_preroot_fini(void) 1284 { 1285 mutex_enter(&veb_lock); 1286 1287 if (veb != NULL) { 1288 while (!list_is_empty(&veb->veb_ents)) { 1289 struct veb_ent *vebe = list_remove_head(&veb->veb_ents); 1290 1291 spa_strfree(vebe->vebe_devpath); 1292 1293 kmem_free(vebe, sizeof (*vebe)); 1294 } 1295 1296 kmem_free(veb, sizeof (*veb)); 1297 veb = NULL; 1298 } 1299 1300 mutex_exit(&veb_lock); 1301 } 1302