1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/zfs_context.h> 28 #include <sys/fm/fs/zfs.h> 29 #include <sys/spa.h> 30 #include <sys/spa_impl.h> 31 #include <sys/dmu.h> 32 #include <sys/dmu_tx.h> 33 #include <sys/vdev_impl.h> 34 #include <sys/uberblock_impl.h> 35 #include <sys/metaslab.h> 36 #include <sys/metaslab_impl.h> 37 #include <sys/space_map.h> 38 #include <sys/zio.h> 39 #include <sys/zap.h> 40 #include <sys/fs/zfs.h> 41 #include <sys/arc.h> 42 43 /* 44 * Virtual device management. 45 */ 46 47 static vdev_ops_t *vdev_ops_table[] = { 48 &vdev_root_ops, 49 &vdev_raidz_ops, 50 &vdev_mirror_ops, 51 &vdev_replacing_ops, 52 &vdev_spare_ops, 53 &vdev_disk_ops, 54 &vdev_file_ops, 55 &vdev_missing_ops, 56 NULL 57 }; 58 59 /* maximum scrub/resilver I/O queue per leaf vdev */ 60 int zfs_scrub_limit = 10; 61 62 /* 63 * Given a vdev type, return the appropriate ops vector. 64 */ 65 static vdev_ops_t * 66 vdev_getops(const char *type) 67 { 68 vdev_ops_t *ops, **opspp; 69 70 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 71 if (strcmp(ops->vdev_op_type, type) == 0) 72 break; 73 74 return (ops); 75 } 76 77 /* 78 * Default asize function: return the MAX of psize with the asize of 79 * all children. This is what's used by anything other than RAID-Z. 80 */ 81 uint64_t 82 vdev_default_asize(vdev_t *vd, uint64_t psize) 83 { 84 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 85 uint64_t csize; 86 uint64_t c; 87 88 for (c = 0; c < vd->vdev_children; c++) { 89 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 90 asize = MAX(asize, csize); 91 } 92 93 return (asize); 94 } 95 96 /* 97 * Get the replaceable or attachable device size. 98 * If the parent is a mirror or raidz, the replaceable size is the minimum 99 * psize of all its children. For the rest, just return our own psize. 100 * 101 * e.g. 102 * psize rsize 103 * root - - 104 * mirror/raidz - - 105 * disk1 20g 20g 106 * disk2 40g 20g 107 * disk3 80g 80g 108 */ 109 uint64_t 110 vdev_get_rsize(vdev_t *vd) 111 { 112 vdev_t *pvd, *cvd; 113 uint64_t c, rsize; 114 115 pvd = vd->vdev_parent; 116 117 /* 118 * If our parent is NULL or the root, just return our own psize. 119 */ 120 if (pvd == NULL || pvd->vdev_parent == NULL) 121 return (vd->vdev_psize); 122 123 rsize = 0; 124 125 for (c = 0; c < pvd->vdev_children; c++) { 126 cvd = pvd->vdev_child[c]; 127 rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 128 } 129 130 return (rsize); 131 } 132 133 vdev_t * 134 vdev_lookup_top(spa_t *spa, uint64_t vdev) 135 { 136 vdev_t *rvd = spa->spa_root_vdev; 137 138 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 139 140 if (vdev < rvd->vdev_children) { 141 ASSERT(rvd->vdev_child[vdev] != NULL); 142 return (rvd->vdev_child[vdev]); 143 } 144 145 return (NULL); 146 } 147 148 vdev_t * 149 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 150 { 151 int c; 152 vdev_t *mvd; 153 154 if (vd->vdev_guid == guid) 155 return (vd); 156 157 for (c = 0; c < vd->vdev_children; c++) 158 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 159 NULL) 160 return (mvd); 161 162 return (NULL); 163 } 164 165 void 166 vdev_add_child(vdev_t *pvd, vdev_t *cvd) 167 { 168 size_t oldsize, newsize; 169 uint64_t id = cvd->vdev_id; 170 vdev_t **newchild; 171 172 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 173 ASSERT(cvd->vdev_parent == NULL); 174 175 cvd->vdev_parent = pvd; 176 177 if (pvd == NULL) 178 return; 179 180 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 181 182 oldsize = pvd->vdev_children * sizeof (vdev_t *); 183 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 184 newsize = pvd->vdev_children * sizeof (vdev_t *); 185 186 newchild = kmem_zalloc(newsize, KM_SLEEP); 187 if (pvd->vdev_child != NULL) { 188 bcopy(pvd->vdev_child, newchild, oldsize); 189 kmem_free(pvd->vdev_child, oldsize); 190 } 191 192 pvd->vdev_child = newchild; 193 pvd->vdev_child[id] = cvd; 194 195 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 196 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 197 198 /* 199 * Walk up all ancestors to update guid sum. 200 */ 201 for (; pvd != NULL; pvd = pvd->vdev_parent) 202 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 203 204 if (cvd->vdev_ops->vdev_op_leaf) 205 cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 206 } 207 208 void 209 vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 210 { 211 int c; 212 uint_t id = cvd->vdev_id; 213 214 ASSERT(cvd->vdev_parent == pvd); 215 216 if (pvd == NULL) 217 return; 218 219 ASSERT(id < pvd->vdev_children); 220 ASSERT(pvd->vdev_child[id] == cvd); 221 222 pvd->vdev_child[id] = NULL; 223 cvd->vdev_parent = NULL; 224 225 for (c = 0; c < pvd->vdev_children; c++) 226 if (pvd->vdev_child[c]) 227 break; 228 229 if (c == pvd->vdev_children) { 230 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 231 pvd->vdev_child = NULL; 232 pvd->vdev_children = 0; 233 } 234 235 /* 236 * Walk up all ancestors to update guid sum. 237 */ 238 for (; pvd != NULL; pvd = pvd->vdev_parent) 239 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 240 241 if (cvd->vdev_ops->vdev_op_leaf) 242 cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 243 } 244 245 /* 246 * Remove any holes in the child array. 247 */ 248 void 249 vdev_compact_children(vdev_t *pvd) 250 { 251 vdev_t **newchild, *cvd; 252 int oldc = pvd->vdev_children; 253 int newc, c; 254 255 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 256 257 for (c = newc = 0; c < oldc; c++) 258 if (pvd->vdev_child[c]) 259 newc++; 260 261 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 262 263 for (c = newc = 0; c < oldc; c++) { 264 if ((cvd = pvd->vdev_child[c]) != NULL) { 265 newchild[newc] = cvd; 266 cvd->vdev_id = newc++; 267 } 268 } 269 270 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 271 pvd->vdev_child = newchild; 272 pvd->vdev_children = newc; 273 } 274 275 /* 276 * Allocate and minimally initialize a vdev_t. 277 */ 278 static vdev_t * 279 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 280 { 281 vdev_t *vd; 282 283 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 284 285 if (spa->spa_root_vdev == NULL) { 286 ASSERT(ops == &vdev_root_ops); 287 spa->spa_root_vdev = vd; 288 } 289 290 if (guid == 0) { 291 if (spa->spa_root_vdev == vd) { 292 /* 293 * The root vdev's guid will also be the pool guid, 294 * which must be unique among all pools. 295 */ 296 while (guid == 0 || spa_guid_exists(guid, 0)) 297 guid = spa_get_random(-1ULL); 298 } else { 299 /* 300 * Any other vdev's guid must be unique within the pool. 301 */ 302 while (guid == 0 || 303 spa_guid_exists(spa_guid(spa), guid)) 304 guid = spa_get_random(-1ULL); 305 } 306 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 307 } 308 309 vd->vdev_spa = spa; 310 vd->vdev_id = id; 311 vd->vdev_guid = guid; 312 vd->vdev_guid_sum = guid; 313 vd->vdev_ops = ops; 314 vd->vdev_state = VDEV_STATE_CLOSED; 315 316 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 317 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 318 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 319 for (int t = 0; t < DTL_TYPES; t++) { 320 space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, 321 &vd->vdev_dtl_lock); 322 } 323 txg_list_create(&vd->vdev_ms_list, 324 offsetof(struct metaslab, ms_txg_node)); 325 txg_list_create(&vd->vdev_dtl_list, 326 offsetof(struct vdev, vdev_dtl_node)); 327 vd->vdev_stat.vs_timestamp = gethrtime(); 328 vdev_queue_init(vd); 329 vdev_cache_init(vd); 330 331 return (vd); 332 } 333 334 /* 335 * Allocate a new vdev. The 'alloctype' is used to control whether we are 336 * creating a new vdev or loading an existing one - the behavior is slightly 337 * different for each case. 338 */ 339 int 340 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 341 int alloctype) 342 { 343 vdev_ops_t *ops; 344 char *type; 345 uint64_t guid = 0, islog, nparity; 346 vdev_t *vd; 347 348 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 349 350 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 351 return (EINVAL); 352 353 if ((ops = vdev_getops(type)) == NULL) 354 return (EINVAL); 355 356 /* 357 * If this is a load, get the vdev guid from the nvlist. 358 * Otherwise, vdev_alloc_common() will generate one for us. 359 */ 360 if (alloctype == VDEV_ALLOC_LOAD) { 361 uint64_t label_id; 362 363 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 364 label_id != id) 365 return (EINVAL); 366 367 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 368 return (EINVAL); 369 } else if (alloctype == VDEV_ALLOC_SPARE) { 370 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 371 return (EINVAL); 372 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 373 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 374 return (EINVAL); 375 } 376 377 /* 378 * The first allocated vdev must be of type 'root'. 379 */ 380 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 381 return (EINVAL); 382 383 /* 384 * Determine whether we're a log vdev. 385 */ 386 islog = 0; 387 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 388 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 389 return (ENOTSUP); 390 391 /* 392 * Set the nparity property for RAID-Z vdevs. 393 */ 394 nparity = -1ULL; 395 if (ops == &vdev_raidz_ops) { 396 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 397 &nparity) == 0) { 398 /* 399 * Currently, we can only support 2 parity devices. 400 */ 401 if (nparity == 0 || nparity > 2) 402 return (EINVAL); 403 /* 404 * Older versions can only support 1 parity device. 405 */ 406 if (nparity == 2 && 407 spa_version(spa) < SPA_VERSION_RAID6) 408 return (ENOTSUP); 409 } else { 410 /* 411 * We require the parity to be specified for SPAs that 412 * support multiple parity levels. 413 */ 414 if (spa_version(spa) >= SPA_VERSION_RAID6) 415 return (EINVAL); 416 /* 417 * Otherwise, we default to 1 parity device for RAID-Z. 418 */ 419 nparity = 1; 420 } 421 } else { 422 nparity = 0; 423 } 424 ASSERT(nparity != -1ULL); 425 426 vd = vdev_alloc_common(spa, id, guid, ops); 427 428 vd->vdev_islog = islog; 429 vd->vdev_nparity = nparity; 430 431 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 432 vd->vdev_path = spa_strdup(vd->vdev_path); 433 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 434 vd->vdev_devid = spa_strdup(vd->vdev_devid); 435 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 436 &vd->vdev_physpath) == 0) 437 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 438 439 /* 440 * Set the whole_disk property. If it's not specified, leave the value 441 * as -1. 442 */ 443 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 444 &vd->vdev_wholedisk) != 0) 445 vd->vdev_wholedisk = -1ULL; 446 447 /* 448 * Look for the 'not present' flag. This will only be set if the device 449 * was not present at the time of import. 450 */ 451 if (!spa->spa_import_faulted) 452 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 453 &vd->vdev_not_present); 454 455 /* 456 * Get the alignment requirement. 457 */ 458 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 459 460 /* 461 * If we're a top-level vdev, try to load the allocation parameters. 462 */ 463 if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 464 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 465 &vd->vdev_ms_array); 466 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 467 &vd->vdev_ms_shift); 468 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 469 &vd->vdev_asize); 470 } 471 472 /* 473 * If we're a leaf vdev, try to load the DTL object and other state. 474 */ 475 if (vd->vdev_ops->vdev_op_leaf && 476 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { 477 if (alloctype == VDEV_ALLOC_LOAD) { 478 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 479 &vd->vdev_dtl_smo.smo_object); 480 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 481 &vd->vdev_unspare); 482 } 483 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 484 &vd->vdev_offline); 485 486 /* 487 * When importing a pool, we want to ignore the persistent fault 488 * state, as the diagnosis made on another system may not be 489 * valid in the current context. 490 */ 491 if (spa->spa_load_state == SPA_LOAD_OPEN) { 492 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 493 &vd->vdev_faulted); 494 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 495 &vd->vdev_degraded); 496 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 497 &vd->vdev_removed); 498 } 499 } 500 501 /* 502 * Add ourselves to the parent's list of children. 503 */ 504 vdev_add_child(parent, vd); 505 506 *vdp = vd; 507 508 return (0); 509 } 510 511 void 512 vdev_free(vdev_t *vd) 513 { 514 int c; 515 spa_t *spa = vd->vdev_spa; 516 517 /* 518 * vdev_free() implies closing the vdev first. This is simpler than 519 * trying to ensure complicated semantics for all callers. 520 */ 521 vdev_close(vd); 522 523 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 524 525 /* 526 * Free all children. 527 */ 528 for (c = 0; c < vd->vdev_children; c++) 529 vdev_free(vd->vdev_child[c]); 530 531 ASSERT(vd->vdev_child == NULL); 532 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 533 534 /* 535 * Discard allocation state. 536 */ 537 if (vd == vd->vdev_top) 538 vdev_metaslab_fini(vd); 539 540 ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 541 ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 542 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 543 544 /* 545 * Remove this vdev from its parent's child list. 546 */ 547 vdev_remove_child(vd->vdev_parent, vd); 548 549 ASSERT(vd->vdev_parent == NULL); 550 551 /* 552 * Clean up vdev structure. 553 */ 554 vdev_queue_fini(vd); 555 vdev_cache_fini(vd); 556 557 if (vd->vdev_path) 558 spa_strfree(vd->vdev_path); 559 if (vd->vdev_devid) 560 spa_strfree(vd->vdev_devid); 561 if (vd->vdev_physpath) 562 spa_strfree(vd->vdev_physpath); 563 564 if (vd->vdev_isspare) 565 spa_spare_remove(vd); 566 if (vd->vdev_isl2cache) 567 spa_l2cache_remove(vd); 568 569 txg_list_destroy(&vd->vdev_ms_list); 570 txg_list_destroy(&vd->vdev_dtl_list); 571 572 mutex_enter(&vd->vdev_dtl_lock); 573 for (int t = 0; t < DTL_TYPES; t++) { 574 space_map_unload(&vd->vdev_dtl[t]); 575 space_map_destroy(&vd->vdev_dtl[t]); 576 } 577 mutex_exit(&vd->vdev_dtl_lock); 578 579 mutex_destroy(&vd->vdev_dtl_lock); 580 mutex_destroy(&vd->vdev_stat_lock); 581 mutex_destroy(&vd->vdev_probe_lock); 582 583 if (vd == spa->spa_root_vdev) 584 spa->spa_root_vdev = NULL; 585 586 kmem_free(vd, sizeof (vdev_t)); 587 } 588 589 /* 590 * Transfer top-level vdev state from svd to tvd. 591 */ 592 static void 593 vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 594 { 595 spa_t *spa = svd->vdev_spa; 596 metaslab_t *msp; 597 vdev_t *vd; 598 int t; 599 600 ASSERT(tvd == tvd->vdev_top); 601 602 tvd->vdev_ms_array = svd->vdev_ms_array; 603 tvd->vdev_ms_shift = svd->vdev_ms_shift; 604 tvd->vdev_ms_count = svd->vdev_ms_count; 605 606 svd->vdev_ms_array = 0; 607 svd->vdev_ms_shift = 0; 608 svd->vdev_ms_count = 0; 609 610 tvd->vdev_mg = svd->vdev_mg; 611 tvd->vdev_ms = svd->vdev_ms; 612 613 svd->vdev_mg = NULL; 614 svd->vdev_ms = NULL; 615 616 if (tvd->vdev_mg != NULL) 617 tvd->vdev_mg->mg_vd = tvd; 618 619 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 620 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 621 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 622 623 svd->vdev_stat.vs_alloc = 0; 624 svd->vdev_stat.vs_space = 0; 625 svd->vdev_stat.vs_dspace = 0; 626 627 for (t = 0; t < TXG_SIZE; t++) { 628 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 629 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 630 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 631 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 632 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 633 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 634 } 635 636 if (list_link_active(&svd->vdev_config_dirty_node)) { 637 vdev_config_clean(svd); 638 vdev_config_dirty(tvd); 639 } 640 641 if (list_link_active(&svd->vdev_state_dirty_node)) { 642 vdev_state_clean(svd); 643 vdev_state_dirty(tvd); 644 } 645 646 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 647 svd->vdev_deflate_ratio = 0; 648 649 tvd->vdev_islog = svd->vdev_islog; 650 svd->vdev_islog = 0; 651 } 652 653 static void 654 vdev_top_update(vdev_t *tvd, vdev_t *vd) 655 { 656 int c; 657 658 if (vd == NULL) 659 return; 660 661 vd->vdev_top = tvd; 662 663 for (c = 0; c < vd->vdev_children; c++) 664 vdev_top_update(tvd, vd->vdev_child[c]); 665 } 666 667 /* 668 * Add a mirror/replacing vdev above an existing vdev. 669 */ 670 vdev_t * 671 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 672 { 673 spa_t *spa = cvd->vdev_spa; 674 vdev_t *pvd = cvd->vdev_parent; 675 vdev_t *mvd; 676 677 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 678 679 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 680 681 mvd->vdev_asize = cvd->vdev_asize; 682 mvd->vdev_ashift = cvd->vdev_ashift; 683 mvd->vdev_state = cvd->vdev_state; 684 685 vdev_remove_child(pvd, cvd); 686 vdev_add_child(pvd, mvd); 687 cvd->vdev_id = mvd->vdev_children; 688 vdev_add_child(mvd, cvd); 689 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 690 691 if (mvd == mvd->vdev_top) 692 vdev_top_transfer(cvd, mvd); 693 694 return (mvd); 695 } 696 697 /* 698 * Remove a 1-way mirror/replacing vdev from the tree. 699 */ 700 void 701 vdev_remove_parent(vdev_t *cvd) 702 { 703 vdev_t *mvd = cvd->vdev_parent; 704 vdev_t *pvd = mvd->vdev_parent; 705 706 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 707 708 ASSERT(mvd->vdev_children == 1); 709 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 710 mvd->vdev_ops == &vdev_replacing_ops || 711 mvd->vdev_ops == &vdev_spare_ops); 712 cvd->vdev_ashift = mvd->vdev_ashift; 713 714 vdev_remove_child(mvd, cvd); 715 vdev_remove_child(pvd, mvd); 716 717 /* 718 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 719 * Otherwise, we could have detached an offline device, and when we 720 * go to import the pool we'll think we have two top-level vdevs, 721 * instead of a different version of the same top-level vdev. 722 */ 723 if (mvd->vdev_top == mvd) { 724 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 725 cvd->vdev_guid += guid_delta; 726 cvd->vdev_guid_sum += guid_delta; 727 } 728 cvd->vdev_id = mvd->vdev_id; 729 vdev_add_child(pvd, cvd); 730 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 731 732 if (cvd == cvd->vdev_top) 733 vdev_top_transfer(mvd, cvd); 734 735 ASSERT(mvd->vdev_children == 0); 736 vdev_free(mvd); 737 } 738 739 int 740 vdev_metaslab_init(vdev_t *vd, uint64_t txg) 741 { 742 spa_t *spa = vd->vdev_spa; 743 objset_t *mos = spa->spa_meta_objset; 744 metaslab_class_t *mc; 745 uint64_t m; 746 uint64_t oldc = vd->vdev_ms_count; 747 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 748 metaslab_t **mspp; 749 int error; 750 751 if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 752 return (0); 753 754 ASSERT(oldc <= newc); 755 756 if (vd->vdev_islog) 757 mc = spa->spa_log_class; 758 else 759 mc = spa->spa_normal_class; 760 761 if (vd->vdev_mg == NULL) 762 vd->vdev_mg = metaslab_group_create(mc, vd); 763 764 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 765 766 if (oldc != 0) { 767 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 768 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 769 } 770 771 vd->vdev_ms = mspp; 772 vd->vdev_ms_count = newc; 773 774 for (m = oldc; m < newc; m++) { 775 space_map_obj_t smo = { 0, 0, 0 }; 776 if (txg == 0) { 777 uint64_t object = 0; 778 error = dmu_read(mos, vd->vdev_ms_array, 779 m * sizeof (uint64_t), sizeof (uint64_t), &object); 780 if (error) 781 return (error); 782 if (object != 0) { 783 dmu_buf_t *db; 784 error = dmu_bonus_hold(mos, object, FTAG, &db); 785 if (error) 786 return (error); 787 ASSERT3U(db->db_size, >=, sizeof (smo)); 788 bcopy(db->db_data, &smo, sizeof (smo)); 789 ASSERT3U(smo.smo_object, ==, object); 790 dmu_buf_rele(db, FTAG); 791 } 792 } 793 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 794 m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 795 } 796 797 return (0); 798 } 799 800 void 801 vdev_metaslab_fini(vdev_t *vd) 802 { 803 uint64_t m; 804 uint64_t count = vd->vdev_ms_count; 805 806 if (vd->vdev_ms != NULL) { 807 for (m = 0; m < count; m++) 808 if (vd->vdev_ms[m] != NULL) 809 metaslab_fini(vd->vdev_ms[m]); 810 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 811 vd->vdev_ms = NULL; 812 } 813 } 814 815 typedef struct vdev_probe_stats { 816 boolean_t vps_readable; 817 boolean_t vps_writeable; 818 int vps_flags; 819 zio_t *vps_root; 820 vdev_t *vps_vd; 821 } vdev_probe_stats_t; 822 823 static void 824 vdev_probe_done(zio_t *zio) 825 { 826 spa_t *spa = zio->io_spa; 827 vdev_probe_stats_t *vps = zio->io_private; 828 vdev_t *vd = vps->vps_vd; 829 830 if (zio->io_type == ZIO_TYPE_READ) { 831 ASSERT(zio->io_vd == vd); 832 if (zio->io_error == 0) 833 vps->vps_readable = 1; 834 if (zio->io_error == 0 && spa_writeable(spa)) { 835 zio_nowait(zio_write_phys(vps->vps_root, vd, 836 zio->io_offset, zio->io_size, zio->io_data, 837 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 838 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 839 } else { 840 zio_buf_free(zio->io_data, zio->io_size); 841 } 842 } else if (zio->io_type == ZIO_TYPE_WRITE) { 843 ASSERT(zio->io_vd == vd); 844 if (zio->io_error == 0) 845 vps->vps_writeable = 1; 846 zio_buf_free(zio->io_data, zio->io_size); 847 } else if (zio->io_type == ZIO_TYPE_NULL) { 848 ASSERT(zio->io_vd == NULL); 849 ASSERT(zio == vps->vps_root); 850 851 vd->vdev_cant_read |= !vps->vps_readable; 852 vd->vdev_cant_write |= !vps->vps_writeable; 853 854 if (vdev_readable(vd) && 855 (vdev_writeable(vd) || !spa_writeable(spa))) { 856 zio->io_error = 0; 857 } else { 858 ASSERT(zio->io_error != 0); 859 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 860 spa, vd, NULL, 0, 0); 861 zio->io_error = ENXIO; 862 } 863 kmem_free(vps, sizeof (*vps)); 864 } 865 } 866 867 /* 868 * Determine whether this device is accessible by reading and writing 869 * to several known locations: the pad regions of each vdev label 870 * but the first (which we leave alone in case it contains a VTOC). 871 */ 872 zio_t * 873 vdev_probe(vdev_t *vd, zio_t *pio) 874 { 875 spa_t *spa = vd->vdev_spa; 876 vdev_probe_stats_t *vps; 877 zio_t *zio; 878 879 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 880 881 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 882 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY; 883 884 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 885 /* 886 * vdev_cant_read and vdev_cant_write can only transition 887 * from TRUE to FALSE when we have the SCL_ZIO lock as writer; 888 * otherwise they can only transition from FALSE to TRUE. 889 * This ensures that any zio looking at these values can 890 * assume that failures persist for the life of the I/O. 891 * That's important because when a device has intermittent 892 * connectivity problems, we want to ensure that they're 893 * ascribed to the device (ENXIO) and not the zio (EIO). 894 * 895 * Since we hold SCL_ZIO as writer here, clear both values 896 * so the probe can reevaluate from first principles. 897 */ 898 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 899 vd->vdev_cant_read = B_FALSE; 900 vd->vdev_cant_write = B_FALSE; 901 } 902 903 ASSERT(vd->vdev_ops->vdev_op_leaf); 904 905 zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags); 906 907 vps->vps_root = zio; 908 vps->vps_vd = vd; 909 910 for (int l = 1; l < VDEV_LABELS; l++) { 911 zio_nowait(zio_read_phys(zio, vd, 912 vdev_label_offset(vd->vdev_psize, l, 913 offsetof(vdev_label_t, vl_pad)), 914 VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), 915 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 916 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 917 } 918 919 return (zio); 920 } 921 922 /* 923 * Prepare a virtual device for access. 924 */ 925 int 926 vdev_open(vdev_t *vd) 927 { 928 spa_t *spa = vd->vdev_spa; 929 int error; 930 int c; 931 uint64_t osize = 0; 932 uint64_t asize, psize; 933 uint64_t ashift = 0; 934 935 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 936 937 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 938 vd->vdev_state == VDEV_STATE_CANT_OPEN || 939 vd->vdev_state == VDEV_STATE_OFFLINE); 940 941 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 942 943 if (!vd->vdev_removed && vd->vdev_faulted) { 944 ASSERT(vd->vdev_children == 0); 945 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 946 VDEV_AUX_ERR_EXCEEDED); 947 return (ENXIO); 948 } else if (vd->vdev_offline) { 949 ASSERT(vd->vdev_children == 0); 950 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 951 return (ENXIO); 952 } 953 954 error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 955 956 if (zio_injection_enabled && error == 0) 957 error = zio_handle_device_injection(vd, ENXIO); 958 959 if (error) { 960 if (vd->vdev_removed && 961 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 962 vd->vdev_removed = B_FALSE; 963 964 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 965 vd->vdev_stat.vs_aux); 966 return (error); 967 } 968 969 vd->vdev_removed = B_FALSE; 970 971 if (vd->vdev_degraded) { 972 ASSERT(vd->vdev_children == 0); 973 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 974 VDEV_AUX_ERR_EXCEEDED); 975 } else { 976 vd->vdev_state = VDEV_STATE_HEALTHY; 977 } 978 979 for (c = 0; c < vd->vdev_children; c++) 980 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 981 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 982 VDEV_AUX_NONE); 983 break; 984 } 985 986 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 987 988 if (vd->vdev_children == 0) { 989 if (osize < SPA_MINDEVSIZE) { 990 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 991 VDEV_AUX_TOO_SMALL); 992 return (EOVERFLOW); 993 } 994 psize = osize; 995 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 996 } else { 997 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 998 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 999 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1000 VDEV_AUX_TOO_SMALL); 1001 return (EOVERFLOW); 1002 } 1003 psize = 0; 1004 asize = osize; 1005 } 1006 1007 vd->vdev_psize = psize; 1008 1009 if (vd->vdev_asize == 0) { 1010 /* 1011 * This is the first-ever open, so use the computed values. 1012 * For testing purposes, a higher ashift can be requested. 1013 */ 1014 vd->vdev_asize = asize; 1015 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 1016 } else { 1017 /* 1018 * Make sure the alignment requirement hasn't increased. 1019 */ 1020 if (ashift > vd->vdev_top->vdev_ashift) { 1021 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1022 VDEV_AUX_BAD_LABEL); 1023 return (EINVAL); 1024 } 1025 1026 /* 1027 * Make sure the device hasn't shrunk. 1028 */ 1029 if (asize < vd->vdev_asize) { 1030 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1031 VDEV_AUX_BAD_LABEL); 1032 return (EINVAL); 1033 } 1034 1035 /* 1036 * If all children are healthy and the asize has increased, 1037 * then we've experienced dynamic LUN growth. 1038 */ 1039 if (vd->vdev_state == VDEV_STATE_HEALTHY && 1040 asize > vd->vdev_asize) { 1041 vd->vdev_asize = asize; 1042 } 1043 } 1044 1045 /* 1046 * Ensure we can issue some IO before declaring the 1047 * vdev open for business. 1048 */ 1049 if (vd->vdev_ops->vdev_op_leaf && 1050 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1051 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1052 VDEV_AUX_IO_FAILURE); 1053 return (error); 1054 } 1055 1056 /* 1057 * If this is a top-level vdev, compute the raidz-deflation 1058 * ratio. Note, we hard-code in 128k (1<<17) because it is the 1059 * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 1060 * changes, this algorithm must never change, or we will 1061 * inconsistently account for existing bp's. 1062 */ 1063 if (vd->vdev_top == vd) { 1064 vd->vdev_deflate_ratio = (1<<17) / 1065 (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 1066 } 1067 1068 /* 1069 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1070 * resilver. But don't do this if we are doing a reopen for a scrub, 1071 * since this would just restart the scrub we are already doing. 1072 */ 1073 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1074 vdev_resilver_needed(vd, NULL, NULL)) 1075 spa_async_request(spa, SPA_ASYNC_RESILVER); 1076 1077 return (0); 1078 } 1079 1080 /* 1081 * Called once the vdevs are all opened, this routine validates the label 1082 * contents. This needs to be done before vdev_load() so that we don't 1083 * inadvertently do repair I/Os to the wrong device. 1084 * 1085 * This function will only return failure if one of the vdevs indicates that it 1086 * has since been destroyed or exported. This is only possible if 1087 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1088 * will be updated but the function will return 0. 1089 */ 1090 int 1091 vdev_validate(vdev_t *vd) 1092 { 1093 spa_t *spa = vd->vdev_spa; 1094 int c; 1095 nvlist_t *label; 1096 uint64_t guid, top_guid; 1097 uint64_t state; 1098 1099 for (c = 0; c < vd->vdev_children; c++) 1100 if (vdev_validate(vd->vdev_child[c]) != 0) 1101 return (EBADF); 1102 1103 /* 1104 * If the device has already failed, or was marked offline, don't do 1105 * any further validation. Otherwise, label I/O will fail and we will 1106 * overwrite the previous state. 1107 */ 1108 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1109 1110 if ((label = vdev_label_read_config(vd)) == NULL) { 1111 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1112 VDEV_AUX_BAD_LABEL); 1113 return (0); 1114 } 1115 1116 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 1117 &guid) != 0 || guid != spa_guid(spa)) { 1118 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1119 VDEV_AUX_CORRUPT_DATA); 1120 nvlist_free(label); 1121 return (0); 1122 } 1123 1124 /* 1125 * If this vdev just became a top-level vdev because its 1126 * sibling was detached, it will have adopted the parent's 1127 * vdev guid -- but the label may or may not be on disk yet. 1128 * Fortunately, either version of the label will have the 1129 * same top guid, so if we're a top-level vdev, we can 1130 * safely compare to that instead. 1131 */ 1132 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1133 &guid) != 0 || 1134 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1135 &top_guid) != 0 || 1136 (vd->vdev_guid != guid && 1137 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1138 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1139 VDEV_AUX_CORRUPT_DATA); 1140 nvlist_free(label); 1141 return (0); 1142 } 1143 1144 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1145 &state) != 0) { 1146 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1147 VDEV_AUX_CORRUPT_DATA); 1148 nvlist_free(label); 1149 return (0); 1150 } 1151 1152 nvlist_free(label); 1153 1154 if (spa->spa_load_state == SPA_LOAD_OPEN && 1155 state != POOL_STATE_ACTIVE) 1156 return (EBADF); 1157 1158 /* 1159 * If we were able to open and validate a vdev that was 1160 * previously marked permanently unavailable, clear that state 1161 * now. 1162 */ 1163 if (vd->vdev_not_present) 1164 vd->vdev_not_present = 0; 1165 } 1166 1167 return (0); 1168 } 1169 1170 /* 1171 * Close a virtual device. 1172 */ 1173 void 1174 vdev_close(vdev_t *vd) 1175 { 1176 spa_t *spa = vd->vdev_spa; 1177 1178 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1179 1180 vd->vdev_ops->vdev_op_close(vd); 1181 1182 vdev_cache_purge(vd); 1183 1184 /* 1185 * We record the previous state before we close it, so that if we are 1186 * doing a reopen(), we don't generate FMA ereports if we notice that 1187 * it's still faulted. 1188 */ 1189 vd->vdev_prevstate = vd->vdev_state; 1190 1191 if (vd->vdev_offline) 1192 vd->vdev_state = VDEV_STATE_OFFLINE; 1193 else 1194 vd->vdev_state = VDEV_STATE_CLOSED; 1195 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1196 } 1197 1198 void 1199 vdev_reopen(vdev_t *vd) 1200 { 1201 spa_t *spa = vd->vdev_spa; 1202 1203 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1204 1205 vdev_close(vd); 1206 (void) vdev_open(vd); 1207 1208 /* 1209 * Call vdev_validate() here to make sure we have the same device. 1210 * Otherwise, a device with an invalid label could be successfully 1211 * opened in response to vdev_reopen(). 1212 */ 1213 if (vd->vdev_aux) { 1214 (void) vdev_validate_aux(vd); 1215 if (vdev_readable(vd) && vdev_writeable(vd) && 1216 !l2arc_vdev_present(vd)) { 1217 uint64_t size = vdev_get_rsize(vd); 1218 l2arc_add_vdev(spa, vd, 1219 VDEV_LABEL_START_SIZE, 1220 size - VDEV_LABEL_START_SIZE); 1221 } 1222 } else { 1223 (void) vdev_validate(vd); 1224 } 1225 1226 /* 1227 * Reassess parent vdev's health. 1228 */ 1229 vdev_propagate_state(vd); 1230 } 1231 1232 int 1233 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1234 { 1235 int error; 1236 1237 /* 1238 * Normally, partial opens (e.g. of a mirror) are allowed. 1239 * For a create, however, we want to fail the request if 1240 * there are any components we can't open. 1241 */ 1242 error = vdev_open(vd); 1243 1244 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1245 vdev_close(vd); 1246 return (error ? error : ENXIO); 1247 } 1248 1249 /* 1250 * Recursively initialize all labels. 1251 */ 1252 if ((error = vdev_label_init(vd, txg, isreplacing ? 1253 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1254 vdev_close(vd); 1255 return (error); 1256 } 1257 1258 return (0); 1259 } 1260 1261 /* 1262 * The is the latter half of vdev_create(). It is distinct because it 1263 * involves initiating transactions in order to do metaslab creation. 1264 * For creation, we want to try to create all vdevs at once and then undo it 1265 * if anything fails; this is much harder if we have pending transactions. 1266 */ 1267 void 1268 vdev_init(vdev_t *vd, uint64_t txg) 1269 { 1270 /* 1271 * Aim for roughly 200 metaslabs per vdev. 1272 */ 1273 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1274 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1275 1276 /* 1277 * Initialize the vdev's metaslabs. This can't fail because 1278 * there's nothing to read when creating all new metaslabs. 1279 */ 1280 VERIFY(vdev_metaslab_init(vd, txg) == 0); 1281 } 1282 1283 void 1284 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1285 { 1286 ASSERT(vd == vd->vdev_top); 1287 ASSERT(ISP2(flags)); 1288 1289 if (flags & VDD_METASLAB) 1290 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1291 1292 if (flags & VDD_DTL) 1293 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1294 1295 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1296 } 1297 1298 /* 1299 * DTLs. 1300 * 1301 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1302 * the vdev has less than perfect replication. There are three kinds of DTL: 1303 * 1304 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1305 * 1306 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1307 * 1308 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1309 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1310 * txgs that was scrubbed. 1311 * 1312 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1313 * persistent errors or just some device being offline. 1314 * Unlike the other three, the DTL_OUTAGE map is not generally 1315 * maintained; it's only computed when needed, typically to 1316 * determine whether a device can be detached. 1317 * 1318 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1319 * either has the data or it doesn't. 1320 * 1321 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1322 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1323 * if any child is less than fully replicated, then so is its parent. 1324 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1325 * comprising only those txgs which appear in 'maxfaults' or more children; 1326 * those are the txgs we don't have enough replication to read. For example, 1327 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1328 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1329 * two child DTL_MISSING maps. 1330 * 1331 * It should be clear from the above that to compute the DTLs and outage maps 1332 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1333 * Therefore, that is all we keep on disk. When loading the pool, or after 1334 * a configuration change, we generate all other DTLs from first principles. 1335 */ 1336 void 1337 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1338 { 1339 space_map_t *sm = &vd->vdev_dtl[t]; 1340 1341 ASSERT(t < DTL_TYPES); 1342 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1343 1344 mutex_enter(sm->sm_lock); 1345 if (!space_map_contains(sm, txg, size)) 1346 space_map_add(sm, txg, size); 1347 mutex_exit(sm->sm_lock); 1348 } 1349 1350 boolean_t 1351 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1352 { 1353 space_map_t *sm = &vd->vdev_dtl[t]; 1354 boolean_t dirty = B_FALSE; 1355 1356 ASSERT(t < DTL_TYPES); 1357 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1358 1359 mutex_enter(sm->sm_lock); 1360 if (sm->sm_space != 0) 1361 dirty = space_map_contains(sm, txg, size); 1362 mutex_exit(sm->sm_lock); 1363 1364 return (dirty); 1365 } 1366 1367 boolean_t 1368 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1369 { 1370 space_map_t *sm = &vd->vdev_dtl[t]; 1371 boolean_t empty; 1372 1373 mutex_enter(sm->sm_lock); 1374 empty = (sm->sm_space == 0); 1375 mutex_exit(sm->sm_lock); 1376 1377 return (empty); 1378 } 1379 1380 /* 1381 * Reassess DTLs after a config change or scrub completion. 1382 */ 1383 void 1384 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1385 { 1386 spa_t *spa = vd->vdev_spa; 1387 avl_tree_t reftree; 1388 int minref; 1389 1390 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1391 1392 for (int c = 0; c < vd->vdev_children; c++) 1393 vdev_dtl_reassess(vd->vdev_child[c], txg, 1394 scrub_txg, scrub_done); 1395 1396 if (vd == spa->spa_root_vdev) 1397 return; 1398 1399 if (vd->vdev_ops->vdev_op_leaf) { 1400 mutex_enter(&vd->vdev_dtl_lock); 1401 if (scrub_txg != 0 && 1402 (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { 1403 /* XXX should check scrub_done? */ 1404 /* 1405 * We completed a scrub up to scrub_txg. If we 1406 * did it without rebooting, then the scrub dtl 1407 * will be valid, so excise the old region and 1408 * fold in the scrub dtl. Otherwise, leave the 1409 * dtl as-is if there was an error. 1410 * 1411 * There's little trick here: to excise the beginning 1412 * of the DTL_MISSING map, we put it into a reference 1413 * tree and then add a segment with refcnt -1 that 1414 * covers the range [0, scrub_txg). This means 1415 * that each txg in that range has refcnt -1 or 0. 1416 * We then add DTL_SCRUB with a refcnt of 2, so that 1417 * entries in the range [0, scrub_txg) will have a 1418 * positive refcnt -- either 1 or 2. We then convert 1419 * the reference tree into the new DTL_MISSING map. 1420 */ 1421 space_map_ref_create(&reftree); 1422 space_map_ref_add_map(&reftree, 1423 &vd->vdev_dtl[DTL_MISSING], 1); 1424 space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); 1425 space_map_ref_add_map(&reftree, 1426 &vd->vdev_dtl[DTL_SCRUB], 2); 1427 space_map_ref_generate_map(&reftree, 1428 &vd->vdev_dtl[DTL_MISSING], 1); 1429 space_map_ref_destroy(&reftree); 1430 } 1431 space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1432 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1433 space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); 1434 if (scrub_done) 1435 space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1436 space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1437 if (!vdev_readable(vd)) 1438 space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1439 else 1440 space_map_walk(&vd->vdev_dtl[DTL_MISSING], 1441 space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); 1442 mutex_exit(&vd->vdev_dtl_lock); 1443 1444 if (txg != 0) 1445 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1446 return; 1447 } 1448 1449 mutex_enter(&vd->vdev_dtl_lock); 1450 for (int t = 0; t < DTL_TYPES; t++) { 1451 if (t == DTL_SCRUB) 1452 continue; /* leaf vdevs only */ 1453 if (t == DTL_PARTIAL) 1454 minref = 1; /* i.e. non-zero */ 1455 else if (vd->vdev_nparity != 0) 1456 minref = vd->vdev_nparity + 1; /* RAID-Z */ 1457 else 1458 minref = vd->vdev_children; /* any kind of mirror */ 1459 space_map_ref_create(&reftree); 1460 for (int c = 0; c < vd->vdev_children; c++) { 1461 vdev_t *cvd = vd->vdev_child[c]; 1462 mutex_enter(&cvd->vdev_dtl_lock); 1463 space_map_ref_add_map(&reftree, &cvd->vdev_dtl[t], 1); 1464 mutex_exit(&cvd->vdev_dtl_lock); 1465 } 1466 space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); 1467 space_map_ref_destroy(&reftree); 1468 } 1469 mutex_exit(&vd->vdev_dtl_lock); 1470 } 1471 1472 static int 1473 vdev_dtl_load(vdev_t *vd) 1474 { 1475 spa_t *spa = vd->vdev_spa; 1476 space_map_obj_t *smo = &vd->vdev_dtl_smo; 1477 objset_t *mos = spa->spa_meta_objset; 1478 dmu_buf_t *db; 1479 int error; 1480 1481 ASSERT(vd->vdev_children == 0); 1482 1483 if (smo->smo_object == 0) 1484 return (0); 1485 1486 if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1487 return (error); 1488 1489 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1490 bcopy(db->db_data, smo, sizeof (*smo)); 1491 dmu_buf_rele(db, FTAG); 1492 1493 mutex_enter(&vd->vdev_dtl_lock); 1494 error = space_map_load(&vd->vdev_dtl[DTL_MISSING], 1495 NULL, SM_ALLOC, smo, mos); 1496 mutex_exit(&vd->vdev_dtl_lock); 1497 1498 return (error); 1499 } 1500 1501 void 1502 vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1503 { 1504 spa_t *spa = vd->vdev_spa; 1505 space_map_obj_t *smo = &vd->vdev_dtl_smo; 1506 space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; 1507 objset_t *mos = spa->spa_meta_objset; 1508 space_map_t smsync; 1509 kmutex_t smlock; 1510 dmu_buf_t *db; 1511 dmu_tx_t *tx; 1512 1513 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1514 1515 if (vd->vdev_detached) { 1516 if (smo->smo_object != 0) { 1517 int err = dmu_object_free(mos, smo->smo_object, tx); 1518 ASSERT3U(err, ==, 0); 1519 smo->smo_object = 0; 1520 } 1521 dmu_tx_commit(tx); 1522 return; 1523 } 1524 1525 if (smo->smo_object == 0) { 1526 ASSERT(smo->smo_objsize == 0); 1527 ASSERT(smo->smo_alloc == 0); 1528 smo->smo_object = dmu_object_alloc(mos, 1529 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1530 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1531 ASSERT(smo->smo_object != 0); 1532 vdev_config_dirty(vd->vdev_top); 1533 } 1534 1535 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1536 1537 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1538 &smlock); 1539 1540 mutex_enter(&smlock); 1541 1542 mutex_enter(&vd->vdev_dtl_lock); 1543 space_map_walk(sm, space_map_add, &smsync); 1544 mutex_exit(&vd->vdev_dtl_lock); 1545 1546 space_map_truncate(smo, mos, tx); 1547 space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1548 1549 space_map_destroy(&smsync); 1550 1551 mutex_exit(&smlock); 1552 mutex_destroy(&smlock); 1553 1554 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1555 dmu_buf_will_dirty(db, tx); 1556 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1557 bcopy(smo, db->db_data, sizeof (*smo)); 1558 dmu_buf_rele(db, FTAG); 1559 1560 dmu_tx_commit(tx); 1561 } 1562 1563 /* 1564 * Determine whether the specified vdev can be offlined/detached/removed 1565 * without losing data. 1566 */ 1567 boolean_t 1568 vdev_dtl_required(vdev_t *vd) 1569 { 1570 spa_t *spa = vd->vdev_spa; 1571 vdev_t *tvd = vd->vdev_top; 1572 uint8_t cant_read = vd->vdev_cant_read; 1573 boolean_t required; 1574 1575 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1576 1577 if (vd == spa->spa_root_vdev || vd == tvd) 1578 return (B_TRUE); 1579 1580 /* 1581 * Temporarily mark the device as unreadable, and then determine 1582 * whether this results in any DTL outages in the top-level vdev. 1583 * If not, we can safely offline/detach/remove the device. 1584 */ 1585 vd->vdev_cant_read = B_TRUE; 1586 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1587 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 1588 vd->vdev_cant_read = cant_read; 1589 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 1590 1591 return (required); 1592 } 1593 1594 /* 1595 * Determine if resilver is needed, and if so the txg range. 1596 */ 1597 boolean_t 1598 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 1599 { 1600 boolean_t needed = B_FALSE; 1601 uint64_t thismin = UINT64_MAX; 1602 uint64_t thismax = 0; 1603 1604 if (vd->vdev_children == 0) { 1605 mutex_enter(&vd->vdev_dtl_lock); 1606 if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && 1607 vdev_writeable(vd)) { 1608 space_seg_t *ss; 1609 1610 ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); 1611 thismin = ss->ss_start - 1; 1612 ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); 1613 thismax = ss->ss_end; 1614 needed = B_TRUE; 1615 } 1616 mutex_exit(&vd->vdev_dtl_lock); 1617 } else { 1618 for (int c = 0; c < vd->vdev_children; c++) { 1619 vdev_t *cvd = vd->vdev_child[c]; 1620 uint64_t cmin, cmax; 1621 1622 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 1623 thismin = MIN(thismin, cmin); 1624 thismax = MAX(thismax, cmax); 1625 needed = B_TRUE; 1626 } 1627 } 1628 } 1629 1630 if (needed && minp) { 1631 *minp = thismin; 1632 *maxp = thismax; 1633 } 1634 return (needed); 1635 } 1636 1637 void 1638 vdev_load(vdev_t *vd) 1639 { 1640 /* 1641 * Recursively load all children. 1642 */ 1643 for (int c = 0; c < vd->vdev_children; c++) 1644 vdev_load(vd->vdev_child[c]); 1645 1646 /* 1647 * If this is a top-level vdev, initialize its metaslabs. 1648 */ 1649 if (vd == vd->vdev_top && 1650 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1651 vdev_metaslab_init(vd, 0) != 0)) 1652 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1653 VDEV_AUX_CORRUPT_DATA); 1654 1655 /* 1656 * If this is a leaf vdev, load its DTL. 1657 */ 1658 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1659 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1660 VDEV_AUX_CORRUPT_DATA); 1661 } 1662 1663 /* 1664 * The special vdev case is used for hot spares and l2cache devices. Its 1665 * sole purpose it to set the vdev state for the associated vdev. To do this, 1666 * we make sure that we can open the underlying device, then try to read the 1667 * label, and make sure that the label is sane and that it hasn't been 1668 * repurposed to another pool. 1669 */ 1670 int 1671 vdev_validate_aux(vdev_t *vd) 1672 { 1673 nvlist_t *label; 1674 uint64_t guid, version; 1675 uint64_t state; 1676 1677 if (!vdev_readable(vd)) 1678 return (0); 1679 1680 if ((label = vdev_label_read_config(vd)) == NULL) { 1681 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1682 VDEV_AUX_CORRUPT_DATA); 1683 return (-1); 1684 } 1685 1686 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 1687 version > SPA_VERSION || 1688 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 1689 guid != vd->vdev_guid || 1690 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 1691 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1692 VDEV_AUX_CORRUPT_DATA); 1693 nvlist_free(label); 1694 return (-1); 1695 } 1696 1697 /* 1698 * We don't actually check the pool state here. If it's in fact in 1699 * use by another pool, we update this fact on the fly when requested. 1700 */ 1701 nvlist_free(label); 1702 return (0); 1703 } 1704 1705 void 1706 vdev_sync_done(vdev_t *vd, uint64_t txg) 1707 { 1708 metaslab_t *msp; 1709 1710 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1711 metaslab_sync_done(msp, txg); 1712 } 1713 1714 void 1715 vdev_sync(vdev_t *vd, uint64_t txg) 1716 { 1717 spa_t *spa = vd->vdev_spa; 1718 vdev_t *lvd; 1719 metaslab_t *msp; 1720 dmu_tx_t *tx; 1721 1722 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 1723 ASSERT(vd == vd->vdev_top); 1724 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1725 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1726 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1727 ASSERT(vd->vdev_ms_array != 0); 1728 vdev_config_dirty(vd); 1729 dmu_tx_commit(tx); 1730 } 1731 1732 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1733 metaslab_sync(msp, txg); 1734 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 1735 } 1736 1737 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1738 vdev_dtl_sync(lvd, txg); 1739 1740 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1741 } 1742 1743 uint64_t 1744 vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1745 { 1746 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1747 } 1748 1749 /* 1750 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 1751 * not be opened, and no I/O is attempted. 1752 */ 1753 int 1754 vdev_fault(spa_t *spa, uint64_t guid) 1755 { 1756 vdev_t *vd; 1757 1758 spa_vdev_state_enter(spa); 1759 1760 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1761 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 1762 1763 if (!vd->vdev_ops->vdev_op_leaf) 1764 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 1765 1766 /* 1767 * Faulted state takes precedence over degraded. 1768 */ 1769 vd->vdev_faulted = 1ULL; 1770 vd->vdev_degraded = 0ULL; 1771 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); 1772 1773 /* 1774 * If marking the vdev as faulted cause the top-level vdev to become 1775 * unavailable, then back off and simply mark the vdev as degraded 1776 * instead. 1777 */ 1778 if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { 1779 vd->vdev_degraded = 1ULL; 1780 vd->vdev_faulted = 0ULL; 1781 1782 /* 1783 * If we reopen the device and it's not dead, only then do we 1784 * mark it degraded. 1785 */ 1786 vdev_reopen(vd); 1787 1788 if (vdev_readable(vd)) { 1789 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 1790 VDEV_AUX_ERR_EXCEEDED); 1791 } 1792 } 1793 1794 return (spa_vdev_state_exit(spa, vd, 0)); 1795 } 1796 1797 /* 1798 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 1799 * user that something is wrong. The vdev continues to operate as normal as far 1800 * as I/O is concerned. 1801 */ 1802 int 1803 vdev_degrade(spa_t *spa, uint64_t guid) 1804 { 1805 vdev_t *vd; 1806 1807 spa_vdev_state_enter(spa); 1808 1809 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1810 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 1811 1812 if (!vd->vdev_ops->vdev_op_leaf) 1813 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 1814 1815 /* 1816 * If the vdev is already faulted, then don't do anything. 1817 */ 1818 if (vd->vdev_faulted || vd->vdev_degraded) 1819 return (spa_vdev_state_exit(spa, NULL, 0)); 1820 1821 vd->vdev_degraded = 1ULL; 1822 if (!vdev_is_dead(vd)) 1823 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 1824 VDEV_AUX_ERR_EXCEEDED); 1825 1826 return (spa_vdev_state_exit(spa, vd, 0)); 1827 } 1828 1829 /* 1830 * Online the given vdev. If 'unspare' is set, it implies two things. First, 1831 * any attached spare device should be detached when the device finishes 1832 * resilvering. Second, the online should be treated like a 'test' online case, 1833 * so no FMA events are generated if the device fails to open. 1834 */ 1835 int 1836 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 1837 { 1838 vdev_t *vd; 1839 1840 spa_vdev_state_enter(spa); 1841 1842 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1843 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 1844 1845 if (!vd->vdev_ops->vdev_op_leaf) 1846 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 1847 1848 vd->vdev_offline = B_FALSE; 1849 vd->vdev_tmpoffline = B_FALSE; 1850 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 1851 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 1852 vdev_reopen(vd->vdev_top); 1853 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 1854 1855 if (newstate) 1856 *newstate = vd->vdev_state; 1857 if ((flags & ZFS_ONLINE_UNSPARE) && 1858 !vdev_is_dead(vd) && vd->vdev_parent && 1859 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 1860 vd->vdev_parent->vdev_child[0] == vd) 1861 vd->vdev_unspare = B_TRUE; 1862 1863 return (spa_vdev_state_exit(spa, vd, 0)); 1864 } 1865 1866 int 1867 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 1868 { 1869 vdev_t *vd; 1870 1871 spa_vdev_state_enter(spa); 1872 1873 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1874 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 1875 1876 if (!vd->vdev_ops->vdev_op_leaf) 1877 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 1878 1879 /* 1880 * If the device isn't already offline, try to offline it. 1881 */ 1882 if (!vd->vdev_offline) { 1883 /* 1884 * If this device has the only valid copy of some data, 1885 * don't allow it to be offlined. 1886 */ 1887 if (vd->vdev_aux == NULL && vdev_dtl_required(vd)) 1888 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 1889 1890 /* 1891 * Offline this device and reopen its top-level vdev. 1892 * If this action results in the top-level vdev becoming 1893 * unusable, undo it and fail the request. 1894 */ 1895 vd->vdev_offline = B_TRUE; 1896 vdev_reopen(vd->vdev_top); 1897 if (vd->vdev_aux == NULL && vdev_is_dead(vd->vdev_top)) { 1898 vd->vdev_offline = B_FALSE; 1899 vdev_reopen(vd->vdev_top); 1900 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 1901 } 1902 } 1903 1904 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 1905 1906 return (spa_vdev_state_exit(spa, vd, 0)); 1907 } 1908 1909 /* 1910 * Clear the error counts associated with this vdev. Unlike vdev_online() and 1911 * vdev_offline(), we assume the spa config is locked. We also clear all 1912 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 1913 */ 1914 void 1915 vdev_clear(spa_t *spa, vdev_t *vd) 1916 { 1917 vdev_t *rvd = spa->spa_root_vdev; 1918 1919 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1920 1921 if (vd == NULL) 1922 vd = rvd; 1923 1924 vd->vdev_stat.vs_read_errors = 0; 1925 vd->vdev_stat.vs_write_errors = 0; 1926 vd->vdev_stat.vs_checksum_errors = 0; 1927 1928 for (int c = 0; c < vd->vdev_children; c++) 1929 vdev_clear(spa, vd->vdev_child[c]); 1930 1931 /* 1932 * If we're in the FAULTED state or have experienced failed I/O, then 1933 * clear the persistent state and attempt to reopen the device. We 1934 * also mark the vdev config dirty, so that the new faulted state is 1935 * written out to disk. 1936 */ 1937 if (vd->vdev_faulted || vd->vdev_degraded || 1938 !vdev_readable(vd) || !vdev_writeable(vd)) { 1939 1940 vd->vdev_faulted = vd->vdev_degraded = 0; 1941 vd->vdev_cant_read = B_FALSE; 1942 vd->vdev_cant_write = B_FALSE; 1943 1944 vdev_reopen(vd); 1945 1946 if (vd != rvd) 1947 vdev_state_dirty(vd->vdev_top); 1948 1949 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 1950 spa_async_request(spa, SPA_ASYNC_RESILVER); 1951 1952 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 1953 } 1954 } 1955 1956 boolean_t 1957 vdev_is_dead(vdev_t *vd) 1958 { 1959 return (vd->vdev_state < VDEV_STATE_DEGRADED); 1960 } 1961 1962 boolean_t 1963 vdev_readable(vdev_t *vd) 1964 { 1965 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 1966 } 1967 1968 boolean_t 1969 vdev_writeable(vdev_t *vd) 1970 { 1971 return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 1972 } 1973 1974 boolean_t 1975 vdev_allocatable(vdev_t *vd) 1976 { 1977 uint64_t state = vd->vdev_state; 1978 1979 /* 1980 * We currently allow allocations from vdevs which may be in the 1981 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 1982 * fails to reopen then we'll catch it later when we're holding 1983 * the proper locks. Note that we have to get the vdev state 1984 * in a local variable because although it changes atomically, 1985 * we're asking two separate questions about it. 1986 */ 1987 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 1988 !vd->vdev_cant_write); 1989 } 1990 1991 boolean_t 1992 vdev_accessible(vdev_t *vd, zio_t *zio) 1993 { 1994 ASSERT(zio->io_vd == vd); 1995 1996 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 1997 return (B_FALSE); 1998 1999 if (zio->io_type == ZIO_TYPE_READ) 2000 return (!vd->vdev_cant_read); 2001 2002 if (zio->io_type == ZIO_TYPE_WRITE) 2003 return (!vd->vdev_cant_write); 2004 2005 return (B_TRUE); 2006 } 2007 2008 /* 2009 * Get statistics for the given vdev. 2010 */ 2011 void 2012 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2013 { 2014 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 2015 2016 mutex_enter(&vd->vdev_stat_lock); 2017 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2018 vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; 2019 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2020 vs->vs_state = vd->vdev_state; 2021 vs->vs_rsize = vdev_get_rsize(vd); 2022 mutex_exit(&vd->vdev_stat_lock); 2023 2024 /* 2025 * If we're getting stats on the root vdev, aggregate the I/O counts 2026 * over all top-level vdevs (i.e. the direct children of the root). 2027 */ 2028 if (vd == rvd) { 2029 for (int c = 0; c < rvd->vdev_children; c++) { 2030 vdev_t *cvd = rvd->vdev_child[c]; 2031 vdev_stat_t *cvs = &cvd->vdev_stat; 2032 2033 mutex_enter(&vd->vdev_stat_lock); 2034 for (int t = 0; t < ZIO_TYPES; t++) { 2035 vs->vs_ops[t] += cvs->vs_ops[t]; 2036 vs->vs_bytes[t] += cvs->vs_bytes[t]; 2037 } 2038 vs->vs_scrub_examined += cvs->vs_scrub_examined; 2039 mutex_exit(&vd->vdev_stat_lock); 2040 } 2041 } 2042 } 2043 2044 void 2045 vdev_clear_stats(vdev_t *vd) 2046 { 2047 mutex_enter(&vd->vdev_stat_lock); 2048 vd->vdev_stat.vs_space = 0; 2049 vd->vdev_stat.vs_dspace = 0; 2050 vd->vdev_stat.vs_alloc = 0; 2051 mutex_exit(&vd->vdev_stat_lock); 2052 } 2053 2054 void 2055 vdev_stat_update(zio_t *zio, uint64_t psize) 2056 { 2057 spa_t *spa = zio->io_spa; 2058 vdev_t *rvd = spa->spa_root_vdev; 2059 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2060 vdev_t *pvd; 2061 uint64_t txg = zio->io_txg; 2062 vdev_stat_t *vs = &vd->vdev_stat; 2063 zio_type_t type = zio->io_type; 2064 int flags = zio->io_flags; 2065 2066 /* 2067 * If this i/o is a gang leader, it didn't do any actual work. 2068 */ 2069 if (zio->io_gang_tree) 2070 return; 2071 2072 if (zio->io_error == 0) { 2073 /* 2074 * If this is a root i/o, don't count it -- we've already 2075 * counted the top-level vdevs, and vdev_get_stats() will 2076 * aggregate them when asked. This reduces contention on 2077 * the root vdev_stat_lock and implicitly handles blocks 2078 * that compress away to holes, for which there is no i/o. 2079 * (Holes never create vdev children, so all the counters 2080 * remain zero, which is what we want.) 2081 * 2082 * Note: this only applies to successful i/o (io_error == 0) 2083 * because unlike i/o counts, errors are not additive. 2084 * When reading a ditto block, for example, failure of 2085 * one top-level vdev does not imply a root-level error. 2086 */ 2087 if (vd == rvd) 2088 return; 2089 2090 ASSERT(vd == zio->io_vd); 2091 2092 if (flags & ZIO_FLAG_IO_BYPASS) 2093 return; 2094 2095 mutex_enter(&vd->vdev_stat_lock); 2096 2097 if (flags & ZIO_FLAG_IO_REPAIR) { 2098 if (flags & ZIO_FLAG_SCRUB_THREAD) 2099 vs->vs_scrub_repaired += psize; 2100 if (flags & ZIO_FLAG_SELF_HEAL) 2101 vs->vs_self_healed += psize; 2102 } 2103 2104 vs->vs_ops[type]++; 2105 vs->vs_bytes[type] += psize; 2106 2107 mutex_exit(&vd->vdev_stat_lock); 2108 return; 2109 } 2110 2111 if (flags & ZIO_FLAG_SPECULATIVE) 2112 return; 2113 2114 mutex_enter(&vd->vdev_stat_lock); 2115 if (type == ZIO_TYPE_READ) { 2116 if (zio->io_error == ECKSUM) 2117 vs->vs_checksum_errors++; 2118 else 2119 vs->vs_read_errors++; 2120 } 2121 if (type == ZIO_TYPE_WRITE) 2122 vs->vs_write_errors++; 2123 mutex_exit(&vd->vdev_stat_lock); 2124 2125 if (type == ZIO_TYPE_WRITE && txg != 0 && 2126 (!(flags & ZIO_FLAG_IO_REPAIR) || 2127 (flags & ZIO_FLAG_SCRUB_THREAD))) { 2128 /* 2129 * This is either a normal write (not a repair), or it's a 2130 * repair induced by the scrub thread. In the normal case, 2131 * we commit the DTL change in the same txg as the block 2132 * was born. In the scrub-induced repair case, we know that 2133 * scrubs run in first-pass syncing context, so we commit 2134 * the DTL change in spa->spa_syncing_txg. 2135 * 2136 * We currently do not make DTL entries for failed spontaneous 2137 * self-healing writes triggered by normal (non-scrubbing) 2138 * reads, because we have no transactional context in which to 2139 * do so -- and it's not clear that it'd be desirable anyway. 2140 */ 2141 if (vd->vdev_ops->vdev_op_leaf) { 2142 uint64_t commit_txg = txg; 2143 if (flags & ZIO_FLAG_SCRUB_THREAD) { 2144 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2145 ASSERT(spa_sync_pass(spa) == 1); 2146 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2147 commit_txg = spa->spa_syncing_txg; 2148 } 2149 ASSERT(commit_txg >= spa->spa_syncing_txg); 2150 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2151 return; 2152 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2153 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 2154 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2155 } 2156 if (vd != rvd) 2157 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2158 } 2159 } 2160 2161 void 2162 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 2163 { 2164 int c; 2165 vdev_stat_t *vs = &vd->vdev_stat; 2166 2167 for (c = 0; c < vd->vdev_children; c++) 2168 vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 2169 2170 mutex_enter(&vd->vdev_stat_lock); 2171 2172 if (type == POOL_SCRUB_NONE) { 2173 /* 2174 * Update completion and end time. Leave everything else alone 2175 * so we can report what happened during the previous scrub. 2176 */ 2177 vs->vs_scrub_complete = complete; 2178 vs->vs_scrub_end = gethrestime_sec(); 2179 } else { 2180 vs->vs_scrub_type = type; 2181 vs->vs_scrub_complete = 0; 2182 vs->vs_scrub_examined = 0; 2183 vs->vs_scrub_repaired = 0; 2184 vs->vs_scrub_start = gethrestime_sec(); 2185 vs->vs_scrub_end = 0; 2186 } 2187 2188 mutex_exit(&vd->vdev_stat_lock); 2189 } 2190 2191 /* 2192 * Update the in-core space usage stats for this vdev and the root vdev. 2193 */ 2194 void 2195 vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, 2196 boolean_t update_root) 2197 { 2198 int64_t dspace_delta = space_delta; 2199 spa_t *spa = vd->vdev_spa; 2200 vdev_t *rvd = spa->spa_root_vdev; 2201 2202 ASSERT(vd == vd->vdev_top); 2203 2204 /* 2205 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 2206 * factor. We must calculate this here and not at the root vdev 2207 * because the root vdev's psize-to-asize is simply the max of its 2208 * childrens', thus not accurate enough for us. 2209 */ 2210 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 2211 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 2212 vd->vdev_deflate_ratio; 2213 2214 mutex_enter(&vd->vdev_stat_lock); 2215 vd->vdev_stat.vs_space += space_delta; 2216 vd->vdev_stat.vs_alloc += alloc_delta; 2217 vd->vdev_stat.vs_dspace += dspace_delta; 2218 mutex_exit(&vd->vdev_stat_lock); 2219 2220 if (update_root) { 2221 ASSERT(rvd == vd->vdev_parent); 2222 ASSERT(vd->vdev_ms_count != 0); 2223 2224 /* 2225 * Don't count non-normal (e.g. intent log) space as part of 2226 * the pool's capacity. 2227 */ 2228 if (vd->vdev_mg->mg_class != spa->spa_normal_class) 2229 return; 2230 2231 mutex_enter(&rvd->vdev_stat_lock); 2232 rvd->vdev_stat.vs_space += space_delta; 2233 rvd->vdev_stat.vs_alloc += alloc_delta; 2234 rvd->vdev_stat.vs_dspace += dspace_delta; 2235 mutex_exit(&rvd->vdev_stat_lock); 2236 } 2237 } 2238 2239 /* 2240 * Mark a top-level vdev's config as dirty, placing it on the dirty list 2241 * so that it will be written out next time the vdev configuration is synced. 2242 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2243 */ 2244 void 2245 vdev_config_dirty(vdev_t *vd) 2246 { 2247 spa_t *spa = vd->vdev_spa; 2248 vdev_t *rvd = spa->spa_root_vdev; 2249 int c; 2250 2251 /* 2252 * If this is an aux vdev (as with l2cache devices), then we update the 2253 * vdev config manually and set the sync flag. 2254 */ 2255 if (vd->vdev_aux != NULL) { 2256 spa_aux_vdev_t *sav = vd->vdev_aux; 2257 nvlist_t **aux; 2258 uint_t naux; 2259 2260 for (c = 0; c < sav->sav_count; c++) { 2261 if (sav->sav_vdevs[c] == vd) 2262 break; 2263 } 2264 2265 if (c == sav->sav_count) { 2266 /* 2267 * We're being removed. There's nothing more to do. 2268 */ 2269 ASSERT(sav->sav_sync == B_TRUE); 2270 return; 2271 } 2272 2273 sav->sav_sync = B_TRUE; 2274 2275 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 2276 ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); 2277 2278 ASSERT(c < naux); 2279 2280 /* 2281 * Setting the nvlist in the middle if the array is a little 2282 * sketchy, but it will work. 2283 */ 2284 nvlist_free(aux[c]); 2285 aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); 2286 2287 return; 2288 } 2289 2290 /* 2291 * The dirty list is protected by the SCL_CONFIG lock. The caller 2292 * must either hold SCL_CONFIG as writer, or must be the sync thread 2293 * (which holds SCL_CONFIG as reader). There's only one sync thread, 2294 * so this is sufficient to ensure mutual exclusion. 2295 */ 2296 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2297 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2298 spa_config_held(spa, SCL_CONFIG, RW_READER))); 2299 2300 if (vd == rvd) { 2301 for (c = 0; c < rvd->vdev_children; c++) 2302 vdev_config_dirty(rvd->vdev_child[c]); 2303 } else { 2304 ASSERT(vd == vd->vdev_top); 2305 2306 if (!list_link_active(&vd->vdev_config_dirty_node)) 2307 list_insert_head(&spa->spa_config_dirty_list, vd); 2308 } 2309 } 2310 2311 void 2312 vdev_config_clean(vdev_t *vd) 2313 { 2314 spa_t *spa = vd->vdev_spa; 2315 2316 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 2317 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2318 spa_config_held(spa, SCL_CONFIG, RW_READER))); 2319 2320 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 2321 list_remove(&spa->spa_config_dirty_list, vd); 2322 } 2323 2324 /* 2325 * Mark a top-level vdev's state as dirty, so that the next pass of 2326 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 2327 * the state changes from larger config changes because they require 2328 * much less locking, and are often needed for administrative actions. 2329 */ 2330 void 2331 vdev_state_dirty(vdev_t *vd) 2332 { 2333 spa_t *spa = vd->vdev_spa; 2334 2335 ASSERT(vd == vd->vdev_top); 2336 2337 /* 2338 * The state list is protected by the SCL_STATE lock. The caller 2339 * must either hold SCL_STATE as writer, or must be the sync thread 2340 * (which holds SCL_STATE as reader). There's only one sync thread, 2341 * so this is sufficient to ensure mutual exclusion. 2342 */ 2343 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2344 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2345 spa_config_held(spa, SCL_STATE, RW_READER))); 2346 2347 if (!list_link_active(&vd->vdev_state_dirty_node)) 2348 list_insert_head(&spa->spa_state_dirty_list, vd); 2349 } 2350 2351 void 2352 vdev_state_clean(vdev_t *vd) 2353 { 2354 spa_t *spa = vd->vdev_spa; 2355 2356 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 2357 (dsl_pool_sync_context(spa_get_dsl(spa)) && 2358 spa_config_held(spa, SCL_STATE, RW_READER))); 2359 2360 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 2361 list_remove(&spa->spa_state_dirty_list, vd); 2362 } 2363 2364 /* 2365 * Propagate vdev state up from children to parent. 2366 */ 2367 void 2368 vdev_propagate_state(vdev_t *vd) 2369 { 2370 spa_t *spa = vd->vdev_spa; 2371 vdev_t *rvd = spa->spa_root_vdev; 2372 int degraded = 0, faulted = 0; 2373 int corrupted = 0; 2374 int c; 2375 vdev_t *child; 2376 2377 if (vd->vdev_children > 0) { 2378 for (c = 0; c < vd->vdev_children; c++) { 2379 child = vd->vdev_child[c]; 2380 2381 if (!vdev_readable(child) || 2382 (!vdev_writeable(child) && spa_writeable(spa))) { 2383 /* 2384 * Root special: if there is a top-level log 2385 * device, treat the root vdev as if it were 2386 * degraded. 2387 */ 2388 if (child->vdev_islog && vd == rvd) 2389 degraded++; 2390 else 2391 faulted++; 2392 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 2393 degraded++; 2394 } 2395 2396 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 2397 corrupted++; 2398 } 2399 2400 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 2401 2402 /* 2403 * Root special: if there is a top-level vdev that cannot be 2404 * opened due to corrupted metadata, then propagate the root 2405 * vdev's aux state as 'corrupt' rather than 'insufficient 2406 * replicas'. 2407 */ 2408 if (corrupted && vd == rvd && 2409 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 2410 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 2411 VDEV_AUX_CORRUPT_DATA); 2412 } 2413 2414 if (vd->vdev_parent) 2415 vdev_propagate_state(vd->vdev_parent); 2416 } 2417 2418 /* 2419 * Set a vdev's state. If this is during an open, we don't update the parent 2420 * state, because we're in the process of opening children depth-first. 2421 * Otherwise, we propagate the change to the parent. 2422 * 2423 * If this routine places a device in a faulted state, an appropriate ereport is 2424 * generated. 2425 */ 2426 void 2427 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2428 { 2429 uint64_t save_state; 2430 spa_t *spa = vd->vdev_spa; 2431 2432 if (state == vd->vdev_state) { 2433 vd->vdev_stat.vs_aux = aux; 2434 return; 2435 } 2436 2437 save_state = vd->vdev_state; 2438 2439 vd->vdev_state = state; 2440 vd->vdev_stat.vs_aux = aux; 2441 2442 /* 2443 * If we are setting the vdev state to anything but an open state, then 2444 * always close the underlying device. Otherwise, we keep accessible 2445 * but invalid devices open forever. We don't call vdev_close() itself, 2446 * because that implies some extra checks (offline, etc) that we don't 2447 * want here. This is limited to leaf devices, because otherwise 2448 * closing the device will affect other children. 2449 */ 2450 if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) 2451 vd->vdev_ops->vdev_op_close(vd); 2452 2453 if (vd->vdev_removed && 2454 state == VDEV_STATE_CANT_OPEN && 2455 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 2456 /* 2457 * If the previous state is set to VDEV_STATE_REMOVED, then this 2458 * device was previously marked removed and someone attempted to 2459 * reopen it. If this failed due to a nonexistent device, then 2460 * keep the device in the REMOVED state. We also let this be if 2461 * it is one of our special test online cases, which is only 2462 * attempting to online the device and shouldn't generate an FMA 2463 * fault. 2464 */ 2465 vd->vdev_state = VDEV_STATE_REMOVED; 2466 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2467 } else if (state == VDEV_STATE_REMOVED) { 2468 /* 2469 * Indicate to the ZFS DE that this device has been removed, and 2470 * any recent errors should be ignored. 2471 */ 2472 zfs_post_remove(spa, vd); 2473 vd->vdev_removed = B_TRUE; 2474 } else if (state == VDEV_STATE_CANT_OPEN) { 2475 /* 2476 * If we fail to open a vdev during an import, we mark it as 2477 * "not available", which signifies that it was never there to 2478 * begin with. Failure to open such a device is not considered 2479 * an error. 2480 */ 2481 if (spa->spa_load_state == SPA_LOAD_IMPORT && 2482 !spa->spa_import_faulted && 2483 vd->vdev_ops->vdev_op_leaf) 2484 vd->vdev_not_present = 1; 2485 2486 /* 2487 * Post the appropriate ereport. If the 'prevstate' field is 2488 * set to something other than VDEV_STATE_UNKNOWN, it indicates 2489 * that this is part of a vdev_reopen(). In this case, we don't 2490 * want to post the ereport if the device was already in the 2491 * CANT_OPEN state beforehand. 2492 * 2493 * If the 'checkremove' flag is set, then this is an attempt to 2494 * online the device in response to an insertion event. If we 2495 * hit this case, then we have detected an insertion event for a 2496 * faulted or offline device that wasn't in the removed state. 2497 * In this scenario, we don't post an ereport because we are 2498 * about to replace the device, or attempt an online with 2499 * vdev_forcefault, which will generate the fault for us. 2500 */ 2501 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 2502 !vd->vdev_not_present && !vd->vdev_checkremove && 2503 vd != spa->spa_root_vdev) { 2504 const char *class; 2505 2506 switch (aux) { 2507 case VDEV_AUX_OPEN_FAILED: 2508 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 2509 break; 2510 case VDEV_AUX_CORRUPT_DATA: 2511 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 2512 break; 2513 case VDEV_AUX_NO_REPLICAS: 2514 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 2515 break; 2516 case VDEV_AUX_BAD_GUID_SUM: 2517 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 2518 break; 2519 case VDEV_AUX_TOO_SMALL: 2520 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 2521 break; 2522 case VDEV_AUX_BAD_LABEL: 2523 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 2524 break; 2525 case VDEV_AUX_IO_FAILURE: 2526 class = FM_EREPORT_ZFS_IO_FAILURE; 2527 break; 2528 default: 2529 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 2530 } 2531 2532 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 2533 } 2534 2535 /* Erase any notion of persistent removed state */ 2536 vd->vdev_removed = B_FALSE; 2537 } else { 2538 vd->vdev_removed = B_FALSE; 2539 } 2540 2541 if (!isopen) 2542 vdev_propagate_state(vd); 2543 } 2544 2545 /* 2546 * Check the vdev configuration to ensure that it's capable of supporting 2547 * a root pool. Currently, we do not support RAID-Z or partial configuration. 2548 * In addition, only a single top-level vdev is allowed and none of the leaves 2549 * can be wholedisks. 2550 */ 2551 boolean_t 2552 vdev_is_bootable(vdev_t *vd) 2553 { 2554 int c; 2555 2556 if (!vd->vdev_ops->vdev_op_leaf) { 2557 char *vdev_type = vd->vdev_ops->vdev_op_type; 2558 2559 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 2560 vd->vdev_children > 1) { 2561 return (B_FALSE); 2562 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 2563 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 2564 return (B_FALSE); 2565 } 2566 } else if (vd->vdev_wholedisk == 1) { 2567 return (B_FALSE); 2568 } 2569 2570 for (c = 0; c < vd->vdev_children; c++) { 2571 if (!vdev_is_bootable(vd->vdev_child[c])) 2572 return (B_FALSE); 2573 } 2574 return (B_TRUE); 2575 } 2576