1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 25 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 26 * Copyright (c) 2014 Integros [integros.com] 27 */ 28 29 #include <sys/zfs_context.h> 30 #include <sys/fm/fs/zfs.h> 31 #include <sys/spa.h> 32 #include <sys/spa_impl.h> 33 #include <sys/dmu.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/vdev_impl.h> 36 #include <sys/uberblock_impl.h> 37 #include <sys/metaslab.h> 38 #include <sys/metaslab_impl.h> 39 #include <sys/space_map.h> 40 #include <sys/space_reftree.h> 41 #include <sys/zio.h> 42 #include <sys/zap.h> 43 #include <sys/fs/zfs.h> 44 #include <sys/arc.h> 45 #include <sys/zil.h> 46 #include <sys/dsl_scan.h> 47 48 /* 49 * Virtual device management. 50 */ 51 52 static vdev_ops_t *vdev_ops_table[] = { 53 &vdev_root_ops, 54 &vdev_raidz_ops, 55 &vdev_mirror_ops, 56 &vdev_replacing_ops, 57 &vdev_spare_ops, 58 &vdev_disk_ops, 59 &vdev_file_ops, 60 &vdev_missing_ops, 61 &vdev_hole_ops, 62 NULL 63 }; 64 65 /* maximum scrub/resilver I/O queue per leaf vdev */ 66 int zfs_scrub_limit = 10; 67 68 /* 69 * When a vdev is added, it will be divided into approximately (but no 70 * more than) this number of metaslabs. 71 */ 72 int metaslabs_per_vdev = 200; 73 74 /* 75 * Given a vdev type, return the appropriate ops vector. 76 */ 77 static vdev_ops_t * 78 vdev_getops(const char *type) 79 { 80 vdev_ops_t *ops, **opspp; 81 82 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 83 if (strcmp(ops->vdev_op_type, type) == 0) 84 break; 85 86 return (ops); 87 } 88 89 /* 90 * Default asize function: return the MAX of psize with the asize of 91 * all children. This is what's used by anything other than RAID-Z. 92 */ 93 uint64_t 94 vdev_default_asize(vdev_t *vd, uint64_t psize) 95 { 96 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 97 uint64_t csize; 98 99 for (int c = 0; c < vd->vdev_children; c++) { 100 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 101 asize = MAX(asize, csize); 102 } 103 104 return (asize); 105 } 106 107 /* 108 * Get the minimum allocatable size. We define the allocatable size as 109 * the vdev's asize rounded to the nearest metaslab. This allows us to 110 * replace or attach devices which don't have the same physical size but 111 * can still satisfy the same number of allocations. 112 */ 113 uint64_t 114 vdev_get_min_asize(vdev_t *vd) 115 { 116 vdev_t *pvd = vd->vdev_parent; 117 118 /* 119 * If our parent is NULL (inactive spare or cache) or is the root, 120 * just return our own asize. 121 */ 122 if (pvd == NULL) 123 return (vd->vdev_asize); 124 125 /* 126 * The top-level vdev just returns the allocatable size rounded 127 * to the nearest metaslab. 128 */ 129 if (vd == vd->vdev_top) 130 return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); 131 132 /* 133 * The allocatable space for a raidz vdev is N * sizeof(smallest child), 134 * so each child must provide at least 1/Nth of its asize. 135 */ 136 if (pvd->vdev_ops == &vdev_raidz_ops) 137 return (pvd->vdev_min_asize / pvd->vdev_children); 138 139 return (pvd->vdev_min_asize); 140 } 141 142 void 143 vdev_set_min_asize(vdev_t *vd) 144 { 145 vd->vdev_min_asize = vdev_get_min_asize(vd); 146 147 for (int c = 0; c < vd->vdev_children; c++) 148 vdev_set_min_asize(vd->vdev_child[c]); 149 } 150 151 vdev_t * 152 vdev_lookup_top(spa_t *spa, uint64_t vdev) 153 { 154 vdev_t *rvd = spa->spa_root_vdev; 155 156 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 157 158 if (vdev < rvd->vdev_children) { 159 ASSERT(rvd->vdev_child[vdev] != NULL); 160 return (rvd->vdev_child[vdev]); 161 } 162 163 return (NULL); 164 } 165 166 vdev_t * 167 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 168 { 169 vdev_t *mvd; 170 171 if (vd->vdev_guid == guid) 172 return (vd); 173 174 for (int c = 0; c < vd->vdev_children; c++) 175 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 176 NULL) 177 return (mvd); 178 179 return (NULL); 180 } 181 182 static int 183 vdev_count_leaves_impl(vdev_t *vd) 184 { 185 int n = 0; 186 187 if (vd->vdev_ops->vdev_op_leaf) 188 return (1); 189 190 for (int c = 0; c < vd->vdev_children; c++) 191 n += vdev_count_leaves_impl(vd->vdev_child[c]); 192 193 return (n); 194 } 195 196 int 197 vdev_count_leaves(spa_t *spa) 198 { 199 return (vdev_count_leaves_impl(spa->spa_root_vdev)); 200 } 201 202 void 203 vdev_add_child(vdev_t *pvd, vdev_t *cvd) 204 { 205 size_t oldsize, newsize; 206 uint64_t id = cvd->vdev_id; 207 vdev_t **newchild; 208 spa_t *spa = cvd->vdev_spa; 209 210 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 211 ASSERT(cvd->vdev_parent == NULL); 212 213 cvd->vdev_parent = pvd; 214 215 if (pvd == NULL) 216 return; 217 218 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 219 220 oldsize = pvd->vdev_children * sizeof (vdev_t *); 221 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 222 newsize = pvd->vdev_children * sizeof (vdev_t *); 223 224 newchild = kmem_zalloc(newsize, KM_SLEEP); 225 if (pvd->vdev_child != NULL) { 226 bcopy(pvd->vdev_child, newchild, oldsize); 227 kmem_free(pvd->vdev_child, oldsize); 228 } 229 230 pvd->vdev_child = newchild; 231 pvd->vdev_child[id] = cvd; 232 233 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 234 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 235 236 /* 237 * Walk up all ancestors to update guid sum. 238 */ 239 for (; pvd != NULL; pvd = pvd->vdev_parent) 240 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 241 } 242 243 void 244 vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 245 { 246 int c; 247 uint_t id = cvd->vdev_id; 248 249 ASSERT(cvd->vdev_parent == pvd); 250 251 if (pvd == NULL) 252 return; 253 254 ASSERT(id < pvd->vdev_children); 255 ASSERT(pvd->vdev_child[id] == cvd); 256 257 pvd->vdev_child[id] = NULL; 258 cvd->vdev_parent = NULL; 259 260 for (c = 0; c < pvd->vdev_children; c++) 261 if (pvd->vdev_child[c]) 262 break; 263 264 if (c == pvd->vdev_children) { 265 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 266 pvd->vdev_child = NULL; 267 pvd->vdev_children = 0; 268 } 269 270 /* 271 * Walk up all ancestors to update guid sum. 272 */ 273 for (; pvd != NULL; pvd = pvd->vdev_parent) 274 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 275 } 276 277 /* 278 * Remove any holes in the child array. 279 */ 280 void 281 vdev_compact_children(vdev_t *pvd) 282 { 283 vdev_t **newchild, *cvd; 284 int oldc = pvd->vdev_children; 285 int newc; 286 287 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 288 289 for (int c = newc = 0; c < oldc; c++) 290 if (pvd->vdev_child[c]) 291 newc++; 292 293 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 294 295 for (int c = newc = 0; c < oldc; c++) { 296 if ((cvd = pvd->vdev_child[c]) != NULL) { 297 newchild[newc] = cvd; 298 cvd->vdev_id = newc++; 299 } 300 } 301 302 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 303 pvd->vdev_child = newchild; 304 pvd->vdev_children = newc; 305 } 306 307 /* 308 * Allocate and minimally initialize a vdev_t. 309 */ 310 vdev_t * 311 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 312 { 313 vdev_t *vd; 314 315 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 316 317 if (spa->spa_root_vdev == NULL) { 318 ASSERT(ops == &vdev_root_ops); 319 spa->spa_root_vdev = vd; 320 spa->spa_load_guid = spa_generate_guid(NULL); 321 } 322 323 if (guid == 0 && ops != &vdev_hole_ops) { 324 if (spa->spa_root_vdev == vd) { 325 /* 326 * The root vdev's guid will also be the pool guid, 327 * which must be unique among all pools. 328 */ 329 guid = spa_generate_guid(NULL); 330 } else { 331 /* 332 * Any other vdev's guid must be unique within the pool. 333 */ 334 guid = spa_generate_guid(spa); 335 } 336 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 337 } 338 339 vd->vdev_spa = spa; 340 vd->vdev_id = id; 341 vd->vdev_guid = guid; 342 vd->vdev_guid_sum = guid; 343 vd->vdev_ops = ops; 344 vd->vdev_state = VDEV_STATE_CLOSED; 345 vd->vdev_ishole = (ops == &vdev_hole_ops); 346 347 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 348 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 349 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); 350 mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL); 351 for (int t = 0; t < DTL_TYPES; t++) { 352 vd->vdev_dtl[t] = range_tree_create(NULL, NULL, 353 &vd->vdev_dtl_lock); 354 } 355 txg_list_create(&vd->vdev_ms_list, 356 offsetof(struct metaslab, ms_txg_node)); 357 txg_list_create(&vd->vdev_dtl_list, 358 offsetof(struct vdev, vdev_dtl_node)); 359 vd->vdev_stat.vs_timestamp = gethrtime(); 360 vdev_queue_init(vd); 361 vdev_cache_init(vd); 362 363 return (vd); 364 } 365 366 /* 367 * Allocate a new vdev. The 'alloctype' is used to control whether we are 368 * creating a new vdev or loading an existing one - the behavior is slightly 369 * different for each case. 370 */ 371 int 372 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 373 int alloctype) 374 { 375 vdev_ops_t *ops; 376 char *type; 377 uint64_t guid = 0, islog, nparity; 378 vdev_t *vd; 379 380 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 381 382 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 383 return (SET_ERROR(EINVAL)); 384 385 if ((ops = vdev_getops(type)) == NULL) 386 return (SET_ERROR(EINVAL)); 387 388 /* 389 * If this is a load, get the vdev guid from the nvlist. 390 * Otherwise, vdev_alloc_common() will generate one for us. 391 */ 392 if (alloctype == VDEV_ALLOC_LOAD) { 393 uint64_t label_id; 394 395 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 396 label_id != id) 397 return (SET_ERROR(EINVAL)); 398 399 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 400 return (SET_ERROR(EINVAL)); 401 } else if (alloctype == VDEV_ALLOC_SPARE) { 402 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 403 return (SET_ERROR(EINVAL)); 404 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 405 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 406 return (SET_ERROR(EINVAL)); 407 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { 408 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 409 return (SET_ERROR(EINVAL)); 410 } 411 412 /* 413 * The first allocated vdev must be of type 'root'. 414 */ 415 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 416 return (SET_ERROR(EINVAL)); 417 418 /* 419 * Determine whether we're a log vdev. 420 */ 421 islog = 0; 422 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 423 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 424 return (SET_ERROR(ENOTSUP)); 425 426 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) 427 return (SET_ERROR(ENOTSUP)); 428 429 /* 430 * Set the nparity property for RAID-Z vdevs. 431 */ 432 nparity = -1ULL; 433 if (ops == &vdev_raidz_ops) { 434 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 435 &nparity) == 0) { 436 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) 437 return (SET_ERROR(EINVAL)); 438 /* 439 * Previous versions could only support 1 or 2 parity 440 * device. 441 */ 442 if (nparity > 1 && 443 spa_version(spa) < SPA_VERSION_RAIDZ2) 444 return (SET_ERROR(ENOTSUP)); 445 if (nparity > 2 && 446 spa_version(spa) < SPA_VERSION_RAIDZ3) 447 return (SET_ERROR(ENOTSUP)); 448 } else { 449 /* 450 * We require the parity to be specified for SPAs that 451 * support multiple parity levels. 452 */ 453 if (spa_version(spa) >= SPA_VERSION_RAIDZ2) 454 return (SET_ERROR(EINVAL)); 455 /* 456 * Otherwise, we default to 1 parity device for RAID-Z. 457 */ 458 nparity = 1; 459 } 460 } else { 461 nparity = 0; 462 } 463 ASSERT(nparity != -1ULL); 464 465 vd = vdev_alloc_common(spa, id, guid, ops); 466 467 vd->vdev_islog = islog; 468 vd->vdev_nparity = nparity; 469 470 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 471 vd->vdev_path = spa_strdup(vd->vdev_path); 472 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 473 vd->vdev_devid = spa_strdup(vd->vdev_devid); 474 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 475 &vd->vdev_physpath) == 0) 476 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 477 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) 478 vd->vdev_fru = spa_strdup(vd->vdev_fru); 479 480 /* 481 * Set the whole_disk property. If it's not specified, leave the value 482 * as -1. 483 */ 484 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 485 &vd->vdev_wholedisk) != 0) 486 vd->vdev_wholedisk = -1ULL; 487 488 /* 489 * Look for the 'not present' flag. This will only be set if the device 490 * was not present at the time of import. 491 */ 492 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 493 &vd->vdev_not_present); 494 495 /* 496 * Get the alignment requirement. 497 */ 498 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 499 500 /* 501 * Retrieve the vdev creation time. 502 */ 503 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, 504 &vd->vdev_crtxg); 505 506 /* 507 * If we're a top-level vdev, try to load the allocation parameters. 508 */ 509 if (parent && !parent->vdev_parent && 510 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 511 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 512 &vd->vdev_ms_array); 513 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 514 &vd->vdev_ms_shift); 515 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 516 &vd->vdev_asize); 517 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, 518 &vd->vdev_removing); 519 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, 520 &vd->vdev_top_zap); 521 } else { 522 ASSERT0(vd->vdev_top_zap); 523 } 524 525 if (parent && !parent->vdev_parent && alloctype != VDEV_ALLOC_ATTACH) { 526 ASSERT(alloctype == VDEV_ALLOC_LOAD || 527 alloctype == VDEV_ALLOC_ADD || 528 alloctype == VDEV_ALLOC_SPLIT || 529 alloctype == VDEV_ALLOC_ROOTPOOL); 530 vd->vdev_mg = metaslab_group_create(islog ? 531 spa_log_class(spa) : spa_normal_class(spa), vd); 532 } 533 534 if (vd->vdev_ops->vdev_op_leaf && 535 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { 536 (void) nvlist_lookup_uint64(nv, 537 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); 538 } else { 539 ASSERT0(vd->vdev_leaf_zap); 540 } 541 542 /* 543 * If we're a leaf vdev, try to load the DTL object and other state. 544 */ 545 546 if (vd->vdev_ops->vdev_op_leaf && 547 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || 548 alloctype == VDEV_ALLOC_ROOTPOOL)) { 549 if (alloctype == VDEV_ALLOC_LOAD) { 550 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 551 &vd->vdev_dtl_object); 552 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 553 &vd->vdev_unspare); 554 } 555 556 if (alloctype == VDEV_ALLOC_ROOTPOOL) { 557 uint64_t spare = 0; 558 559 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 560 &spare) == 0 && spare) 561 spa_spare_add(vd); 562 } 563 564 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 565 &vd->vdev_offline); 566 567 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, 568 &vd->vdev_resilver_txg); 569 570 /* 571 * When importing a pool, we want to ignore the persistent fault 572 * state, as the diagnosis made on another system may not be 573 * valid in the current context. Local vdevs will 574 * remain in the faulted state. 575 */ 576 if (spa_load_state(spa) == SPA_LOAD_OPEN) { 577 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 578 &vd->vdev_faulted); 579 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 580 &vd->vdev_degraded); 581 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 582 &vd->vdev_removed); 583 584 if (vd->vdev_faulted || vd->vdev_degraded) { 585 char *aux; 586 587 vd->vdev_label_aux = 588 VDEV_AUX_ERR_EXCEEDED; 589 if (nvlist_lookup_string(nv, 590 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && 591 strcmp(aux, "external") == 0) 592 vd->vdev_label_aux = VDEV_AUX_EXTERNAL; 593 } 594 } 595 } 596 597 /* 598 * Add ourselves to the parent's list of children. 599 */ 600 vdev_add_child(parent, vd); 601 602 *vdp = vd; 603 604 return (0); 605 } 606 607 void 608 vdev_free(vdev_t *vd) 609 { 610 spa_t *spa = vd->vdev_spa; 611 612 /* 613 * vdev_free() implies closing the vdev first. This is simpler than 614 * trying to ensure complicated semantics for all callers. 615 */ 616 vdev_close(vd); 617 618 ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); 619 ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); 620 621 /* 622 * Free all children. 623 */ 624 for (int c = 0; c < vd->vdev_children; c++) 625 vdev_free(vd->vdev_child[c]); 626 627 ASSERT(vd->vdev_child == NULL); 628 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 629 630 /* 631 * Discard allocation state. 632 */ 633 if (vd->vdev_mg != NULL) { 634 vdev_metaslab_fini(vd); 635 metaslab_group_destroy(vd->vdev_mg); 636 } 637 638 ASSERT0(vd->vdev_stat.vs_space); 639 ASSERT0(vd->vdev_stat.vs_dspace); 640 ASSERT0(vd->vdev_stat.vs_alloc); 641 642 /* 643 * Remove this vdev from its parent's child list. 644 */ 645 vdev_remove_child(vd->vdev_parent, vd); 646 647 ASSERT(vd->vdev_parent == NULL); 648 649 /* 650 * Clean up vdev structure. 651 */ 652 vdev_queue_fini(vd); 653 vdev_cache_fini(vd); 654 655 if (vd->vdev_path) 656 spa_strfree(vd->vdev_path); 657 if (vd->vdev_devid) 658 spa_strfree(vd->vdev_devid); 659 if (vd->vdev_physpath) 660 spa_strfree(vd->vdev_physpath); 661 if (vd->vdev_fru) 662 spa_strfree(vd->vdev_fru); 663 664 if (vd->vdev_isspare) 665 spa_spare_remove(vd); 666 if (vd->vdev_isl2cache) 667 spa_l2cache_remove(vd); 668 669 txg_list_destroy(&vd->vdev_ms_list); 670 txg_list_destroy(&vd->vdev_dtl_list); 671 672 mutex_enter(&vd->vdev_dtl_lock); 673 space_map_close(vd->vdev_dtl_sm); 674 for (int t = 0; t < DTL_TYPES; t++) { 675 range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); 676 range_tree_destroy(vd->vdev_dtl[t]); 677 } 678 mutex_exit(&vd->vdev_dtl_lock); 679 680 mutex_destroy(&vd->vdev_queue_lock); 681 mutex_destroy(&vd->vdev_dtl_lock); 682 mutex_destroy(&vd->vdev_stat_lock); 683 mutex_destroy(&vd->vdev_probe_lock); 684 685 if (vd == spa->spa_root_vdev) 686 spa->spa_root_vdev = NULL; 687 688 kmem_free(vd, sizeof (vdev_t)); 689 } 690 691 /* 692 * Transfer top-level vdev state from svd to tvd. 693 */ 694 static void 695 vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 696 { 697 spa_t *spa = svd->vdev_spa; 698 metaslab_t *msp; 699 vdev_t *vd; 700 int t; 701 702 ASSERT(tvd == tvd->vdev_top); 703 704 tvd->vdev_ms_array = svd->vdev_ms_array; 705 tvd->vdev_ms_shift = svd->vdev_ms_shift; 706 tvd->vdev_ms_count = svd->vdev_ms_count; 707 tvd->vdev_top_zap = svd->vdev_top_zap; 708 709 svd->vdev_ms_array = 0; 710 svd->vdev_ms_shift = 0; 711 svd->vdev_ms_count = 0; 712 svd->vdev_top_zap = 0; 713 714 if (tvd->vdev_mg) 715 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); 716 tvd->vdev_mg = svd->vdev_mg; 717 tvd->vdev_ms = svd->vdev_ms; 718 719 svd->vdev_mg = NULL; 720 svd->vdev_ms = NULL; 721 722 if (tvd->vdev_mg != NULL) 723 tvd->vdev_mg->mg_vd = tvd; 724 725 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 726 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 727 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 728 729 svd->vdev_stat.vs_alloc = 0; 730 svd->vdev_stat.vs_space = 0; 731 svd->vdev_stat.vs_dspace = 0; 732 733 for (t = 0; t < TXG_SIZE; t++) { 734 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 735 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 736 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 737 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 738 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 739 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 740 } 741 742 if (list_link_active(&svd->vdev_config_dirty_node)) { 743 vdev_config_clean(svd); 744 vdev_config_dirty(tvd); 745 } 746 747 if (list_link_active(&svd->vdev_state_dirty_node)) { 748 vdev_state_clean(svd); 749 vdev_state_dirty(tvd); 750 } 751 752 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 753 svd->vdev_deflate_ratio = 0; 754 755 tvd->vdev_islog = svd->vdev_islog; 756 svd->vdev_islog = 0; 757 } 758 759 static void 760 vdev_top_update(vdev_t *tvd, vdev_t *vd) 761 { 762 if (vd == NULL) 763 return; 764 765 vd->vdev_top = tvd; 766 767 for (int c = 0; c < vd->vdev_children; c++) 768 vdev_top_update(tvd, vd->vdev_child[c]); 769 } 770 771 /* 772 * Add a mirror/replacing vdev above an existing vdev. 773 */ 774 vdev_t * 775 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 776 { 777 spa_t *spa = cvd->vdev_spa; 778 vdev_t *pvd = cvd->vdev_parent; 779 vdev_t *mvd; 780 781 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 782 783 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 784 785 mvd->vdev_asize = cvd->vdev_asize; 786 mvd->vdev_min_asize = cvd->vdev_min_asize; 787 mvd->vdev_max_asize = cvd->vdev_max_asize; 788 mvd->vdev_ashift = cvd->vdev_ashift; 789 mvd->vdev_state = cvd->vdev_state; 790 mvd->vdev_crtxg = cvd->vdev_crtxg; 791 792 vdev_remove_child(pvd, cvd); 793 vdev_add_child(pvd, mvd); 794 cvd->vdev_id = mvd->vdev_children; 795 vdev_add_child(mvd, cvd); 796 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 797 798 if (mvd == mvd->vdev_top) 799 vdev_top_transfer(cvd, mvd); 800 801 return (mvd); 802 } 803 804 /* 805 * Remove a 1-way mirror/replacing vdev from the tree. 806 */ 807 void 808 vdev_remove_parent(vdev_t *cvd) 809 { 810 vdev_t *mvd = cvd->vdev_parent; 811 vdev_t *pvd = mvd->vdev_parent; 812 813 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 814 815 ASSERT(mvd->vdev_children == 1); 816 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 817 mvd->vdev_ops == &vdev_replacing_ops || 818 mvd->vdev_ops == &vdev_spare_ops); 819 cvd->vdev_ashift = mvd->vdev_ashift; 820 821 vdev_remove_child(mvd, cvd); 822 vdev_remove_child(pvd, mvd); 823 824 /* 825 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. 826 * Otherwise, we could have detached an offline device, and when we 827 * go to import the pool we'll think we have two top-level vdevs, 828 * instead of a different version of the same top-level vdev. 829 */ 830 if (mvd->vdev_top == mvd) { 831 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; 832 cvd->vdev_orig_guid = cvd->vdev_guid; 833 cvd->vdev_guid += guid_delta; 834 cvd->vdev_guid_sum += guid_delta; 835 } 836 cvd->vdev_id = mvd->vdev_id; 837 vdev_add_child(pvd, cvd); 838 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 839 840 if (cvd == cvd->vdev_top) 841 vdev_top_transfer(mvd, cvd); 842 843 ASSERT(mvd->vdev_children == 0); 844 vdev_free(mvd); 845 } 846 847 int 848 vdev_metaslab_init(vdev_t *vd, uint64_t txg) 849 { 850 spa_t *spa = vd->vdev_spa; 851 objset_t *mos = spa->spa_meta_objset; 852 uint64_t m; 853 uint64_t oldc = vd->vdev_ms_count; 854 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 855 metaslab_t **mspp; 856 int error; 857 858 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 859 860 /* 861 * This vdev is not being allocated from yet or is a hole. 862 */ 863 if (vd->vdev_ms_shift == 0) 864 return (0); 865 866 ASSERT(!vd->vdev_ishole); 867 868 /* 869 * Compute the raidz-deflation ratio. Note, we hard-code 870 * in 128k (1 << 17) because it is the "typical" blocksize. 871 * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, 872 * otherwise it would inconsistently account for existing bp's. 873 */ 874 vd->vdev_deflate_ratio = (1 << 17) / 875 (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); 876 877 ASSERT(oldc <= newc); 878 879 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 880 881 if (oldc != 0) { 882 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 883 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 884 } 885 886 vd->vdev_ms = mspp; 887 vd->vdev_ms_count = newc; 888 889 for (m = oldc; m < newc; m++) { 890 uint64_t object = 0; 891 892 if (txg == 0) { 893 error = dmu_read(mos, vd->vdev_ms_array, 894 m * sizeof (uint64_t), sizeof (uint64_t), &object, 895 DMU_READ_PREFETCH); 896 if (error) 897 return (error); 898 } 899 900 error = metaslab_init(vd->vdev_mg, m, object, txg, 901 &(vd->vdev_ms[m])); 902 if (error) 903 return (error); 904 } 905 906 if (txg == 0) 907 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); 908 909 /* 910 * If the vdev is being removed we don't activate 911 * the metaslabs since we want to ensure that no new 912 * allocations are performed on this device. 913 */ 914 if (oldc == 0 && !vd->vdev_removing) 915 metaslab_group_activate(vd->vdev_mg); 916 917 if (txg == 0) 918 spa_config_exit(spa, SCL_ALLOC, FTAG); 919 920 return (0); 921 } 922 923 void 924 vdev_metaslab_fini(vdev_t *vd) 925 { 926 uint64_t m; 927 uint64_t count = vd->vdev_ms_count; 928 929 if (vd->vdev_ms != NULL) { 930 metaslab_group_passivate(vd->vdev_mg); 931 for (m = 0; m < count; m++) { 932 metaslab_t *msp = vd->vdev_ms[m]; 933 934 if (msp != NULL) 935 metaslab_fini(msp); 936 } 937 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 938 vd->vdev_ms = NULL; 939 } 940 } 941 942 typedef struct vdev_probe_stats { 943 boolean_t vps_readable; 944 boolean_t vps_writeable; 945 int vps_flags; 946 } vdev_probe_stats_t; 947 948 static void 949 vdev_probe_done(zio_t *zio) 950 { 951 spa_t *spa = zio->io_spa; 952 vdev_t *vd = zio->io_vd; 953 vdev_probe_stats_t *vps = zio->io_private; 954 955 ASSERT(vd->vdev_probe_zio != NULL); 956 957 if (zio->io_type == ZIO_TYPE_READ) { 958 if (zio->io_error == 0) 959 vps->vps_readable = 1; 960 if (zio->io_error == 0 && spa_writeable(spa)) { 961 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, 962 zio->io_offset, zio->io_size, zio->io_data, 963 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 964 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); 965 } else { 966 zio_buf_free(zio->io_data, zio->io_size); 967 } 968 } else if (zio->io_type == ZIO_TYPE_WRITE) { 969 if (zio->io_error == 0) 970 vps->vps_writeable = 1; 971 zio_buf_free(zio->io_data, zio->io_size); 972 } else if (zio->io_type == ZIO_TYPE_NULL) { 973 zio_t *pio; 974 975 vd->vdev_cant_read |= !vps->vps_readable; 976 vd->vdev_cant_write |= !vps->vps_writeable; 977 978 if (vdev_readable(vd) && 979 (vdev_writeable(vd) || !spa_writeable(spa))) { 980 zio->io_error = 0; 981 } else { 982 ASSERT(zio->io_error != 0); 983 zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, 984 spa, vd, NULL, 0, 0); 985 zio->io_error = SET_ERROR(ENXIO); 986 } 987 988 mutex_enter(&vd->vdev_probe_lock); 989 ASSERT(vd->vdev_probe_zio == zio); 990 vd->vdev_probe_zio = NULL; 991 mutex_exit(&vd->vdev_probe_lock); 992 993 zio_link_t *zl = NULL; 994 while ((pio = zio_walk_parents(zio, &zl)) != NULL) 995 if (!vdev_accessible(vd, pio)) 996 pio->io_error = SET_ERROR(ENXIO); 997 998 kmem_free(vps, sizeof (*vps)); 999 } 1000 } 1001 1002 /* 1003 * Determine whether this device is accessible. 1004 * 1005 * Read and write to several known locations: the pad regions of each 1006 * vdev label but the first, which we leave alone in case it contains 1007 * a VTOC. 1008 */ 1009 zio_t * 1010 vdev_probe(vdev_t *vd, zio_t *zio) 1011 { 1012 spa_t *spa = vd->vdev_spa; 1013 vdev_probe_stats_t *vps = NULL; 1014 zio_t *pio; 1015 1016 ASSERT(vd->vdev_ops->vdev_op_leaf); 1017 1018 /* 1019 * Don't probe the probe. 1020 */ 1021 if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) 1022 return (NULL); 1023 1024 /* 1025 * To prevent 'probe storms' when a device fails, we create 1026 * just one probe i/o at a time. All zios that want to probe 1027 * this vdev will become parents of the probe io. 1028 */ 1029 mutex_enter(&vd->vdev_probe_lock); 1030 1031 if ((pio = vd->vdev_probe_zio) == NULL) { 1032 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); 1033 1034 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | 1035 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | 1036 ZIO_FLAG_TRYHARD; 1037 1038 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { 1039 /* 1040 * vdev_cant_read and vdev_cant_write can only 1041 * transition from TRUE to FALSE when we have the 1042 * SCL_ZIO lock as writer; otherwise they can only 1043 * transition from FALSE to TRUE. This ensures that 1044 * any zio looking at these values can assume that 1045 * failures persist for the life of the I/O. That's 1046 * important because when a device has intermittent 1047 * connectivity problems, we want to ensure that 1048 * they're ascribed to the device (ENXIO) and not 1049 * the zio (EIO). 1050 * 1051 * Since we hold SCL_ZIO as writer here, clear both 1052 * values so the probe can reevaluate from first 1053 * principles. 1054 */ 1055 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; 1056 vd->vdev_cant_read = B_FALSE; 1057 vd->vdev_cant_write = B_FALSE; 1058 } 1059 1060 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, 1061 vdev_probe_done, vps, 1062 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); 1063 1064 /* 1065 * We can't change the vdev state in this context, so we 1066 * kick off an async task to do it on our behalf. 1067 */ 1068 if (zio != NULL) { 1069 vd->vdev_probe_wanted = B_TRUE; 1070 spa_async_request(spa, SPA_ASYNC_PROBE); 1071 } 1072 } 1073 1074 if (zio != NULL) 1075 zio_add_child(zio, pio); 1076 1077 mutex_exit(&vd->vdev_probe_lock); 1078 1079 if (vps == NULL) { 1080 ASSERT(zio != NULL); 1081 return (NULL); 1082 } 1083 1084 for (int l = 1; l < VDEV_LABELS; l++) { 1085 zio_nowait(zio_read_phys(pio, vd, 1086 vdev_label_offset(vd->vdev_psize, l, 1087 offsetof(vdev_label_t, vl_pad2)), 1088 VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), 1089 ZIO_CHECKSUM_OFF, vdev_probe_done, vps, 1090 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); 1091 } 1092 1093 if (zio == NULL) 1094 return (pio); 1095 1096 zio_nowait(pio); 1097 return (NULL); 1098 } 1099 1100 static void 1101 vdev_open_child(void *arg) 1102 { 1103 vdev_t *vd = arg; 1104 1105 vd->vdev_open_thread = curthread; 1106 vd->vdev_open_error = vdev_open(vd); 1107 vd->vdev_open_thread = NULL; 1108 } 1109 1110 boolean_t 1111 vdev_uses_zvols(vdev_t *vd) 1112 { 1113 if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, 1114 strlen(ZVOL_DIR)) == 0) 1115 return (B_TRUE); 1116 for (int c = 0; c < vd->vdev_children; c++) 1117 if (vdev_uses_zvols(vd->vdev_child[c])) 1118 return (B_TRUE); 1119 return (B_FALSE); 1120 } 1121 1122 void 1123 vdev_open_children(vdev_t *vd) 1124 { 1125 taskq_t *tq; 1126 int children = vd->vdev_children; 1127 1128 /* 1129 * in order to handle pools on top of zvols, do the opens 1130 * in a single thread so that the same thread holds the 1131 * spa_namespace_lock 1132 */ 1133 if (vdev_uses_zvols(vd)) { 1134 for (int c = 0; c < children; c++) 1135 vd->vdev_child[c]->vdev_open_error = 1136 vdev_open(vd->vdev_child[c]); 1137 return; 1138 } 1139 tq = taskq_create("vdev_open", children, minclsyspri, 1140 children, children, TASKQ_PREPOPULATE); 1141 1142 for (int c = 0; c < children; c++) 1143 VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], 1144 TQ_SLEEP) != NULL); 1145 1146 taskq_destroy(tq); 1147 } 1148 1149 /* 1150 * Prepare a virtual device for access. 1151 */ 1152 int 1153 vdev_open(vdev_t *vd) 1154 { 1155 spa_t *spa = vd->vdev_spa; 1156 int error; 1157 uint64_t osize = 0; 1158 uint64_t max_osize = 0; 1159 uint64_t asize, max_asize, psize; 1160 uint64_t ashift = 0; 1161 1162 ASSERT(vd->vdev_open_thread == curthread || 1163 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1164 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 1165 vd->vdev_state == VDEV_STATE_CANT_OPEN || 1166 vd->vdev_state == VDEV_STATE_OFFLINE); 1167 1168 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1169 vd->vdev_cant_read = B_FALSE; 1170 vd->vdev_cant_write = B_FALSE; 1171 vd->vdev_min_asize = vdev_get_min_asize(vd); 1172 1173 /* 1174 * If this vdev is not removed, check its fault status. If it's 1175 * faulted, bail out of the open. 1176 */ 1177 if (!vd->vdev_removed && vd->vdev_faulted) { 1178 ASSERT(vd->vdev_children == 0); 1179 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1180 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1181 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1182 vd->vdev_label_aux); 1183 return (SET_ERROR(ENXIO)); 1184 } else if (vd->vdev_offline) { 1185 ASSERT(vd->vdev_children == 0); 1186 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 1187 return (SET_ERROR(ENXIO)); 1188 } 1189 1190 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); 1191 1192 /* 1193 * Reset the vdev_reopening flag so that we actually close 1194 * the vdev on error. 1195 */ 1196 vd->vdev_reopening = B_FALSE; 1197 if (zio_injection_enabled && error == 0) 1198 error = zio_handle_device_injection(vd, NULL, ENXIO); 1199 1200 if (error) { 1201 if (vd->vdev_removed && 1202 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 1203 vd->vdev_removed = B_FALSE; 1204 1205 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1206 vd->vdev_stat.vs_aux); 1207 return (error); 1208 } 1209 1210 vd->vdev_removed = B_FALSE; 1211 1212 /* 1213 * Recheck the faulted flag now that we have confirmed that 1214 * the vdev is accessible. If we're faulted, bail. 1215 */ 1216 if (vd->vdev_faulted) { 1217 ASSERT(vd->vdev_children == 0); 1218 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || 1219 vd->vdev_label_aux == VDEV_AUX_EXTERNAL); 1220 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1221 vd->vdev_label_aux); 1222 return (SET_ERROR(ENXIO)); 1223 } 1224 1225 if (vd->vdev_degraded) { 1226 ASSERT(vd->vdev_children == 0); 1227 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1228 VDEV_AUX_ERR_EXCEEDED); 1229 } else { 1230 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); 1231 } 1232 1233 /* 1234 * For hole or missing vdevs we just return success. 1235 */ 1236 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) 1237 return (0); 1238 1239 for (int c = 0; c < vd->vdev_children; c++) { 1240 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 1241 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 1242 VDEV_AUX_NONE); 1243 break; 1244 } 1245 } 1246 1247 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 1248 max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); 1249 1250 if (vd->vdev_children == 0) { 1251 if (osize < SPA_MINDEVSIZE) { 1252 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1253 VDEV_AUX_TOO_SMALL); 1254 return (SET_ERROR(EOVERFLOW)); 1255 } 1256 psize = osize; 1257 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 1258 max_asize = max_osize - (VDEV_LABEL_START_SIZE + 1259 VDEV_LABEL_END_SIZE); 1260 } else { 1261 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 1262 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 1263 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1264 VDEV_AUX_TOO_SMALL); 1265 return (SET_ERROR(EOVERFLOW)); 1266 } 1267 psize = 0; 1268 asize = osize; 1269 max_asize = max_osize; 1270 } 1271 1272 vd->vdev_psize = psize; 1273 1274 /* 1275 * Make sure the allocatable size hasn't shrunk. 1276 */ 1277 if (asize < vd->vdev_min_asize) { 1278 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1279 VDEV_AUX_BAD_LABEL); 1280 return (SET_ERROR(EINVAL)); 1281 } 1282 1283 if (vd->vdev_asize == 0) { 1284 /* 1285 * This is the first-ever open, so use the computed values. 1286 * For testing purposes, a higher ashift can be requested. 1287 */ 1288 vd->vdev_asize = asize; 1289 vd->vdev_max_asize = max_asize; 1290 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 1291 } else { 1292 /* 1293 * Detect if the alignment requirement has increased. 1294 * We don't want to make the pool unavailable, just 1295 * issue a warning instead. 1296 */ 1297 if (ashift > vd->vdev_top->vdev_ashift && 1298 vd->vdev_ops->vdev_op_leaf) { 1299 cmn_err(CE_WARN, 1300 "Disk, '%s', has a block alignment that is " 1301 "larger than the pool's alignment\n", 1302 vd->vdev_path); 1303 } 1304 vd->vdev_max_asize = max_asize; 1305 } 1306 1307 /* 1308 * If all children are healthy and the asize has increased, 1309 * then we've experienced dynamic LUN growth. If automatic 1310 * expansion is enabled then use the additional space. 1311 */ 1312 if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && 1313 (vd->vdev_expanding || spa->spa_autoexpand)) 1314 vd->vdev_asize = asize; 1315 1316 vdev_set_min_asize(vd); 1317 1318 /* 1319 * Ensure we can issue some IO before declaring the 1320 * vdev open for business. 1321 */ 1322 if (vd->vdev_ops->vdev_op_leaf && 1323 (error = zio_wait(vdev_probe(vd, NULL))) != 0) { 1324 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 1325 VDEV_AUX_ERR_EXCEEDED); 1326 return (error); 1327 } 1328 1329 /* 1330 * Track the min and max ashift values for normal data devices. 1331 */ 1332 if (vd->vdev_top == vd && vd->vdev_ashift != 0 && 1333 !vd->vdev_islog && vd->vdev_aux == NULL) { 1334 if (vd->vdev_ashift > spa->spa_max_ashift) 1335 spa->spa_max_ashift = vd->vdev_ashift; 1336 if (vd->vdev_ashift < spa->spa_min_ashift) 1337 spa->spa_min_ashift = vd->vdev_ashift; 1338 } 1339 1340 /* 1341 * If a leaf vdev has a DTL, and seems healthy, then kick off a 1342 * resilver. But don't do this if we are doing a reopen for a scrub, 1343 * since this would just restart the scrub we are already doing. 1344 */ 1345 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && 1346 vdev_resilver_needed(vd, NULL, NULL)) 1347 spa_async_request(spa, SPA_ASYNC_RESILVER); 1348 1349 return (0); 1350 } 1351 1352 /* 1353 * Called once the vdevs are all opened, this routine validates the label 1354 * contents. This needs to be done before vdev_load() so that we don't 1355 * inadvertently do repair I/Os to the wrong device. 1356 * 1357 * If 'strict' is false ignore the spa guid check. This is necessary because 1358 * if the machine crashed during a re-guid the new guid might have been written 1359 * to all of the vdev labels, but not the cached config. The strict check 1360 * will be performed when the pool is opened again using the mos config. 1361 * 1362 * This function will only return failure if one of the vdevs indicates that it 1363 * has since been destroyed or exported. This is only possible if 1364 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 1365 * will be updated but the function will return 0. 1366 */ 1367 int 1368 vdev_validate(vdev_t *vd, boolean_t strict) 1369 { 1370 spa_t *spa = vd->vdev_spa; 1371 nvlist_t *label; 1372 uint64_t guid = 0, top_guid; 1373 uint64_t state; 1374 1375 for (int c = 0; c < vd->vdev_children; c++) 1376 if (vdev_validate(vd->vdev_child[c], strict) != 0) 1377 return (SET_ERROR(EBADF)); 1378 1379 /* 1380 * If the device has already failed, or was marked offline, don't do 1381 * any further validation. Otherwise, label I/O will fail and we will 1382 * overwrite the previous state. 1383 */ 1384 if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { 1385 uint64_t aux_guid = 0; 1386 nvlist_t *nvl; 1387 uint64_t txg = spa_last_synced_txg(spa) != 0 ? 1388 spa_last_synced_txg(spa) : -1ULL; 1389 1390 if ((label = vdev_label_read_config(vd, txg)) == NULL) { 1391 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1392 VDEV_AUX_BAD_LABEL); 1393 return (0); 1394 } 1395 1396 /* 1397 * Determine if this vdev has been split off into another 1398 * pool. If so, then refuse to open it. 1399 */ 1400 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, 1401 &aux_guid) == 0 && aux_guid == spa_guid(spa)) { 1402 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1403 VDEV_AUX_SPLIT_POOL); 1404 nvlist_free(label); 1405 return (0); 1406 } 1407 1408 if (strict && (nvlist_lookup_uint64(label, 1409 ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || 1410 guid != spa_guid(spa))) { 1411 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1412 VDEV_AUX_CORRUPT_DATA); 1413 nvlist_free(label); 1414 return (0); 1415 } 1416 1417 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) 1418 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, 1419 &aux_guid) != 0) 1420 aux_guid = 0; 1421 1422 /* 1423 * If this vdev just became a top-level vdev because its 1424 * sibling was detached, it will have adopted the parent's 1425 * vdev guid -- but the label may or may not be on disk yet. 1426 * Fortunately, either version of the label will have the 1427 * same top guid, so if we're a top-level vdev, we can 1428 * safely compare to that instead. 1429 * 1430 * If we split this vdev off instead, then we also check the 1431 * original pool's guid. We don't want to consider the vdev 1432 * corrupt if it is partway through a split operation. 1433 */ 1434 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1435 &guid) != 0 || 1436 nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, 1437 &top_guid) != 0 || 1438 ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && 1439 (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { 1440 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1441 VDEV_AUX_CORRUPT_DATA); 1442 nvlist_free(label); 1443 return (0); 1444 } 1445 1446 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1447 &state) != 0) { 1448 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1449 VDEV_AUX_CORRUPT_DATA); 1450 nvlist_free(label); 1451 return (0); 1452 } 1453 1454 nvlist_free(label); 1455 1456 /* 1457 * If this is a verbatim import, no need to check the 1458 * state of the pool. 1459 */ 1460 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && 1461 spa_load_state(spa) == SPA_LOAD_OPEN && 1462 state != POOL_STATE_ACTIVE) 1463 return (SET_ERROR(EBADF)); 1464 1465 /* 1466 * If we were able to open and validate a vdev that was 1467 * previously marked permanently unavailable, clear that state 1468 * now. 1469 */ 1470 if (vd->vdev_not_present) 1471 vd->vdev_not_present = 0; 1472 } 1473 1474 return (0); 1475 } 1476 1477 /* 1478 * Close a virtual device. 1479 */ 1480 void 1481 vdev_close(vdev_t *vd) 1482 { 1483 spa_t *spa = vd->vdev_spa; 1484 vdev_t *pvd = vd->vdev_parent; 1485 1486 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1487 1488 /* 1489 * If our parent is reopening, then we are as well, unless we are 1490 * going offline. 1491 */ 1492 if (pvd != NULL && pvd->vdev_reopening) 1493 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); 1494 1495 vd->vdev_ops->vdev_op_close(vd); 1496 1497 vdev_cache_purge(vd); 1498 1499 /* 1500 * We record the previous state before we close it, so that if we are 1501 * doing a reopen(), we don't generate FMA ereports if we notice that 1502 * it's still faulted. 1503 */ 1504 vd->vdev_prevstate = vd->vdev_state; 1505 1506 if (vd->vdev_offline) 1507 vd->vdev_state = VDEV_STATE_OFFLINE; 1508 else 1509 vd->vdev_state = VDEV_STATE_CLOSED; 1510 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1511 } 1512 1513 void 1514 vdev_hold(vdev_t *vd) 1515 { 1516 spa_t *spa = vd->vdev_spa; 1517 1518 ASSERT(spa_is_root(spa)); 1519 if (spa->spa_state == POOL_STATE_UNINITIALIZED) 1520 return; 1521 1522 for (int c = 0; c < vd->vdev_children; c++) 1523 vdev_hold(vd->vdev_child[c]); 1524 1525 if (vd->vdev_ops->vdev_op_leaf) 1526 vd->vdev_ops->vdev_op_hold(vd); 1527 } 1528 1529 void 1530 vdev_rele(vdev_t *vd) 1531 { 1532 spa_t *spa = vd->vdev_spa; 1533 1534 ASSERT(spa_is_root(spa)); 1535 for (int c = 0; c < vd->vdev_children; c++) 1536 vdev_rele(vd->vdev_child[c]); 1537 1538 if (vd->vdev_ops->vdev_op_leaf) 1539 vd->vdev_ops->vdev_op_rele(vd); 1540 } 1541 1542 /* 1543 * Reopen all interior vdevs and any unopened leaves. We don't actually 1544 * reopen leaf vdevs which had previously been opened as they might deadlock 1545 * on the spa_config_lock. Instead we only obtain the leaf's physical size. 1546 * If the leaf has never been opened then open it, as usual. 1547 */ 1548 void 1549 vdev_reopen(vdev_t *vd) 1550 { 1551 spa_t *spa = vd->vdev_spa; 1552 1553 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 1554 1555 /* set the reopening flag unless we're taking the vdev offline */ 1556 vd->vdev_reopening = !vd->vdev_offline; 1557 vdev_close(vd); 1558 (void) vdev_open(vd); 1559 1560 /* 1561 * Call vdev_validate() here to make sure we have the same device. 1562 * Otherwise, a device with an invalid label could be successfully 1563 * opened in response to vdev_reopen(). 1564 */ 1565 if (vd->vdev_aux) { 1566 (void) vdev_validate_aux(vd); 1567 if (vdev_readable(vd) && vdev_writeable(vd) && 1568 vd->vdev_aux == &spa->spa_l2cache && 1569 !l2arc_vdev_present(vd)) 1570 l2arc_add_vdev(spa, vd); 1571 } else { 1572 (void) vdev_validate(vd, B_TRUE); 1573 } 1574 1575 /* 1576 * Reassess parent vdev's health. 1577 */ 1578 vdev_propagate_state(vd); 1579 } 1580 1581 int 1582 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1583 { 1584 int error; 1585 1586 /* 1587 * Normally, partial opens (e.g. of a mirror) are allowed. 1588 * For a create, however, we want to fail the request if 1589 * there are any components we can't open. 1590 */ 1591 error = vdev_open(vd); 1592 1593 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1594 vdev_close(vd); 1595 return (error ? error : ENXIO); 1596 } 1597 1598 /* 1599 * Recursively load DTLs and initialize all labels. 1600 */ 1601 if ((error = vdev_dtl_load(vd)) != 0 || 1602 (error = vdev_label_init(vd, txg, isreplacing ? 1603 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1604 vdev_close(vd); 1605 return (error); 1606 } 1607 1608 return (0); 1609 } 1610 1611 void 1612 vdev_metaslab_set_size(vdev_t *vd) 1613 { 1614 /* 1615 * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev. 1616 */ 1617 vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev); 1618 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1619 } 1620 1621 void 1622 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1623 { 1624 ASSERT(vd == vd->vdev_top); 1625 ASSERT(!vd->vdev_ishole); 1626 ASSERT(ISP2(flags)); 1627 ASSERT(spa_writeable(vd->vdev_spa)); 1628 1629 if (flags & VDD_METASLAB) 1630 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1631 1632 if (flags & VDD_DTL) 1633 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1634 1635 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1636 } 1637 1638 void 1639 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) 1640 { 1641 for (int c = 0; c < vd->vdev_children; c++) 1642 vdev_dirty_leaves(vd->vdev_child[c], flags, txg); 1643 1644 if (vd->vdev_ops->vdev_op_leaf) 1645 vdev_dirty(vd->vdev_top, flags, vd, txg); 1646 } 1647 1648 /* 1649 * DTLs. 1650 * 1651 * A vdev's DTL (dirty time log) is the set of transaction groups for which 1652 * the vdev has less than perfect replication. There are four kinds of DTL: 1653 * 1654 * DTL_MISSING: txgs for which the vdev has no valid copies of the data 1655 * 1656 * DTL_PARTIAL: txgs for which data is available, but not fully replicated 1657 * 1658 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon 1659 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of 1660 * txgs that was scrubbed. 1661 * 1662 * DTL_OUTAGE: txgs which cannot currently be read, whether due to 1663 * persistent errors or just some device being offline. 1664 * Unlike the other three, the DTL_OUTAGE map is not generally 1665 * maintained; it's only computed when needed, typically to 1666 * determine whether a device can be detached. 1667 * 1668 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device 1669 * either has the data or it doesn't. 1670 * 1671 * For interior vdevs such as mirror and RAID-Z the picture is more complex. 1672 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because 1673 * if any child is less than fully replicated, then so is its parent. 1674 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, 1675 * comprising only those txgs which appear in 'maxfaults' or more children; 1676 * those are the txgs we don't have enough replication to read. For example, 1677 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); 1678 * thus, its DTL_MISSING consists of the set of txgs that appear in more than 1679 * two child DTL_MISSING maps. 1680 * 1681 * It should be clear from the above that to compute the DTLs and outage maps 1682 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. 1683 * Therefore, that is all we keep on disk. When loading the pool, or after 1684 * a configuration change, we generate all other DTLs from first principles. 1685 */ 1686 void 1687 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1688 { 1689 range_tree_t *rt = vd->vdev_dtl[t]; 1690 1691 ASSERT(t < DTL_TYPES); 1692 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1693 ASSERT(spa_writeable(vd->vdev_spa)); 1694 1695 mutex_enter(rt->rt_lock); 1696 if (!range_tree_contains(rt, txg, size)) 1697 range_tree_add(rt, txg, size); 1698 mutex_exit(rt->rt_lock); 1699 } 1700 1701 boolean_t 1702 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) 1703 { 1704 range_tree_t *rt = vd->vdev_dtl[t]; 1705 boolean_t dirty = B_FALSE; 1706 1707 ASSERT(t < DTL_TYPES); 1708 ASSERT(vd != vd->vdev_spa->spa_root_vdev); 1709 1710 mutex_enter(rt->rt_lock); 1711 if (range_tree_space(rt) != 0) 1712 dirty = range_tree_contains(rt, txg, size); 1713 mutex_exit(rt->rt_lock); 1714 1715 return (dirty); 1716 } 1717 1718 boolean_t 1719 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) 1720 { 1721 range_tree_t *rt = vd->vdev_dtl[t]; 1722 boolean_t empty; 1723 1724 mutex_enter(rt->rt_lock); 1725 empty = (range_tree_space(rt) == 0); 1726 mutex_exit(rt->rt_lock); 1727 1728 return (empty); 1729 } 1730 1731 /* 1732 * Returns the lowest txg in the DTL range. 1733 */ 1734 static uint64_t 1735 vdev_dtl_min(vdev_t *vd) 1736 { 1737 range_seg_t *rs; 1738 1739 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1740 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1741 ASSERT0(vd->vdev_children); 1742 1743 rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1744 return (rs->rs_start - 1); 1745 } 1746 1747 /* 1748 * Returns the highest txg in the DTL. 1749 */ 1750 static uint64_t 1751 vdev_dtl_max(vdev_t *vd) 1752 { 1753 range_seg_t *rs; 1754 1755 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); 1756 ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); 1757 ASSERT0(vd->vdev_children); 1758 1759 rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root); 1760 return (rs->rs_end); 1761 } 1762 1763 /* 1764 * Determine if a resilvering vdev should remove any DTL entries from 1765 * its range. If the vdev was resilvering for the entire duration of the 1766 * scan then it should excise that range from its DTLs. Otherwise, this 1767 * vdev is considered partially resilvered and should leave its DTL 1768 * entries intact. The comment in vdev_dtl_reassess() describes how we 1769 * excise the DTLs. 1770 */ 1771 static boolean_t 1772 vdev_dtl_should_excise(vdev_t *vd) 1773 { 1774 spa_t *spa = vd->vdev_spa; 1775 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1776 1777 ASSERT0(scn->scn_phys.scn_errors); 1778 ASSERT0(vd->vdev_children); 1779 1780 if (vd->vdev_resilver_txg == 0 || 1781 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0) 1782 return (B_TRUE); 1783 1784 /* 1785 * When a resilver is initiated the scan will assign the scn_max_txg 1786 * value to the highest txg value that exists in all DTLs. If this 1787 * device's max DTL is not part of this scan (i.e. it is not in 1788 * the range (scn_min_txg, scn_max_txg] then it is not eligible 1789 * for excision. 1790 */ 1791 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { 1792 ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd)); 1793 ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg); 1794 ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg); 1795 return (B_TRUE); 1796 } 1797 return (B_FALSE); 1798 } 1799 1800 /* 1801 * Reassess DTLs after a config change or scrub completion. 1802 */ 1803 void 1804 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1805 { 1806 spa_t *spa = vd->vdev_spa; 1807 avl_tree_t reftree; 1808 int minref; 1809 1810 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 1811 1812 for (int c = 0; c < vd->vdev_children; c++) 1813 vdev_dtl_reassess(vd->vdev_child[c], txg, 1814 scrub_txg, scrub_done); 1815 1816 if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) 1817 return; 1818 1819 if (vd->vdev_ops->vdev_op_leaf) { 1820 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; 1821 1822 mutex_enter(&vd->vdev_dtl_lock); 1823 1824 /* 1825 * If we've completed a scan cleanly then determine 1826 * if this vdev should remove any DTLs. We only want to 1827 * excise regions on vdevs that were available during 1828 * the entire duration of this scan. 1829 */ 1830 if (scrub_txg != 0 && 1831 (spa->spa_scrub_started || 1832 (scn != NULL && scn->scn_phys.scn_errors == 0)) && 1833 vdev_dtl_should_excise(vd)) { 1834 /* 1835 * We completed a scrub up to scrub_txg. If we 1836 * did it without rebooting, then the scrub dtl 1837 * will be valid, so excise the old region and 1838 * fold in the scrub dtl. Otherwise, leave the 1839 * dtl as-is if there was an error. 1840 * 1841 * There's little trick here: to excise the beginning 1842 * of the DTL_MISSING map, we put it into a reference 1843 * tree and then add a segment with refcnt -1 that 1844 * covers the range [0, scrub_txg). This means 1845 * that each txg in that range has refcnt -1 or 0. 1846 * We then add DTL_SCRUB with a refcnt of 2, so that 1847 * entries in the range [0, scrub_txg) will have a 1848 * positive refcnt -- either 1 or 2. We then convert 1849 * the reference tree into the new DTL_MISSING map. 1850 */ 1851 space_reftree_create(&reftree); 1852 space_reftree_add_map(&reftree, 1853 vd->vdev_dtl[DTL_MISSING], 1); 1854 space_reftree_add_seg(&reftree, 0, scrub_txg, -1); 1855 space_reftree_add_map(&reftree, 1856 vd->vdev_dtl[DTL_SCRUB], 2); 1857 space_reftree_generate_map(&reftree, 1858 vd->vdev_dtl[DTL_MISSING], 1); 1859 space_reftree_destroy(&reftree); 1860 } 1861 range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); 1862 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1863 range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); 1864 if (scrub_done) 1865 range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); 1866 range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); 1867 if (!vdev_readable(vd)) 1868 range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); 1869 else 1870 range_tree_walk(vd->vdev_dtl[DTL_MISSING], 1871 range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); 1872 1873 /* 1874 * If the vdev was resilvering and no longer has any 1875 * DTLs then reset its resilvering flag. 1876 */ 1877 if (vd->vdev_resilver_txg != 0 && 1878 range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 && 1879 range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) 1880 vd->vdev_resilver_txg = 0; 1881 1882 mutex_exit(&vd->vdev_dtl_lock); 1883 1884 if (txg != 0) 1885 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1886 return; 1887 } 1888 1889 mutex_enter(&vd->vdev_dtl_lock); 1890 for (int t = 0; t < DTL_TYPES; t++) { 1891 /* account for child's outage in parent's missing map */ 1892 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; 1893 if (t == DTL_SCRUB) 1894 continue; /* leaf vdevs only */ 1895 if (t == DTL_PARTIAL) 1896 minref = 1; /* i.e. non-zero */ 1897 else if (vd->vdev_nparity != 0) 1898 minref = vd->vdev_nparity + 1; /* RAID-Z */ 1899 else 1900 minref = vd->vdev_children; /* any kind of mirror */ 1901 space_reftree_create(&reftree); 1902 for (int c = 0; c < vd->vdev_children; c++) { 1903 vdev_t *cvd = vd->vdev_child[c]; 1904 mutex_enter(&cvd->vdev_dtl_lock); 1905 space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); 1906 mutex_exit(&cvd->vdev_dtl_lock); 1907 } 1908 space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); 1909 space_reftree_destroy(&reftree); 1910 } 1911 mutex_exit(&vd->vdev_dtl_lock); 1912 } 1913 1914 int 1915 vdev_dtl_load(vdev_t *vd) 1916 { 1917 spa_t *spa = vd->vdev_spa; 1918 objset_t *mos = spa->spa_meta_objset; 1919 int error = 0; 1920 1921 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { 1922 ASSERT(!vd->vdev_ishole); 1923 1924 error = space_map_open(&vd->vdev_dtl_sm, mos, 1925 vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock); 1926 if (error) 1927 return (error); 1928 ASSERT(vd->vdev_dtl_sm != NULL); 1929 1930 mutex_enter(&vd->vdev_dtl_lock); 1931 1932 /* 1933 * Now that we've opened the space_map we need to update 1934 * the in-core DTL. 1935 */ 1936 space_map_update(vd->vdev_dtl_sm); 1937 1938 error = space_map_load(vd->vdev_dtl_sm, 1939 vd->vdev_dtl[DTL_MISSING], SM_ALLOC); 1940 mutex_exit(&vd->vdev_dtl_lock); 1941 1942 return (error); 1943 } 1944 1945 for (int c = 0; c < vd->vdev_children; c++) { 1946 error = vdev_dtl_load(vd->vdev_child[c]); 1947 if (error != 0) 1948 break; 1949 } 1950 1951 return (error); 1952 } 1953 1954 void 1955 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) 1956 { 1957 spa_t *spa = vd->vdev_spa; 1958 1959 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); 1960 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 1961 zapobj, tx)); 1962 } 1963 1964 uint64_t 1965 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) 1966 { 1967 spa_t *spa = vd->vdev_spa; 1968 uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, 1969 DMU_OT_NONE, 0, tx); 1970 1971 ASSERT(zap != 0); 1972 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, 1973 zap, tx)); 1974 1975 return (zap); 1976 } 1977 1978 void 1979 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) 1980 { 1981 if (vd->vdev_ops != &vdev_hole_ops && 1982 vd->vdev_ops != &vdev_missing_ops && 1983 vd->vdev_ops != &vdev_root_ops && 1984 !vd->vdev_top->vdev_removing) { 1985 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { 1986 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); 1987 } 1988 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { 1989 vd->vdev_top_zap = vdev_create_link_zap(vd, tx); 1990 } 1991 } 1992 for (uint64_t i = 0; i < vd->vdev_children; i++) { 1993 vdev_construct_zaps(vd->vdev_child[i], tx); 1994 } 1995 } 1996 1997 void 1998 vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1999 { 2000 spa_t *spa = vd->vdev_spa; 2001 range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; 2002 objset_t *mos = spa->spa_meta_objset; 2003 range_tree_t *rtsync; 2004 kmutex_t rtlock; 2005 dmu_tx_t *tx; 2006 uint64_t object = space_map_object(vd->vdev_dtl_sm); 2007 2008 ASSERT(!vd->vdev_ishole); 2009 ASSERT(vd->vdev_ops->vdev_op_leaf); 2010 2011 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2012 2013 if (vd->vdev_detached || vd->vdev_top->vdev_removing) { 2014 mutex_enter(&vd->vdev_dtl_lock); 2015 space_map_free(vd->vdev_dtl_sm, tx); 2016 space_map_close(vd->vdev_dtl_sm); 2017 vd->vdev_dtl_sm = NULL; 2018 mutex_exit(&vd->vdev_dtl_lock); 2019 2020 /* 2021 * We only destroy the leaf ZAP for detached leaves or for 2022 * removed log devices. Removed data devices handle leaf ZAP 2023 * cleanup later, once cancellation is no longer possible. 2024 */ 2025 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || 2026 vd->vdev_top->vdev_islog)) { 2027 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); 2028 vd->vdev_leaf_zap = 0; 2029 } 2030 2031 dmu_tx_commit(tx); 2032 return; 2033 } 2034 2035 if (vd->vdev_dtl_sm == NULL) { 2036 uint64_t new_object; 2037 2038 new_object = space_map_alloc(mos, tx); 2039 VERIFY3U(new_object, !=, 0); 2040 2041 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, 2042 0, -1ULL, 0, &vd->vdev_dtl_lock)); 2043 ASSERT(vd->vdev_dtl_sm != NULL); 2044 } 2045 2046 mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL); 2047 2048 rtsync = range_tree_create(NULL, NULL, &rtlock); 2049 2050 mutex_enter(&rtlock); 2051 2052 mutex_enter(&vd->vdev_dtl_lock); 2053 range_tree_walk(rt, range_tree_add, rtsync); 2054 mutex_exit(&vd->vdev_dtl_lock); 2055 2056 space_map_truncate(vd->vdev_dtl_sm, tx); 2057 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); 2058 range_tree_vacate(rtsync, NULL, NULL); 2059 2060 range_tree_destroy(rtsync); 2061 2062 mutex_exit(&rtlock); 2063 mutex_destroy(&rtlock); 2064 2065 /* 2066 * If the object for the space map has changed then dirty 2067 * the top level so that we update the config. 2068 */ 2069 if (object != space_map_object(vd->vdev_dtl_sm)) { 2070 zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, " 2071 "new object %llu", txg, spa_name(spa), object, 2072 space_map_object(vd->vdev_dtl_sm)); 2073 vdev_config_dirty(vd->vdev_top); 2074 } 2075 2076 dmu_tx_commit(tx); 2077 2078 mutex_enter(&vd->vdev_dtl_lock); 2079 space_map_update(vd->vdev_dtl_sm); 2080 mutex_exit(&vd->vdev_dtl_lock); 2081 } 2082 2083 /* 2084 * Determine whether the specified vdev can be offlined/detached/removed 2085 * without losing data. 2086 */ 2087 boolean_t 2088 vdev_dtl_required(vdev_t *vd) 2089 { 2090 spa_t *spa = vd->vdev_spa; 2091 vdev_t *tvd = vd->vdev_top; 2092 uint8_t cant_read = vd->vdev_cant_read; 2093 boolean_t required; 2094 2095 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2096 2097 if (vd == spa->spa_root_vdev || vd == tvd) 2098 return (B_TRUE); 2099 2100 /* 2101 * Temporarily mark the device as unreadable, and then determine 2102 * whether this results in any DTL outages in the top-level vdev. 2103 * If not, we can safely offline/detach/remove the device. 2104 */ 2105 vd->vdev_cant_read = B_TRUE; 2106 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2107 required = !vdev_dtl_empty(tvd, DTL_OUTAGE); 2108 vd->vdev_cant_read = cant_read; 2109 vdev_dtl_reassess(tvd, 0, 0, B_FALSE); 2110 2111 if (!required && zio_injection_enabled) 2112 required = !!zio_handle_device_injection(vd, NULL, ECHILD); 2113 2114 return (required); 2115 } 2116 2117 /* 2118 * Determine if resilver is needed, and if so the txg range. 2119 */ 2120 boolean_t 2121 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 2122 { 2123 boolean_t needed = B_FALSE; 2124 uint64_t thismin = UINT64_MAX; 2125 uint64_t thismax = 0; 2126 2127 if (vd->vdev_children == 0) { 2128 mutex_enter(&vd->vdev_dtl_lock); 2129 if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 && 2130 vdev_writeable(vd)) { 2131 2132 thismin = vdev_dtl_min(vd); 2133 thismax = vdev_dtl_max(vd); 2134 needed = B_TRUE; 2135 } 2136 mutex_exit(&vd->vdev_dtl_lock); 2137 } else { 2138 for (int c = 0; c < vd->vdev_children; c++) { 2139 vdev_t *cvd = vd->vdev_child[c]; 2140 uint64_t cmin, cmax; 2141 2142 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 2143 thismin = MIN(thismin, cmin); 2144 thismax = MAX(thismax, cmax); 2145 needed = B_TRUE; 2146 } 2147 } 2148 } 2149 2150 if (needed && minp) { 2151 *minp = thismin; 2152 *maxp = thismax; 2153 } 2154 return (needed); 2155 } 2156 2157 void 2158 vdev_load(vdev_t *vd) 2159 { 2160 /* 2161 * Recursively load all children. 2162 */ 2163 for (int c = 0; c < vd->vdev_children; c++) 2164 vdev_load(vd->vdev_child[c]); 2165 2166 /* 2167 * If this is a top-level vdev, initialize its metaslabs. 2168 */ 2169 if (vd == vd->vdev_top && !vd->vdev_ishole && 2170 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 2171 vdev_metaslab_init(vd, 0) != 0)) 2172 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2173 VDEV_AUX_CORRUPT_DATA); 2174 2175 /* 2176 * If this is a leaf vdev, load its DTL. 2177 */ 2178 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 2179 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 2180 VDEV_AUX_CORRUPT_DATA); 2181 } 2182 2183 /* 2184 * The special vdev case is used for hot spares and l2cache devices. Its 2185 * sole purpose it to set the vdev state for the associated vdev. To do this, 2186 * we make sure that we can open the underlying device, then try to read the 2187 * label, and make sure that the label is sane and that it hasn't been 2188 * repurposed to another pool. 2189 */ 2190 int 2191 vdev_validate_aux(vdev_t *vd) 2192 { 2193 nvlist_t *label; 2194 uint64_t guid, version; 2195 uint64_t state; 2196 2197 if (!vdev_readable(vd)) 2198 return (0); 2199 2200 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { 2201 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2202 VDEV_AUX_CORRUPT_DATA); 2203 return (-1); 2204 } 2205 2206 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 2207 !SPA_VERSION_IS_SUPPORTED(version) || 2208 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 2209 guid != vd->vdev_guid || 2210 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 2211 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 2212 VDEV_AUX_CORRUPT_DATA); 2213 nvlist_free(label); 2214 return (-1); 2215 } 2216 2217 /* 2218 * We don't actually check the pool state here. If it's in fact in 2219 * use by another pool, we update this fact on the fly when requested. 2220 */ 2221 nvlist_free(label); 2222 return (0); 2223 } 2224 2225 void 2226 vdev_remove(vdev_t *vd, uint64_t txg) 2227 { 2228 spa_t *spa = vd->vdev_spa; 2229 objset_t *mos = spa->spa_meta_objset; 2230 dmu_tx_t *tx; 2231 2232 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 2233 ASSERT(vd == vd->vdev_top); 2234 ASSERT3U(txg, ==, spa_syncing_txg(spa)); 2235 2236 if (vd->vdev_ms != NULL) { 2237 metaslab_group_t *mg = vd->vdev_mg; 2238 2239 metaslab_group_histogram_verify(mg); 2240 metaslab_class_histogram_verify(mg->mg_class); 2241 2242 for (int m = 0; m < vd->vdev_ms_count; m++) { 2243 metaslab_t *msp = vd->vdev_ms[m]; 2244 2245 if (msp == NULL || msp->ms_sm == NULL) 2246 continue; 2247 2248 mutex_enter(&msp->ms_lock); 2249 /* 2250 * If the metaslab was not loaded when the vdev 2251 * was removed then the histogram accounting may 2252 * not be accurate. Update the histogram information 2253 * here so that we ensure that the metaslab group 2254 * and metaslab class are up-to-date. 2255 */ 2256 metaslab_group_histogram_remove(mg, msp); 2257 2258 VERIFY0(space_map_allocated(msp->ms_sm)); 2259 space_map_free(msp->ms_sm, tx); 2260 space_map_close(msp->ms_sm); 2261 msp->ms_sm = NULL; 2262 mutex_exit(&msp->ms_lock); 2263 } 2264 2265 metaslab_group_histogram_verify(mg); 2266 metaslab_class_histogram_verify(mg->mg_class); 2267 for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) 2268 ASSERT0(mg->mg_histogram[i]); 2269 2270 } 2271 2272 if (vd->vdev_ms_array) { 2273 (void) dmu_object_free(mos, vd->vdev_ms_array, tx); 2274 vd->vdev_ms_array = 0; 2275 } 2276 2277 if (vd->vdev_islog && vd->vdev_top_zap != 0) { 2278 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); 2279 vd->vdev_top_zap = 0; 2280 } 2281 dmu_tx_commit(tx); 2282 } 2283 2284 void 2285 vdev_sync_done(vdev_t *vd, uint64_t txg) 2286 { 2287 metaslab_t *msp; 2288 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); 2289 2290 ASSERT(!vd->vdev_ishole); 2291 2292 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 2293 metaslab_sync_done(msp, txg); 2294 2295 if (reassess) 2296 metaslab_sync_reassess(vd->vdev_mg); 2297 } 2298 2299 void 2300 vdev_sync(vdev_t *vd, uint64_t txg) 2301 { 2302 spa_t *spa = vd->vdev_spa; 2303 vdev_t *lvd; 2304 metaslab_t *msp; 2305 dmu_tx_t *tx; 2306 2307 ASSERT(!vd->vdev_ishole); 2308 2309 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 2310 ASSERT(vd == vd->vdev_top); 2311 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 2312 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 2313 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 2314 ASSERT(vd->vdev_ms_array != 0); 2315 vdev_config_dirty(vd); 2316 dmu_tx_commit(tx); 2317 } 2318 2319 /* 2320 * Remove the metadata associated with this vdev once it's empty. 2321 */ 2322 if (vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) 2323 vdev_remove(vd, txg); 2324 2325 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 2326 metaslab_sync(msp, txg); 2327 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 2328 } 2329 2330 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 2331 vdev_dtl_sync(lvd, txg); 2332 2333 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 2334 } 2335 2336 uint64_t 2337 vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 2338 { 2339 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 2340 } 2341 2342 /* 2343 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 2344 * not be opened, and no I/O is attempted. 2345 */ 2346 int 2347 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2348 { 2349 vdev_t *vd, *tvd; 2350 2351 spa_vdev_state_enter(spa, SCL_NONE); 2352 2353 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2354 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2355 2356 if (!vd->vdev_ops->vdev_op_leaf) 2357 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2358 2359 tvd = vd->vdev_top; 2360 2361 /* 2362 * We don't directly use the aux state here, but if we do a 2363 * vdev_reopen(), we need this value to be present to remember why we 2364 * were faulted. 2365 */ 2366 vd->vdev_label_aux = aux; 2367 2368 /* 2369 * Faulted state takes precedence over degraded. 2370 */ 2371 vd->vdev_delayed_close = B_FALSE; 2372 vd->vdev_faulted = 1ULL; 2373 vd->vdev_degraded = 0ULL; 2374 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); 2375 2376 /* 2377 * If this device has the only valid copy of the data, then 2378 * back off and simply mark the vdev as degraded instead. 2379 */ 2380 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { 2381 vd->vdev_degraded = 1ULL; 2382 vd->vdev_faulted = 0ULL; 2383 2384 /* 2385 * If we reopen the device and it's not dead, only then do we 2386 * mark it degraded. 2387 */ 2388 vdev_reopen(tvd); 2389 2390 if (vdev_readable(vd)) 2391 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); 2392 } 2393 2394 return (spa_vdev_state_exit(spa, vd, 0)); 2395 } 2396 2397 /* 2398 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 2399 * user that something is wrong. The vdev continues to operate as normal as far 2400 * as I/O is concerned. 2401 */ 2402 int 2403 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) 2404 { 2405 vdev_t *vd; 2406 2407 spa_vdev_state_enter(spa, SCL_NONE); 2408 2409 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2410 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2411 2412 if (!vd->vdev_ops->vdev_op_leaf) 2413 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2414 2415 /* 2416 * If the vdev is already faulted, then don't do anything. 2417 */ 2418 if (vd->vdev_faulted || vd->vdev_degraded) 2419 return (spa_vdev_state_exit(spa, NULL, 0)); 2420 2421 vd->vdev_degraded = 1ULL; 2422 if (!vdev_is_dead(vd)) 2423 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 2424 aux); 2425 2426 return (spa_vdev_state_exit(spa, vd, 0)); 2427 } 2428 2429 /* 2430 * Online the given vdev. 2431 * 2432 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached 2433 * spare device should be detached when the device finishes resilvering. 2434 * Second, the online should be treated like a 'test' online case, so no FMA 2435 * events are generated if the device fails to open. 2436 */ 2437 int 2438 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) 2439 { 2440 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; 2441 boolean_t postevent = B_FALSE; 2442 2443 spa_vdev_state_enter(spa, SCL_NONE); 2444 2445 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2446 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2447 2448 if (!vd->vdev_ops->vdev_op_leaf) 2449 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2450 2451 postevent = 2452 (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ? 2453 B_TRUE : B_FALSE; 2454 2455 tvd = vd->vdev_top; 2456 vd->vdev_offline = B_FALSE; 2457 vd->vdev_tmpoffline = B_FALSE; 2458 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); 2459 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); 2460 2461 /* XXX - L2ARC 1.0 does not support expansion */ 2462 if (!vd->vdev_aux) { 2463 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2464 pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); 2465 } 2466 2467 vdev_reopen(tvd); 2468 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 2469 2470 if (!vd->vdev_aux) { 2471 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2472 pvd->vdev_expanding = B_FALSE; 2473 } 2474 2475 if (newstate) 2476 *newstate = vd->vdev_state; 2477 if ((flags & ZFS_ONLINE_UNSPARE) && 2478 !vdev_is_dead(vd) && vd->vdev_parent && 2479 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2480 vd->vdev_parent->vdev_child[0] == vd) 2481 vd->vdev_unspare = B_TRUE; 2482 2483 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { 2484 2485 /* XXX - L2ARC 1.0 does not support expansion */ 2486 if (vd->vdev_aux) 2487 return (spa_vdev_state_exit(spa, vd, ENOTSUP)); 2488 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2489 } 2490 2491 if (postevent) 2492 spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE); 2493 2494 return (spa_vdev_state_exit(spa, vd, 0)); 2495 } 2496 2497 static int 2498 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) 2499 { 2500 vdev_t *vd, *tvd; 2501 int error = 0; 2502 uint64_t generation; 2503 metaslab_group_t *mg; 2504 2505 top: 2506 spa_vdev_state_enter(spa, SCL_ALLOC); 2507 2508 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 2509 return (spa_vdev_state_exit(spa, NULL, ENODEV)); 2510 2511 if (!vd->vdev_ops->vdev_op_leaf) 2512 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 2513 2514 tvd = vd->vdev_top; 2515 mg = tvd->vdev_mg; 2516 generation = spa->spa_config_generation + 1; 2517 2518 /* 2519 * If the device isn't already offline, try to offline it. 2520 */ 2521 if (!vd->vdev_offline) { 2522 /* 2523 * If this device has the only valid copy of some data, 2524 * don't allow it to be offlined. Log devices are always 2525 * expendable. 2526 */ 2527 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2528 vdev_dtl_required(vd)) 2529 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2530 2531 /* 2532 * If the top-level is a slog and it has had allocations 2533 * then proceed. We check that the vdev's metaslab group 2534 * is not NULL since it's possible that we may have just 2535 * added this vdev but not yet initialized its metaslabs. 2536 */ 2537 if (tvd->vdev_islog && mg != NULL) { 2538 /* 2539 * Prevent any future allocations. 2540 */ 2541 metaslab_group_passivate(mg); 2542 (void) spa_vdev_state_exit(spa, vd, 0); 2543 2544 error = spa_offline_log(spa); 2545 2546 spa_vdev_state_enter(spa, SCL_ALLOC); 2547 2548 /* 2549 * Check to see if the config has changed. 2550 */ 2551 if (error || generation != spa->spa_config_generation) { 2552 metaslab_group_activate(mg); 2553 if (error) 2554 return (spa_vdev_state_exit(spa, 2555 vd, error)); 2556 (void) spa_vdev_state_exit(spa, vd, 0); 2557 goto top; 2558 } 2559 ASSERT0(tvd->vdev_stat.vs_alloc); 2560 } 2561 2562 /* 2563 * Offline this device and reopen its top-level vdev. 2564 * If the top-level vdev is a log device then just offline 2565 * it. Otherwise, if this action results in the top-level 2566 * vdev becoming unusable, undo it and fail the request. 2567 */ 2568 vd->vdev_offline = B_TRUE; 2569 vdev_reopen(tvd); 2570 2571 if (!tvd->vdev_islog && vd->vdev_aux == NULL && 2572 vdev_is_dead(tvd)) { 2573 vd->vdev_offline = B_FALSE; 2574 vdev_reopen(tvd); 2575 return (spa_vdev_state_exit(spa, NULL, EBUSY)); 2576 } 2577 2578 /* 2579 * Add the device back into the metaslab rotor so that 2580 * once we online the device it's open for business. 2581 */ 2582 if (tvd->vdev_islog && mg != NULL) 2583 metaslab_group_activate(mg); 2584 } 2585 2586 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); 2587 2588 return (spa_vdev_state_exit(spa, vd, 0)); 2589 } 2590 2591 int 2592 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 2593 { 2594 int error; 2595 2596 mutex_enter(&spa->spa_vdev_top_lock); 2597 error = vdev_offline_locked(spa, guid, flags); 2598 mutex_exit(&spa->spa_vdev_top_lock); 2599 2600 return (error); 2601 } 2602 2603 /* 2604 * Clear the error counts associated with this vdev. Unlike vdev_online() and 2605 * vdev_offline(), we assume the spa config is locked. We also clear all 2606 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 2607 */ 2608 void 2609 vdev_clear(spa_t *spa, vdev_t *vd) 2610 { 2611 vdev_t *rvd = spa->spa_root_vdev; 2612 2613 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 2614 2615 if (vd == NULL) 2616 vd = rvd; 2617 2618 vd->vdev_stat.vs_read_errors = 0; 2619 vd->vdev_stat.vs_write_errors = 0; 2620 vd->vdev_stat.vs_checksum_errors = 0; 2621 2622 for (int c = 0; c < vd->vdev_children; c++) 2623 vdev_clear(spa, vd->vdev_child[c]); 2624 2625 /* 2626 * If we're in the FAULTED state or have experienced failed I/O, then 2627 * clear the persistent state and attempt to reopen the device. We 2628 * also mark the vdev config dirty, so that the new faulted state is 2629 * written out to disk. 2630 */ 2631 if (vd->vdev_faulted || vd->vdev_degraded || 2632 !vdev_readable(vd) || !vdev_writeable(vd)) { 2633 2634 /* 2635 * When reopening in reponse to a clear event, it may be due to 2636 * a fmadm repair request. In this case, if the device is 2637 * still broken, we want to still post the ereport again. 2638 */ 2639 vd->vdev_forcefault = B_TRUE; 2640 2641 vd->vdev_faulted = vd->vdev_degraded = 0ULL; 2642 vd->vdev_cant_read = B_FALSE; 2643 vd->vdev_cant_write = B_FALSE; 2644 2645 vdev_reopen(vd == rvd ? rvd : vd->vdev_top); 2646 2647 vd->vdev_forcefault = B_FALSE; 2648 2649 if (vd != rvd && vdev_writeable(vd->vdev_top)) 2650 vdev_state_dirty(vd->vdev_top); 2651 2652 if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) 2653 spa_async_request(spa, SPA_ASYNC_RESILVER); 2654 2655 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 2656 } 2657 2658 /* 2659 * When clearing a FMA-diagnosed fault, we always want to 2660 * unspare the device, as we assume that the original spare was 2661 * done in response to the FMA fault. 2662 */ 2663 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && 2664 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 2665 vd->vdev_parent->vdev_child[0] == vd) 2666 vd->vdev_unspare = B_TRUE; 2667 } 2668 2669 boolean_t 2670 vdev_is_dead(vdev_t *vd) 2671 { 2672 /* 2673 * Holes and missing devices are always considered "dead". 2674 * This simplifies the code since we don't have to check for 2675 * these types of devices in the various code paths. 2676 * Instead we rely on the fact that we skip over dead devices 2677 * before issuing I/O to them. 2678 */ 2679 return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || 2680 vd->vdev_ops == &vdev_missing_ops); 2681 } 2682 2683 boolean_t 2684 vdev_readable(vdev_t *vd) 2685 { 2686 return (!vdev_is_dead(vd) && !vd->vdev_cant_read); 2687 } 2688 2689 boolean_t 2690 vdev_writeable(vdev_t *vd) 2691 { 2692 return (!vdev_is_dead(vd) && !vd->vdev_cant_write); 2693 } 2694 2695 boolean_t 2696 vdev_allocatable(vdev_t *vd) 2697 { 2698 uint64_t state = vd->vdev_state; 2699 2700 /* 2701 * We currently allow allocations from vdevs which may be in the 2702 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device 2703 * fails to reopen then we'll catch it later when we're holding 2704 * the proper locks. Note that we have to get the vdev state 2705 * in a local variable because although it changes atomically, 2706 * we're asking two separate questions about it. 2707 */ 2708 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && 2709 !vd->vdev_cant_write && !vd->vdev_ishole && 2710 vd->vdev_mg->mg_initialized); 2711 } 2712 2713 boolean_t 2714 vdev_accessible(vdev_t *vd, zio_t *zio) 2715 { 2716 ASSERT(zio->io_vd == vd); 2717 2718 if (vdev_is_dead(vd) || vd->vdev_remove_wanted) 2719 return (B_FALSE); 2720 2721 if (zio->io_type == ZIO_TYPE_READ) 2722 return (!vd->vdev_cant_read); 2723 2724 if (zio->io_type == ZIO_TYPE_WRITE) 2725 return (!vd->vdev_cant_write); 2726 2727 return (B_TRUE); 2728 } 2729 2730 /* 2731 * Get statistics for the given vdev. 2732 */ 2733 void 2734 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 2735 { 2736 spa_t *spa = vd->vdev_spa; 2737 vdev_t *rvd = spa->spa_root_vdev; 2738 vdev_t *tvd = vd->vdev_top; 2739 2740 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); 2741 2742 mutex_enter(&vd->vdev_stat_lock); 2743 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 2744 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 2745 vs->vs_state = vd->vdev_state; 2746 vs->vs_rsize = vdev_get_min_asize(vd); 2747 if (vd->vdev_ops->vdev_op_leaf) 2748 vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; 2749 /* 2750 * Report expandable space on top-level, non-auxillary devices only. 2751 * The expandable space is reported in terms of metaslab sized units 2752 * since that determines how much space the pool can expand. 2753 */ 2754 if (vd->vdev_aux == NULL && tvd != NULL) { 2755 vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize, 2756 1ULL << tvd->vdev_ms_shift); 2757 } 2758 if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) { 2759 vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; 2760 } 2761 2762 /* 2763 * If we're getting stats on the root vdev, aggregate the I/O counts 2764 * over all top-level vdevs (i.e. the direct children of the root). 2765 */ 2766 if (vd == rvd) { 2767 for (int c = 0; c < rvd->vdev_children; c++) { 2768 vdev_t *cvd = rvd->vdev_child[c]; 2769 vdev_stat_t *cvs = &cvd->vdev_stat; 2770 2771 for (int t = 0; t < ZIO_TYPES; t++) { 2772 vs->vs_ops[t] += cvs->vs_ops[t]; 2773 vs->vs_bytes[t] += cvs->vs_bytes[t]; 2774 } 2775 cvs->vs_scan_removing = cvd->vdev_removing; 2776 } 2777 } 2778 mutex_exit(&vd->vdev_stat_lock); 2779 } 2780 2781 void 2782 vdev_clear_stats(vdev_t *vd) 2783 { 2784 mutex_enter(&vd->vdev_stat_lock); 2785 vd->vdev_stat.vs_space = 0; 2786 vd->vdev_stat.vs_dspace = 0; 2787 vd->vdev_stat.vs_alloc = 0; 2788 mutex_exit(&vd->vdev_stat_lock); 2789 } 2790 2791 void 2792 vdev_scan_stat_init(vdev_t *vd) 2793 { 2794 vdev_stat_t *vs = &vd->vdev_stat; 2795 2796 for (int c = 0; c < vd->vdev_children; c++) 2797 vdev_scan_stat_init(vd->vdev_child[c]); 2798 2799 mutex_enter(&vd->vdev_stat_lock); 2800 vs->vs_scan_processed = 0; 2801 mutex_exit(&vd->vdev_stat_lock); 2802 } 2803 2804 void 2805 vdev_stat_update(zio_t *zio, uint64_t psize) 2806 { 2807 spa_t *spa = zio->io_spa; 2808 vdev_t *rvd = spa->spa_root_vdev; 2809 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; 2810 vdev_t *pvd; 2811 uint64_t txg = zio->io_txg; 2812 vdev_stat_t *vs = &vd->vdev_stat; 2813 zio_type_t type = zio->io_type; 2814 int flags = zio->io_flags; 2815 2816 /* 2817 * If this i/o is a gang leader, it didn't do any actual work. 2818 */ 2819 if (zio->io_gang_tree) 2820 return; 2821 2822 if (zio->io_error == 0) { 2823 /* 2824 * If this is a root i/o, don't count it -- we've already 2825 * counted the top-level vdevs, and vdev_get_stats() will 2826 * aggregate them when asked. This reduces contention on 2827 * the root vdev_stat_lock and implicitly handles blocks 2828 * that compress away to holes, for which there is no i/o. 2829 * (Holes never create vdev children, so all the counters 2830 * remain zero, which is what we want.) 2831 * 2832 * Note: this only applies to successful i/o (io_error == 0) 2833 * because unlike i/o counts, errors are not additive. 2834 * When reading a ditto block, for example, failure of 2835 * one top-level vdev does not imply a root-level error. 2836 */ 2837 if (vd == rvd) 2838 return; 2839 2840 ASSERT(vd == zio->io_vd); 2841 2842 if (flags & ZIO_FLAG_IO_BYPASS) 2843 return; 2844 2845 mutex_enter(&vd->vdev_stat_lock); 2846 2847 if (flags & ZIO_FLAG_IO_REPAIR) { 2848 if (flags & ZIO_FLAG_SCAN_THREAD) { 2849 dsl_scan_phys_t *scn_phys = 2850 &spa->spa_dsl_pool->dp_scan->scn_phys; 2851 uint64_t *processed = &scn_phys->scn_processed; 2852 2853 /* XXX cleanup? */ 2854 if (vd->vdev_ops->vdev_op_leaf) 2855 atomic_add_64(processed, psize); 2856 vs->vs_scan_processed += psize; 2857 } 2858 2859 if (flags & ZIO_FLAG_SELF_HEAL) 2860 vs->vs_self_healed += psize; 2861 } 2862 2863 vs->vs_ops[type]++; 2864 vs->vs_bytes[type] += psize; 2865 2866 mutex_exit(&vd->vdev_stat_lock); 2867 return; 2868 } 2869 2870 if (flags & ZIO_FLAG_SPECULATIVE) 2871 return; 2872 2873 /* 2874 * If this is an I/O error that is going to be retried, then ignore the 2875 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as 2876 * hard errors, when in reality they can happen for any number of 2877 * innocuous reasons (bus resets, MPxIO link failure, etc). 2878 */ 2879 if (zio->io_error == EIO && 2880 !(zio->io_flags & ZIO_FLAG_IO_RETRY)) 2881 return; 2882 2883 /* 2884 * Intent logs writes won't propagate their error to the root 2885 * I/O so don't mark these types of failures as pool-level 2886 * errors. 2887 */ 2888 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) 2889 return; 2890 2891 mutex_enter(&vd->vdev_stat_lock); 2892 if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { 2893 if (zio->io_error == ECKSUM) 2894 vs->vs_checksum_errors++; 2895 else 2896 vs->vs_read_errors++; 2897 } 2898 if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) 2899 vs->vs_write_errors++; 2900 mutex_exit(&vd->vdev_stat_lock); 2901 2902 if (type == ZIO_TYPE_WRITE && txg != 0 && 2903 (!(flags & ZIO_FLAG_IO_REPAIR) || 2904 (flags & ZIO_FLAG_SCAN_THREAD) || 2905 spa->spa_claiming)) { 2906 /* 2907 * This is either a normal write (not a repair), or it's 2908 * a repair induced by the scrub thread, or it's a repair 2909 * made by zil_claim() during spa_load() in the first txg. 2910 * In the normal case, we commit the DTL change in the same 2911 * txg as the block was born. In the scrub-induced repair 2912 * case, we know that scrubs run in first-pass syncing context, 2913 * so we commit the DTL change in spa_syncing_txg(spa). 2914 * In the zil_claim() case, we commit in spa_first_txg(spa). 2915 * 2916 * We currently do not make DTL entries for failed spontaneous 2917 * self-healing writes triggered by normal (non-scrubbing) 2918 * reads, because we have no transactional context in which to 2919 * do so -- and it's not clear that it'd be desirable anyway. 2920 */ 2921 if (vd->vdev_ops->vdev_op_leaf) { 2922 uint64_t commit_txg = txg; 2923 if (flags & ZIO_FLAG_SCAN_THREAD) { 2924 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2925 ASSERT(spa_sync_pass(spa) == 1); 2926 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); 2927 commit_txg = spa_syncing_txg(spa); 2928 } else if (spa->spa_claiming) { 2929 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 2930 commit_txg = spa_first_txg(spa); 2931 } 2932 ASSERT(commit_txg >= spa_syncing_txg(spa)); 2933 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) 2934 return; 2935 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) 2936 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); 2937 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); 2938 } 2939 if (vd != rvd) 2940 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); 2941 } 2942 } 2943 2944 /* 2945 * Update the in-core space usage stats for this vdev, its metaslab class, 2946 * and the root vdev. 2947 */ 2948 void 2949 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, 2950 int64_t space_delta) 2951 { 2952 int64_t dspace_delta = space_delta; 2953 spa_t *spa = vd->vdev_spa; 2954 vdev_t *rvd = spa->spa_root_vdev; 2955 metaslab_group_t *mg = vd->vdev_mg; 2956 metaslab_class_t *mc = mg ? mg->mg_class : NULL; 2957 2958 ASSERT(vd == vd->vdev_top); 2959 2960 /* 2961 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 2962 * factor. We must calculate this here and not at the root vdev 2963 * because the root vdev's psize-to-asize is simply the max of its 2964 * childrens', thus not accurate enough for us. 2965 */ 2966 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 2967 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); 2968 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 2969 vd->vdev_deflate_ratio; 2970 2971 mutex_enter(&vd->vdev_stat_lock); 2972 vd->vdev_stat.vs_alloc += alloc_delta; 2973 vd->vdev_stat.vs_space += space_delta; 2974 vd->vdev_stat.vs_dspace += dspace_delta; 2975 mutex_exit(&vd->vdev_stat_lock); 2976 2977 if (mc == spa_normal_class(spa)) { 2978 mutex_enter(&rvd->vdev_stat_lock); 2979 rvd->vdev_stat.vs_alloc += alloc_delta; 2980 rvd->vdev_stat.vs_space += space_delta; 2981 rvd->vdev_stat.vs_dspace += dspace_delta; 2982 mutex_exit(&rvd->vdev_stat_lock); 2983 } 2984 2985 if (mc != NULL) { 2986 ASSERT(rvd == vd->vdev_parent); 2987 ASSERT(vd->vdev_ms_count != 0); 2988 2989 metaslab_class_space_update(mc, 2990 alloc_delta, defer_delta, space_delta, dspace_delta); 2991 } 2992 } 2993 2994 /* 2995 * Mark a top-level vdev's config as dirty, placing it on the dirty list 2996 * so that it will be written out next time the vdev configuration is synced. 2997 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2998 */ 2999 void 3000 vdev_config_dirty(vdev_t *vd) 3001 { 3002 spa_t *spa = vd->vdev_spa; 3003 vdev_t *rvd = spa->spa_root_vdev; 3004 int c; 3005 3006 ASSERT(spa_writeable(spa)); 3007 3008 /* 3009 * If this is an aux vdev (as with l2cache and spare devices), then we 3010 * update the vdev config manually and set the sync flag. 3011 */ 3012 if (vd->vdev_aux != NULL) { 3013 spa_aux_vdev_t *sav = vd->vdev_aux; 3014 nvlist_t **aux; 3015 uint_t naux; 3016 3017 for (c = 0; c < sav->sav_count; c++) { 3018 if (sav->sav_vdevs[c] == vd) 3019 break; 3020 } 3021 3022 if (c == sav->sav_count) { 3023 /* 3024 * We're being removed. There's nothing more to do. 3025 */ 3026 ASSERT(sav->sav_sync == B_TRUE); 3027 return; 3028 } 3029 3030 sav->sav_sync = B_TRUE; 3031 3032 if (nvlist_lookup_nvlist_array(sav->sav_config, 3033 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { 3034 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 3035 ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); 3036 } 3037 3038 ASSERT(c < naux); 3039 3040 /* 3041 * Setting the nvlist in the middle if the array is a little 3042 * sketchy, but it will work. 3043 */ 3044 nvlist_free(aux[c]); 3045 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); 3046 3047 return; 3048 } 3049 3050 /* 3051 * The dirty list is protected by the SCL_CONFIG lock. The caller 3052 * must either hold SCL_CONFIG as writer, or must be the sync thread 3053 * (which holds SCL_CONFIG as reader). There's only one sync thread, 3054 * so this is sufficient to ensure mutual exclusion. 3055 */ 3056 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3057 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3058 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3059 3060 if (vd == rvd) { 3061 for (c = 0; c < rvd->vdev_children; c++) 3062 vdev_config_dirty(rvd->vdev_child[c]); 3063 } else { 3064 ASSERT(vd == vd->vdev_top); 3065 3066 if (!list_link_active(&vd->vdev_config_dirty_node) && 3067 !vd->vdev_ishole) 3068 list_insert_head(&spa->spa_config_dirty_list, vd); 3069 } 3070 } 3071 3072 void 3073 vdev_config_clean(vdev_t *vd) 3074 { 3075 spa_t *spa = vd->vdev_spa; 3076 3077 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || 3078 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3079 spa_config_held(spa, SCL_CONFIG, RW_READER))); 3080 3081 ASSERT(list_link_active(&vd->vdev_config_dirty_node)); 3082 list_remove(&spa->spa_config_dirty_list, vd); 3083 } 3084 3085 /* 3086 * Mark a top-level vdev's state as dirty, so that the next pass of 3087 * spa_sync() can convert this into vdev_config_dirty(). We distinguish 3088 * the state changes from larger config changes because they require 3089 * much less locking, and are often needed for administrative actions. 3090 */ 3091 void 3092 vdev_state_dirty(vdev_t *vd) 3093 { 3094 spa_t *spa = vd->vdev_spa; 3095 3096 ASSERT(spa_writeable(spa)); 3097 ASSERT(vd == vd->vdev_top); 3098 3099 /* 3100 * The state list is protected by the SCL_STATE lock. The caller 3101 * must either hold SCL_STATE as writer, or must be the sync thread 3102 * (which holds SCL_STATE as reader). There's only one sync thread, 3103 * so this is sufficient to ensure mutual exclusion. 3104 */ 3105 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3106 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3107 spa_config_held(spa, SCL_STATE, RW_READER))); 3108 3109 if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) 3110 list_insert_head(&spa->spa_state_dirty_list, vd); 3111 } 3112 3113 void 3114 vdev_state_clean(vdev_t *vd) 3115 { 3116 spa_t *spa = vd->vdev_spa; 3117 3118 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || 3119 (dsl_pool_sync_context(spa_get_dsl(spa)) && 3120 spa_config_held(spa, SCL_STATE, RW_READER))); 3121 3122 ASSERT(list_link_active(&vd->vdev_state_dirty_node)); 3123 list_remove(&spa->spa_state_dirty_list, vd); 3124 } 3125 3126 /* 3127 * Propagate vdev state up from children to parent. 3128 */ 3129 void 3130 vdev_propagate_state(vdev_t *vd) 3131 { 3132 spa_t *spa = vd->vdev_spa; 3133 vdev_t *rvd = spa->spa_root_vdev; 3134 int degraded = 0, faulted = 0; 3135 int corrupted = 0; 3136 vdev_t *child; 3137 3138 if (vd->vdev_children > 0) { 3139 for (int c = 0; c < vd->vdev_children; c++) { 3140 child = vd->vdev_child[c]; 3141 3142 /* 3143 * Don't factor holes into the decision. 3144 */ 3145 if (child->vdev_ishole) 3146 continue; 3147 3148 if (!vdev_readable(child) || 3149 (!vdev_writeable(child) && spa_writeable(spa))) { 3150 /* 3151 * Root special: if there is a top-level log 3152 * device, treat the root vdev as if it were 3153 * degraded. 3154 */ 3155 if (child->vdev_islog && vd == rvd) 3156 degraded++; 3157 else 3158 faulted++; 3159 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 3160 degraded++; 3161 } 3162 3163 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 3164 corrupted++; 3165 } 3166 3167 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 3168 3169 /* 3170 * Root special: if there is a top-level vdev that cannot be 3171 * opened due to corrupted metadata, then propagate the root 3172 * vdev's aux state as 'corrupt' rather than 'insufficient 3173 * replicas'. 3174 */ 3175 if (corrupted && vd == rvd && 3176 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 3177 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 3178 VDEV_AUX_CORRUPT_DATA); 3179 } 3180 3181 if (vd->vdev_parent) 3182 vdev_propagate_state(vd->vdev_parent); 3183 } 3184 3185 /* 3186 * Set a vdev's state. If this is during an open, we don't update the parent 3187 * state, because we're in the process of opening children depth-first. 3188 * Otherwise, we propagate the change to the parent. 3189 * 3190 * If this routine places a device in a faulted state, an appropriate ereport is 3191 * generated. 3192 */ 3193 void 3194 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 3195 { 3196 uint64_t save_state; 3197 spa_t *spa = vd->vdev_spa; 3198 3199 if (state == vd->vdev_state) { 3200 vd->vdev_stat.vs_aux = aux; 3201 return; 3202 } 3203 3204 save_state = vd->vdev_state; 3205 3206 vd->vdev_state = state; 3207 vd->vdev_stat.vs_aux = aux; 3208 3209 /* 3210 * If we are setting the vdev state to anything but an open state, then 3211 * always close the underlying device unless the device has requested 3212 * a delayed close (i.e. we're about to remove or fault the device). 3213 * Otherwise, we keep accessible but invalid devices open forever. 3214 * We don't call vdev_close() itself, because that implies some extra 3215 * checks (offline, etc) that we don't want here. This is limited to 3216 * leaf devices, because otherwise closing the device will affect other 3217 * children. 3218 */ 3219 if (!vd->vdev_delayed_close && vdev_is_dead(vd) && 3220 vd->vdev_ops->vdev_op_leaf) 3221 vd->vdev_ops->vdev_op_close(vd); 3222 3223 /* 3224 * If we have brought this vdev back into service, we need 3225 * to notify fmd so that it can gracefully repair any outstanding 3226 * cases due to a missing device. We do this in all cases, even those 3227 * that probably don't correlate to a repaired fault. This is sure to 3228 * catch all cases, and we let the zfs-retire agent sort it out. If 3229 * this is a transient state it's OK, as the retire agent will 3230 * double-check the state of the vdev before repairing it. 3231 */ 3232 if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && 3233 vd->vdev_prevstate != state) 3234 zfs_post_state_change(spa, vd); 3235 3236 if (vd->vdev_removed && 3237 state == VDEV_STATE_CANT_OPEN && 3238 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 3239 /* 3240 * If the previous state is set to VDEV_STATE_REMOVED, then this 3241 * device was previously marked removed and someone attempted to 3242 * reopen it. If this failed due to a nonexistent device, then 3243 * keep the device in the REMOVED state. We also let this be if 3244 * it is one of our special test online cases, which is only 3245 * attempting to online the device and shouldn't generate an FMA 3246 * fault. 3247 */ 3248 vd->vdev_state = VDEV_STATE_REMOVED; 3249 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 3250 } else if (state == VDEV_STATE_REMOVED) { 3251 vd->vdev_removed = B_TRUE; 3252 } else if (state == VDEV_STATE_CANT_OPEN) { 3253 /* 3254 * If we fail to open a vdev during an import or recovery, we 3255 * mark it as "not available", which signifies that it was 3256 * never there to begin with. Failure to open such a device 3257 * is not considered an error. 3258 */ 3259 if ((spa_load_state(spa) == SPA_LOAD_IMPORT || 3260 spa_load_state(spa) == SPA_LOAD_RECOVER) && 3261 vd->vdev_ops->vdev_op_leaf) 3262 vd->vdev_not_present = 1; 3263 3264 /* 3265 * Post the appropriate ereport. If the 'prevstate' field is 3266 * set to something other than VDEV_STATE_UNKNOWN, it indicates 3267 * that this is part of a vdev_reopen(). In this case, we don't 3268 * want to post the ereport if the device was already in the 3269 * CANT_OPEN state beforehand. 3270 * 3271 * If the 'checkremove' flag is set, then this is an attempt to 3272 * online the device in response to an insertion event. If we 3273 * hit this case, then we have detected an insertion event for a 3274 * faulted or offline device that wasn't in the removed state. 3275 * In this scenario, we don't post an ereport because we are 3276 * about to replace the device, or attempt an online with 3277 * vdev_forcefault, which will generate the fault for us. 3278 */ 3279 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 3280 !vd->vdev_not_present && !vd->vdev_checkremove && 3281 vd != spa->spa_root_vdev) { 3282 const char *class; 3283 3284 switch (aux) { 3285 case VDEV_AUX_OPEN_FAILED: 3286 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 3287 break; 3288 case VDEV_AUX_CORRUPT_DATA: 3289 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 3290 break; 3291 case VDEV_AUX_NO_REPLICAS: 3292 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 3293 break; 3294 case VDEV_AUX_BAD_GUID_SUM: 3295 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 3296 break; 3297 case VDEV_AUX_TOO_SMALL: 3298 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 3299 break; 3300 case VDEV_AUX_BAD_LABEL: 3301 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 3302 break; 3303 default: 3304 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 3305 } 3306 3307 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 3308 } 3309 3310 /* Erase any notion of persistent removed state */ 3311 vd->vdev_removed = B_FALSE; 3312 } else { 3313 vd->vdev_removed = B_FALSE; 3314 } 3315 3316 if (!isopen && vd->vdev_parent) 3317 vdev_propagate_state(vd->vdev_parent); 3318 } 3319 3320 /* 3321 * Check the vdev configuration to ensure that it's capable of supporting 3322 * a root pool. Currently, we do not support RAID-Z or partial configuration. 3323 * In addition, only a single top-level vdev is allowed and none of the leaves 3324 * can be wholedisks. 3325 */ 3326 boolean_t 3327 vdev_is_bootable(vdev_t *vd) 3328 { 3329 if (!vd->vdev_ops->vdev_op_leaf) { 3330 char *vdev_type = vd->vdev_ops->vdev_op_type; 3331 3332 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 3333 vd->vdev_children > 1) { 3334 return (B_FALSE); 3335 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 3336 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 3337 return (B_FALSE); 3338 } 3339 } 3340 3341 for (int c = 0; c < vd->vdev_children; c++) { 3342 if (!vdev_is_bootable(vd->vdev_child[c])) 3343 return (B_FALSE); 3344 } 3345 return (B_TRUE); 3346 } 3347 3348 /* 3349 * Load the state from the original vdev tree (ovd) which 3350 * we've retrieved from the MOS config object. If the original 3351 * vdev was offline or faulted then we transfer that state to the 3352 * device in the current vdev tree (nvd). 3353 */ 3354 void 3355 vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) 3356 { 3357 spa_t *spa = nvd->vdev_spa; 3358 3359 ASSERT(nvd->vdev_top->vdev_islog); 3360 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); 3361 ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); 3362 3363 for (int c = 0; c < nvd->vdev_children; c++) 3364 vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); 3365 3366 if (nvd->vdev_ops->vdev_op_leaf) { 3367 /* 3368 * Restore the persistent vdev state 3369 */ 3370 nvd->vdev_offline = ovd->vdev_offline; 3371 nvd->vdev_faulted = ovd->vdev_faulted; 3372 nvd->vdev_degraded = ovd->vdev_degraded; 3373 nvd->vdev_removed = ovd->vdev_removed; 3374 } 3375 } 3376 3377 /* 3378 * Determine if a log device has valid content. If the vdev was 3379 * removed or faulted in the MOS config then we know that 3380 * the content on the log device has already been written to the pool. 3381 */ 3382 boolean_t 3383 vdev_log_state_valid(vdev_t *vd) 3384 { 3385 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && 3386 !vd->vdev_removed) 3387 return (B_TRUE); 3388 3389 for (int c = 0; c < vd->vdev_children; c++) 3390 if (vdev_log_state_valid(vd->vdev_child[c])) 3391 return (B_TRUE); 3392 3393 return (B_FALSE); 3394 } 3395 3396 /* 3397 * Expand a vdev if possible. 3398 */ 3399 void 3400 vdev_expand(vdev_t *vd, uint64_t txg) 3401 { 3402 ASSERT(vd->vdev_top == vd); 3403 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); 3404 3405 if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { 3406 VERIFY(vdev_metaslab_init(vd, txg) == 0); 3407 vdev_config_dirty(vd); 3408 } 3409 } 3410 3411 /* 3412 * Split a vdev. 3413 */ 3414 void 3415 vdev_split(vdev_t *vd) 3416 { 3417 vdev_t *cvd, *pvd = vd->vdev_parent; 3418 3419 vdev_remove_child(pvd, vd); 3420 vdev_compact_children(pvd); 3421 3422 cvd = pvd->vdev_child[0]; 3423 if (pvd->vdev_children == 1) { 3424 vdev_remove_parent(cvd); 3425 cvd->vdev_splitting = B_TRUE; 3426 } 3427 vdev_propagate_state(cvd); 3428 } 3429 3430 void 3431 vdev_deadman(vdev_t *vd) 3432 { 3433 for (int c = 0; c < vd->vdev_children; c++) { 3434 vdev_t *cvd = vd->vdev_child[c]; 3435 3436 vdev_deadman(cvd); 3437 } 3438 3439 if (vd->vdev_ops->vdev_op_leaf) { 3440 vdev_queue_t *vq = &vd->vdev_queue; 3441 3442 mutex_enter(&vq->vq_lock); 3443 if (avl_numnodes(&vq->vq_active_tree) > 0) { 3444 spa_t *spa = vd->vdev_spa; 3445 zio_t *fio; 3446 uint64_t delta; 3447 3448 /* 3449 * Look at the head of all the pending queues, 3450 * if any I/O has been outstanding for longer than 3451 * the spa_deadman_synctime we panic the system. 3452 */ 3453 fio = avl_first(&vq->vq_active_tree); 3454 delta = gethrtime() - fio->io_timestamp; 3455 if (delta > spa_deadman_synctime(spa)) { 3456 zfs_dbgmsg("SLOW IO: zio timestamp %lluns, " 3457 "delta %lluns, last io %lluns", 3458 fio->io_timestamp, delta, 3459 vq->vq_io_complete_ts); 3460 fm_panic("I/O to pool '%s' appears to be " 3461 "hung.", spa_name(spa)); 3462 } 3463 } 3464 mutex_exit(&vq->vq_lock); 3465 } 3466 } 3467