1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/fm/fs/zfs.h> 31 #include <sys/spa.h> 32 #include <sys/spa_impl.h> 33 #include <sys/dmu.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/vdev_impl.h> 36 #include <sys/uberblock_impl.h> 37 #include <sys/metaslab.h> 38 #include <sys/metaslab_impl.h> 39 #include <sys/space_map.h> 40 #include <sys/zio.h> 41 #include <sys/zap.h> 42 #include <sys/fs/zfs.h> 43 #include <sys/arc.h> 44 45 /* 46 * Virtual device management. 47 */ 48 49 static vdev_ops_t *vdev_ops_table[] = { 50 &vdev_root_ops, 51 &vdev_raidz_ops, 52 &vdev_mirror_ops, 53 &vdev_replacing_ops, 54 &vdev_spare_ops, 55 &vdev_disk_ops, 56 &vdev_file_ops, 57 &vdev_missing_ops, 58 NULL 59 }; 60 61 /* maximum scrub/resilver I/O queue */ 62 int zfs_scrub_limit = 70; 63 64 /* 65 * Given a vdev type, return the appropriate ops vector. 66 */ 67 static vdev_ops_t * 68 vdev_getops(const char *type) 69 { 70 vdev_ops_t *ops, **opspp; 71 72 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 73 if (strcmp(ops->vdev_op_type, type) == 0) 74 break; 75 76 return (ops); 77 } 78 79 /* 80 * Default asize function: return the MAX of psize with the asize of 81 * all children. This is what's used by anything other than RAID-Z. 82 */ 83 uint64_t 84 vdev_default_asize(vdev_t *vd, uint64_t psize) 85 { 86 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 87 uint64_t csize; 88 uint64_t c; 89 90 for (c = 0; c < vd->vdev_children; c++) { 91 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 92 asize = MAX(asize, csize); 93 } 94 95 return (asize); 96 } 97 98 /* 99 * Get the replaceable or attachable device size. 100 * If the parent is a mirror or raidz, the replaceable size is the minimum 101 * psize of all its children. For the rest, just return our own psize. 102 * 103 * e.g. 104 * psize rsize 105 * root - - 106 * mirror/raidz - - 107 * disk1 20g 20g 108 * disk2 40g 20g 109 * disk3 80g 80g 110 */ 111 uint64_t 112 vdev_get_rsize(vdev_t *vd) 113 { 114 vdev_t *pvd, *cvd; 115 uint64_t c, rsize; 116 117 pvd = vd->vdev_parent; 118 119 /* 120 * If our parent is NULL or the root, just return our own psize. 121 */ 122 if (pvd == NULL || pvd->vdev_parent == NULL) 123 return (vd->vdev_psize); 124 125 rsize = 0; 126 127 for (c = 0; c < pvd->vdev_children; c++) { 128 cvd = pvd->vdev_child[c]; 129 rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 130 } 131 132 return (rsize); 133 } 134 135 vdev_t * 136 vdev_lookup_top(spa_t *spa, uint64_t vdev) 137 { 138 vdev_t *rvd = spa->spa_root_vdev; 139 140 ASSERT(spa_config_held(spa, RW_READER) || 141 curthread == spa->spa_scrub_thread); 142 143 if (vdev < rvd->vdev_children) 144 return (rvd->vdev_child[vdev]); 145 146 return (NULL); 147 } 148 149 vdev_t * 150 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 151 { 152 int c; 153 vdev_t *mvd; 154 155 if (vd->vdev_guid == guid) 156 return (vd); 157 158 for (c = 0; c < vd->vdev_children; c++) 159 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 160 NULL) 161 return (mvd); 162 163 return (NULL); 164 } 165 166 void 167 vdev_add_child(vdev_t *pvd, vdev_t *cvd) 168 { 169 size_t oldsize, newsize; 170 uint64_t id = cvd->vdev_id; 171 vdev_t **newchild; 172 173 ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 174 ASSERT(cvd->vdev_parent == NULL); 175 176 cvd->vdev_parent = pvd; 177 178 if (pvd == NULL) 179 return; 180 181 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 182 183 oldsize = pvd->vdev_children * sizeof (vdev_t *); 184 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 185 newsize = pvd->vdev_children * sizeof (vdev_t *); 186 187 newchild = kmem_zalloc(newsize, KM_SLEEP); 188 if (pvd->vdev_child != NULL) { 189 bcopy(pvd->vdev_child, newchild, oldsize); 190 kmem_free(pvd->vdev_child, oldsize); 191 } 192 193 pvd->vdev_child = newchild; 194 pvd->vdev_child[id] = cvd; 195 196 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 197 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 198 199 /* 200 * Walk up all ancestors to update guid sum. 201 */ 202 for (; pvd != NULL; pvd = pvd->vdev_parent) 203 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 204 205 if (cvd->vdev_ops->vdev_op_leaf) 206 cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 207 } 208 209 void 210 vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 211 { 212 int c; 213 uint_t id = cvd->vdev_id; 214 215 ASSERT(cvd->vdev_parent == pvd); 216 217 if (pvd == NULL) 218 return; 219 220 ASSERT(id < pvd->vdev_children); 221 ASSERT(pvd->vdev_child[id] == cvd); 222 223 pvd->vdev_child[id] = NULL; 224 cvd->vdev_parent = NULL; 225 226 for (c = 0; c < pvd->vdev_children; c++) 227 if (pvd->vdev_child[c]) 228 break; 229 230 if (c == pvd->vdev_children) { 231 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 232 pvd->vdev_child = NULL; 233 pvd->vdev_children = 0; 234 } 235 236 /* 237 * Walk up all ancestors to update guid sum. 238 */ 239 for (; pvd != NULL; pvd = pvd->vdev_parent) 240 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 241 242 if (cvd->vdev_ops->vdev_op_leaf) 243 cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 244 } 245 246 /* 247 * Remove any holes in the child array. 248 */ 249 void 250 vdev_compact_children(vdev_t *pvd) 251 { 252 vdev_t **newchild, *cvd; 253 int oldc = pvd->vdev_children; 254 int newc, c; 255 256 ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 257 258 for (c = newc = 0; c < oldc; c++) 259 if (pvd->vdev_child[c]) 260 newc++; 261 262 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 263 264 for (c = newc = 0; c < oldc; c++) { 265 if ((cvd = pvd->vdev_child[c]) != NULL) { 266 newchild[newc] = cvd; 267 cvd->vdev_id = newc++; 268 } 269 } 270 271 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 272 pvd->vdev_child = newchild; 273 pvd->vdev_children = newc; 274 } 275 276 /* 277 * Allocate and minimally initialize a vdev_t. 278 */ 279 static vdev_t * 280 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 281 { 282 vdev_t *vd; 283 284 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 285 286 if (spa->spa_root_vdev == NULL) { 287 ASSERT(ops == &vdev_root_ops); 288 spa->spa_root_vdev = vd; 289 } 290 291 if (guid == 0) { 292 if (spa->spa_root_vdev == vd) { 293 /* 294 * The root vdev's guid will also be the pool guid, 295 * which must be unique among all pools. 296 */ 297 while (guid == 0 || spa_guid_exists(guid, 0)) 298 guid = spa_get_random(-1ULL); 299 } else { 300 /* 301 * Any other vdev's guid must be unique within the pool. 302 */ 303 while (guid == 0 || 304 spa_guid_exists(spa_guid(spa), guid)) 305 guid = spa_get_random(-1ULL); 306 } 307 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 308 } 309 310 vd->vdev_spa = spa; 311 vd->vdev_id = id; 312 vd->vdev_guid = guid; 313 vd->vdev_guid_sum = guid; 314 vd->vdev_ops = ops; 315 vd->vdev_state = VDEV_STATE_CLOSED; 316 317 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 318 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 319 space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 320 space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 321 txg_list_create(&vd->vdev_ms_list, 322 offsetof(struct metaslab, ms_txg_node)); 323 txg_list_create(&vd->vdev_dtl_list, 324 offsetof(struct vdev, vdev_dtl_node)); 325 vd->vdev_stat.vs_timestamp = gethrtime(); 326 vdev_queue_init(vd); 327 vdev_cache_init(vd); 328 329 return (vd); 330 } 331 332 /* 333 * Allocate a new vdev. The 'alloctype' is used to control whether we are 334 * creating a new vdev or loading an existing one - the behavior is slightly 335 * different for each case. 336 */ 337 int 338 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 339 int alloctype) 340 { 341 vdev_ops_t *ops; 342 char *type; 343 uint64_t guid = 0, islog, nparity; 344 vdev_t *vd; 345 346 ASSERT(spa_config_held(spa, RW_WRITER)); 347 348 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 349 return (EINVAL); 350 351 if ((ops = vdev_getops(type)) == NULL) 352 return (EINVAL); 353 354 /* 355 * If this is a load, get the vdev guid from the nvlist. 356 * Otherwise, vdev_alloc_common() will generate one for us. 357 */ 358 if (alloctype == VDEV_ALLOC_LOAD) { 359 uint64_t label_id; 360 361 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 362 label_id != id) 363 return (EINVAL); 364 365 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 366 return (EINVAL); 367 } else if (alloctype == VDEV_ALLOC_SPARE) { 368 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 369 return (EINVAL); 370 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 371 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 372 return (EINVAL); 373 } 374 375 /* 376 * The first allocated vdev must be of type 'root'. 377 */ 378 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 379 return (EINVAL); 380 381 /* 382 * Determine whether we're a log vdev. 383 */ 384 islog = 0; 385 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 386 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 387 return (ENOTSUP); 388 389 /* 390 * Set the nparity property for RAID-Z vdevs. 391 */ 392 nparity = -1ULL; 393 if (ops == &vdev_raidz_ops) { 394 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 395 &nparity) == 0) { 396 /* 397 * Currently, we can only support 2 parity devices. 398 */ 399 if (nparity == 0 || nparity > 2) 400 return (EINVAL); 401 /* 402 * Older versions can only support 1 parity device. 403 */ 404 if (nparity == 2 && 405 spa_version(spa) < SPA_VERSION_RAID6) 406 return (ENOTSUP); 407 } else { 408 /* 409 * We require the parity to be specified for SPAs that 410 * support multiple parity levels. 411 */ 412 if (spa_version(spa) >= SPA_VERSION_RAID6) 413 return (EINVAL); 414 /* 415 * Otherwise, we default to 1 parity device for RAID-Z. 416 */ 417 nparity = 1; 418 } 419 } else { 420 nparity = 0; 421 } 422 ASSERT(nparity != -1ULL); 423 424 vd = vdev_alloc_common(spa, id, guid, ops); 425 426 vd->vdev_islog = islog; 427 vd->vdev_nparity = nparity; 428 429 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 430 vd->vdev_path = spa_strdup(vd->vdev_path); 431 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 432 vd->vdev_devid = spa_strdup(vd->vdev_devid); 433 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 434 &vd->vdev_physpath) == 0) 435 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 436 437 /* 438 * Set the whole_disk property. If it's not specified, leave the value 439 * as -1. 440 */ 441 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 442 &vd->vdev_wholedisk) != 0) 443 vd->vdev_wholedisk = -1ULL; 444 445 /* 446 * Look for the 'not present' flag. This will only be set if the device 447 * was not present at the time of import. 448 */ 449 if (!spa->spa_import_faulted) 450 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 451 &vd->vdev_not_present); 452 453 /* 454 * Get the alignment requirement. 455 */ 456 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 457 458 /* 459 * If we're a top-level vdev, try to load the allocation parameters. 460 */ 461 if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 462 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 463 &vd->vdev_ms_array); 464 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 465 &vd->vdev_ms_shift); 466 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 467 &vd->vdev_asize); 468 } 469 470 /* 471 * If we're a leaf vdev, try to load the DTL object and other state. 472 */ 473 if (vd->vdev_ops->vdev_op_leaf && 474 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { 475 if (alloctype == VDEV_ALLOC_LOAD) { 476 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 477 &vd->vdev_dtl.smo_object); 478 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 479 &vd->vdev_unspare); 480 } 481 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 482 &vd->vdev_offline); 483 484 /* 485 * When importing a pool, we want to ignore the persistent fault 486 * state, as the diagnosis made on another system may not be 487 * valid in the current context. 488 */ 489 if (spa->spa_load_state == SPA_LOAD_OPEN) { 490 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 491 &vd->vdev_faulted); 492 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 493 &vd->vdev_degraded); 494 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 495 &vd->vdev_removed); 496 } 497 } 498 499 /* 500 * Add ourselves to the parent's list of children. 501 */ 502 vdev_add_child(parent, vd); 503 504 *vdp = vd; 505 506 return (0); 507 } 508 509 void 510 vdev_free(vdev_t *vd) 511 { 512 int c; 513 spa_t *spa = vd->vdev_spa; 514 515 /* 516 * vdev_free() implies closing the vdev first. This is simpler than 517 * trying to ensure complicated semantics for all callers. 518 */ 519 vdev_close(vd); 520 521 522 ASSERT(!list_link_active(&vd->vdev_dirty_node)); 523 524 /* 525 * Free all children. 526 */ 527 for (c = 0; c < vd->vdev_children; c++) 528 vdev_free(vd->vdev_child[c]); 529 530 ASSERT(vd->vdev_child == NULL); 531 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 532 533 /* 534 * Discard allocation state. 535 */ 536 if (vd == vd->vdev_top) 537 vdev_metaslab_fini(vd); 538 539 ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 540 ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 541 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 542 543 /* 544 * Remove this vdev from its parent's child list. 545 */ 546 vdev_remove_child(vd->vdev_parent, vd); 547 548 ASSERT(vd->vdev_parent == NULL); 549 550 /* 551 * Clean up vdev structure. 552 */ 553 vdev_queue_fini(vd); 554 vdev_cache_fini(vd); 555 556 if (vd->vdev_path) 557 spa_strfree(vd->vdev_path); 558 if (vd->vdev_devid) 559 spa_strfree(vd->vdev_devid); 560 if (vd->vdev_physpath) 561 spa_strfree(vd->vdev_physpath); 562 563 if (vd->vdev_isspare) 564 spa_spare_remove(vd); 565 if (vd->vdev_isl2cache) 566 spa_l2cache_remove(vd); 567 568 txg_list_destroy(&vd->vdev_ms_list); 569 txg_list_destroy(&vd->vdev_dtl_list); 570 mutex_enter(&vd->vdev_dtl_lock); 571 space_map_unload(&vd->vdev_dtl_map); 572 space_map_destroy(&vd->vdev_dtl_map); 573 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 574 space_map_destroy(&vd->vdev_dtl_scrub); 575 mutex_exit(&vd->vdev_dtl_lock); 576 mutex_destroy(&vd->vdev_dtl_lock); 577 mutex_destroy(&vd->vdev_stat_lock); 578 579 if (vd == spa->spa_root_vdev) 580 spa->spa_root_vdev = NULL; 581 582 kmem_free(vd, sizeof (vdev_t)); 583 } 584 585 /* 586 * Transfer top-level vdev state from svd to tvd. 587 */ 588 static void 589 vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 590 { 591 spa_t *spa = svd->vdev_spa; 592 metaslab_t *msp; 593 vdev_t *vd; 594 int t; 595 596 ASSERT(tvd == tvd->vdev_top); 597 598 tvd->vdev_ms_array = svd->vdev_ms_array; 599 tvd->vdev_ms_shift = svd->vdev_ms_shift; 600 tvd->vdev_ms_count = svd->vdev_ms_count; 601 602 svd->vdev_ms_array = 0; 603 svd->vdev_ms_shift = 0; 604 svd->vdev_ms_count = 0; 605 606 tvd->vdev_mg = svd->vdev_mg; 607 tvd->vdev_ms = svd->vdev_ms; 608 609 svd->vdev_mg = NULL; 610 svd->vdev_ms = NULL; 611 612 if (tvd->vdev_mg != NULL) 613 tvd->vdev_mg->mg_vd = tvd; 614 615 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 616 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 617 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 618 619 svd->vdev_stat.vs_alloc = 0; 620 svd->vdev_stat.vs_space = 0; 621 svd->vdev_stat.vs_dspace = 0; 622 623 for (t = 0; t < TXG_SIZE; t++) { 624 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 625 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 626 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 627 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 628 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 629 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 630 } 631 632 if (list_link_active(&svd->vdev_dirty_node)) { 633 vdev_config_clean(svd); 634 vdev_config_dirty(tvd); 635 } 636 637 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 638 svd->vdev_deflate_ratio = 0; 639 640 tvd->vdev_islog = svd->vdev_islog; 641 svd->vdev_islog = 0; 642 } 643 644 static void 645 vdev_top_update(vdev_t *tvd, vdev_t *vd) 646 { 647 int c; 648 649 if (vd == NULL) 650 return; 651 652 vd->vdev_top = tvd; 653 654 for (c = 0; c < vd->vdev_children; c++) 655 vdev_top_update(tvd, vd->vdev_child[c]); 656 } 657 658 /* 659 * Add a mirror/replacing vdev above an existing vdev. 660 */ 661 vdev_t * 662 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 663 { 664 spa_t *spa = cvd->vdev_spa; 665 vdev_t *pvd = cvd->vdev_parent; 666 vdev_t *mvd; 667 668 ASSERT(spa_config_held(spa, RW_WRITER)); 669 670 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 671 672 mvd->vdev_asize = cvd->vdev_asize; 673 mvd->vdev_ashift = cvd->vdev_ashift; 674 mvd->vdev_state = cvd->vdev_state; 675 676 vdev_remove_child(pvd, cvd); 677 vdev_add_child(pvd, mvd); 678 cvd->vdev_id = mvd->vdev_children; 679 vdev_add_child(mvd, cvd); 680 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 681 682 if (mvd == mvd->vdev_top) 683 vdev_top_transfer(cvd, mvd); 684 685 return (mvd); 686 } 687 688 /* 689 * Remove a 1-way mirror/replacing vdev from the tree. 690 */ 691 void 692 vdev_remove_parent(vdev_t *cvd) 693 { 694 vdev_t *mvd = cvd->vdev_parent; 695 vdev_t *pvd = mvd->vdev_parent; 696 697 ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 698 699 ASSERT(mvd->vdev_children == 1); 700 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 701 mvd->vdev_ops == &vdev_replacing_ops || 702 mvd->vdev_ops == &vdev_spare_ops); 703 cvd->vdev_ashift = mvd->vdev_ashift; 704 705 vdev_remove_child(mvd, cvd); 706 vdev_remove_child(pvd, mvd); 707 cvd->vdev_id = mvd->vdev_id; 708 vdev_add_child(pvd, cvd); 709 /* 710 * If we created a new toplevel vdev, then we need to change the child's 711 * vdev GUID to match the old toplevel vdev. Otherwise, we could have 712 * detached an offline device, and when we go to import the pool we'll 713 * think we have two toplevel vdevs, instead of a different version of 714 * the same toplevel vdev. 715 */ 716 if (cvd->vdev_top == cvd) { 717 pvd->vdev_guid_sum -= cvd->vdev_guid; 718 cvd->vdev_guid_sum -= cvd->vdev_guid; 719 cvd->vdev_guid = mvd->vdev_guid; 720 cvd->vdev_guid_sum += mvd->vdev_guid; 721 pvd->vdev_guid_sum += cvd->vdev_guid; 722 } 723 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 724 725 if (cvd == cvd->vdev_top) 726 vdev_top_transfer(mvd, cvd); 727 728 ASSERT(mvd->vdev_children == 0); 729 vdev_free(mvd); 730 } 731 732 int 733 vdev_metaslab_init(vdev_t *vd, uint64_t txg) 734 { 735 spa_t *spa = vd->vdev_spa; 736 objset_t *mos = spa->spa_meta_objset; 737 metaslab_class_t *mc; 738 uint64_t m; 739 uint64_t oldc = vd->vdev_ms_count; 740 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 741 metaslab_t **mspp; 742 int error; 743 744 if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 745 return (0); 746 747 dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 748 749 ASSERT(oldc <= newc); 750 751 if (vd->vdev_islog) 752 mc = spa->spa_log_class; 753 else 754 mc = spa->spa_normal_class; 755 756 if (vd->vdev_mg == NULL) 757 vd->vdev_mg = metaslab_group_create(mc, vd); 758 759 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 760 761 if (oldc != 0) { 762 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 763 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 764 } 765 766 vd->vdev_ms = mspp; 767 vd->vdev_ms_count = newc; 768 769 for (m = oldc; m < newc; m++) { 770 space_map_obj_t smo = { 0, 0, 0 }; 771 if (txg == 0) { 772 uint64_t object = 0; 773 error = dmu_read(mos, vd->vdev_ms_array, 774 m * sizeof (uint64_t), sizeof (uint64_t), &object); 775 if (error) 776 return (error); 777 if (object != 0) { 778 dmu_buf_t *db; 779 error = dmu_bonus_hold(mos, object, FTAG, &db); 780 if (error) 781 return (error); 782 ASSERT3U(db->db_size, >=, sizeof (smo)); 783 bcopy(db->db_data, &smo, sizeof (smo)); 784 ASSERT3U(smo.smo_object, ==, object); 785 dmu_buf_rele(db, FTAG); 786 } 787 } 788 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 789 m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 790 } 791 792 return (0); 793 } 794 795 void 796 vdev_metaslab_fini(vdev_t *vd) 797 { 798 uint64_t m; 799 uint64_t count = vd->vdev_ms_count; 800 801 if (vd->vdev_ms != NULL) { 802 for (m = 0; m < count; m++) 803 if (vd->vdev_ms[m] != NULL) 804 metaslab_fini(vd->vdev_ms[m]); 805 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 806 vd->vdev_ms = NULL; 807 } 808 } 809 810 int 811 vdev_probe(vdev_t *vd) 812 { 813 if (vd == NULL) 814 return (EINVAL); 815 816 /* 817 * Right now we only support status checks on the leaf vdevs. 818 */ 819 if (vd->vdev_ops->vdev_op_leaf) 820 return (vd->vdev_ops->vdev_op_probe(vd)); 821 822 return (0); 823 } 824 825 /* 826 * Prepare a virtual device for access. 827 */ 828 int 829 vdev_open(vdev_t *vd) 830 { 831 int error; 832 int c; 833 uint64_t osize = 0; 834 uint64_t asize, psize; 835 uint64_t ashift = 0; 836 837 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 838 vd->vdev_state == VDEV_STATE_CANT_OPEN || 839 vd->vdev_state == VDEV_STATE_OFFLINE); 840 841 if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 842 vd->vdev_fault_arg >>= 1; 843 else 844 vd->vdev_fault_mode = VDEV_FAULT_NONE; 845 846 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 847 848 if (!vd->vdev_removed && vd->vdev_faulted) { 849 ASSERT(vd->vdev_children == 0); 850 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 851 VDEV_AUX_ERR_EXCEEDED); 852 return (ENXIO); 853 } else if (vd->vdev_offline) { 854 ASSERT(vd->vdev_children == 0); 855 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 856 return (ENXIO); 857 } 858 859 error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 860 861 if (zio_injection_enabled && error == 0) 862 error = zio_handle_device_injection(vd, ENXIO); 863 864 if (error) { 865 if (vd->vdev_removed && 866 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 867 vd->vdev_removed = B_FALSE; 868 869 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 870 vd->vdev_stat.vs_aux); 871 return (error); 872 } 873 874 vd->vdev_removed = B_FALSE; 875 876 if (vd->vdev_degraded) { 877 ASSERT(vd->vdev_children == 0); 878 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 879 VDEV_AUX_ERR_EXCEEDED); 880 } else { 881 vd->vdev_state = VDEV_STATE_HEALTHY; 882 } 883 884 for (c = 0; c < vd->vdev_children; c++) 885 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 886 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 887 VDEV_AUX_NONE); 888 break; 889 } 890 891 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 892 893 if (vd->vdev_children == 0) { 894 if (osize < SPA_MINDEVSIZE) { 895 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 896 VDEV_AUX_TOO_SMALL); 897 return (EOVERFLOW); 898 } 899 psize = osize; 900 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 901 } else { 902 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 903 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 904 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 905 VDEV_AUX_TOO_SMALL); 906 return (EOVERFLOW); 907 } 908 psize = 0; 909 asize = osize; 910 } 911 912 vd->vdev_psize = psize; 913 914 if (vd->vdev_asize == 0) { 915 /* 916 * This is the first-ever open, so use the computed values. 917 * For testing purposes, a higher ashift can be requested. 918 */ 919 vd->vdev_asize = asize; 920 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 921 } else { 922 /* 923 * Make sure the alignment requirement hasn't increased. 924 */ 925 if (ashift > vd->vdev_top->vdev_ashift) { 926 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 927 VDEV_AUX_BAD_LABEL); 928 return (EINVAL); 929 } 930 931 /* 932 * Make sure the device hasn't shrunk. 933 */ 934 if (asize < vd->vdev_asize) { 935 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 936 VDEV_AUX_BAD_LABEL); 937 return (EINVAL); 938 } 939 940 /* 941 * If all children are healthy and the asize has increased, 942 * then we've experienced dynamic LUN growth. 943 */ 944 if (vd->vdev_state == VDEV_STATE_HEALTHY && 945 asize > vd->vdev_asize) { 946 vd->vdev_asize = asize; 947 } 948 } 949 950 /* 951 * Ensure we can issue some IO before declaring the 952 * vdev open for business. 953 */ 954 error = vdev_probe(vd); 955 if (error) { 956 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 957 VDEV_AUX_OPEN_FAILED); 958 return (error); 959 } 960 961 /* 962 * If this is a top-level vdev, compute the raidz-deflation 963 * ratio. Note, we hard-code in 128k (1<<17) because it is the 964 * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 965 * changes, this algorithm must never change, or we will 966 * inconsistently account for existing bp's. 967 */ 968 if (vd->vdev_top == vd) { 969 vd->vdev_deflate_ratio = (1<<17) / 970 (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 971 } 972 973 return (0); 974 } 975 976 /* 977 * Called once the vdevs are all opened, this routine validates the label 978 * contents. This needs to be done before vdev_load() so that we don't 979 * inadvertently do repair I/Os to the wrong device. 980 * 981 * This function will only return failure if one of the vdevs indicates that it 982 * has since been destroyed or exported. This is only possible if 983 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 984 * will be updated but the function will return 0. 985 */ 986 int 987 vdev_validate(vdev_t *vd) 988 { 989 spa_t *spa = vd->vdev_spa; 990 int c; 991 nvlist_t *label; 992 uint64_t guid; 993 uint64_t state; 994 995 for (c = 0; c < vd->vdev_children; c++) 996 if (vdev_validate(vd->vdev_child[c]) != 0) 997 return (EBADF); 998 999 /* 1000 * If the device has already failed, or was marked offline, don't do 1001 * any further validation. Otherwise, label I/O will fail and we will 1002 * overwrite the previous state. 1003 */ 1004 if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { 1005 1006 if ((label = vdev_label_read_config(vd)) == NULL) { 1007 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1008 VDEV_AUX_BAD_LABEL); 1009 return (0); 1010 } 1011 1012 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 1013 &guid) != 0 || guid != spa_guid(spa)) { 1014 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1015 VDEV_AUX_CORRUPT_DATA); 1016 nvlist_free(label); 1017 return (0); 1018 } 1019 1020 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1021 &guid) != 0 || guid != vd->vdev_guid) { 1022 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1023 VDEV_AUX_CORRUPT_DATA); 1024 nvlist_free(label); 1025 return (0); 1026 } 1027 1028 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1029 &state) != 0) { 1030 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1031 VDEV_AUX_CORRUPT_DATA); 1032 nvlist_free(label); 1033 return (0); 1034 } 1035 1036 nvlist_free(label); 1037 1038 if (spa->spa_load_state == SPA_LOAD_OPEN && 1039 state != POOL_STATE_ACTIVE) 1040 return (EBADF); 1041 1042 /* 1043 * If we were able to open and validate a vdev that was 1044 * previously marked permanently unavailable, clear that state 1045 * now. 1046 */ 1047 if (vd->vdev_not_present) 1048 vd->vdev_not_present = 0; 1049 } 1050 1051 return (0); 1052 } 1053 1054 /* 1055 * Close a virtual device. 1056 */ 1057 void 1058 vdev_close(vdev_t *vd) 1059 { 1060 vd->vdev_ops->vdev_op_close(vd); 1061 1062 vdev_cache_purge(vd); 1063 1064 /* 1065 * We record the previous state before we close it, so that if we are 1066 * doing a reopen(), we don't generate FMA ereports if we notice that 1067 * it's still faulted. 1068 */ 1069 vd->vdev_prevstate = vd->vdev_state; 1070 1071 if (vd->vdev_offline) 1072 vd->vdev_state = VDEV_STATE_OFFLINE; 1073 else 1074 vd->vdev_state = VDEV_STATE_CLOSED; 1075 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1076 } 1077 1078 void 1079 vdev_reopen(vdev_t *vd) 1080 { 1081 spa_t *spa = vd->vdev_spa; 1082 1083 ASSERT(spa_config_held(spa, RW_WRITER)); 1084 1085 vdev_close(vd); 1086 (void) vdev_open(vd); 1087 1088 /* 1089 * Call vdev_validate() here to make sure we have the same device. 1090 * Otherwise, a device with an invalid label could be successfully 1091 * opened in response to vdev_reopen(). 1092 */ 1093 if (vd->vdev_aux) { 1094 (void) vdev_validate_aux(vd); 1095 if (!vdev_is_dead(vd) && 1096 !l2arc_vdev_present(vd)) { 1097 uint64_t size = vdev_get_rsize(vd); 1098 l2arc_add_vdev(spa, vd, 1099 VDEV_LABEL_START_SIZE, 1100 size - VDEV_LABEL_START_SIZE); 1101 } 1102 } else { 1103 (void) vdev_validate(vd); 1104 } 1105 1106 /* 1107 * Reassess parent vdev's health. 1108 */ 1109 vdev_propagate_state(vd); 1110 } 1111 1112 int 1113 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1114 { 1115 int error; 1116 1117 /* 1118 * Normally, partial opens (e.g. of a mirror) are allowed. 1119 * For a create, however, we want to fail the request if 1120 * there are any components we can't open. 1121 */ 1122 error = vdev_open(vd); 1123 1124 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1125 vdev_close(vd); 1126 return (error ? error : ENXIO); 1127 } 1128 1129 /* 1130 * Recursively initialize all labels. 1131 */ 1132 if ((error = vdev_label_init(vd, txg, isreplacing ? 1133 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1134 vdev_close(vd); 1135 return (error); 1136 } 1137 1138 return (0); 1139 } 1140 1141 /* 1142 * The is the latter half of vdev_create(). It is distinct because it 1143 * involves initiating transactions in order to do metaslab creation. 1144 * For creation, we want to try to create all vdevs at once and then undo it 1145 * if anything fails; this is much harder if we have pending transactions. 1146 */ 1147 void 1148 vdev_init(vdev_t *vd, uint64_t txg) 1149 { 1150 /* 1151 * Aim for roughly 200 metaslabs per vdev. 1152 */ 1153 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1154 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1155 1156 /* 1157 * Initialize the vdev's metaslabs. This can't fail because 1158 * there's nothing to read when creating all new metaslabs. 1159 */ 1160 VERIFY(vdev_metaslab_init(vd, txg) == 0); 1161 } 1162 1163 void 1164 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1165 { 1166 ASSERT(vd == vd->vdev_top); 1167 ASSERT(ISP2(flags)); 1168 1169 if (flags & VDD_METASLAB) 1170 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1171 1172 if (flags & VDD_DTL) 1173 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1174 1175 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1176 } 1177 1178 void 1179 vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 1180 { 1181 mutex_enter(sm->sm_lock); 1182 if (!space_map_contains(sm, txg, size)) 1183 space_map_add(sm, txg, size); 1184 mutex_exit(sm->sm_lock); 1185 } 1186 1187 int 1188 vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 1189 { 1190 int dirty; 1191 1192 /* 1193 * Quick test without the lock -- covers the common case that 1194 * there are no dirty time segments. 1195 */ 1196 if (sm->sm_space == 0) 1197 return (0); 1198 1199 mutex_enter(sm->sm_lock); 1200 dirty = space_map_contains(sm, txg, size); 1201 mutex_exit(sm->sm_lock); 1202 1203 return (dirty); 1204 } 1205 1206 /* 1207 * Reassess DTLs after a config change or scrub completion. 1208 */ 1209 void 1210 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1211 { 1212 spa_t *spa = vd->vdev_spa; 1213 int c; 1214 1215 ASSERT(spa_config_held(spa, RW_WRITER)); 1216 1217 if (vd->vdev_children == 0) { 1218 mutex_enter(&vd->vdev_dtl_lock); 1219 /* 1220 * We're successfully scrubbed everything up to scrub_txg. 1221 * Therefore, excise all old DTLs up to that point, then 1222 * fold in the DTLs for everything we couldn't scrub. 1223 */ 1224 if (scrub_txg != 0) { 1225 space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1226 space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1227 } 1228 if (scrub_done) 1229 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1230 mutex_exit(&vd->vdev_dtl_lock); 1231 if (txg != 0) 1232 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1233 return; 1234 } 1235 1236 /* 1237 * Make sure the DTLs are always correct under the scrub lock. 1238 */ 1239 if (vd == spa->spa_root_vdev) 1240 mutex_enter(&spa->spa_scrub_lock); 1241 1242 mutex_enter(&vd->vdev_dtl_lock); 1243 space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1244 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1245 mutex_exit(&vd->vdev_dtl_lock); 1246 1247 for (c = 0; c < vd->vdev_children; c++) { 1248 vdev_t *cvd = vd->vdev_child[c]; 1249 vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1250 mutex_enter(&vd->vdev_dtl_lock); 1251 space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1252 space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1253 mutex_exit(&vd->vdev_dtl_lock); 1254 } 1255 1256 if (vd == spa->spa_root_vdev) 1257 mutex_exit(&spa->spa_scrub_lock); 1258 } 1259 1260 static int 1261 vdev_dtl_load(vdev_t *vd) 1262 { 1263 spa_t *spa = vd->vdev_spa; 1264 space_map_obj_t *smo = &vd->vdev_dtl; 1265 objset_t *mos = spa->spa_meta_objset; 1266 dmu_buf_t *db; 1267 int error; 1268 1269 ASSERT(vd->vdev_children == 0); 1270 1271 if (smo->smo_object == 0) 1272 return (0); 1273 1274 if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1275 return (error); 1276 1277 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1278 bcopy(db->db_data, smo, sizeof (*smo)); 1279 dmu_buf_rele(db, FTAG); 1280 1281 mutex_enter(&vd->vdev_dtl_lock); 1282 error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); 1283 mutex_exit(&vd->vdev_dtl_lock); 1284 1285 return (error); 1286 } 1287 1288 void 1289 vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1290 { 1291 spa_t *spa = vd->vdev_spa; 1292 space_map_obj_t *smo = &vd->vdev_dtl; 1293 space_map_t *sm = &vd->vdev_dtl_map; 1294 objset_t *mos = spa->spa_meta_objset; 1295 space_map_t smsync; 1296 kmutex_t smlock; 1297 dmu_buf_t *db; 1298 dmu_tx_t *tx; 1299 1300 dprintf("%s in txg %llu pass %d\n", 1301 vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1302 1303 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1304 1305 if (vd->vdev_detached) { 1306 if (smo->smo_object != 0) { 1307 int err = dmu_object_free(mos, smo->smo_object, tx); 1308 ASSERT3U(err, ==, 0); 1309 smo->smo_object = 0; 1310 } 1311 dmu_tx_commit(tx); 1312 dprintf("detach %s committed in txg %llu\n", 1313 vdev_description(vd), txg); 1314 return; 1315 } 1316 1317 if (smo->smo_object == 0) { 1318 ASSERT(smo->smo_objsize == 0); 1319 ASSERT(smo->smo_alloc == 0); 1320 smo->smo_object = dmu_object_alloc(mos, 1321 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1322 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1323 ASSERT(smo->smo_object != 0); 1324 vdev_config_dirty(vd->vdev_top); 1325 } 1326 1327 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1328 1329 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1330 &smlock); 1331 1332 mutex_enter(&smlock); 1333 1334 mutex_enter(&vd->vdev_dtl_lock); 1335 space_map_walk(sm, space_map_add, &smsync); 1336 mutex_exit(&vd->vdev_dtl_lock); 1337 1338 space_map_truncate(smo, mos, tx); 1339 space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1340 1341 space_map_destroy(&smsync); 1342 1343 mutex_exit(&smlock); 1344 mutex_destroy(&smlock); 1345 1346 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1347 dmu_buf_will_dirty(db, tx); 1348 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1349 bcopy(smo, db->db_data, sizeof (*smo)); 1350 dmu_buf_rele(db, FTAG); 1351 1352 dmu_tx_commit(tx); 1353 } 1354 1355 void 1356 vdev_load(vdev_t *vd) 1357 { 1358 int c; 1359 1360 /* 1361 * Recursively load all children. 1362 */ 1363 for (c = 0; c < vd->vdev_children; c++) 1364 vdev_load(vd->vdev_child[c]); 1365 1366 /* 1367 * If this is a top-level vdev, initialize its metaslabs. 1368 */ 1369 if (vd == vd->vdev_top && 1370 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1371 vdev_metaslab_init(vd, 0) != 0)) 1372 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1373 VDEV_AUX_CORRUPT_DATA); 1374 1375 /* 1376 * If this is a leaf vdev, load its DTL. 1377 */ 1378 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1379 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1380 VDEV_AUX_CORRUPT_DATA); 1381 } 1382 1383 /* 1384 * The special vdev case is used for hot spares and l2cache devices. Its 1385 * sole purpose it to set the vdev state for the associated vdev. To do this, 1386 * we make sure that we can open the underlying device, then try to read the 1387 * label, and make sure that the label is sane and that it hasn't been 1388 * repurposed to another pool. 1389 */ 1390 int 1391 vdev_validate_aux(vdev_t *vd) 1392 { 1393 nvlist_t *label; 1394 uint64_t guid, version; 1395 uint64_t state; 1396 1397 if (vdev_is_dead(vd)) 1398 return (0); 1399 1400 if ((label = vdev_label_read_config(vd)) == NULL) { 1401 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1402 VDEV_AUX_CORRUPT_DATA); 1403 return (-1); 1404 } 1405 1406 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 1407 version > SPA_VERSION || 1408 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 1409 guid != vd->vdev_guid || 1410 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 1411 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1412 VDEV_AUX_CORRUPT_DATA); 1413 nvlist_free(label); 1414 return (-1); 1415 } 1416 1417 /* 1418 * We don't actually check the pool state here. If it's in fact in 1419 * use by another pool, we update this fact on the fly when requested. 1420 */ 1421 nvlist_free(label); 1422 return (0); 1423 } 1424 1425 void 1426 vdev_sync_done(vdev_t *vd, uint64_t txg) 1427 { 1428 metaslab_t *msp; 1429 1430 dprintf("%s txg %llu\n", vdev_description(vd), txg); 1431 1432 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1433 metaslab_sync_done(msp, txg); 1434 } 1435 1436 void 1437 vdev_sync(vdev_t *vd, uint64_t txg) 1438 { 1439 spa_t *spa = vd->vdev_spa; 1440 vdev_t *lvd; 1441 metaslab_t *msp; 1442 dmu_tx_t *tx; 1443 1444 dprintf("%s txg %llu pass %d\n", 1445 vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1446 1447 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 1448 ASSERT(vd == vd->vdev_top); 1449 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1450 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1451 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1452 ASSERT(vd->vdev_ms_array != 0); 1453 vdev_config_dirty(vd); 1454 dmu_tx_commit(tx); 1455 } 1456 1457 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1458 metaslab_sync(msp, txg); 1459 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 1460 } 1461 1462 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1463 vdev_dtl_sync(lvd, txg); 1464 1465 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1466 } 1467 1468 uint64_t 1469 vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1470 { 1471 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1472 } 1473 1474 const char * 1475 vdev_description(vdev_t *vd) 1476 { 1477 if (vd == NULL || vd->vdev_ops == NULL) 1478 return ("<unknown>"); 1479 1480 if (vd->vdev_path != NULL) 1481 return (vd->vdev_path); 1482 1483 if (vd->vdev_parent == NULL) 1484 return (spa_name(vd->vdev_spa)); 1485 1486 return (vd->vdev_ops->vdev_op_type); 1487 } 1488 1489 /* 1490 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 1491 * not be opened, and no I/O is attempted. 1492 */ 1493 int 1494 vdev_fault(spa_t *spa, uint64_t guid) 1495 { 1496 vdev_t *vd; 1497 uint64_t txg; 1498 1499 /* 1500 * Disregard a vdev fault request if the pool has 1501 * experienced a complete failure. 1502 * 1503 * XXX - We do this here so that we don't hold the 1504 * spa_namespace_lock in the event that we can't get 1505 * the RW_WRITER spa_config_lock. 1506 */ 1507 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1508 return (EIO); 1509 1510 txg = spa_vdev_enter(spa); 1511 1512 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1513 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1514 if (!vd->vdev_ops->vdev_op_leaf) 1515 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1516 1517 /* 1518 * Faulted state takes precedence over degraded. 1519 */ 1520 vd->vdev_faulted = 1ULL; 1521 vd->vdev_degraded = 0ULL; 1522 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, 1523 VDEV_AUX_ERR_EXCEEDED); 1524 1525 /* 1526 * If marking the vdev as faulted cause the toplevel vdev to become 1527 * unavailable, then back off and simply mark the vdev as degraded 1528 * instead. 1529 */ 1530 if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { 1531 vd->vdev_degraded = 1ULL; 1532 vd->vdev_faulted = 0ULL; 1533 1534 /* 1535 * If we reopen the device and it's not dead, only then do we 1536 * mark it degraded. 1537 */ 1538 vdev_reopen(vd); 1539 1540 if (vdev_readable(vd)) { 1541 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 1542 VDEV_AUX_ERR_EXCEEDED); 1543 } 1544 } 1545 1546 vdev_config_dirty(vd->vdev_top); 1547 1548 (void) spa_vdev_exit(spa, NULL, txg, 0); 1549 1550 return (0); 1551 } 1552 1553 /* 1554 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 1555 * user that something is wrong. The vdev continues to operate as normal as far 1556 * as I/O is concerned. 1557 */ 1558 int 1559 vdev_degrade(spa_t *spa, uint64_t guid) 1560 { 1561 vdev_t *vd; 1562 uint64_t txg; 1563 1564 /* 1565 * Disregard a vdev fault request if the pool has 1566 * experienced a complete failure. 1567 * 1568 * XXX - We do this here so that we don't hold the 1569 * spa_namespace_lock in the event that we can't get 1570 * the RW_WRITER spa_config_lock. 1571 */ 1572 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1573 return (EIO); 1574 1575 txg = spa_vdev_enter(spa); 1576 1577 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1578 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1579 if (!vd->vdev_ops->vdev_op_leaf) 1580 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1581 1582 /* 1583 * If the vdev is already faulted, then don't do anything. 1584 */ 1585 if (vd->vdev_faulted || vd->vdev_degraded) { 1586 (void) spa_vdev_exit(spa, NULL, txg, 0); 1587 return (0); 1588 } 1589 1590 vd->vdev_degraded = 1ULL; 1591 if (!vdev_is_dead(vd)) 1592 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 1593 VDEV_AUX_ERR_EXCEEDED); 1594 vdev_config_dirty(vd->vdev_top); 1595 1596 (void) spa_vdev_exit(spa, NULL, txg, 0); 1597 1598 return (0); 1599 } 1600 1601 /* 1602 * Online the given vdev. If 'unspare' is set, it implies two things. First, 1603 * any attached spare device should be detached when the device finishes 1604 * resilvering. Second, the online should be treated like a 'test' online case, 1605 * so no FMA events are generated if the device fails to open. 1606 */ 1607 int 1608 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, 1609 vdev_state_t *newstate) 1610 { 1611 vdev_t *vd; 1612 uint64_t txg; 1613 1614 /* 1615 * Disregard a vdev fault request if the pool has 1616 * experienced a complete failure. 1617 * 1618 * XXX - We do this here so that we don't hold the 1619 * spa_namespace_lock in the event that we can't get 1620 * the RW_WRITER spa_config_lock. 1621 */ 1622 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1623 return (EIO); 1624 1625 txg = spa_vdev_enter(spa); 1626 1627 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1628 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1629 1630 if (!vd->vdev_ops->vdev_op_leaf) 1631 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1632 1633 vd->vdev_offline = B_FALSE; 1634 vd->vdev_tmpoffline = B_FALSE; 1635 vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? 1636 B_TRUE : B_FALSE; 1637 vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? 1638 B_TRUE : B_FALSE; 1639 vdev_reopen(vd->vdev_top); 1640 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 1641 1642 if (newstate) 1643 *newstate = vd->vdev_state; 1644 if ((flags & ZFS_ONLINE_UNSPARE) && 1645 !vdev_is_dead(vd) && vd->vdev_parent && 1646 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 1647 vd->vdev_parent->vdev_child[0] == vd) 1648 vd->vdev_unspare = B_TRUE; 1649 1650 vdev_config_dirty(vd->vdev_top); 1651 1652 (void) spa_vdev_exit(spa, NULL, txg, 0); 1653 1654 /* 1655 * Must hold spa_namespace_lock in order to post resilver sysevent 1656 * w/pool name. 1657 */ 1658 mutex_enter(&spa_namespace_lock); 1659 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1660 mutex_exit(&spa_namespace_lock); 1661 1662 return (0); 1663 } 1664 1665 int 1666 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 1667 { 1668 vdev_t *vd; 1669 uint64_t txg; 1670 1671 /* 1672 * Disregard a vdev fault request if the pool has 1673 * experienced a complete failure. 1674 * 1675 * XXX - We do this here so that we don't hold the 1676 * spa_namespace_lock in the event that we can't get 1677 * the RW_WRITER spa_config_lock. 1678 */ 1679 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1680 return (EIO); 1681 1682 txg = spa_vdev_enter(spa); 1683 1684 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1685 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1686 1687 if (!vd->vdev_ops->vdev_op_leaf) 1688 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1689 1690 /* 1691 * If the device isn't already offline, try to offline it. 1692 */ 1693 if (!vd->vdev_offline) { 1694 /* 1695 * If this device's top-level vdev has a non-empty DTL, 1696 * don't allow the device to be offlined. 1697 * 1698 * XXX -- make this more precise by allowing the offline 1699 * as long as the remaining devices don't have any DTL holes. 1700 */ 1701 if (vd->vdev_top->vdev_dtl_map.sm_space != 0) 1702 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1703 1704 /* 1705 * Offline this device and reopen its top-level vdev. 1706 * If this action results in the top-level vdev becoming 1707 * unusable, undo it and fail the request. 1708 */ 1709 vd->vdev_offline = B_TRUE; 1710 vdev_reopen(vd->vdev_top); 1711 if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { 1712 vd->vdev_offline = B_FALSE; 1713 vdev_reopen(vd->vdev_top); 1714 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1715 } 1716 } 1717 1718 vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? 1719 B_TRUE : B_FALSE; 1720 1721 vdev_config_dirty(vd->vdev_top); 1722 1723 return (spa_vdev_exit(spa, NULL, txg, 0)); 1724 } 1725 1726 /* 1727 * Clear the error counts associated with this vdev. Unlike vdev_online() and 1728 * vdev_offline(), we assume the spa config is locked. We also clear all 1729 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 1730 * If reopen is specified then attempt to reopen the vdev if the vdev is 1731 * faulted or degraded. 1732 */ 1733 void 1734 vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted) 1735 { 1736 int c; 1737 1738 if (vd == NULL) 1739 vd = spa->spa_root_vdev; 1740 1741 vd->vdev_stat.vs_read_errors = 0; 1742 vd->vdev_stat.vs_write_errors = 0; 1743 vd->vdev_stat.vs_checksum_errors = 0; 1744 vd->vdev_is_failing = B_FALSE; 1745 1746 for (c = 0; c < vd->vdev_children; c++) 1747 vdev_clear(spa, vd->vdev_child[c], reopen_wanted); 1748 1749 /* 1750 * If we're in the FAULTED state or have experienced failed I/O, then 1751 * clear the persistent state and attempt to reopen the device. We 1752 * also mark the vdev config dirty, so that the new faulted state is 1753 * written out to disk. 1754 */ 1755 if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded || 1756 vd->vdev_stat.vs_aux == VDEV_AUX_IO_FAILURE)) { 1757 boolean_t resilver = (vd->vdev_faulted || vd->vdev_degraded); 1758 1759 vd->vdev_faulted = vd->vdev_degraded = 0; 1760 vdev_reopen(vd); 1761 vdev_config_dirty(vd->vdev_top); 1762 1763 if (resilver && vd->vdev_aux == NULL && !vdev_is_dead(vd)) 1764 spa_async_request(spa, SPA_ASYNC_RESILVER); 1765 1766 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 1767 } 1768 } 1769 1770 int 1771 vdev_readable(vdev_t *vd) 1772 { 1773 /* XXPOLICY */ 1774 return (!vdev_is_dead(vd)); 1775 } 1776 1777 int 1778 vdev_writeable(vdev_t *vd) 1779 { 1780 return (!vdev_is_dead(vd) && !vd->vdev_is_failing); 1781 } 1782 1783 int 1784 vdev_is_dead(vdev_t *vd) 1785 { 1786 /* 1787 * If the vdev experienced I/O failures, then the vdev is marked 1788 * as faulted (VDEV_STATE_FAULTED) for status output and FMA; however, 1789 * we need to allow access to the vdev for resumed I/Os (see 1790 * zio_vdev_resume_io() ). 1791 */ 1792 return (vd->vdev_state < VDEV_STATE_DEGRADED && 1793 vd->vdev_stat.vs_aux != VDEV_AUX_IO_FAILURE); 1794 } 1795 1796 int 1797 vdev_error_inject(vdev_t *vd, zio_t *zio) 1798 { 1799 int error = 0; 1800 1801 if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1802 return (0); 1803 1804 if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1805 return (0); 1806 1807 switch (vd->vdev_fault_mode) { 1808 case VDEV_FAULT_RANDOM: 1809 if (spa_get_random(vd->vdev_fault_arg) == 0) 1810 error = EIO; 1811 break; 1812 1813 case VDEV_FAULT_COUNT: 1814 if ((int64_t)--vd->vdev_fault_arg <= 0) 1815 vd->vdev_fault_mode = VDEV_FAULT_NONE; 1816 error = EIO; 1817 break; 1818 } 1819 1820 return (error); 1821 } 1822 1823 /* 1824 * Get statistics for the given vdev. 1825 */ 1826 void 1827 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1828 { 1829 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1830 int c, t; 1831 1832 mutex_enter(&vd->vdev_stat_lock); 1833 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1834 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1835 vs->vs_state = vd->vdev_state; 1836 vs->vs_rsize = vdev_get_rsize(vd); 1837 mutex_exit(&vd->vdev_stat_lock); 1838 1839 /* 1840 * If we're getting stats on the root vdev, aggregate the I/O counts 1841 * over all top-level vdevs (i.e. the direct children of the root). 1842 */ 1843 if (vd == rvd) { 1844 for (c = 0; c < rvd->vdev_children; c++) { 1845 vdev_t *cvd = rvd->vdev_child[c]; 1846 vdev_stat_t *cvs = &cvd->vdev_stat; 1847 1848 mutex_enter(&vd->vdev_stat_lock); 1849 for (t = 0; t < ZIO_TYPES; t++) { 1850 vs->vs_ops[t] += cvs->vs_ops[t]; 1851 vs->vs_bytes[t] += cvs->vs_bytes[t]; 1852 } 1853 vs->vs_read_errors += cvs->vs_read_errors; 1854 vs->vs_write_errors += cvs->vs_write_errors; 1855 vs->vs_checksum_errors += cvs->vs_checksum_errors; 1856 vs->vs_scrub_examined += cvs->vs_scrub_examined; 1857 vs->vs_scrub_errors += cvs->vs_scrub_errors; 1858 mutex_exit(&vd->vdev_stat_lock); 1859 } 1860 } 1861 } 1862 1863 void 1864 vdev_clear_stats(vdev_t *vd) 1865 { 1866 mutex_enter(&vd->vdev_stat_lock); 1867 vd->vdev_stat.vs_space = 0; 1868 vd->vdev_stat.vs_dspace = 0; 1869 vd->vdev_stat.vs_alloc = 0; 1870 mutex_exit(&vd->vdev_stat_lock); 1871 } 1872 1873 void 1874 vdev_stat_update(zio_t *zio) 1875 { 1876 vdev_t *vd = zio->io_vd; 1877 vdev_t *pvd; 1878 uint64_t txg = zio->io_txg; 1879 vdev_stat_t *vs = &vd->vdev_stat; 1880 zio_type_t type = zio->io_type; 1881 int flags = zio->io_flags; 1882 1883 if (zio->io_error == 0) { 1884 if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1885 mutex_enter(&vd->vdev_stat_lock); 1886 vs->vs_ops[type]++; 1887 vs->vs_bytes[type] += zio->io_size; 1888 mutex_exit(&vd->vdev_stat_lock); 1889 } 1890 if ((flags & ZIO_FLAG_IO_REPAIR) && 1891 zio->io_delegate_list == NULL) { 1892 mutex_enter(&vd->vdev_stat_lock); 1893 if (flags & ZIO_FLAG_SCRUB_THREAD) 1894 vs->vs_scrub_repaired += zio->io_size; 1895 else 1896 vs->vs_self_healed += zio->io_size; 1897 mutex_exit(&vd->vdev_stat_lock); 1898 } 1899 return; 1900 } 1901 1902 if (flags & ZIO_FLAG_SPECULATIVE) 1903 return; 1904 1905 if (vdev_readable(vd)) { 1906 mutex_enter(&vd->vdev_stat_lock); 1907 if (type == ZIO_TYPE_READ) { 1908 if (zio->io_error == ECKSUM) 1909 vs->vs_checksum_errors++; 1910 else 1911 vs->vs_read_errors++; 1912 } 1913 if (type == ZIO_TYPE_WRITE) 1914 vs->vs_write_errors++; 1915 mutex_exit(&vd->vdev_stat_lock); 1916 } 1917 1918 if (type == ZIO_TYPE_WRITE) { 1919 if (txg == 0 || vd->vdev_children != 0) 1920 return; 1921 if (flags & ZIO_FLAG_SCRUB_THREAD) { 1922 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1923 for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1924 vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1925 } 1926 if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1927 if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1928 return; 1929 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1930 for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1931 vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1932 } 1933 } 1934 } 1935 1936 void 1937 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 1938 { 1939 int c; 1940 vdev_stat_t *vs = &vd->vdev_stat; 1941 1942 for (c = 0; c < vd->vdev_children; c++) 1943 vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 1944 1945 mutex_enter(&vd->vdev_stat_lock); 1946 1947 if (type == POOL_SCRUB_NONE) { 1948 /* 1949 * Update completion and end time. Leave everything else alone 1950 * so we can report what happened during the previous scrub. 1951 */ 1952 vs->vs_scrub_complete = complete; 1953 vs->vs_scrub_end = gethrestime_sec(); 1954 } else { 1955 vs->vs_scrub_type = type; 1956 vs->vs_scrub_complete = 0; 1957 vs->vs_scrub_examined = 0; 1958 vs->vs_scrub_repaired = 0; 1959 vs->vs_scrub_errors = 0; 1960 vs->vs_scrub_start = gethrestime_sec(); 1961 vs->vs_scrub_end = 0; 1962 } 1963 1964 mutex_exit(&vd->vdev_stat_lock); 1965 } 1966 1967 /* 1968 * Update the in-core space usage stats for this vdev and the root vdev. 1969 */ 1970 void 1971 vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, 1972 boolean_t update_root) 1973 { 1974 int64_t dspace_delta = space_delta; 1975 spa_t *spa = vd->vdev_spa; 1976 vdev_t *rvd = spa->spa_root_vdev; 1977 1978 ASSERT(vd == vd->vdev_top); 1979 1980 /* 1981 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 1982 * factor. We must calculate this here and not at the root vdev 1983 * because the root vdev's psize-to-asize is simply the max of its 1984 * childrens', thus not accurate enough for us. 1985 */ 1986 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 1987 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 1988 vd->vdev_deflate_ratio; 1989 1990 mutex_enter(&vd->vdev_stat_lock); 1991 vd->vdev_stat.vs_space += space_delta; 1992 vd->vdev_stat.vs_alloc += alloc_delta; 1993 vd->vdev_stat.vs_dspace += dspace_delta; 1994 mutex_exit(&vd->vdev_stat_lock); 1995 1996 if (update_root) { 1997 ASSERT(rvd == vd->vdev_parent); 1998 ASSERT(vd->vdev_ms_count != 0); 1999 2000 /* 2001 * Don't count non-normal (e.g. intent log) space as part of 2002 * the pool's capacity. 2003 */ 2004 if (vd->vdev_mg->mg_class != spa->spa_normal_class) 2005 return; 2006 2007 mutex_enter(&rvd->vdev_stat_lock); 2008 rvd->vdev_stat.vs_space += space_delta; 2009 rvd->vdev_stat.vs_alloc += alloc_delta; 2010 rvd->vdev_stat.vs_dspace += dspace_delta; 2011 mutex_exit(&rvd->vdev_stat_lock); 2012 } 2013 } 2014 2015 /* 2016 * Mark a top-level vdev's config as dirty, placing it on the dirty list 2017 * so that it will be written out next time the vdev configuration is synced. 2018 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2019 */ 2020 void 2021 vdev_config_dirty(vdev_t *vd) 2022 { 2023 spa_t *spa = vd->vdev_spa; 2024 vdev_t *rvd = spa->spa_root_vdev; 2025 int c; 2026 2027 /* 2028 * If this is an aux vdev (as with l2cache devices), then we update the 2029 * vdev config manually and set the sync flag. 2030 */ 2031 if (vd->vdev_aux != NULL) { 2032 spa_aux_vdev_t *sav = vd->vdev_aux; 2033 nvlist_t **aux; 2034 uint_t naux; 2035 2036 for (c = 0; c < sav->sav_count; c++) { 2037 if (sav->sav_vdevs[c] == vd) 2038 break; 2039 } 2040 2041 ASSERT(c < sav->sav_count); 2042 sav->sav_sync = B_TRUE; 2043 2044 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 2045 ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); 2046 2047 ASSERT(c < naux); 2048 2049 /* 2050 * Setting the nvlist in the middle if the array is a little 2051 * sketchy, but it will work. 2052 */ 2053 nvlist_free(aux[c]); 2054 aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); 2055 2056 return; 2057 } 2058 2059 /* 2060 * The dirty list is protected by the config lock. The caller must 2061 * either hold the config lock as writer, or must be the sync thread 2062 * (which holds the lock as reader). There's only one sync thread, 2063 * so this is sufficient to ensure mutual exclusion. 2064 */ 2065 ASSERT(spa_config_held(spa, RW_WRITER) || 2066 dsl_pool_sync_context(spa_get_dsl(spa))); 2067 2068 if (vd == rvd) { 2069 for (c = 0; c < rvd->vdev_children; c++) 2070 vdev_config_dirty(rvd->vdev_child[c]); 2071 } else { 2072 ASSERT(vd == vd->vdev_top); 2073 2074 if (!list_link_active(&vd->vdev_dirty_node)) 2075 list_insert_head(&spa->spa_dirty_list, vd); 2076 } 2077 } 2078 2079 void 2080 vdev_config_clean(vdev_t *vd) 2081 { 2082 spa_t *spa = vd->vdev_spa; 2083 2084 ASSERT(spa_config_held(spa, RW_WRITER) || 2085 dsl_pool_sync_context(spa_get_dsl(spa))); 2086 2087 ASSERT(list_link_active(&vd->vdev_dirty_node)); 2088 list_remove(&spa->spa_dirty_list, vd); 2089 } 2090 2091 /* 2092 * Propagate vdev state up from children to parent. 2093 */ 2094 void 2095 vdev_propagate_state(vdev_t *vd) 2096 { 2097 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 2098 int degraded = 0, faulted = 0; 2099 int corrupted = 0; 2100 int c; 2101 vdev_t *child; 2102 2103 if (vd->vdev_children > 0) { 2104 for (c = 0; c < vd->vdev_children; c++) { 2105 child = vd->vdev_child[c]; 2106 2107 if ((vdev_is_dead(child) && !vdev_readable(child)) || 2108 child->vdev_stat.vs_aux == VDEV_AUX_IO_FAILURE) { 2109 /* 2110 * Root special: if there is a top-level log 2111 * device, treat the root vdev as if it were 2112 * degraded. 2113 */ 2114 if (child->vdev_islog && vd == rvd) 2115 degraded++; 2116 else 2117 faulted++; 2118 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 2119 degraded++; 2120 } 2121 2122 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 2123 corrupted++; 2124 } 2125 2126 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 2127 2128 /* 2129 * Root special: if there is a toplevel vdev that cannot be 2130 * opened due to corrupted metadata, then propagate the root 2131 * vdev's aux state as 'corrupt' rather than 'insufficient 2132 * replicas'. 2133 */ 2134 if (corrupted && vd == rvd && 2135 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 2136 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 2137 VDEV_AUX_CORRUPT_DATA); 2138 } 2139 2140 if (vd->vdev_parent) 2141 vdev_propagate_state(vd->vdev_parent); 2142 } 2143 2144 /* 2145 * Set a vdev's state. If this is during an open, we don't update the parent 2146 * state, because we're in the process of opening children depth-first. 2147 * Otherwise, we propagate the change to the parent. 2148 * 2149 * If this routine places a device in a faulted state, an appropriate ereport is 2150 * generated. 2151 */ 2152 void 2153 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2154 { 2155 uint64_t save_state; 2156 spa_t *spa = vd->vdev_spa; 2157 2158 if (state == vd->vdev_state) { 2159 vd->vdev_stat.vs_aux = aux; 2160 return; 2161 } 2162 2163 save_state = vd->vdev_state; 2164 2165 vd->vdev_state = state; 2166 vd->vdev_stat.vs_aux = aux; 2167 2168 /* 2169 * If we are setting the vdev state to anything but an open state, then 2170 * always close the underlying device. Otherwise, we keep accessible 2171 * but invalid devices open forever. We don't call vdev_close() itself, 2172 * because that implies some extra checks (offline, etc) that we don't 2173 * want here. This is limited to leaf devices, because otherwise 2174 * closing the device will affect other children. 2175 */ 2176 if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf) 2177 vd->vdev_ops->vdev_op_close(vd); 2178 2179 if (vd->vdev_removed && 2180 state == VDEV_STATE_CANT_OPEN && 2181 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 2182 /* 2183 * If the previous state is set to VDEV_STATE_REMOVED, then this 2184 * device was previously marked removed and someone attempted to 2185 * reopen it. If this failed due to a nonexistent device, then 2186 * keep the device in the REMOVED state. We also let this be if 2187 * it is one of our special test online cases, which is only 2188 * attempting to online the device and shouldn't generate an FMA 2189 * fault. 2190 */ 2191 vd->vdev_state = VDEV_STATE_REMOVED; 2192 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2193 } else if (state == VDEV_STATE_REMOVED) { 2194 /* 2195 * Indicate to the ZFS DE that this device has been removed, and 2196 * any recent errors should be ignored. 2197 */ 2198 zfs_post_remove(spa, vd); 2199 vd->vdev_removed = B_TRUE; 2200 } else if (state == VDEV_STATE_CANT_OPEN) { 2201 /* 2202 * If we fail to open a vdev during an import, we mark it as 2203 * "not available", which signifies that it was never there to 2204 * begin with. Failure to open such a device is not considered 2205 * an error. 2206 */ 2207 if (spa->spa_load_state == SPA_LOAD_IMPORT && 2208 !spa->spa_import_faulted && 2209 vd->vdev_ops->vdev_op_leaf) 2210 vd->vdev_not_present = 1; 2211 2212 /* 2213 * Post the appropriate ereport. If the 'prevstate' field is 2214 * set to something other than VDEV_STATE_UNKNOWN, it indicates 2215 * that this is part of a vdev_reopen(). In this case, we don't 2216 * want to post the ereport if the device was already in the 2217 * CANT_OPEN state beforehand. 2218 * 2219 * If the 'checkremove' flag is set, then this is an attempt to 2220 * online the device in response to an insertion event. If we 2221 * hit this case, then we have detected an insertion event for a 2222 * faulted or offline device that wasn't in the removed state. 2223 * In this scenario, we don't post an ereport because we are 2224 * about to replace the device, or attempt an online with 2225 * vdev_forcefault, which will generate the fault for us. 2226 */ 2227 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 2228 !vd->vdev_not_present && !vd->vdev_checkremove && 2229 vd != spa->spa_root_vdev) { 2230 const char *class; 2231 2232 switch (aux) { 2233 case VDEV_AUX_OPEN_FAILED: 2234 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 2235 break; 2236 case VDEV_AUX_CORRUPT_DATA: 2237 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 2238 break; 2239 case VDEV_AUX_NO_REPLICAS: 2240 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 2241 break; 2242 case VDEV_AUX_BAD_GUID_SUM: 2243 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 2244 break; 2245 case VDEV_AUX_TOO_SMALL: 2246 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 2247 break; 2248 case VDEV_AUX_BAD_LABEL: 2249 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 2250 break; 2251 default: 2252 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 2253 } 2254 2255 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 2256 } 2257 2258 /* Erase any notion of persistent removed state */ 2259 vd->vdev_removed = B_FALSE; 2260 } else { 2261 vd->vdev_removed = B_FALSE; 2262 } 2263 2264 if (!isopen) 2265 vdev_propagate_state(vd); 2266 } 2267 2268 /* 2269 * Check the vdev configuration to ensure that it's capable of supporting 2270 * a root pool. Currently, we do not support RAID-Z or partial configuration. 2271 * In addition, only a single top-level vdev is allowed and none of the leaves 2272 * can be wholedisks. 2273 */ 2274 boolean_t 2275 vdev_is_bootable(vdev_t *vd) 2276 { 2277 int c; 2278 2279 if (!vd->vdev_ops->vdev_op_leaf) { 2280 char *vdev_type = vd->vdev_ops->vdev_op_type; 2281 2282 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 2283 vd->vdev_children > 1) { 2284 return (B_FALSE); 2285 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 2286 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 2287 return (B_FALSE); 2288 } 2289 } else if (vd->vdev_wholedisk == 1) { 2290 return (B_FALSE); 2291 } 2292 2293 for (c = 0; c < vd->vdev_children; c++) { 2294 if (!vdev_is_bootable(vd->vdev_child[c])) 2295 return (B_FALSE); 2296 } 2297 return (B_TRUE); 2298 } 2299