1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/fm/fs/zfs.h> 31 #include <sys/spa.h> 32 #include <sys/spa_impl.h> 33 #include <sys/dmu.h> 34 #include <sys/dmu_tx.h> 35 #include <sys/vdev_impl.h> 36 #include <sys/uberblock_impl.h> 37 #include <sys/metaslab.h> 38 #include <sys/metaslab_impl.h> 39 #include <sys/space_map.h> 40 #include <sys/zio.h> 41 #include <sys/zap.h> 42 #include <sys/fs/zfs.h> 43 #include <sys/arc.h> 44 45 /* 46 * Virtual device management. 47 */ 48 49 static vdev_ops_t *vdev_ops_table[] = { 50 &vdev_root_ops, 51 &vdev_raidz_ops, 52 &vdev_mirror_ops, 53 &vdev_replacing_ops, 54 &vdev_spare_ops, 55 &vdev_disk_ops, 56 &vdev_file_ops, 57 &vdev_missing_ops, 58 NULL 59 }; 60 61 /* maximum scrub/resilver I/O queue per leaf vdev */ 62 int zfs_scrub_limit = 10; 63 64 /* 65 * Given a vdev type, return the appropriate ops vector. 66 */ 67 static vdev_ops_t * 68 vdev_getops(const char *type) 69 { 70 vdev_ops_t *ops, **opspp; 71 72 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 73 if (strcmp(ops->vdev_op_type, type) == 0) 74 break; 75 76 return (ops); 77 } 78 79 /* 80 * Default asize function: return the MAX of psize with the asize of 81 * all children. This is what's used by anything other than RAID-Z. 82 */ 83 uint64_t 84 vdev_default_asize(vdev_t *vd, uint64_t psize) 85 { 86 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); 87 uint64_t csize; 88 uint64_t c; 89 90 for (c = 0; c < vd->vdev_children; c++) { 91 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 92 asize = MAX(asize, csize); 93 } 94 95 return (asize); 96 } 97 98 /* 99 * Get the replaceable or attachable device size. 100 * If the parent is a mirror or raidz, the replaceable size is the minimum 101 * psize of all its children. For the rest, just return our own psize. 102 * 103 * e.g. 104 * psize rsize 105 * root - - 106 * mirror/raidz - - 107 * disk1 20g 20g 108 * disk2 40g 20g 109 * disk3 80g 80g 110 */ 111 uint64_t 112 vdev_get_rsize(vdev_t *vd) 113 { 114 vdev_t *pvd, *cvd; 115 uint64_t c, rsize; 116 117 pvd = vd->vdev_parent; 118 119 /* 120 * If our parent is NULL or the root, just return our own psize. 121 */ 122 if (pvd == NULL || pvd->vdev_parent == NULL) 123 return (vd->vdev_psize); 124 125 rsize = 0; 126 127 for (c = 0; c < pvd->vdev_children; c++) { 128 cvd = pvd->vdev_child[c]; 129 rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 130 } 131 132 return (rsize); 133 } 134 135 vdev_t * 136 vdev_lookup_top(spa_t *spa, uint64_t vdev) 137 { 138 vdev_t *rvd = spa->spa_root_vdev; 139 140 ASSERT(spa_config_held(spa, RW_READER)); 141 142 if (vdev < rvd->vdev_children) { 143 ASSERT(rvd->vdev_child[vdev] != NULL); 144 return (rvd->vdev_child[vdev]); 145 } 146 147 return (NULL); 148 } 149 150 vdev_t * 151 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 152 { 153 int c; 154 vdev_t *mvd; 155 156 if (vd->vdev_guid == guid) 157 return (vd); 158 159 for (c = 0; c < vd->vdev_children; c++) 160 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 161 NULL) 162 return (mvd); 163 164 return (NULL); 165 } 166 167 void 168 vdev_add_child(vdev_t *pvd, vdev_t *cvd) 169 { 170 size_t oldsize, newsize; 171 uint64_t id = cvd->vdev_id; 172 vdev_t **newchild; 173 174 ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 175 ASSERT(cvd->vdev_parent == NULL); 176 177 cvd->vdev_parent = pvd; 178 179 if (pvd == NULL) 180 return; 181 182 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 183 184 oldsize = pvd->vdev_children * sizeof (vdev_t *); 185 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 186 newsize = pvd->vdev_children * sizeof (vdev_t *); 187 188 newchild = kmem_zalloc(newsize, KM_SLEEP); 189 if (pvd->vdev_child != NULL) { 190 bcopy(pvd->vdev_child, newchild, oldsize); 191 kmem_free(pvd->vdev_child, oldsize); 192 } 193 194 pvd->vdev_child = newchild; 195 pvd->vdev_child[id] = cvd; 196 197 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 198 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 199 200 /* 201 * Walk up all ancestors to update guid sum. 202 */ 203 for (; pvd != NULL; pvd = pvd->vdev_parent) 204 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 205 206 if (cvd->vdev_ops->vdev_op_leaf) 207 cvd->vdev_spa->spa_scrub_maxinflight += zfs_scrub_limit; 208 } 209 210 void 211 vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 212 { 213 int c; 214 uint_t id = cvd->vdev_id; 215 216 ASSERT(cvd->vdev_parent == pvd); 217 218 if (pvd == NULL) 219 return; 220 221 ASSERT(id < pvd->vdev_children); 222 ASSERT(pvd->vdev_child[id] == cvd); 223 224 pvd->vdev_child[id] = NULL; 225 cvd->vdev_parent = NULL; 226 227 for (c = 0; c < pvd->vdev_children; c++) 228 if (pvd->vdev_child[c]) 229 break; 230 231 if (c == pvd->vdev_children) { 232 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 233 pvd->vdev_child = NULL; 234 pvd->vdev_children = 0; 235 } 236 237 /* 238 * Walk up all ancestors to update guid sum. 239 */ 240 for (; pvd != NULL; pvd = pvd->vdev_parent) 241 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 242 243 if (cvd->vdev_ops->vdev_op_leaf) 244 cvd->vdev_spa->spa_scrub_maxinflight -= zfs_scrub_limit; 245 } 246 247 /* 248 * Remove any holes in the child array. 249 */ 250 void 251 vdev_compact_children(vdev_t *pvd) 252 { 253 vdev_t **newchild, *cvd; 254 int oldc = pvd->vdev_children; 255 int newc, c; 256 257 ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 258 259 for (c = newc = 0; c < oldc; c++) 260 if (pvd->vdev_child[c]) 261 newc++; 262 263 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 264 265 for (c = newc = 0; c < oldc; c++) { 266 if ((cvd = pvd->vdev_child[c]) != NULL) { 267 newchild[newc] = cvd; 268 cvd->vdev_id = newc++; 269 } 270 } 271 272 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 273 pvd->vdev_child = newchild; 274 pvd->vdev_children = newc; 275 } 276 277 /* 278 * Allocate and minimally initialize a vdev_t. 279 */ 280 static vdev_t * 281 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 282 { 283 vdev_t *vd; 284 285 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 286 287 if (spa->spa_root_vdev == NULL) { 288 ASSERT(ops == &vdev_root_ops); 289 spa->spa_root_vdev = vd; 290 } 291 292 if (guid == 0) { 293 if (spa->spa_root_vdev == vd) { 294 /* 295 * The root vdev's guid will also be the pool guid, 296 * which must be unique among all pools. 297 */ 298 while (guid == 0 || spa_guid_exists(guid, 0)) 299 guid = spa_get_random(-1ULL); 300 } else { 301 /* 302 * Any other vdev's guid must be unique within the pool. 303 */ 304 while (guid == 0 || 305 spa_guid_exists(spa_guid(spa), guid)) 306 guid = spa_get_random(-1ULL); 307 } 308 ASSERT(!spa_guid_exists(spa_guid(spa), guid)); 309 } 310 311 vd->vdev_spa = spa; 312 vd->vdev_id = id; 313 vd->vdev_guid = guid; 314 vd->vdev_guid_sum = guid; 315 vd->vdev_ops = ops; 316 vd->vdev_state = VDEV_STATE_CLOSED; 317 318 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 319 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); 320 space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 321 space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 322 txg_list_create(&vd->vdev_ms_list, 323 offsetof(struct metaslab, ms_txg_node)); 324 txg_list_create(&vd->vdev_dtl_list, 325 offsetof(struct vdev, vdev_dtl_node)); 326 vd->vdev_stat.vs_timestamp = gethrtime(); 327 vdev_queue_init(vd); 328 vdev_cache_init(vd); 329 330 return (vd); 331 } 332 333 /* 334 * Allocate a new vdev. The 'alloctype' is used to control whether we are 335 * creating a new vdev or loading an existing one - the behavior is slightly 336 * different for each case. 337 */ 338 int 339 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, 340 int alloctype) 341 { 342 vdev_ops_t *ops; 343 char *type; 344 uint64_t guid = 0, islog, nparity; 345 vdev_t *vd; 346 347 ASSERT(spa_config_held(spa, RW_WRITER)); 348 349 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 350 return (EINVAL); 351 352 if ((ops = vdev_getops(type)) == NULL) 353 return (EINVAL); 354 355 /* 356 * If this is a load, get the vdev guid from the nvlist. 357 * Otherwise, vdev_alloc_common() will generate one for us. 358 */ 359 if (alloctype == VDEV_ALLOC_LOAD) { 360 uint64_t label_id; 361 362 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 363 label_id != id) 364 return (EINVAL); 365 366 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 367 return (EINVAL); 368 } else if (alloctype == VDEV_ALLOC_SPARE) { 369 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 370 return (EINVAL); 371 } else if (alloctype == VDEV_ALLOC_L2CACHE) { 372 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 373 return (EINVAL); 374 } 375 376 /* 377 * The first allocated vdev must be of type 'root'. 378 */ 379 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) 380 return (EINVAL); 381 382 /* 383 * Determine whether we're a log vdev. 384 */ 385 islog = 0; 386 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); 387 if (islog && spa_version(spa) < SPA_VERSION_SLOGS) 388 return (ENOTSUP); 389 390 /* 391 * Set the nparity property for RAID-Z vdevs. 392 */ 393 nparity = -1ULL; 394 if (ops == &vdev_raidz_ops) { 395 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, 396 &nparity) == 0) { 397 /* 398 * Currently, we can only support 2 parity devices. 399 */ 400 if (nparity == 0 || nparity > 2) 401 return (EINVAL); 402 /* 403 * Older versions can only support 1 parity device. 404 */ 405 if (nparity == 2 && 406 spa_version(spa) < SPA_VERSION_RAID6) 407 return (ENOTSUP); 408 } else { 409 /* 410 * We require the parity to be specified for SPAs that 411 * support multiple parity levels. 412 */ 413 if (spa_version(spa) >= SPA_VERSION_RAID6) 414 return (EINVAL); 415 /* 416 * Otherwise, we default to 1 parity device for RAID-Z. 417 */ 418 nparity = 1; 419 } 420 } else { 421 nparity = 0; 422 } 423 ASSERT(nparity != -1ULL); 424 425 vd = vdev_alloc_common(spa, id, guid, ops); 426 427 vd->vdev_islog = islog; 428 vd->vdev_nparity = nparity; 429 430 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 431 vd->vdev_path = spa_strdup(vd->vdev_path); 432 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 433 vd->vdev_devid = spa_strdup(vd->vdev_devid); 434 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, 435 &vd->vdev_physpath) == 0) 436 vd->vdev_physpath = spa_strdup(vd->vdev_physpath); 437 438 /* 439 * Set the whole_disk property. If it's not specified, leave the value 440 * as -1. 441 */ 442 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 443 &vd->vdev_wholedisk) != 0) 444 vd->vdev_wholedisk = -1ULL; 445 446 /* 447 * Look for the 'not present' flag. This will only be set if the device 448 * was not present at the time of import. 449 */ 450 if (!spa->spa_import_faulted) 451 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 452 &vd->vdev_not_present); 453 454 /* 455 * Get the alignment requirement. 456 */ 457 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); 458 459 /* 460 * If we're a top-level vdev, try to load the allocation parameters. 461 */ 462 if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 463 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 464 &vd->vdev_ms_array); 465 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 466 &vd->vdev_ms_shift); 467 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 468 &vd->vdev_asize); 469 } 470 471 /* 472 * If we're a leaf vdev, try to load the DTL object and other state. 473 */ 474 if (vd->vdev_ops->vdev_op_leaf && 475 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { 476 if (alloctype == VDEV_ALLOC_LOAD) { 477 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 478 &vd->vdev_dtl.smo_object); 479 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, 480 &vd->vdev_unspare); 481 } 482 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, 483 &vd->vdev_offline); 484 485 /* 486 * When importing a pool, we want to ignore the persistent fault 487 * state, as the diagnosis made on another system may not be 488 * valid in the current context. 489 */ 490 if (spa->spa_load_state == SPA_LOAD_OPEN) { 491 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, 492 &vd->vdev_faulted); 493 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, 494 &vd->vdev_degraded); 495 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, 496 &vd->vdev_removed); 497 } 498 } 499 500 /* 501 * Add ourselves to the parent's list of children. 502 */ 503 vdev_add_child(parent, vd); 504 505 *vdp = vd; 506 507 return (0); 508 } 509 510 void 511 vdev_free(vdev_t *vd) 512 { 513 int c; 514 spa_t *spa = vd->vdev_spa; 515 516 /* 517 * vdev_free() implies closing the vdev first. This is simpler than 518 * trying to ensure complicated semantics for all callers. 519 */ 520 vdev_close(vd); 521 522 523 ASSERT(!list_link_active(&vd->vdev_dirty_node)); 524 525 /* 526 * Free all children. 527 */ 528 for (c = 0; c < vd->vdev_children; c++) 529 vdev_free(vd->vdev_child[c]); 530 531 ASSERT(vd->vdev_child == NULL); 532 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 533 534 /* 535 * Discard allocation state. 536 */ 537 if (vd == vd->vdev_top) 538 vdev_metaslab_fini(vd); 539 540 ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 541 ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); 542 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 543 544 /* 545 * Remove this vdev from its parent's child list. 546 */ 547 vdev_remove_child(vd->vdev_parent, vd); 548 549 ASSERT(vd->vdev_parent == NULL); 550 551 /* 552 * Clean up vdev structure. 553 */ 554 vdev_queue_fini(vd); 555 vdev_cache_fini(vd); 556 557 if (vd->vdev_path) 558 spa_strfree(vd->vdev_path); 559 if (vd->vdev_devid) 560 spa_strfree(vd->vdev_devid); 561 if (vd->vdev_physpath) 562 spa_strfree(vd->vdev_physpath); 563 564 if (vd->vdev_isspare) 565 spa_spare_remove(vd); 566 if (vd->vdev_isl2cache) 567 spa_l2cache_remove(vd); 568 569 txg_list_destroy(&vd->vdev_ms_list); 570 txg_list_destroy(&vd->vdev_dtl_list); 571 mutex_enter(&vd->vdev_dtl_lock); 572 space_map_unload(&vd->vdev_dtl_map); 573 space_map_destroy(&vd->vdev_dtl_map); 574 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 575 space_map_destroy(&vd->vdev_dtl_scrub); 576 mutex_exit(&vd->vdev_dtl_lock); 577 mutex_destroy(&vd->vdev_dtl_lock); 578 mutex_destroy(&vd->vdev_stat_lock); 579 580 if (vd == spa->spa_root_vdev) 581 spa->spa_root_vdev = NULL; 582 583 kmem_free(vd, sizeof (vdev_t)); 584 } 585 586 /* 587 * Transfer top-level vdev state from svd to tvd. 588 */ 589 static void 590 vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 591 { 592 spa_t *spa = svd->vdev_spa; 593 metaslab_t *msp; 594 vdev_t *vd; 595 int t; 596 597 ASSERT(tvd == tvd->vdev_top); 598 599 tvd->vdev_ms_array = svd->vdev_ms_array; 600 tvd->vdev_ms_shift = svd->vdev_ms_shift; 601 tvd->vdev_ms_count = svd->vdev_ms_count; 602 603 svd->vdev_ms_array = 0; 604 svd->vdev_ms_shift = 0; 605 svd->vdev_ms_count = 0; 606 607 tvd->vdev_mg = svd->vdev_mg; 608 tvd->vdev_ms = svd->vdev_ms; 609 610 svd->vdev_mg = NULL; 611 svd->vdev_ms = NULL; 612 613 if (tvd->vdev_mg != NULL) 614 tvd->vdev_mg->mg_vd = tvd; 615 616 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 617 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 618 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; 619 620 svd->vdev_stat.vs_alloc = 0; 621 svd->vdev_stat.vs_space = 0; 622 svd->vdev_stat.vs_dspace = 0; 623 624 for (t = 0; t < TXG_SIZE; t++) { 625 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 626 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 627 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 628 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 629 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 630 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 631 } 632 633 if (list_link_active(&svd->vdev_dirty_node)) { 634 vdev_config_clean(svd); 635 vdev_config_dirty(tvd); 636 } 637 638 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; 639 svd->vdev_deflate_ratio = 0; 640 641 tvd->vdev_islog = svd->vdev_islog; 642 svd->vdev_islog = 0; 643 } 644 645 static void 646 vdev_top_update(vdev_t *tvd, vdev_t *vd) 647 { 648 int c; 649 650 if (vd == NULL) 651 return; 652 653 vd->vdev_top = tvd; 654 655 for (c = 0; c < vd->vdev_children; c++) 656 vdev_top_update(tvd, vd->vdev_child[c]); 657 } 658 659 /* 660 * Add a mirror/replacing vdev above an existing vdev. 661 */ 662 vdev_t * 663 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 664 { 665 spa_t *spa = cvd->vdev_spa; 666 vdev_t *pvd = cvd->vdev_parent; 667 vdev_t *mvd; 668 669 ASSERT(spa_config_held(spa, RW_WRITER)); 670 671 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 672 673 mvd->vdev_asize = cvd->vdev_asize; 674 mvd->vdev_ashift = cvd->vdev_ashift; 675 mvd->vdev_state = cvd->vdev_state; 676 677 vdev_remove_child(pvd, cvd); 678 vdev_add_child(pvd, mvd); 679 cvd->vdev_id = mvd->vdev_children; 680 vdev_add_child(mvd, cvd); 681 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 682 683 if (mvd == mvd->vdev_top) 684 vdev_top_transfer(cvd, mvd); 685 686 return (mvd); 687 } 688 689 /* 690 * Remove a 1-way mirror/replacing vdev from the tree. 691 */ 692 void 693 vdev_remove_parent(vdev_t *cvd) 694 { 695 vdev_t *mvd = cvd->vdev_parent; 696 vdev_t *pvd = mvd->vdev_parent; 697 698 ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 699 700 ASSERT(mvd->vdev_children == 1); 701 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 702 mvd->vdev_ops == &vdev_replacing_ops || 703 mvd->vdev_ops == &vdev_spare_ops); 704 cvd->vdev_ashift = mvd->vdev_ashift; 705 706 vdev_remove_child(mvd, cvd); 707 vdev_remove_child(pvd, mvd); 708 cvd->vdev_id = mvd->vdev_id; 709 vdev_add_child(pvd, cvd); 710 /* 711 * If we created a new toplevel vdev, then we need to change the child's 712 * vdev GUID to match the old toplevel vdev. Otherwise, we could have 713 * detached an offline device, and when we go to import the pool we'll 714 * think we have two toplevel vdevs, instead of a different version of 715 * the same toplevel vdev. 716 */ 717 if (cvd->vdev_top == cvd) { 718 pvd->vdev_guid_sum -= cvd->vdev_guid; 719 cvd->vdev_guid_sum -= cvd->vdev_guid; 720 cvd->vdev_guid = mvd->vdev_guid; 721 cvd->vdev_guid_sum += mvd->vdev_guid; 722 pvd->vdev_guid_sum += cvd->vdev_guid; 723 } 724 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 725 726 if (cvd == cvd->vdev_top) 727 vdev_top_transfer(mvd, cvd); 728 729 ASSERT(mvd->vdev_children == 0); 730 vdev_free(mvd); 731 } 732 733 int 734 vdev_metaslab_init(vdev_t *vd, uint64_t txg) 735 { 736 spa_t *spa = vd->vdev_spa; 737 objset_t *mos = spa->spa_meta_objset; 738 metaslab_class_t *mc; 739 uint64_t m; 740 uint64_t oldc = vd->vdev_ms_count; 741 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 742 metaslab_t **mspp; 743 int error; 744 745 if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ 746 return (0); 747 748 dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 749 750 ASSERT(oldc <= newc); 751 752 if (vd->vdev_islog) 753 mc = spa->spa_log_class; 754 else 755 mc = spa->spa_normal_class; 756 757 if (vd->vdev_mg == NULL) 758 vd->vdev_mg = metaslab_group_create(mc, vd); 759 760 mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 761 762 if (oldc != 0) { 763 bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); 764 kmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); 765 } 766 767 vd->vdev_ms = mspp; 768 vd->vdev_ms_count = newc; 769 770 for (m = oldc; m < newc; m++) { 771 space_map_obj_t smo = { 0, 0, 0 }; 772 if (txg == 0) { 773 uint64_t object = 0; 774 error = dmu_read(mos, vd->vdev_ms_array, 775 m * sizeof (uint64_t), sizeof (uint64_t), &object); 776 if (error) 777 return (error); 778 if (object != 0) { 779 dmu_buf_t *db; 780 error = dmu_bonus_hold(mos, object, FTAG, &db); 781 if (error) 782 return (error); 783 ASSERT3U(db->db_size, >=, sizeof (smo)); 784 bcopy(db->db_data, &smo, sizeof (smo)); 785 ASSERT3U(smo.smo_object, ==, object); 786 dmu_buf_rele(db, FTAG); 787 } 788 } 789 vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo, 790 m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 791 } 792 793 return (0); 794 } 795 796 void 797 vdev_metaslab_fini(vdev_t *vd) 798 { 799 uint64_t m; 800 uint64_t count = vd->vdev_ms_count; 801 802 if (vd->vdev_ms != NULL) { 803 for (m = 0; m < count; m++) 804 if (vd->vdev_ms[m] != NULL) 805 metaslab_fini(vd->vdev_ms[m]); 806 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 807 vd->vdev_ms = NULL; 808 } 809 } 810 811 int 812 vdev_probe(vdev_t *vd) 813 { 814 if (vd == NULL) 815 return (EINVAL); 816 817 /* 818 * Right now we only support status checks on the leaf vdevs. 819 */ 820 if (vd->vdev_ops->vdev_op_leaf) 821 return (vd->vdev_ops->vdev_op_probe(vd)); 822 823 return (0); 824 } 825 826 /* 827 * Prepare a virtual device for access. 828 */ 829 int 830 vdev_open(vdev_t *vd) 831 { 832 int error; 833 int c; 834 uint64_t osize = 0; 835 uint64_t asize, psize; 836 uint64_t ashift = 0; 837 838 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 839 vd->vdev_state == VDEV_STATE_CANT_OPEN || 840 vd->vdev_state == VDEV_STATE_OFFLINE); 841 842 if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 843 vd->vdev_fault_arg >>= 1; 844 else 845 vd->vdev_fault_mode = VDEV_FAULT_NONE; 846 847 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 848 849 if (!vd->vdev_removed && vd->vdev_faulted) { 850 ASSERT(vd->vdev_children == 0); 851 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, 852 VDEV_AUX_ERR_EXCEEDED); 853 return (ENXIO); 854 } else if (vd->vdev_offline) { 855 ASSERT(vd->vdev_children == 0); 856 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); 857 return (ENXIO); 858 } 859 860 error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 861 862 if (zio_injection_enabled && error == 0) 863 error = zio_handle_device_injection(vd, ENXIO); 864 865 if (error) { 866 if (vd->vdev_removed && 867 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) 868 vd->vdev_removed = B_FALSE; 869 870 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 871 vd->vdev_stat.vs_aux); 872 return (error); 873 } 874 875 vd->vdev_removed = B_FALSE; 876 877 if (vd->vdev_degraded) { 878 ASSERT(vd->vdev_children == 0); 879 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 880 VDEV_AUX_ERR_EXCEEDED); 881 } else { 882 vd->vdev_state = VDEV_STATE_HEALTHY; 883 } 884 885 for (c = 0; c < vd->vdev_children; c++) 886 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { 887 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, 888 VDEV_AUX_NONE); 889 break; 890 } 891 892 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 893 894 if (vd->vdev_children == 0) { 895 if (osize < SPA_MINDEVSIZE) { 896 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 897 VDEV_AUX_TOO_SMALL); 898 return (EOVERFLOW); 899 } 900 psize = osize; 901 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 902 } else { 903 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - 904 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 905 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 906 VDEV_AUX_TOO_SMALL); 907 return (EOVERFLOW); 908 } 909 psize = 0; 910 asize = osize; 911 } 912 913 vd->vdev_psize = psize; 914 915 if (vd->vdev_asize == 0) { 916 /* 917 * This is the first-ever open, so use the computed values. 918 * For testing purposes, a higher ashift can be requested. 919 */ 920 vd->vdev_asize = asize; 921 vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); 922 } else { 923 /* 924 * Make sure the alignment requirement hasn't increased. 925 */ 926 if (ashift > vd->vdev_top->vdev_ashift) { 927 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 928 VDEV_AUX_BAD_LABEL); 929 return (EINVAL); 930 } 931 932 /* 933 * Make sure the device hasn't shrunk. 934 */ 935 if (asize < vd->vdev_asize) { 936 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 937 VDEV_AUX_BAD_LABEL); 938 return (EINVAL); 939 } 940 941 /* 942 * If all children are healthy and the asize has increased, 943 * then we've experienced dynamic LUN growth. 944 */ 945 if (vd->vdev_state == VDEV_STATE_HEALTHY && 946 asize > vd->vdev_asize) { 947 vd->vdev_asize = asize; 948 } 949 } 950 951 /* 952 * Ensure we can issue some IO before declaring the 953 * vdev open for business. 954 */ 955 error = vdev_probe(vd); 956 if (error) { 957 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 958 VDEV_AUX_OPEN_FAILED); 959 return (error); 960 } 961 962 /* 963 * If this is a top-level vdev, compute the raidz-deflation 964 * ratio. Note, we hard-code in 128k (1<<17) because it is the 965 * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE 966 * changes, this algorithm must never change, or we will 967 * inconsistently account for existing bp's. 968 */ 969 if (vd->vdev_top == vd) { 970 vd->vdev_deflate_ratio = (1<<17) / 971 (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); 972 } 973 974 /* 975 * If a leaf vdev has a DTL, and seems healthy, then kick off a 976 * resilver. But don't do this if we are doing a reopen for a 977 * scrub, since this would just restart the scrub we are already 978 * doing. 979 */ 980 if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) { 981 mutex_enter(&vd->vdev_dtl_lock); 982 if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) 983 spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER); 984 mutex_exit(&vd->vdev_dtl_lock); 985 } 986 987 return (0); 988 } 989 990 /* 991 * Called once the vdevs are all opened, this routine validates the label 992 * contents. This needs to be done before vdev_load() so that we don't 993 * inadvertently do repair I/Os to the wrong device. 994 * 995 * This function will only return failure if one of the vdevs indicates that it 996 * has since been destroyed or exported. This is only possible if 997 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state 998 * will be updated but the function will return 0. 999 */ 1000 int 1001 vdev_validate(vdev_t *vd) 1002 { 1003 spa_t *spa = vd->vdev_spa; 1004 int c; 1005 nvlist_t *label; 1006 uint64_t guid; 1007 uint64_t state; 1008 1009 for (c = 0; c < vd->vdev_children; c++) 1010 if (vdev_validate(vd->vdev_child[c]) != 0) 1011 return (EBADF); 1012 1013 /* 1014 * If the device has already failed, or was marked offline, don't do 1015 * any further validation. Otherwise, label I/O will fail and we will 1016 * overwrite the previous state. 1017 */ 1018 if (vd->vdev_ops->vdev_op_leaf && !vdev_is_dead(vd)) { 1019 1020 if ((label = vdev_label_read_config(vd)) == NULL) { 1021 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1022 VDEV_AUX_BAD_LABEL); 1023 return (0); 1024 } 1025 1026 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 1027 &guid) != 0 || guid != spa_guid(spa)) { 1028 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1029 VDEV_AUX_CORRUPT_DATA); 1030 nvlist_free(label); 1031 return (0); 1032 } 1033 1034 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, 1035 &guid) != 0 || guid != vd->vdev_guid) { 1036 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1037 VDEV_AUX_CORRUPT_DATA); 1038 nvlist_free(label); 1039 return (0); 1040 } 1041 1042 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1043 &state) != 0) { 1044 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1045 VDEV_AUX_CORRUPT_DATA); 1046 nvlist_free(label); 1047 return (0); 1048 } 1049 1050 nvlist_free(label); 1051 1052 if (spa->spa_load_state == SPA_LOAD_OPEN && 1053 state != POOL_STATE_ACTIVE) 1054 return (EBADF); 1055 1056 /* 1057 * If we were able to open and validate a vdev that was 1058 * previously marked permanently unavailable, clear that state 1059 * now. 1060 */ 1061 if (vd->vdev_not_present) 1062 vd->vdev_not_present = 0; 1063 } 1064 1065 return (0); 1066 } 1067 1068 /* 1069 * Close a virtual device. 1070 */ 1071 void 1072 vdev_close(vdev_t *vd) 1073 { 1074 vd->vdev_ops->vdev_op_close(vd); 1075 1076 vdev_cache_purge(vd); 1077 1078 /* 1079 * We record the previous state before we close it, so that if we are 1080 * doing a reopen(), we don't generate FMA ereports if we notice that 1081 * it's still faulted. 1082 */ 1083 vd->vdev_prevstate = vd->vdev_state; 1084 1085 if (vd->vdev_offline) 1086 vd->vdev_state = VDEV_STATE_OFFLINE; 1087 else 1088 vd->vdev_state = VDEV_STATE_CLOSED; 1089 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 1090 } 1091 1092 void 1093 vdev_reopen(vdev_t *vd) 1094 { 1095 spa_t *spa = vd->vdev_spa; 1096 1097 ASSERT(spa_config_held(spa, RW_WRITER)); 1098 1099 vdev_close(vd); 1100 (void) vdev_open(vd); 1101 1102 /* 1103 * Call vdev_validate() here to make sure we have the same device. 1104 * Otherwise, a device with an invalid label could be successfully 1105 * opened in response to vdev_reopen(). 1106 */ 1107 if (vd->vdev_aux) { 1108 (void) vdev_validate_aux(vd); 1109 if (!vdev_is_dead(vd) && 1110 !l2arc_vdev_present(vd)) { 1111 uint64_t size = vdev_get_rsize(vd); 1112 l2arc_add_vdev(spa, vd, 1113 VDEV_LABEL_START_SIZE, 1114 size - VDEV_LABEL_START_SIZE); 1115 } 1116 } else { 1117 (void) vdev_validate(vd); 1118 } 1119 1120 /* 1121 * Reassess parent vdev's health. 1122 */ 1123 vdev_propagate_state(vd); 1124 } 1125 1126 int 1127 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) 1128 { 1129 int error; 1130 1131 /* 1132 * Normally, partial opens (e.g. of a mirror) are allowed. 1133 * For a create, however, we want to fail the request if 1134 * there are any components we can't open. 1135 */ 1136 error = vdev_open(vd); 1137 1138 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 1139 vdev_close(vd); 1140 return (error ? error : ENXIO); 1141 } 1142 1143 /* 1144 * Recursively initialize all labels. 1145 */ 1146 if ((error = vdev_label_init(vd, txg, isreplacing ? 1147 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { 1148 vdev_close(vd); 1149 return (error); 1150 } 1151 1152 return (0); 1153 } 1154 1155 /* 1156 * The is the latter half of vdev_create(). It is distinct because it 1157 * involves initiating transactions in order to do metaslab creation. 1158 * For creation, we want to try to create all vdevs at once and then undo it 1159 * if anything fails; this is much harder if we have pending transactions. 1160 */ 1161 void 1162 vdev_init(vdev_t *vd, uint64_t txg) 1163 { 1164 /* 1165 * Aim for roughly 200 metaslabs per vdev. 1166 */ 1167 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 1168 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 1169 1170 /* 1171 * Initialize the vdev's metaslabs. This can't fail because 1172 * there's nothing to read when creating all new metaslabs. 1173 */ 1174 VERIFY(vdev_metaslab_init(vd, txg) == 0); 1175 } 1176 1177 void 1178 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) 1179 { 1180 ASSERT(vd == vd->vdev_top); 1181 ASSERT(ISP2(flags)); 1182 1183 if (flags & VDD_METASLAB) 1184 (void) txg_list_add(&vd->vdev_ms_list, arg, txg); 1185 1186 if (flags & VDD_DTL) 1187 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); 1188 1189 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); 1190 } 1191 1192 void 1193 vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 1194 { 1195 mutex_enter(sm->sm_lock); 1196 if (!space_map_contains(sm, txg, size)) 1197 space_map_add(sm, txg, size); 1198 mutex_exit(sm->sm_lock); 1199 } 1200 1201 int 1202 vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 1203 { 1204 int dirty; 1205 1206 /* 1207 * Quick test without the lock -- covers the common case that 1208 * there are no dirty time segments. 1209 */ 1210 if (sm->sm_space == 0) 1211 return (0); 1212 1213 mutex_enter(sm->sm_lock); 1214 dirty = space_map_contains(sm, txg, size); 1215 mutex_exit(sm->sm_lock); 1216 1217 return (dirty); 1218 } 1219 1220 /* 1221 * Reassess DTLs after a config change or scrub completion. 1222 */ 1223 void 1224 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 1225 { 1226 spa_t *spa = vd->vdev_spa; 1227 int c; 1228 1229 ASSERT(spa_config_held(spa, RW_READER)); 1230 1231 if (vd->vdev_children == 0) { 1232 mutex_enter(&vd->vdev_dtl_lock); 1233 if (scrub_txg != 0 && 1234 (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { 1235 /* XXX should check scrub_done? */ 1236 /* 1237 * We completed a scrub up to scrub_txg. If we 1238 * did it without rebooting, then the scrub dtl 1239 * will be valid, so excise the old region and 1240 * fold in the scrub dtl. Otherwise, leave the 1241 * dtl as-is if there was an error. 1242 */ 1243 space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1244 space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1245 } 1246 if (scrub_done) 1247 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1248 mutex_exit(&vd->vdev_dtl_lock); 1249 1250 if (txg != 0) 1251 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1252 return; 1253 } 1254 1255 /* 1256 * Make sure the DTLs are always correct under the scrub lock. 1257 */ 1258 if (vd == spa->spa_root_vdev) 1259 mutex_enter(&spa->spa_scrub_lock); 1260 1261 mutex_enter(&vd->vdev_dtl_lock); 1262 space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1263 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1264 mutex_exit(&vd->vdev_dtl_lock); 1265 1266 for (c = 0; c < vd->vdev_children; c++) { 1267 vdev_t *cvd = vd->vdev_child[c]; 1268 vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1269 mutex_enter(&vd->vdev_dtl_lock); 1270 space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1271 space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1272 mutex_exit(&vd->vdev_dtl_lock); 1273 } 1274 1275 if (vd == spa->spa_root_vdev) 1276 mutex_exit(&spa->spa_scrub_lock); 1277 } 1278 1279 static int 1280 vdev_dtl_load(vdev_t *vd) 1281 { 1282 spa_t *spa = vd->vdev_spa; 1283 space_map_obj_t *smo = &vd->vdev_dtl; 1284 objset_t *mos = spa->spa_meta_objset; 1285 dmu_buf_t *db; 1286 int error; 1287 1288 ASSERT(vd->vdev_children == 0); 1289 1290 if (smo->smo_object == 0) 1291 return (0); 1292 1293 if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) 1294 return (error); 1295 1296 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1297 bcopy(db->db_data, smo, sizeof (*smo)); 1298 dmu_buf_rele(db, FTAG); 1299 1300 mutex_enter(&vd->vdev_dtl_lock); 1301 error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); 1302 mutex_exit(&vd->vdev_dtl_lock); 1303 1304 return (error); 1305 } 1306 1307 void 1308 vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1309 { 1310 spa_t *spa = vd->vdev_spa; 1311 space_map_obj_t *smo = &vd->vdev_dtl; 1312 space_map_t *sm = &vd->vdev_dtl_map; 1313 objset_t *mos = spa->spa_meta_objset; 1314 space_map_t smsync; 1315 kmutex_t smlock; 1316 dmu_buf_t *db; 1317 dmu_tx_t *tx; 1318 1319 dprintf("%s in txg %llu pass %d\n", 1320 vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1321 1322 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1323 1324 if (vd->vdev_detached) { 1325 if (smo->smo_object != 0) { 1326 int err = dmu_object_free(mos, smo->smo_object, tx); 1327 ASSERT3U(err, ==, 0); 1328 smo->smo_object = 0; 1329 } 1330 dmu_tx_commit(tx); 1331 dprintf("detach %s committed in txg %llu\n", 1332 vdev_description(vd), txg); 1333 return; 1334 } 1335 1336 if (smo->smo_object == 0) { 1337 ASSERT(smo->smo_objsize == 0); 1338 ASSERT(smo->smo_alloc == 0); 1339 smo->smo_object = dmu_object_alloc(mos, 1340 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1341 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1342 ASSERT(smo->smo_object != 0); 1343 vdev_config_dirty(vd->vdev_top); 1344 } 1345 1346 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1347 1348 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1349 &smlock); 1350 1351 mutex_enter(&smlock); 1352 1353 mutex_enter(&vd->vdev_dtl_lock); 1354 space_map_walk(sm, space_map_add, &smsync); 1355 mutex_exit(&vd->vdev_dtl_lock); 1356 1357 space_map_truncate(smo, mos, tx); 1358 space_map_sync(&smsync, SM_ALLOC, smo, mos, tx); 1359 1360 space_map_destroy(&smsync); 1361 1362 mutex_exit(&smlock); 1363 mutex_destroy(&smlock); 1364 1365 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1366 dmu_buf_will_dirty(db, tx); 1367 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1368 bcopy(smo, db->db_data, sizeof (*smo)); 1369 dmu_buf_rele(db, FTAG); 1370 1371 dmu_tx_commit(tx); 1372 } 1373 1374 /* 1375 * Determine if resilver is needed, and if so the txg range. 1376 */ 1377 boolean_t 1378 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) 1379 { 1380 boolean_t needed = B_FALSE; 1381 uint64_t thismin = UINT64_MAX; 1382 uint64_t thismax = 0; 1383 1384 if (vd->vdev_children == 0) { 1385 mutex_enter(&vd->vdev_dtl_lock); 1386 if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) { 1387 space_seg_t *ss; 1388 1389 ss = avl_first(&vd->vdev_dtl_map.sm_root); 1390 thismin = ss->ss_start - 1; 1391 ss = avl_last(&vd->vdev_dtl_map.sm_root); 1392 thismax = ss->ss_end; 1393 needed = B_TRUE; 1394 } 1395 mutex_exit(&vd->vdev_dtl_lock); 1396 } else { 1397 int c; 1398 for (c = 0; c < vd->vdev_children; c++) { 1399 vdev_t *cvd = vd->vdev_child[c]; 1400 uint64_t cmin, cmax; 1401 1402 if (vdev_resilver_needed(cvd, &cmin, &cmax)) { 1403 thismin = MIN(thismin, cmin); 1404 thismax = MAX(thismax, cmax); 1405 needed = B_TRUE; 1406 } 1407 } 1408 } 1409 1410 if (needed && minp) { 1411 *minp = thismin; 1412 *maxp = thismax; 1413 } 1414 return (needed); 1415 } 1416 1417 void 1418 vdev_load(vdev_t *vd) 1419 { 1420 int c; 1421 1422 /* 1423 * Recursively load all children. 1424 */ 1425 for (c = 0; c < vd->vdev_children; c++) 1426 vdev_load(vd->vdev_child[c]); 1427 1428 /* 1429 * If this is a top-level vdev, initialize its metaslabs. 1430 */ 1431 if (vd == vd->vdev_top && 1432 (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || 1433 vdev_metaslab_init(vd, 0) != 0)) 1434 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1435 VDEV_AUX_CORRUPT_DATA); 1436 1437 /* 1438 * If this is a leaf vdev, load its DTL. 1439 */ 1440 if (vd->vdev_ops->vdev_op_leaf && vdev_dtl_load(vd) != 0) 1441 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, 1442 VDEV_AUX_CORRUPT_DATA); 1443 } 1444 1445 /* 1446 * The special vdev case is used for hot spares and l2cache devices. Its 1447 * sole purpose it to set the vdev state for the associated vdev. To do this, 1448 * we make sure that we can open the underlying device, then try to read the 1449 * label, and make sure that the label is sane and that it hasn't been 1450 * repurposed to another pool. 1451 */ 1452 int 1453 vdev_validate_aux(vdev_t *vd) 1454 { 1455 nvlist_t *label; 1456 uint64_t guid, version; 1457 uint64_t state; 1458 1459 if (vdev_is_dead(vd)) 1460 return (0); 1461 1462 if ((label = vdev_label_read_config(vd)) == NULL) { 1463 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1464 VDEV_AUX_CORRUPT_DATA); 1465 return (-1); 1466 } 1467 1468 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || 1469 version > SPA_VERSION || 1470 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || 1471 guid != vd->vdev_guid || 1472 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { 1473 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, 1474 VDEV_AUX_CORRUPT_DATA); 1475 nvlist_free(label); 1476 return (-1); 1477 } 1478 1479 /* 1480 * We don't actually check the pool state here. If it's in fact in 1481 * use by another pool, we update this fact on the fly when requested. 1482 */ 1483 nvlist_free(label); 1484 return (0); 1485 } 1486 1487 void 1488 vdev_sync_done(vdev_t *vd, uint64_t txg) 1489 { 1490 metaslab_t *msp; 1491 1492 dprintf("%s txg %llu\n", vdev_description(vd), txg); 1493 1494 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1495 metaslab_sync_done(msp, txg); 1496 } 1497 1498 void 1499 vdev_sync(vdev_t *vd, uint64_t txg) 1500 { 1501 spa_t *spa = vd->vdev_spa; 1502 vdev_t *lvd; 1503 metaslab_t *msp; 1504 dmu_tx_t *tx; 1505 1506 dprintf("%s txg %llu pass %d\n", 1507 vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1508 1509 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { 1510 ASSERT(vd == vd->vdev_top); 1511 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1512 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1513 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1514 ASSERT(vd->vdev_ms_array != 0); 1515 vdev_config_dirty(vd); 1516 dmu_tx_commit(tx); 1517 } 1518 1519 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { 1520 metaslab_sync(msp, txg); 1521 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); 1522 } 1523 1524 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1525 vdev_dtl_sync(lvd, txg); 1526 1527 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1528 } 1529 1530 uint64_t 1531 vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1532 { 1533 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1534 } 1535 1536 const char * 1537 vdev_description(vdev_t *vd) 1538 { 1539 if (vd == NULL || vd->vdev_ops == NULL) 1540 return ("<unknown>"); 1541 1542 if (vd->vdev_path != NULL) 1543 return (vd->vdev_path); 1544 1545 if (vd->vdev_parent == NULL) 1546 return (spa_name(vd->vdev_spa)); 1547 1548 return (vd->vdev_ops->vdev_op_type); 1549 } 1550 1551 /* 1552 * Mark the given vdev faulted. A faulted vdev behaves as if the device could 1553 * not be opened, and no I/O is attempted. 1554 */ 1555 int 1556 vdev_fault(spa_t *spa, uint64_t guid) 1557 { 1558 vdev_t *vd; 1559 uint64_t txg; 1560 1561 /* 1562 * Disregard a vdev fault request if the pool has 1563 * experienced a complete failure. 1564 * 1565 * XXX - We do this here so that we don't hold the 1566 * spa_namespace_lock in the event that we can't get 1567 * the RW_WRITER spa_config_lock. 1568 */ 1569 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1570 return (EIO); 1571 1572 txg = spa_vdev_enter(spa); 1573 1574 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1575 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1576 if (!vd->vdev_ops->vdev_op_leaf) 1577 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1578 1579 /* 1580 * Faulted state takes precedence over degraded. 1581 */ 1582 vd->vdev_faulted = 1ULL; 1583 vd->vdev_degraded = 0ULL; 1584 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, 1585 VDEV_AUX_ERR_EXCEEDED); 1586 1587 /* 1588 * If marking the vdev as faulted cause the toplevel vdev to become 1589 * unavailable, then back off and simply mark the vdev as degraded 1590 * instead. 1591 */ 1592 if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { 1593 vd->vdev_degraded = 1ULL; 1594 vd->vdev_faulted = 0ULL; 1595 1596 /* 1597 * If we reopen the device and it's not dead, only then do we 1598 * mark it degraded. 1599 */ 1600 vdev_reopen(vd); 1601 1602 if (vdev_readable(vd)) { 1603 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 1604 VDEV_AUX_ERR_EXCEEDED); 1605 } 1606 } 1607 1608 vdev_config_dirty(vd->vdev_top); 1609 1610 (void) spa_vdev_exit(spa, NULL, txg, 0); 1611 1612 return (0); 1613 } 1614 1615 /* 1616 * Mark the given vdev degraded. A degraded vdev is purely an indication to the 1617 * user that something is wrong. The vdev continues to operate as normal as far 1618 * as I/O is concerned. 1619 */ 1620 int 1621 vdev_degrade(spa_t *spa, uint64_t guid) 1622 { 1623 vdev_t *vd; 1624 uint64_t txg; 1625 1626 /* 1627 * Disregard a vdev fault request if the pool has 1628 * experienced a complete failure. 1629 * 1630 * XXX - We do this here so that we don't hold the 1631 * spa_namespace_lock in the event that we can't get 1632 * the RW_WRITER spa_config_lock. 1633 */ 1634 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1635 return (EIO); 1636 1637 txg = spa_vdev_enter(spa); 1638 1639 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1640 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1641 if (!vd->vdev_ops->vdev_op_leaf) 1642 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1643 1644 /* 1645 * If the vdev is already faulted, then don't do anything. 1646 */ 1647 if (vd->vdev_faulted || vd->vdev_degraded) { 1648 (void) spa_vdev_exit(spa, NULL, txg, 0); 1649 return (0); 1650 } 1651 1652 vd->vdev_degraded = 1ULL; 1653 if (!vdev_is_dead(vd)) 1654 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, 1655 VDEV_AUX_ERR_EXCEEDED); 1656 vdev_config_dirty(vd->vdev_top); 1657 1658 (void) spa_vdev_exit(spa, NULL, txg, 0); 1659 1660 return (0); 1661 } 1662 1663 /* 1664 * Online the given vdev. If 'unspare' is set, it implies two things. First, 1665 * any attached spare device should be detached when the device finishes 1666 * resilvering. Second, the online should be treated like a 'test' online case, 1667 * so no FMA events are generated if the device fails to open. 1668 */ 1669 int 1670 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, 1671 vdev_state_t *newstate) 1672 { 1673 vdev_t *vd; 1674 uint64_t txg; 1675 1676 /* 1677 * Disregard a vdev fault request if the pool has 1678 * experienced a complete failure. 1679 * 1680 * XXX - We do this here so that we don't hold the 1681 * spa_namespace_lock in the event that we can't get 1682 * the RW_WRITER spa_config_lock. 1683 */ 1684 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1685 return (EIO); 1686 1687 txg = spa_vdev_enter(spa); 1688 1689 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1690 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1691 1692 if (!vd->vdev_ops->vdev_op_leaf) 1693 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1694 1695 vd->vdev_offline = B_FALSE; 1696 vd->vdev_tmpoffline = B_FALSE; 1697 vd->vdev_checkremove = (flags & ZFS_ONLINE_CHECKREMOVE) ? 1698 B_TRUE : B_FALSE; 1699 vd->vdev_forcefault = (flags & ZFS_ONLINE_FORCEFAULT) ? 1700 B_TRUE : B_FALSE; 1701 vdev_reopen(vd->vdev_top); 1702 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; 1703 1704 if (newstate) 1705 *newstate = vd->vdev_state; 1706 if ((flags & ZFS_ONLINE_UNSPARE) && 1707 !vdev_is_dead(vd) && vd->vdev_parent && 1708 vd->vdev_parent->vdev_ops == &vdev_spare_ops && 1709 vd->vdev_parent->vdev_child[0] == vd) 1710 vd->vdev_unspare = B_TRUE; 1711 1712 vdev_config_dirty(vd->vdev_top); 1713 1714 (void) spa_vdev_exit(spa, NULL, txg, 0); 1715 1716 /* 1717 * Must hold spa_namespace_lock in order to post resilver sysevent 1718 * w/pool name. 1719 */ 1720 mutex_enter(&spa_namespace_lock); 1721 VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); 1722 mutex_exit(&spa_namespace_lock); 1723 1724 return (0); 1725 } 1726 1727 int 1728 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) 1729 { 1730 vdev_t *vd; 1731 uint64_t txg; 1732 1733 /* 1734 * Disregard a vdev fault request if the pool has 1735 * experienced a complete failure. 1736 * 1737 * XXX - We do this here so that we don't hold the 1738 * spa_namespace_lock in the event that we can't get 1739 * the RW_WRITER spa_config_lock. 1740 */ 1741 if (spa_state(spa) == POOL_STATE_IO_FAILURE) 1742 return (EIO); 1743 1744 txg = spa_vdev_enter(spa); 1745 1746 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 1747 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 1748 1749 if (!vd->vdev_ops->vdev_op_leaf) 1750 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 1751 1752 /* 1753 * If the device isn't already offline, try to offline it. 1754 */ 1755 if (!vd->vdev_offline) { 1756 /* 1757 * If this device's top-level vdev has a non-empty DTL, 1758 * don't allow the device to be offlined. 1759 * 1760 * XXX -- make this more precise by allowing the offline 1761 * as long as the remaining devices don't have any DTL holes. 1762 */ 1763 if (vd->vdev_top->vdev_dtl_map.sm_space != 0) 1764 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1765 1766 /* 1767 * Offline this device and reopen its top-level vdev. 1768 * If this action results in the top-level vdev becoming 1769 * unusable, undo it and fail the request. 1770 */ 1771 vd->vdev_offline = B_TRUE; 1772 vdev_reopen(vd->vdev_top); 1773 if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { 1774 vd->vdev_offline = B_FALSE; 1775 vdev_reopen(vd->vdev_top); 1776 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 1777 } 1778 } 1779 1780 vd->vdev_tmpoffline = (flags & ZFS_OFFLINE_TEMPORARY) ? 1781 B_TRUE : B_FALSE; 1782 1783 vdev_config_dirty(vd->vdev_top); 1784 1785 return (spa_vdev_exit(spa, NULL, txg, 0)); 1786 } 1787 1788 /* 1789 * Clear the error counts associated with this vdev. Unlike vdev_online() and 1790 * vdev_offline(), we assume the spa config is locked. We also clear all 1791 * children. If 'vd' is NULL, then the user wants to clear all vdevs. 1792 * If reopen is specified then attempt to reopen the vdev if the vdev is 1793 * faulted or degraded. 1794 */ 1795 void 1796 vdev_clear(spa_t *spa, vdev_t *vd, boolean_t reopen_wanted) 1797 { 1798 int c; 1799 1800 if (vd == NULL) 1801 vd = spa->spa_root_vdev; 1802 1803 vd->vdev_stat.vs_read_errors = 0; 1804 vd->vdev_stat.vs_write_errors = 0; 1805 vd->vdev_stat.vs_checksum_errors = 0; 1806 vd->vdev_is_failing = B_FALSE; 1807 1808 for (c = 0; c < vd->vdev_children; c++) 1809 vdev_clear(spa, vd->vdev_child[c], reopen_wanted); 1810 1811 /* 1812 * If we're in the FAULTED state or have experienced failed I/O, then 1813 * clear the persistent state and attempt to reopen the device. We 1814 * also mark the vdev config dirty, so that the new faulted state is 1815 * written out to disk. 1816 */ 1817 if (reopen_wanted && (vd->vdev_faulted || vd->vdev_degraded || 1818 vd->vdev_stat.vs_aux == VDEV_AUX_IO_FAILURE)) { 1819 boolean_t resilver = (vd->vdev_faulted || vd->vdev_degraded); 1820 1821 vd->vdev_faulted = vd->vdev_degraded = 0; 1822 vdev_reopen(vd); 1823 vdev_config_dirty(vd->vdev_top); 1824 1825 if (resilver && vd->vdev_aux == NULL && !vdev_is_dead(vd)) 1826 spa_async_request(spa, SPA_ASYNC_RESILVER); 1827 1828 spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); 1829 } 1830 } 1831 1832 int 1833 vdev_readable(vdev_t *vd) 1834 { 1835 /* XXPOLICY */ 1836 return (!vdev_is_dead(vd)); 1837 } 1838 1839 int 1840 vdev_writeable(vdev_t *vd) 1841 { 1842 return (!vdev_is_dead(vd) && !vd->vdev_is_failing); 1843 } 1844 1845 int 1846 vdev_is_dead(vdev_t *vd) 1847 { 1848 /* 1849 * If the vdev experienced I/O failures, then the vdev is marked 1850 * as faulted (VDEV_STATE_FAULTED) for status output and FMA; however, 1851 * we need to allow access to the vdev for resumed I/Os (see 1852 * zio_vdev_resume_io() ). 1853 */ 1854 return (vd->vdev_state < VDEV_STATE_DEGRADED && 1855 vd->vdev_stat.vs_aux != VDEV_AUX_IO_FAILURE); 1856 } 1857 1858 int 1859 vdev_error_inject(vdev_t *vd, zio_t *zio) 1860 { 1861 int error = 0; 1862 1863 if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1864 return (0); 1865 1866 if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1867 return (0); 1868 1869 switch (vd->vdev_fault_mode) { 1870 case VDEV_FAULT_RANDOM: 1871 if (spa_get_random(vd->vdev_fault_arg) == 0) 1872 error = EIO; 1873 break; 1874 1875 case VDEV_FAULT_COUNT: 1876 if ((int64_t)--vd->vdev_fault_arg <= 0) 1877 vd->vdev_fault_mode = VDEV_FAULT_NONE; 1878 error = EIO; 1879 break; 1880 } 1881 1882 return (error); 1883 } 1884 1885 /* 1886 * Get statistics for the given vdev. 1887 */ 1888 void 1889 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1890 { 1891 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1892 int c, t; 1893 1894 mutex_enter(&vd->vdev_stat_lock); 1895 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1896 vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; 1897 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1898 vs->vs_state = vd->vdev_state; 1899 vs->vs_rsize = vdev_get_rsize(vd); 1900 mutex_exit(&vd->vdev_stat_lock); 1901 1902 /* 1903 * If we're getting stats on the root vdev, aggregate the I/O counts 1904 * over all top-level vdevs (i.e. the direct children of the root). 1905 */ 1906 if (vd == rvd) { 1907 for (c = 0; c < rvd->vdev_children; c++) { 1908 vdev_t *cvd = rvd->vdev_child[c]; 1909 vdev_stat_t *cvs = &cvd->vdev_stat; 1910 1911 mutex_enter(&vd->vdev_stat_lock); 1912 for (t = 0; t < ZIO_TYPES; t++) { 1913 vs->vs_ops[t] += cvs->vs_ops[t]; 1914 vs->vs_bytes[t] += cvs->vs_bytes[t]; 1915 } 1916 vs->vs_read_errors += cvs->vs_read_errors; 1917 vs->vs_write_errors += cvs->vs_write_errors; 1918 vs->vs_checksum_errors += cvs->vs_checksum_errors; 1919 vs->vs_scrub_examined += cvs->vs_scrub_examined; 1920 mutex_exit(&vd->vdev_stat_lock); 1921 } 1922 } 1923 } 1924 1925 void 1926 vdev_clear_stats(vdev_t *vd) 1927 { 1928 mutex_enter(&vd->vdev_stat_lock); 1929 vd->vdev_stat.vs_space = 0; 1930 vd->vdev_stat.vs_dspace = 0; 1931 vd->vdev_stat.vs_alloc = 0; 1932 mutex_exit(&vd->vdev_stat_lock); 1933 } 1934 1935 void 1936 vdev_stat_update(zio_t *zio) 1937 { 1938 vdev_t *vd = zio->io_vd; 1939 vdev_t *pvd; 1940 uint64_t txg = zio->io_txg; 1941 vdev_stat_t *vs = &vd->vdev_stat; 1942 zio_type_t type = zio->io_type; 1943 int flags = zio->io_flags; 1944 1945 if (zio->io_error == 0) { 1946 if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1947 mutex_enter(&vd->vdev_stat_lock); 1948 vs->vs_ops[type]++; 1949 vs->vs_bytes[type] += zio->io_size; 1950 mutex_exit(&vd->vdev_stat_lock); 1951 } 1952 if ((flags & ZIO_FLAG_IO_REPAIR) && 1953 zio->io_delegate_list == NULL) { 1954 mutex_enter(&vd->vdev_stat_lock); 1955 if (flags & ZIO_FLAG_SCRUB_THREAD) 1956 vs->vs_scrub_repaired += zio->io_size; 1957 else 1958 vs->vs_self_healed += zio->io_size; 1959 mutex_exit(&vd->vdev_stat_lock); 1960 } 1961 return; 1962 } 1963 1964 if (flags & ZIO_FLAG_SPECULATIVE) 1965 return; 1966 1967 if (vdev_readable(vd)) { 1968 mutex_enter(&vd->vdev_stat_lock); 1969 if (type == ZIO_TYPE_READ) { 1970 if (zio->io_error == ECKSUM) 1971 vs->vs_checksum_errors++; 1972 else 1973 vs->vs_read_errors++; 1974 } 1975 if (type == ZIO_TYPE_WRITE) 1976 vs->vs_write_errors++; 1977 mutex_exit(&vd->vdev_stat_lock); 1978 } 1979 1980 if (type == ZIO_TYPE_WRITE) { 1981 if (txg == 0 || vd->vdev_children != 0) 1982 return; 1983 if (flags & ZIO_FLAG_SCRUB_THREAD) { 1984 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1985 for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1986 vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1987 } 1988 if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1989 if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1990 return; 1991 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); 1992 for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1993 vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1994 } 1995 } 1996 } 1997 1998 void 1999 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 2000 { 2001 int c; 2002 vdev_stat_t *vs = &vd->vdev_stat; 2003 2004 for (c = 0; c < vd->vdev_children; c++) 2005 vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 2006 2007 mutex_enter(&vd->vdev_stat_lock); 2008 2009 if (type == POOL_SCRUB_NONE) { 2010 /* 2011 * Update completion and end time. Leave everything else alone 2012 * so we can report what happened during the previous scrub. 2013 */ 2014 vs->vs_scrub_complete = complete; 2015 vs->vs_scrub_end = gethrestime_sec(); 2016 } else { 2017 vs->vs_scrub_type = type; 2018 vs->vs_scrub_complete = 0; 2019 vs->vs_scrub_examined = 0; 2020 vs->vs_scrub_repaired = 0; 2021 vs->vs_scrub_start = gethrestime_sec(); 2022 vs->vs_scrub_end = 0; 2023 } 2024 2025 mutex_exit(&vd->vdev_stat_lock); 2026 } 2027 2028 /* 2029 * Update the in-core space usage stats for this vdev and the root vdev. 2030 */ 2031 void 2032 vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, 2033 boolean_t update_root) 2034 { 2035 int64_t dspace_delta = space_delta; 2036 spa_t *spa = vd->vdev_spa; 2037 vdev_t *rvd = spa->spa_root_vdev; 2038 2039 ASSERT(vd == vd->vdev_top); 2040 2041 /* 2042 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion 2043 * factor. We must calculate this here and not at the root vdev 2044 * because the root vdev's psize-to-asize is simply the max of its 2045 * childrens', thus not accurate enough for us. 2046 */ 2047 ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); 2048 dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * 2049 vd->vdev_deflate_ratio; 2050 2051 mutex_enter(&vd->vdev_stat_lock); 2052 vd->vdev_stat.vs_space += space_delta; 2053 vd->vdev_stat.vs_alloc += alloc_delta; 2054 vd->vdev_stat.vs_dspace += dspace_delta; 2055 mutex_exit(&vd->vdev_stat_lock); 2056 2057 if (update_root) { 2058 ASSERT(rvd == vd->vdev_parent); 2059 ASSERT(vd->vdev_ms_count != 0); 2060 2061 /* 2062 * Don't count non-normal (e.g. intent log) space as part of 2063 * the pool's capacity. 2064 */ 2065 if (vd->vdev_mg->mg_class != spa->spa_normal_class) 2066 return; 2067 2068 mutex_enter(&rvd->vdev_stat_lock); 2069 rvd->vdev_stat.vs_space += space_delta; 2070 rvd->vdev_stat.vs_alloc += alloc_delta; 2071 rvd->vdev_stat.vs_dspace += dspace_delta; 2072 mutex_exit(&rvd->vdev_stat_lock); 2073 } 2074 } 2075 2076 /* 2077 * Mark a top-level vdev's config as dirty, placing it on the dirty list 2078 * so that it will be written out next time the vdev configuration is synced. 2079 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 2080 */ 2081 void 2082 vdev_config_dirty(vdev_t *vd) 2083 { 2084 spa_t *spa = vd->vdev_spa; 2085 vdev_t *rvd = spa->spa_root_vdev; 2086 int c; 2087 2088 /* 2089 * If this is an aux vdev (as with l2cache devices), then we update the 2090 * vdev config manually and set the sync flag. 2091 */ 2092 if (vd->vdev_aux != NULL) { 2093 spa_aux_vdev_t *sav = vd->vdev_aux; 2094 nvlist_t **aux; 2095 uint_t naux; 2096 2097 for (c = 0; c < sav->sav_count; c++) { 2098 if (sav->sav_vdevs[c] == vd) 2099 break; 2100 } 2101 2102 ASSERT(c < sav->sav_count); 2103 sav->sav_sync = B_TRUE; 2104 2105 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 2106 ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); 2107 2108 ASSERT(c < naux); 2109 2110 /* 2111 * Setting the nvlist in the middle if the array is a little 2112 * sketchy, but it will work. 2113 */ 2114 nvlist_free(aux[c]); 2115 aux[c] = vdev_config_generate(spa, vd, B_TRUE, B_FALSE, B_TRUE); 2116 2117 return; 2118 } 2119 2120 /* 2121 * The dirty list is protected by the config lock. The caller must 2122 * either hold the config lock as writer, or must be the sync thread 2123 * (which holds the lock as reader). There's only one sync thread, 2124 * so this is sufficient to ensure mutual exclusion. 2125 */ 2126 ASSERT(spa_config_held(spa, RW_WRITER) || 2127 dsl_pool_sync_context(spa_get_dsl(spa))); 2128 2129 if (vd == rvd) { 2130 for (c = 0; c < rvd->vdev_children; c++) 2131 vdev_config_dirty(rvd->vdev_child[c]); 2132 } else { 2133 ASSERT(vd == vd->vdev_top); 2134 2135 if (!list_link_active(&vd->vdev_dirty_node)) 2136 list_insert_head(&spa->spa_dirty_list, vd); 2137 } 2138 } 2139 2140 void 2141 vdev_config_clean(vdev_t *vd) 2142 { 2143 spa_t *spa = vd->vdev_spa; 2144 2145 ASSERT(spa_config_held(spa, RW_WRITER) || 2146 dsl_pool_sync_context(spa_get_dsl(spa))); 2147 2148 ASSERT(list_link_active(&vd->vdev_dirty_node)); 2149 list_remove(&spa->spa_dirty_list, vd); 2150 } 2151 2152 /* 2153 * Propagate vdev state up from children to parent. 2154 */ 2155 void 2156 vdev_propagate_state(vdev_t *vd) 2157 { 2158 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 2159 int degraded = 0, faulted = 0; 2160 int corrupted = 0; 2161 int c; 2162 vdev_t *child; 2163 2164 if (vd->vdev_children > 0) { 2165 for (c = 0; c < vd->vdev_children; c++) { 2166 child = vd->vdev_child[c]; 2167 2168 if ((vdev_is_dead(child) && !vdev_readable(child)) || 2169 child->vdev_stat.vs_aux == VDEV_AUX_IO_FAILURE) { 2170 /* 2171 * Root special: if there is a top-level log 2172 * device, treat the root vdev as if it were 2173 * degraded. 2174 */ 2175 if (child->vdev_islog && vd == rvd) 2176 degraded++; 2177 else 2178 faulted++; 2179 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { 2180 degraded++; 2181 } 2182 2183 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) 2184 corrupted++; 2185 } 2186 2187 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); 2188 2189 /* 2190 * Root special: if there is a toplevel vdev that cannot be 2191 * opened due to corrupted metadata, then propagate the root 2192 * vdev's aux state as 'corrupt' rather than 'insufficient 2193 * replicas'. 2194 */ 2195 if (corrupted && vd == rvd && 2196 rvd->vdev_state == VDEV_STATE_CANT_OPEN) 2197 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, 2198 VDEV_AUX_CORRUPT_DATA); 2199 } 2200 2201 if (vd->vdev_parent) 2202 vdev_propagate_state(vd->vdev_parent); 2203 } 2204 2205 /* 2206 * Set a vdev's state. If this is during an open, we don't update the parent 2207 * state, because we're in the process of opening children depth-first. 2208 * Otherwise, we propagate the change to the parent. 2209 * 2210 * If this routine places a device in a faulted state, an appropriate ereport is 2211 * generated. 2212 */ 2213 void 2214 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) 2215 { 2216 uint64_t save_state; 2217 spa_t *spa = vd->vdev_spa; 2218 2219 if (state == vd->vdev_state) { 2220 vd->vdev_stat.vs_aux = aux; 2221 return; 2222 } 2223 2224 save_state = vd->vdev_state; 2225 2226 vd->vdev_state = state; 2227 vd->vdev_stat.vs_aux = aux; 2228 2229 /* 2230 * If we are setting the vdev state to anything but an open state, then 2231 * always close the underlying device. Otherwise, we keep accessible 2232 * but invalid devices open forever. We don't call vdev_close() itself, 2233 * because that implies some extra checks (offline, etc) that we don't 2234 * want here. This is limited to leaf devices, because otherwise 2235 * closing the device will affect other children. 2236 */ 2237 if (!vdev_readable(vd) && vd->vdev_ops->vdev_op_leaf) 2238 vd->vdev_ops->vdev_op_close(vd); 2239 2240 if (vd->vdev_removed && 2241 state == VDEV_STATE_CANT_OPEN && 2242 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { 2243 /* 2244 * If the previous state is set to VDEV_STATE_REMOVED, then this 2245 * device was previously marked removed and someone attempted to 2246 * reopen it. If this failed due to a nonexistent device, then 2247 * keep the device in the REMOVED state. We also let this be if 2248 * it is one of our special test online cases, which is only 2249 * attempting to online the device and shouldn't generate an FMA 2250 * fault. 2251 */ 2252 vd->vdev_state = VDEV_STATE_REMOVED; 2253 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 2254 } else if (state == VDEV_STATE_REMOVED) { 2255 /* 2256 * Indicate to the ZFS DE that this device has been removed, and 2257 * any recent errors should be ignored. 2258 */ 2259 zfs_post_remove(spa, vd); 2260 vd->vdev_removed = B_TRUE; 2261 } else if (state == VDEV_STATE_CANT_OPEN) { 2262 /* 2263 * If we fail to open a vdev during an import, we mark it as 2264 * "not available", which signifies that it was never there to 2265 * begin with. Failure to open such a device is not considered 2266 * an error. 2267 */ 2268 if (spa->spa_load_state == SPA_LOAD_IMPORT && 2269 !spa->spa_import_faulted && 2270 vd->vdev_ops->vdev_op_leaf) 2271 vd->vdev_not_present = 1; 2272 2273 /* 2274 * Post the appropriate ereport. If the 'prevstate' field is 2275 * set to something other than VDEV_STATE_UNKNOWN, it indicates 2276 * that this is part of a vdev_reopen(). In this case, we don't 2277 * want to post the ereport if the device was already in the 2278 * CANT_OPEN state beforehand. 2279 * 2280 * If the 'checkremove' flag is set, then this is an attempt to 2281 * online the device in response to an insertion event. If we 2282 * hit this case, then we have detected an insertion event for a 2283 * faulted or offline device that wasn't in the removed state. 2284 * In this scenario, we don't post an ereport because we are 2285 * about to replace the device, or attempt an online with 2286 * vdev_forcefault, which will generate the fault for us. 2287 */ 2288 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && 2289 !vd->vdev_not_present && !vd->vdev_checkremove && 2290 vd != spa->spa_root_vdev) { 2291 const char *class; 2292 2293 switch (aux) { 2294 case VDEV_AUX_OPEN_FAILED: 2295 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; 2296 break; 2297 case VDEV_AUX_CORRUPT_DATA: 2298 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; 2299 break; 2300 case VDEV_AUX_NO_REPLICAS: 2301 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; 2302 break; 2303 case VDEV_AUX_BAD_GUID_SUM: 2304 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; 2305 break; 2306 case VDEV_AUX_TOO_SMALL: 2307 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; 2308 break; 2309 case VDEV_AUX_BAD_LABEL: 2310 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; 2311 break; 2312 default: 2313 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; 2314 } 2315 2316 zfs_ereport_post(class, spa, vd, NULL, save_state, 0); 2317 } 2318 2319 /* Erase any notion of persistent removed state */ 2320 vd->vdev_removed = B_FALSE; 2321 } else { 2322 vd->vdev_removed = B_FALSE; 2323 } 2324 2325 if (!isopen) 2326 vdev_propagate_state(vd); 2327 } 2328 2329 /* 2330 * Check the vdev configuration to ensure that it's capable of supporting 2331 * a root pool. Currently, we do not support RAID-Z or partial configuration. 2332 * In addition, only a single top-level vdev is allowed and none of the leaves 2333 * can be wholedisks. 2334 */ 2335 boolean_t 2336 vdev_is_bootable(vdev_t *vd) 2337 { 2338 int c; 2339 2340 if (!vd->vdev_ops->vdev_op_leaf) { 2341 char *vdev_type = vd->vdev_ops->vdev_op_type; 2342 2343 if (strcmp(vdev_type, VDEV_TYPE_ROOT) == 0 && 2344 vd->vdev_children > 1) { 2345 return (B_FALSE); 2346 } else if (strcmp(vdev_type, VDEV_TYPE_RAIDZ) == 0 || 2347 strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) { 2348 return (B_FALSE); 2349 } 2350 } else if (vd->vdev_wholedisk == 1) { 2351 return (B_FALSE); 2352 } 2353 2354 for (c = 0; c < vd->vdev_children; c++) { 2355 if (!vdev_is_bootable(vd->vdev_child[c])) 2356 return (B_FALSE); 2357 } 2358 return (B_TRUE); 2359 } 2360