1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include <sys/zfs_context.h> 30 #include <sys/spa.h> 31 #include <sys/spa_impl.h> 32 #include <sys/dmu.h> 33 #include <sys/dmu_tx.h> 34 #include <sys/vdev_impl.h> 35 #include <sys/uberblock_impl.h> 36 #include <sys/metaslab.h> 37 #include <sys/metaslab_impl.h> 38 #include <sys/space_map.h> 39 #include <sys/zio.h> 40 #include <sys/zap.h> 41 #include <sys/fs/zfs.h> 42 43 /* 44 * Virtual device management. 45 */ 46 47 static vdev_ops_t *vdev_ops_table[] = { 48 &vdev_root_ops, 49 &vdev_raidz_ops, 50 &vdev_mirror_ops, 51 &vdev_replacing_ops, 52 &vdev_disk_ops, 53 &vdev_file_ops, 54 &vdev_missing_ops, 55 NULL 56 }; 57 58 /* 59 * Given a vdev type, return the appropriate ops vector. 60 */ 61 static vdev_ops_t * 62 vdev_getops(const char *type) 63 { 64 vdev_ops_t *ops, **opspp; 65 66 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) 67 if (strcmp(ops->vdev_op_type, type) == 0) 68 break; 69 70 return (ops); 71 } 72 73 /* 74 * Default asize function: return the MAX of psize with the asize of 75 * all children. This is what's used by anything other than RAID-Z. 76 */ 77 uint64_t 78 vdev_default_asize(vdev_t *vd, uint64_t psize) 79 { 80 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_ashift); 81 uint64_t csize; 82 uint64_t c; 83 84 for (c = 0; c < vd->vdev_children; c++) { 85 csize = vdev_psize_to_asize(vd->vdev_child[c], psize); 86 asize = MAX(asize, csize); 87 } 88 89 return (asize); 90 } 91 92 /* 93 * Get the replaceable or attachable device size. 94 * If the parent is a mirror or raidz, the replaceable size is the minimum 95 * psize of all its children. For the rest, just return our own psize. 96 * 97 * e.g. 98 * psize rsize 99 * root - - 100 * mirror/raidz - - 101 * disk1 20g 20g 102 * disk2 40g 20g 103 * disk3 80g 80g 104 */ 105 uint64_t 106 vdev_get_rsize(vdev_t *vd) 107 { 108 vdev_t *pvd, *cvd; 109 uint64_t c, rsize; 110 111 pvd = vd->vdev_parent; 112 113 /* 114 * If our parent is NULL or the root, just return our own psize. 115 */ 116 if (pvd == NULL || pvd->vdev_parent == NULL) 117 return (vd->vdev_psize); 118 119 rsize = 0; 120 121 for (c = 0; c < pvd->vdev_children; c++) { 122 cvd = pvd->vdev_child[c]; 123 rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; 124 } 125 126 return (rsize); 127 } 128 129 vdev_t * 130 vdev_lookup_top(spa_t *spa, uint64_t vdev) 131 { 132 vdev_t *rvd = spa->spa_root_vdev; 133 134 if (vdev < rvd->vdev_children) 135 return (rvd->vdev_child[vdev]); 136 137 return (NULL); 138 } 139 140 vdev_t * 141 vdev_lookup_by_path(vdev_t *vd, const char *path) 142 { 143 int c; 144 vdev_t *mvd; 145 146 if (vd->vdev_path != NULL) { 147 if (vd->vdev_wholedisk == 1) { 148 /* 149 * For whole disks, the internal path has 's0', but the 150 * path passed in by the user doesn't. 151 */ 152 if (strlen(path) == strlen(vd->vdev_path) - 2 && 153 strncmp(path, vd->vdev_path, strlen(path)) == 0) 154 return (vd); 155 } else if (strcmp(path, vd->vdev_path) == 0) { 156 return (vd); 157 } 158 } 159 160 for (c = 0; c < vd->vdev_children; c++) 161 if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != 162 NULL) 163 return (mvd); 164 165 return (NULL); 166 } 167 168 vdev_t * 169 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) 170 { 171 int c; 172 vdev_t *mvd; 173 174 if (vd->vdev_children == 0 && vd->vdev_guid == guid) 175 return (vd); 176 177 for (c = 0; c < vd->vdev_children; c++) 178 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != 179 NULL) 180 return (mvd); 181 182 return (NULL); 183 } 184 185 void 186 vdev_add_child(vdev_t *pvd, vdev_t *cvd) 187 { 188 size_t oldsize, newsize; 189 uint64_t id = cvd->vdev_id; 190 vdev_t **newchild; 191 192 ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 193 ASSERT(cvd->vdev_parent == NULL); 194 195 cvd->vdev_parent = pvd; 196 197 if (pvd == NULL) 198 return; 199 200 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); 201 202 oldsize = pvd->vdev_children * sizeof (vdev_t *); 203 pvd->vdev_children = MAX(pvd->vdev_children, id + 1); 204 newsize = pvd->vdev_children * sizeof (vdev_t *); 205 206 newchild = kmem_zalloc(newsize, KM_SLEEP); 207 if (pvd->vdev_child != NULL) { 208 bcopy(pvd->vdev_child, newchild, oldsize); 209 kmem_free(pvd->vdev_child, oldsize); 210 } 211 212 pvd->vdev_child = newchild; 213 pvd->vdev_child[id] = cvd; 214 215 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); 216 ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); 217 218 /* 219 * Walk up all ancestors to update guid sum. 220 */ 221 for (; pvd != NULL; pvd = pvd->vdev_parent) 222 pvd->vdev_guid_sum += cvd->vdev_guid_sum; 223 } 224 225 void 226 vdev_remove_child(vdev_t *pvd, vdev_t *cvd) 227 { 228 int c; 229 uint_t id = cvd->vdev_id; 230 231 ASSERT(cvd->vdev_parent == pvd); 232 233 if (pvd == NULL) 234 return; 235 236 ASSERT(id < pvd->vdev_children); 237 ASSERT(pvd->vdev_child[id] == cvd); 238 239 pvd->vdev_child[id] = NULL; 240 cvd->vdev_parent = NULL; 241 242 for (c = 0; c < pvd->vdev_children; c++) 243 if (pvd->vdev_child[c]) 244 break; 245 246 if (c == pvd->vdev_children) { 247 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); 248 pvd->vdev_child = NULL; 249 pvd->vdev_children = 0; 250 } 251 252 /* 253 * Walk up all ancestors to update guid sum. 254 */ 255 for (; pvd != NULL; pvd = pvd->vdev_parent) 256 pvd->vdev_guid_sum -= cvd->vdev_guid_sum; 257 } 258 259 /* 260 * Remove any holes in the child array. 261 */ 262 void 263 vdev_compact_children(vdev_t *pvd) 264 { 265 vdev_t **newchild, *cvd; 266 int oldc = pvd->vdev_children; 267 int newc, c; 268 269 ASSERT(spa_config_held(pvd->vdev_spa, RW_WRITER)); 270 271 for (c = newc = 0; c < oldc; c++) 272 if (pvd->vdev_child[c]) 273 newc++; 274 275 newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); 276 277 for (c = newc = 0; c < oldc; c++) { 278 if ((cvd = pvd->vdev_child[c]) != NULL) { 279 newchild[newc] = cvd; 280 cvd->vdev_id = newc++; 281 } 282 } 283 284 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); 285 pvd->vdev_child = newchild; 286 pvd->vdev_children = newc; 287 } 288 289 /* 290 * Allocate and minimally initialize a vdev_t. 291 */ 292 static vdev_t * 293 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) 294 { 295 vdev_t *vd; 296 297 while (guid == 0) 298 guid = spa_get_random(-1ULL); 299 300 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); 301 302 vd->vdev_spa = spa; 303 vd->vdev_id = id; 304 vd->vdev_guid = guid; 305 vd->vdev_guid_sum = guid; 306 vd->vdev_ops = ops; 307 vd->vdev_state = VDEV_STATE_CLOSED; 308 309 mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL); 310 cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL); 311 list_create(&vd->vdev_io_pending, sizeof (zio_t), 312 offsetof(zio_t, io_pending)); 313 mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL); 314 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); 315 space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); 316 space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); 317 txg_list_create(&vd->vdev_ms_list, 318 offsetof(struct metaslab, ms_txg_node)); 319 txg_list_create(&vd->vdev_dtl_list, 320 offsetof(struct vdev, vdev_dtl_node)); 321 vd->vdev_stat.vs_timestamp = gethrtime(); 322 323 return (vd); 324 } 325 326 /* 327 * Free a vdev_t that has been removed from service. 328 */ 329 static void 330 vdev_free_common(vdev_t *vd) 331 { 332 if (vd->vdev_path) 333 spa_strfree(vd->vdev_path); 334 if (vd->vdev_devid) 335 spa_strfree(vd->vdev_devid); 336 337 txg_list_destroy(&vd->vdev_ms_list); 338 txg_list_destroy(&vd->vdev_dtl_list); 339 mutex_enter(&vd->vdev_dtl_lock); 340 space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 341 space_map_destroy(&vd->vdev_dtl_map); 342 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 343 space_map_destroy(&vd->vdev_dtl_scrub); 344 mutex_exit(&vd->vdev_dtl_lock); 345 mutex_destroy(&vd->vdev_dtl_lock); 346 mutex_destroy(&vd->vdev_dirty_lock); 347 list_destroy(&vd->vdev_io_pending); 348 mutex_destroy(&vd->vdev_io_lock); 349 cv_destroy(&vd->vdev_io_cv); 350 351 kmem_free(vd, sizeof (vdev_t)); 352 } 353 354 /* 355 * Allocate a new vdev. The 'alloctype' is used to control whether we are 356 * creating a new vdev or loading an existing one - the behavior is slightly 357 * different for each case. 358 */ 359 vdev_t * 360 vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) 361 { 362 vdev_ops_t *ops; 363 char *type; 364 uint64_t guid = 0; 365 vdev_t *vd; 366 367 ASSERT(spa_config_held(spa, RW_WRITER)); 368 369 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) 370 return (NULL); 371 372 if ((ops = vdev_getops(type)) == NULL) 373 return (NULL); 374 375 /* 376 * If this is a load, get the vdev guid from the nvlist. 377 * Otherwise, vdev_alloc_common() will generate one for us. 378 */ 379 if (alloctype == VDEV_ALLOC_LOAD) { 380 uint64_t label_id; 381 382 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || 383 label_id != id) 384 return (NULL); 385 386 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) 387 return (NULL); 388 } 389 390 vd = vdev_alloc_common(spa, id, guid, ops); 391 392 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) 393 vd->vdev_path = spa_strdup(vd->vdev_path); 394 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) 395 vd->vdev_devid = spa_strdup(vd->vdev_devid); 396 397 /* 398 * Set the whole_disk property. If it's not specified, leave the value 399 * as -1. 400 */ 401 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, 402 &vd->vdev_wholedisk) != 0) 403 vd->vdev_wholedisk = -1ULL; 404 405 /* 406 * If we're a top-level vdev, try to load the allocation parameters. 407 */ 408 if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { 409 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, 410 &vd->vdev_ms_array); 411 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, 412 &vd->vdev_ms_shift); 413 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, 414 &vd->vdev_ashift); 415 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, 416 &vd->vdev_asize); 417 } 418 419 /* 420 * If we're a leaf vdev, try to load the DTL object. 421 */ 422 if (vd->vdev_ops->vdev_op_leaf && alloctype == VDEV_ALLOC_LOAD) { 423 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, 424 &vd->vdev_dtl.smo_object); 425 } 426 427 /* 428 * Add ourselves to the parent's list of children. 429 */ 430 vdev_add_child(parent, vd); 431 432 return (vd); 433 } 434 435 void 436 vdev_free(vdev_t *vd) 437 { 438 int c; 439 440 /* 441 * vdev_free() implies closing the vdev first. This is simpler than 442 * trying to ensure complicated semantics for all callers. 443 */ 444 vdev_close(vd); 445 446 /* 447 * It's possible to free a vdev that's been added to the dirty 448 * list when in the middle of spa_vdev_add(). Handle that case 449 * correctly here. 450 */ 451 if (vd->vdev_is_dirty) 452 vdev_config_clean(vd); 453 454 /* 455 * Free all children. 456 */ 457 for (c = 0; c < vd->vdev_children; c++) 458 vdev_free(vd->vdev_child[c]); 459 460 ASSERT(vd->vdev_child == NULL); 461 ASSERT(vd->vdev_guid_sum == vd->vdev_guid); 462 463 /* 464 * Discard allocation state. 465 */ 466 if (vd == vd->vdev_top) 467 vdev_metaslab_fini(vd); 468 469 ASSERT3U(vd->vdev_stat.vs_space, ==, 0); 470 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 471 472 /* 473 * Remove this vdev from its parent's child list. 474 */ 475 vdev_remove_child(vd->vdev_parent, vd); 476 477 ASSERT(vd->vdev_parent == NULL); 478 479 vdev_free_common(vd); 480 } 481 482 /* 483 * Transfer top-level vdev state from svd to tvd. 484 */ 485 static void 486 vdev_top_transfer(vdev_t *svd, vdev_t *tvd) 487 { 488 spa_t *spa = svd->vdev_spa; 489 metaslab_t *msp; 490 vdev_t *vd; 491 int t; 492 493 ASSERT(tvd == tvd->vdev_top); 494 495 tvd->vdev_ms_array = svd->vdev_ms_array; 496 tvd->vdev_ms_shift = svd->vdev_ms_shift; 497 tvd->vdev_ms_count = svd->vdev_ms_count; 498 499 svd->vdev_ms_array = 0; 500 svd->vdev_ms_shift = 0; 501 svd->vdev_ms_count = 0; 502 503 tvd->vdev_mg = svd->vdev_mg; 504 tvd->vdev_mg->mg_vd = tvd; 505 tvd->vdev_ms = svd->vdev_ms; 506 tvd->vdev_smo = svd->vdev_smo; 507 508 svd->vdev_mg = NULL; 509 svd->vdev_ms = NULL; 510 svd->vdev_smo = NULL; 511 512 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; 513 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; 514 515 svd->vdev_stat.vs_alloc = 0; 516 svd->vdev_stat.vs_space = 0; 517 518 for (t = 0; t < TXG_SIZE; t++) { 519 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) 520 (void) txg_list_add(&tvd->vdev_ms_list, msp, t); 521 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) 522 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); 523 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) 524 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); 525 tvd->vdev_dirty[t] = svd->vdev_dirty[t]; 526 svd->vdev_dirty[t] = 0; 527 } 528 529 if (svd->vdev_is_dirty) { 530 vdev_config_clean(svd); 531 vdev_config_dirty(tvd); 532 } 533 534 ASSERT(svd->vdev_io_retry == NULL); 535 ASSERT(list_is_empty(&svd->vdev_io_pending)); 536 } 537 538 static void 539 vdev_top_update(vdev_t *tvd, vdev_t *vd) 540 { 541 int c; 542 543 if (vd == NULL) 544 return; 545 546 vd->vdev_top = tvd; 547 548 for (c = 0; c < vd->vdev_children; c++) 549 vdev_top_update(tvd, vd->vdev_child[c]); 550 } 551 552 /* 553 * Add a mirror/replacing vdev above an existing vdev. 554 */ 555 vdev_t * 556 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) 557 { 558 spa_t *spa = cvd->vdev_spa; 559 vdev_t *pvd = cvd->vdev_parent; 560 vdev_t *mvd; 561 562 ASSERT(spa_config_held(spa, RW_WRITER)); 563 564 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); 565 vdev_remove_child(pvd, cvd); 566 vdev_add_child(pvd, mvd); 567 cvd->vdev_id = mvd->vdev_children; 568 vdev_add_child(mvd, cvd); 569 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 570 571 mvd->vdev_asize = cvd->vdev_asize; 572 mvd->vdev_ashift = cvd->vdev_ashift; 573 mvd->vdev_state = cvd->vdev_state; 574 575 if (mvd == mvd->vdev_top) 576 vdev_top_transfer(cvd, mvd); 577 578 return (mvd); 579 } 580 581 /* 582 * Remove a 1-way mirror/replacing vdev from the tree. 583 */ 584 void 585 vdev_remove_parent(vdev_t *cvd) 586 { 587 vdev_t *mvd = cvd->vdev_parent; 588 vdev_t *pvd = mvd->vdev_parent; 589 590 ASSERT(spa_config_held(cvd->vdev_spa, RW_WRITER)); 591 592 ASSERT(mvd->vdev_children == 1); 593 ASSERT(mvd->vdev_ops == &vdev_mirror_ops || 594 mvd->vdev_ops == &vdev_replacing_ops); 595 596 vdev_remove_child(mvd, cvd); 597 vdev_remove_child(pvd, mvd); 598 cvd->vdev_id = mvd->vdev_id; 599 vdev_add_child(pvd, cvd); 600 vdev_top_update(cvd->vdev_top, cvd->vdev_top); 601 602 if (cvd == cvd->vdev_top) 603 vdev_top_transfer(mvd, cvd); 604 605 ASSERT(mvd->vdev_children == 0); 606 vdev_free(mvd); 607 } 608 609 void 610 vdev_metaslab_init(vdev_t *vd, uint64_t txg) 611 { 612 spa_t *spa = vd->vdev_spa; 613 metaslab_class_t *mc = spa_metaslab_class_select(spa); 614 uint64_t c; 615 uint64_t oldc = vd->vdev_ms_count; 616 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; 617 space_map_obj_t *smo = vd->vdev_smo; 618 metaslab_t **mspp = vd->vdev_ms; 619 620 dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); 621 622 ASSERT(oldc <= newc); 623 624 vd->vdev_smo = kmem_zalloc(newc * sizeof (*smo), KM_SLEEP); 625 vd->vdev_ms = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); 626 vd->vdev_ms_count = newc; 627 628 if (vd->vdev_mg == NULL) { 629 if (txg == 0) { 630 dmu_buf_t *db; 631 uint64_t *ms_array; 632 633 ms_array = kmem_zalloc(newc * sizeof (uint64_t), 634 KM_SLEEP); 635 636 dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, 637 0, newc * sizeof (uint64_t), ms_array); 638 639 for (c = 0; c < newc; c++) { 640 if (ms_array[c] == 0) 641 continue; 642 db = dmu_bonus_hold(spa->spa_meta_objset, 643 ms_array[c]); 644 dmu_buf_read(db); 645 ASSERT3U(db->db_size, ==, sizeof (*smo)); 646 bcopy(db->db_data, &vd->vdev_smo[c], 647 db->db_size); 648 ASSERT3U(vd->vdev_smo[c].smo_object, ==, 649 ms_array[c]); 650 dmu_buf_rele(db); 651 } 652 kmem_free(ms_array, newc * sizeof (uint64_t)); 653 } 654 vd->vdev_mg = metaslab_group_create(mc, vd); 655 } 656 657 for (c = 0; c < oldc; c++) { 658 vd->vdev_smo[c] = smo[c]; 659 vd->vdev_ms[c] = mspp[c]; 660 mspp[c]->ms_smo = &vd->vdev_smo[c]; 661 } 662 663 for (c = oldc; c < newc; c++) 664 metaslab_init(vd->vdev_mg, &vd->vdev_smo[c], &vd->vdev_ms[c], 665 c << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); 666 667 if (oldc != 0) { 668 kmem_free(smo, oldc * sizeof (*smo)); 669 kmem_free(mspp, oldc * sizeof (*mspp)); 670 } 671 672 } 673 674 void 675 vdev_metaslab_fini(vdev_t *vd) 676 { 677 uint64_t m; 678 uint64_t count = vd->vdev_ms_count; 679 680 if (vd->vdev_ms != NULL) { 681 for (m = 0; m < count; m++) 682 metaslab_fini(vd->vdev_ms[m]); 683 kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); 684 vd->vdev_ms = NULL; 685 } 686 687 if (vd->vdev_smo != NULL) { 688 kmem_free(vd->vdev_smo, count * sizeof (space_map_obj_t)); 689 vd->vdev_smo = NULL; 690 } 691 } 692 693 /* 694 * Prepare a virtual device for access. 695 */ 696 int 697 vdev_open(vdev_t *vd) 698 { 699 int error; 700 vdev_knob_t *vk; 701 int c; 702 uint64_t osize = 0; 703 uint64_t asize, psize; 704 uint64_t ashift = -1ULL; 705 706 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || 707 vd->vdev_state == VDEV_STATE_CANT_OPEN || 708 vd->vdev_state == VDEV_STATE_OFFLINE); 709 710 if (vd->vdev_fault_mode == VDEV_FAULT_COUNT) 711 vd->vdev_fault_arg >>= 1; 712 else 713 vd->vdev_fault_mode = VDEV_FAULT_NONE; 714 715 vd->vdev_stat.vs_aux = VDEV_AUX_NONE; 716 717 for (vk = vdev_knob_next(NULL); vk != NULL; vk = vdev_knob_next(vk)) { 718 uint64_t *valp = (uint64_t *)((char *)vd + vk->vk_offset); 719 720 *valp = vk->vk_default; 721 *valp = MAX(*valp, vk->vk_min); 722 *valp = MIN(*valp, vk->vk_max); 723 } 724 725 if (vd->vdev_ops->vdev_op_leaf) { 726 vdev_cache_init(vd); 727 vdev_queue_init(vd); 728 vd->vdev_cache_active = B_TRUE; 729 } 730 731 if (vd->vdev_offline) { 732 ASSERT(vd->vdev_children == 0); 733 dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd)); 734 vd->vdev_state = VDEV_STATE_OFFLINE; 735 return (ENXIO); 736 } 737 738 error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); 739 740 dprintf("%s = %d, osize %llu, state = %d\n", 741 vdev_description(vd), error, osize, vd->vdev_state); 742 743 if (error) { 744 dprintf("%s in %s failed to open, error %d, aux %d\n", 745 vdev_description(vd), 746 vdev_description(vd->vdev_parent), 747 error, 748 vd->vdev_stat.vs_aux); 749 750 vd->vdev_state = VDEV_STATE_CANT_OPEN; 751 return (error); 752 } 753 754 vd->vdev_state = VDEV_STATE_HEALTHY; 755 756 for (c = 0; c < vd->vdev_children; c++) 757 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) 758 vd->vdev_state = VDEV_STATE_DEGRADED; 759 760 osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); 761 762 if (vd->vdev_children == 0) { 763 if (osize < SPA_MINDEVSIZE) { 764 vd->vdev_state = VDEV_STATE_CANT_OPEN; 765 vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL; 766 return (EOVERFLOW); 767 } 768 psize = osize; 769 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); 770 } else { 771 if (osize < SPA_MINDEVSIZE - 772 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { 773 vd->vdev_state = VDEV_STATE_CANT_OPEN; 774 vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL; 775 return (EOVERFLOW); 776 } 777 psize = 0; 778 asize = osize; 779 } 780 781 vd->vdev_psize = psize; 782 783 if (vd->vdev_asize == 0) { 784 /* 785 * This is the first-ever open, so use the computed values. 786 */ 787 vd->vdev_asize = asize; 788 vd->vdev_ashift = ashift; 789 } else { 790 /* 791 * Make sure the alignment requirement hasn't increased. 792 */ 793 if (ashift > vd->vdev_ashift) { 794 dprintf("%s: ashift grew\n", vdev_description(vd)); 795 vd->vdev_state = VDEV_STATE_CANT_OPEN; 796 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 797 return (EINVAL); 798 } 799 800 /* 801 * Make sure the device hasn't shrunk. 802 */ 803 if (asize < vd->vdev_asize) { 804 dprintf("%s: device shrank\n", vdev_description(vd)); 805 vd->vdev_state = VDEV_STATE_CANT_OPEN; 806 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; 807 return (EINVAL); 808 } 809 810 /* 811 * If all children are healthy and the asize has increased, 812 * then we've experienced dynamic LUN growth. 813 */ 814 if (vd->vdev_state == VDEV_STATE_HEALTHY && 815 asize > vd->vdev_asize) { 816 dprintf("%s: device grew\n", vdev_description(vd)); 817 vd->vdev_asize = asize; 818 } 819 } 820 821 return (0); 822 } 823 824 /* 825 * Close a virtual device. 826 */ 827 void 828 vdev_close(vdev_t *vd) 829 { 830 ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL); 831 832 vd->vdev_ops->vdev_op_close(vd); 833 834 if (vd->vdev_cache_active) { 835 vdev_cache_fini(vd); 836 vdev_queue_fini(vd); 837 vd->vdev_cache_active = B_FALSE; 838 } 839 840 if (vd->vdev_offline) 841 vd->vdev_state = VDEV_STATE_OFFLINE; 842 else 843 vd->vdev_state = VDEV_STATE_CLOSED; 844 } 845 846 void 847 vdev_reopen(vdev_t *vd, zio_t **rq) 848 { 849 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 850 int c; 851 852 if (vd == rvd) { 853 ASSERT(rq == NULL); 854 for (c = 0; c < rvd->vdev_children; c++) 855 vdev_reopen(rvd->vdev_child[c], NULL); 856 return; 857 } 858 859 /* only valid for top-level vdevs */ 860 ASSERT3P(vd, ==, vd->vdev_top); 861 862 /* 863 * vdev_state can change when spa_config_lock is held as writer, 864 * or when it's held as reader and we're doing a vdev_reopen(). 865 * To handle the latter case, we grab rvd's io_lock to serialize 866 * reopens. This ensures that there's never more than one vdev 867 * state changer active at a time. 868 */ 869 mutex_enter(&rvd->vdev_io_lock); 870 871 mutex_enter(&vd->vdev_io_lock); 872 while (list_head(&vd->vdev_io_pending) != NULL) 873 cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock); 874 vdev_close(vd); 875 (void) vdev_open(vd); 876 if (rq != NULL) { 877 *rq = vd->vdev_io_retry; 878 vd->vdev_io_retry = NULL; 879 } 880 mutex_exit(&vd->vdev_io_lock); 881 882 /* 883 * Reassess root vdev's health. 884 */ 885 rvd->vdev_state = VDEV_STATE_HEALTHY; 886 for (c = 0; c < rvd->vdev_children; c++) { 887 uint64_t state = rvd->vdev_child[c]->vdev_state; 888 rvd->vdev_state = MIN(rvd->vdev_state, state); 889 } 890 891 mutex_exit(&rvd->vdev_io_lock); 892 } 893 894 int 895 vdev_create(vdev_t *vd, uint64_t txg) 896 { 897 int error; 898 899 /* 900 * Normally, partial opens (e.g. of a mirror) are allowed. 901 * For a create, however, we want to fail the request if 902 * there are any components we can't open. 903 */ 904 error = vdev_open(vd); 905 906 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { 907 vdev_close(vd); 908 return (error ? error : ENXIO); 909 } 910 911 /* 912 * Recursively initialize all labels. 913 */ 914 if ((error = vdev_label_init(vd, txg)) != 0) { 915 vdev_close(vd); 916 return (error); 917 } 918 919 return (0); 920 } 921 922 /* 923 * The is the latter half of vdev_create(). It is distinct because it 924 * involves initiating transactions in order to do metaslab creation. 925 * For creation, we want to try to create all vdevs at once and then undo it 926 * if anything fails; this is much harder if we have pending transactions. 927 */ 928 void 929 vdev_init(vdev_t *vd, uint64_t txg) 930 { 931 /* 932 * Aim for roughly 200 metaslabs per vdev. 933 */ 934 vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); 935 vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); 936 937 /* 938 * Initialize the vdev's metaslabs. 939 */ 940 vdev_metaslab_init(vd, txg); 941 } 942 943 void 944 vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg) 945 { 946 vdev_t *tvd = vd->vdev_top; 947 948 mutex_enter(&tvd->vdev_dirty_lock); 949 if ((tvd->vdev_dirty[txg & TXG_MASK] & flags) != flags) { 950 tvd->vdev_dirty[txg & TXG_MASK] |= flags; 951 (void) txg_list_add(&tvd->vdev_spa->spa_vdev_txg_list, 952 tvd, txg); 953 } 954 mutex_exit(&tvd->vdev_dirty_lock); 955 } 956 957 void 958 vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) 959 { 960 mutex_enter(sm->sm_lock); 961 if (!space_map_contains(sm, txg, size)) 962 space_map_add(sm, txg, size); 963 mutex_exit(sm->sm_lock); 964 } 965 966 int 967 vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) 968 { 969 int dirty; 970 971 /* 972 * Quick test without the lock -- covers the common case that 973 * there are no dirty time segments. 974 */ 975 if (sm->sm_space == 0) 976 return (0); 977 978 mutex_enter(sm->sm_lock); 979 dirty = space_map_contains(sm, txg, size); 980 mutex_exit(sm->sm_lock); 981 982 return (dirty); 983 } 984 985 /* 986 * Reassess DTLs after a config change or scrub completion. 987 */ 988 void 989 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) 990 { 991 int c; 992 993 ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER)); 994 995 if (vd->vdev_children == 0) { 996 mutex_enter(&vd->vdev_dtl_lock); 997 /* 998 * We're successfully scrubbed everything up to scrub_txg. 999 * Therefore, excise all old DTLs up to that point, then 1000 * fold in the DTLs for everything we couldn't scrub. 1001 */ 1002 if (scrub_txg != 0) { 1003 space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); 1004 space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); 1005 } 1006 if (scrub_done) 1007 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1008 mutex_exit(&vd->vdev_dtl_lock); 1009 if (txg != 0) { 1010 vdev_t *tvd = vd->vdev_top; 1011 vdev_dirty(tvd, VDD_DTL, txg); 1012 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1013 } 1014 return; 1015 } 1016 1017 mutex_enter(&vd->vdev_dtl_lock); 1018 space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); 1019 space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); 1020 mutex_exit(&vd->vdev_dtl_lock); 1021 1022 for (c = 0; c < vd->vdev_children; c++) { 1023 vdev_t *cvd = vd->vdev_child[c]; 1024 vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); 1025 mutex_enter(&vd->vdev_dtl_lock); 1026 space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); 1027 space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); 1028 mutex_exit(&vd->vdev_dtl_lock); 1029 } 1030 } 1031 1032 static int 1033 vdev_dtl_load(vdev_t *vd) 1034 { 1035 spa_t *spa = vd->vdev_spa; 1036 space_map_obj_t *smo = &vd->vdev_dtl; 1037 dmu_buf_t *db; 1038 int error; 1039 1040 ASSERT(vd->vdev_children == 0); 1041 1042 if (smo->smo_object == 0) 1043 return (0); 1044 1045 db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object); 1046 dmu_buf_read(db); 1047 ASSERT3U(db->db_size, ==, sizeof (*smo)); 1048 bcopy(db->db_data, smo, db->db_size); 1049 dmu_buf_rele(db); 1050 1051 mutex_enter(&vd->vdev_dtl_lock); 1052 error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC, 1053 spa->spa_meta_objset, smo->smo_objsize, smo->smo_alloc); 1054 mutex_exit(&vd->vdev_dtl_lock); 1055 1056 return (error); 1057 } 1058 1059 void 1060 vdev_dtl_sync(vdev_t *vd, uint64_t txg) 1061 { 1062 spa_t *spa = vd->vdev_spa; 1063 space_map_obj_t *smo = &vd->vdev_dtl; 1064 space_map_t *sm = &vd->vdev_dtl_map; 1065 space_map_t smsync; 1066 kmutex_t smlock; 1067 avl_tree_t *t = &sm->sm_root; 1068 space_seg_t *ss; 1069 dmu_buf_t *db; 1070 dmu_tx_t *tx; 1071 1072 dprintf("%s in txg %llu pass %d\n", 1073 vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1074 1075 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1076 1077 if (vd->vdev_detached) { 1078 if (smo->smo_object != 0) { 1079 int err = dmu_object_free(spa->spa_meta_objset, 1080 smo->smo_object, tx); 1081 ASSERT3U(err, ==, 0); 1082 smo->smo_object = 0; 1083 } 1084 dmu_tx_commit(tx); 1085 return; 1086 } 1087 1088 if (smo->smo_object == 0) { 1089 ASSERT(smo->smo_objsize == 0); 1090 ASSERT(smo->smo_alloc == 0); 1091 smo->smo_object = dmu_object_alloc(spa->spa_meta_objset, 1092 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 1093 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 1094 ASSERT(smo->smo_object != 0); 1095 vdev_config_dirty(vd->vdev_top); 1096 } 1097 1098 dmu_free_range(spa->spa_meta_objset, smo->smo_object, 1099 0, smo->smo_objsize, tx); 1100 1101 mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); 1102 1103 space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift, 1104 &smlock); 1105 1106 mutex_enter(&smlock); 1107 1108 mutex_enter(&vd->vdev_dtl_lock); 1109 for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) 1110 space_map_add(&smsync, ss->ss_start, ss->ss_end - ss->ss_start); 1111 mutex_exit(&vd->vdev_dtl_lock); 1112 1113 smo->smo_objsize = 0; 1114 smo->smo_alloc = smsync.sm_space; 1115 1116 space_map_sync(&smsync, NULL, smo, SM_ALLOC, spa->spa_meta_objset, tx); 1117 space_map_destroy(&smsync); 1118 1119 mutex_exit(&smlock); 1120 mutex_destroy(&smlock); 1121 1122 db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object); 1123 dmu_buf_will_dirty(db, tx); 1124 ASSERT3U(db->db_size, ==, sizeof (*smo)); 1125 bcopy(smo, db->db_data, db->db_size); 1126 dmu_buf_rele(db); 1127 1128 dmu_tx_commit(tx); 1129 } 1130 1131 int 1132 vdev_load(vdev_t *vd, int import) 1133 { 1134 spa_t *spa = vd->vdev_spa; 1135 int c, error; 1136 nvlist_t *label; 1137 uint64_t guid, state; 1138 1139 dprintf("loading %s\n", vdev_description(vd)); 1140 1141 /* 1142 * Recursively load all children. 1143 */ 1144 for (c = 0; c < vd->vdev_children; c++) 1145 if ((error = vdev_load(vd->vdev_child[c], import)) != 0) 1146 return (error); 1147 1148 /* 1149 * If this is a leaf vdev, make sure its agrees with its disk labels. 1150 */ 1151 if (vd->vdev_ops->vdev_op_leaf) { 1152 1153 if (vdev_is_dead(vd)) 1154 return (0); 1155 1156 /* 1157 * XXX state transitions don't propagate to parent here. 1158 * Also, merely setting the state isn't sufficient because 1159 * it's not persistent; a vdev_reopen() would make us 1160 * forget all about it. 1161 */ 1162 if ((label = vdev_label_read_config(vd)) == NULL) { 1163 dprintf("can't load label config\n"); 1164 vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1165 VDEV_AUX_CORRUPT_DATA); 1166 return (0); 1167 } 1168 1169 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, 1170 &guid) != 0 || guid != spa_guid(spa)) { 1171 dprintf("bad or missing pool GUID (%llu)\n", guid); 1172 vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1173 VDEV_AUX_CORRUPT_DATA); 1174 nvlist_free(label); 1175 return (0); 1176 } 1177 1178 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) || 1179 guid != vd->vdev_guid) { 1180 dprintf("bad or missing vdev guid (%llu != %llu)\n", 1181 guid, vd->vdev_guid); 1182 vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1183 VDEV_AUX_CORRUPT_DATA); 1184 nvlist_free(label); 1185 return (0); 1186 } 1187 1188 /* 1189 * If we find a vdev with a matching pool guid and vdev guid, 1190 * but the pool state is not active, it indicates that the user 1191 * exported or destroyed the pool without affecting the config 1192 * cache (if / was mounted readonly, for example). In this 1193 * case, immediately return EBADF so the caller can remove it 1194 * from the config. 1195 */ 1196 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, 1197 &state)) { 1198 dprintf("missing pool state\n"); 1199 vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1200 VDEV_AUX_CORRUPT_DATA); 1201 nvlist_free(label); 1202 return (0); 1203 } 1204 1205 if (state != POOL_STATE_ACTIVE && 1206 (!import || state != POOL_STATE_EXPORTED)) { 1207 dprintf("pool state not active (%llu)\n", state); 1208 nvlist_free(label); 1209 return (EBADF); 1210 } 1211 1212 nvlist_free(label); 1213 } 1214 1215 /* 1216 * If this is a top-level vdev, make sure its allocation parameters 1217 * exist and initialize its metaslabs. 1218 */ 1219 if (vd == vd->vdev_top) { 1220 1221 if (vd->vdev_ms_array == 0 || 1222 vd->vdev_ms_shift == 0 || 1223 vd->vdev_ashift == 0 || 1224 vd->vdev_asize == 0) { 1225 vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1226 VDEV_AUX_CORRUPT_DATA); 1227 return (0); 1228 } 1229 1230 vdev_metaslab_init(vd, 0); 1231 } 1232 1233 /* 1234 * If this is a leaf vdev, load its DTL. 1235 */ 1236 if (vd->vdev_ops->vdev_op_leaf) { 1237 error = vdev_dtl_load(vd); 1238 if (error) { 1239 dprintf("can't load DTL for %s, error %d\n", 1240 vdev_description(vd), error); 1241 vdev_set_state(vd, VDEV_STATE_CANT_OPEN, 1242 VDEV_AUX_CORRUPT_DATA); 1243 return (0); 1244 } 1245 } 1246 1247 return (0); 1248 } 1249 1250 void 1251 vdev_sync_done(vdev_t *vd, uint64_t txg) 1252 { 1253 metaslab_t *msp; 1254 1255 dprintf("%s txg %llu\n", vdev_description(vd), txg); 1256 1257 while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) 1258 metaslab_sync_done(msp, txg); 1259 } 1260 1261 void 1262 vdev_add_sync(vdev_t *vd, uint64_t txg) 1263 { 1264 spa_t *spa = vd->vdev_spa; 1265 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1266 1267 ASSERT(vd == vd->vdev_top); 1268 1269 if (vd->vdev_ms_array == 0) 1270 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, 1271 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); 1272 1273 ASSERT(vd->vdev_ms_array != 0); 1274 1275 vdev_config_dirty(vd); 1276 1277 dmu_tx_commit(tx); 1278 } 1279 1280 void 1281 vdev_sync(vdev_t *vd, uint64_t txg) 1282 { 1283 spa_t *spa = vd->vdev_spa; 1284 vdev_t *lvd; 1285 metaslab_t *msp; 1286 uint8_t *dirtyp = &vd->vdev_dirty[txg & TXG_MASK]; 1287 uint8_t dirty = *dirtyp; 1288 1289 mutex_enter(&vd->vdev_dirty_lock); 1290 *dirtyp &= ~(VDD_ALLOC | VDD_FREE | VDD_ADD | VDD_DTL); 1291 mutex_exit(&vd->vdev_dirty_lock); 1292 1293 dprintf("%s txg %llu pass %d\n", 1294 vdev_description(vd), (u_longlong_t)txg, spa_sync_pass(spa)); 1295 1296 if (dirty & VDD_ADD) 1297 vdev_add_sync(vd, txg); 1298 1299 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) 1300 metaslab_sync(msp, txg); 1301 1302 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) 1303 vdev_dtl_sync(lvd, txg); 1304 1305 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); 1306 } 1307 1308 uint64_t 1309 vdev_psize_to_asize(vdev_t *vd, uint64_t psize) 1310 { 1311 return (vd->vdev_ops->vdev_op_asize(vd, psize)); 1312 } 1313 1314 void 1315 vdev_io_start(zio_t *zio) 1316 { 1317 zio->io_vd->vdev_ops->vdev_op_io_start(zio); 1318 } 1319 1320 void 1321 vdev_io_done(zio_t *zio) 1322 { 1323 zio->io_vd->vdev_ops->vdev_op_io_done(zio); 1324 } 1325 1326 const char * 1327 vdev_description(vdev_t *vd) 1328 { 1329 if (vd == NULL || vd->vdev_ops == NULL) 1330 return ("<unknown>"); 1331 1332 if (vd->vdev_path != NULL) 1333 return (vd->vdev_path); 1334 1335 if (vd->vdev_parent == NULL) 1336 return (spa_name(vd->vdev_spa)); 1337 1338 return (vd->vdev_ops->vdev_op_type); 1339 } 1340 1341 int 1342 vdev_online(spa_t *spa, const char *path) 1343 { 1344 vdev_t *vd; 1345 1346 spa_config_enter(spa, RW_WRITER); 1347 1348 if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { 1349 spa_config_exit(spa); 1350 return (ENODEV); 1351 } 1352 1353 dprintf("ONLINE: %s\n", vdev_description(vd)); 1354 1355 vd->vdev_offline = B_FALSE; 1356 1357 /* 1358 * Clear the error counts. The idea is that you expect to see all 1359 * zeroes when everything is working, so if you've just onlined a 1360 * device, you don't want to keep hearing about errors from before. 1361 */ 1362 vd->vdev_stat.vs_read_errors = 0; 1363 vd->vdev_stat.vs_write_errors = 0; 1364 vd->vdev_stat.vs_checksum_errors = 0; 1365 1366 vdev_reopen(vd->vdev_top, NULL); 1367 1368 spa_config_exit(spa); 1369 1370 VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); 1371 1372 return (0); 1373 } 1374 1375 int 1376 vdev_offline(spa_t *spa, const char *path) 1377 { 1378 vdev_t *vd; 1379 1380 spa_config_enter(spa, RW_WRITER); 1381 1382 if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { 1383 spa_config_exit(spa); 1384 return (ENODEV); 1385 } 1386 1387 dprintf("OFFLINE: %s\n", vdev_description(vd)); 1388 1389 /* 1390 * If this device's top-level vdev has a non-empty DTL, 1391 * don't allow the device to be offlined. 1392 * 1393 * XXX -- we should make this more precise by allowing the offline 1394 * as long as the remaining devices don't have any DTL holes. 1395 */ 1396 if (vd->vdev_top->vdev_dtl_map.sm_space != 0) { 1397 spa_config_exit(spa); 1398 return (EBUSY); 1399 } 1400 1401 /* 1402 * Set this device to offline state and reopen its top-level vdev. 1403 * If this action results in the top-level vdev becoming unusable, 1404 * undo it and fail the request. 1405 */ 1406 vd->vdev_offline = B_TRUE; 1407 vdev_reopen(vd->vdev_top, NULL); 1408 if (vdev_is_dead(vd->vdev_top)) { 1409 vd->vdev_offline = B_FALSE; 1410 vdev_reopen(vd->vdev_top, NULL); 1411 spa_config_exit(spa); 1412 return (EBUSY); 1413 } 1414 1415 spa_config_exit(spa); 1416 1417 return (0); 1418 } 1419 1420 int 1421 vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg) 1422 { 1423 vdev_t *vd; 1424 1425 spa_config_enter(spa, RW_WRITER); 1426 1427 if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { 1428 spa_config_exit(spa); 1429 return (ENODEV); 1430 } 1431 1432 vd->vdev_fault_mode = mode; 1433 vd->vdev_fault_mask = mask; 1434 vd->vdev_fault_arg = arg; 1435 1436 spa_config_exit(spa); 1437 1438 return (0); 1439 } 1440 1441 int 1442 vdev_is_dead(vdev_t *vd) 1443 { 1444 return (vd->vdev_state <= VDEV_STATE_CANT_OPEN); 1445 } 1446 1447 int 1448 vdev_error_inject(vdev_t *vd, zio_t *zio) 1449 { 1450 int error = 0; 1451 1452 if (vd->vdev_fault_mode == VDEV_FAULT_NONE) 1453 return (0); 1454 1455 if (((1ULL << zio->io_type) & vd->vdev_fault_mask) == 0) 1456 return (0); 1457 1458 switch (vd->vdev_fault_mode) { 1459 case VDEV_FAULT_RANDOM: 1460 if (spa_get_random(vd->vdev_fault_arg) == 0) 1461 error = EIO; 1462 break; 1463 1464 case VDEV_FAULT_COUNT: 1465 if ((int64_t)--vd->vdev_fault_arg <= 0) 1466 vd->vdev_fault_mode = VDEV_FAULT_NONE; 1467 error = EIO; 1468 break; 1469 } 1470 1471 if (error != 0) { 1472 dprintf("returning %d for type %d on %s state %d offset %llx\n", 1473 error, zio->io_type, vdev_description(vd), 1474 vd->vdev_state, zio->io_offset); 1475 } 1476 1477 return (error); 1478 } 1479 1480 /* 1481 * Get statistics for the given vdev. 1482 */ 1483 void 1484 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) 1485 { 1486 vdev_t *rvd = vd->vdev_spa->spa_root_vdev; 1487 int c, t; 1488 1489 mutex_enter(&vd->vdev_stat_lock); 1490 bcopy(&vd->vdev_stat, vs, sizeof (*vs)); 1491 vs->vs_timestamp = gethrtime() - vs->vs_timestamp; 1492 vs->vs_state = vd->vdev_state; 1493 vs->vs_rsize = vdev_get_rsize(vd); 1494 mutex_exit(&vd->vdev_stat_lock); 1495 1496 /* 1497 * If we're getting stats on the root vdev, aggregate the I/O counts 1498 * over all top-level vdevs (i.e. the direct children of the root). 1499 */ 1500 if (vd == rvd) { 1501 for (c = 0; c < rvd->vdev_children; c++) { 1502 vdev_t *cvd = rvd->vdev_child[c]; 1503 vdev_stat_t *cvs = &cvd->vdev_stat; 1504 1505 mutex_enter(&vd->vdev_stat_lock); 1506 for (t = 0; t < ZIO_TYPES; t++) { 1507 vs->vs_ops[t] += cvs->vs_ops[t]; 1508 vs->vs_bytes[t] += cvs->vs_bytes[t]; 1509 } 1510 vs->vs_read_errors += cvs->vs_read_errors; 1511 vs->vs_write_errors += cvs->vs_write_errors; 1512 vs->vs_checksum_errors += cvs->vs_checksum_errors; 1513 vs->vs_scrub_examined += cvs->vs_scrub_examined; 1514 vs->vs_scrub_errors += cvs->vs_scrub_errors; 1515 mutex_exit(&vd->vdev_stat_lock); 1516 } 1517 } 1518 } 1519 1520 void 1521 vdev_stat_update(zio_t *zio) 1522 { 1523 vdev_t *vd = zio->io_vd; 1524 vdev_t *pvd; 1525 uint64_t txg = zio->io_txg; 1526 vdev_stat_t *vs = &vd->vdev_stat; 1527 zio_type_t type = zio->io_type; 1528 int flags = zio->io_flags; 1529 1530 if (zio->io_error == 0) { 1531 if (!(flags & ZIO_FLAG_IO_BYPASS)) { 1532 mutex_enter(&vd->vdev_stat_lock); 1533 vs->vs_ops[type]++; 1534 vs->vs_bytes[type] += zio->io_size; 1535 mutex_exit(&vd->vdev_stat_lock); 1536 } 1537 if ((flags & ZIO_FLAG_IO_REPAIR) && 1538 zio->io_delegate_list == NULL) { 1539 mutex_enter(&vd->vdev_stat_lock); 1540 if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) 1541 vs->vs_scrub_repaired += zio->io_size; 1542 else 1543 vs->vs_self_healed += zio->io_size; 1544 mutex_exit(&vd->vdev_stat_lock); 1545 } 1546 return; 1547 } 1548 1549 if (flags & ZIO_FLAG_SPECULATIVE) 1550 return; 1551 1552 if (!vdev_is_dead(vd)) { 1553 mutex_enter(&vd->vdev_stat_lock); 1554 if (type == ZIO_TYPE_READ) { 1555 if (zio->io_error == ECKSUM) 1556 vs->vs_checksum_errors++; 1557 else 1558 vs->vs_read_errors++; 1559 } 1560 if (type == ZIO_TYPE_WRITE) 1561 vs->vs_write_errors++; 1562 mutex_exit(&vd->vdev_stat_lock); 1563 } 1564 1565 if (type == ZIO_TYPE_WRITE) { 1566 if (txg == 0 || vd->vdev_children != 0) 1567 return; 1568 if (flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { 1569 ASSERT(flags & ZIO_FLAG_IO_REPAIR); 1570 for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1571 vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); 1572 } 1573 if (!(flags & ZIO_FLAG_IO_REPAIR)) { 1574 vdev_t *tvd = vd->vdev_top; 1575 if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) 1576 return; 1577 vdev_dirty(tvd, VDD_DTL, txg); 1578 (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); 1579 for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) 1580 vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); 1581 } 1582 } 1583 } 1584 1585 void 1586 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) 1587 { 1588 int c; 1589 vdev_stat_t *vs = &vd->vdev_stat; 1590 1591 for (c = 0; c < vd->vdev_children; c++) 1592 vdev_scrub_stat_update(vd->vdev_child[c], type, complete); 1593 1594 mutex_enter(&vd->vdev_stat_lock); 1595 1596 if (type == POOL_SCRUB_NONE) { 1597 /* 1598 * Update completion and end time. Leave everything else alone 1599 * so we can report what happened during the previous scrub. 1600 */ 1601 vs->vs_scrub_complete = complete; 1602 vs->vs_scrub_end = gethrestime_sec(); 1603 } else { 1604 vs->vs_scrub_type = type; 1605 vs->vs_scrub_complete = 0; 1606 vs->vs_scrub_examined = 0; 1607 vs->vs_scrub_repaired = 0; 1608 vs->vs_scrub_errors = 0; 1609 vs->vs_scrub_start = gethrestime_sec(); 1610 vs->vs_scrub_end = 0; 1611 } 1612 1613 mutex_exit(&vd->vdev_stat_lock); 1614 } 1615 1616 /* 1617 * Report checksum errors that a vdev that didn't realize it made. 1618 * This can happen, for example, when RAID-Z combinatorial reconstruction 1619 * infers that one of its components returned bad data. 1620 */ 1621 void 1622 vdev_checksum_error(zio_t *zio, vdev_t *vd) 1623 { 1624 dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", 1625 vdev_description(vd)); 1626 1627 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { 1628 mutex_enter(&vd->vdev_stat_lock); 1629 vd->vdev_stat.vs_checksum_errors++; 1630 mutex_exit(&vd->vdev_stat_lock); 1631 } 1632 } 1633 1634 /* 1635 * Update the in-core space usage stats for this vdev and the root vdev. 1636 */ 1637 void 1638 vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta) 1639 { 1640 ASSERT(vd == vd->vdev_top); 1641 1642 do { 1643 mutex_enter(&vd->vdev_stat_lock); 1644 vd->vdev_stat.vs_space += space_delta; 1645 vd->vdev_stat.vs_alloc += alloc_delta; 1646 mutex_exit(&vd->vdev_stat_lock); 1647 } while ((vd = vd->vdev_parent) != NULL); 1648 } 1649 1650 /* 1651 * Various knobs to tune a vdev. 1652 */ 1653 static vdev_knob_t vdev_knob[] = { 1654 { 1655 "cache_size", 1656 "size of the read-ahead cache", 1657 0, 1658 1ULL << 30, 1659 10ULL << 20, 1660 offsetof(struct vdev, vdev_cache.vc_size) 1661 }, 1662 { 1663 "cache_bshift", 1664 "log2 of cache blocksize", 1665 SPA_MINBLOCKSHIFT, 1666 SPA_MAXBLOCKSHIFT, 1667 16, 1668 offsetof(struct vdev, vdev_cache.vc_bshift) 1669 }, 1670 { 1671 "cache_max", 1672 "largest block size to cache", 1673 0, 1674 SPA_MAXBLOCKSIZE, 1675 1ULL << 14, 1676 offsetof(struct vdev, vdev_cache.vc_max) 1677 }, 1678 { 1679 "min_pending", 1680 "minimum pending I/Os to the disk", 1681 1, 1682 10000, 1683 2, 1684 offsetof(struct vdev, vdev_queue.vq_min_pending) 1685 }, 1686 { 1687 "max_pending", 1688 "maximum pending I/Os to the disk", 1689 1, 1690 10000, 1691 35, 1692 offsetof(struct vdev, vdev_queue.vq_max_pending) 1693 }, 1694 { 1695 "agg_limit", 1696 "maximum size of aggregated I/Os", 1697 0, 1698 SPA_MAXBLOCKSIZE, 1699 SPA_MAXBLOCKSIZE, 1700 offsetof(struct vdev, vdev_queue.vq_agg_limit) 1701 }, 1702 { 1703 "time_shift", 1704 "deadline = pri + (lbolt >> time_shift)", 1705 0, 1706 63, 1707 4, 1708 offsetof(struct vdev, vdev_queue.vq_time_shift) 1709 }, 1710 { 1711 "ramp_rate", 1712 "exponential I/O issue ramp-up rate", 1713 1, 1714 10000, 1715 2, 1716 offsetof(struct vdev, vdev_queue.vq_ramp_rate) 1717 }, 1718 }; 1719 1720 vdev_knob_t * 1721 vdev_knob_next(vdev_knob_t *vk) 1722 { 1723 if (vk == NULL) 1724 return (vdev_knob); 1725 1726 if (++vk == vdev_knob + sizeof (vdev_knob) / sizeof (vdev_knob_t)) 1727 return (NULL); 1728 1729 return (vk); 1730 } 1731 1732 /* 1733 * Mark a top-level vdev's config as dirty, placing it on the dirty list 1734 * so that it will be written out next time the vdev configuration is synced. 1735 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. 1736 */ 1737 void 1738 vdev_config_dirty(vdev_t *vd) 1739 { 1740 spa_t *spa = vd->vdev_spa; 1741 vdev_t *rvd = spa->spa_root_vdev; 1742 int c; 1743 1744 if (vd == rvd) { 1745 for (c = 0; c < rvd->vdev_children; c++) 1746 vdev_config_dirty(rvd->vdev_child[c]); 1747 } else { 1748 ASSERT(vd == vd->vdev_top); 1749 1750 if (!vd->vdev_is_dirty) { 1751 list_insert_head(&spa->spa_dirty_list, vd); 1752 vd->vdev_is_dirty = B_TRUE; 1753 } 1754 } 1755 } 1756 1757 void 1758 vdev_config_clean(vdev_t *vd) 1759 { 1760 ASSERT(vd->vdev_is_dirty); 1761 1762 list_remove(&vd->vdev_spa->spa_dirty_list, vd); 1763 vd->vdev_is_dirty = B_FALSE; 1764 } 1765 1766 /* 1767 * Set a vdev's state, updating any parent's state as well. 1768 */ 1769 void 1770 vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux) 1771 { 1772 if (state == vd->vdev_state) 1773 return; 1774 1775 vd->vdev_state = state; 1776 vd->vdev_stat.vs_aux = aux; 1777 1778 if (vd->vdev_parent != NULL) { 1779 int c; 1780 int degraded = 0, faulted = 0; 1781 vdev_t *parent, *child; 1782 1783 parent = vd->vdev_parent; 1784 for (c = 0; c < parent->vdev_children; c++) { 1785 child = parent->vdev_child[c]; 1786 if (child->vdev_state <= VDEV_STATE_CANT_OPEN) 1787 faulted++; 1788 else if (child->vdev_state == VDEV_STATE_DEGRADED) 1789 degraded++; 1790 } 1791 1792 vd->vdev_parent->vdev_ops->vdev_op_state_change( 1793 vd->vdev_parent, faulted, degraded); 1794 } 1795 } 1796