1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 25 * Copyright (c) 2011 by Delphix. All rights reserved. 26 */ 27 28 /* 29 * This file contains all the routines used when modifying on-disk SPA state. 30 * This includes opening, importing, destroying, exporting a pool, and syncing a 31 * pool. 32 */ 33 34 #include <sys/zfs_context.h> 35 #include <sys/fm/fs/zfs.h> 36 #include <sys/spa_impl.h> 37 #include <sys/zio.h> 38 #include <sys/zio_checksum.h> 39 #include <sys/dmu.h> 40 #include <sys/dmu_tx.h> 41 #include <sys/zap.h> 42 #include <sys/zil.h> 43 #include <sys/ddt.h> 44 #include <sys/vdev_impl.h> 45 #include <sys/metaslab.h> 46 #include <sys/metaslab_impl.h> 47 #include <sys/uberblock_impl.h> 48 #include <sys/txg.h> 49 #include <sys/avl.h> 50 #include <sys/dmu_traverse.h> 51 #include <sys/dmu_objset.h> 52 #include <sys/unique.h> 53 #include <sys/dsl_pool.h> 54 #include <sys/dsl_dataset.h> 55 #include <sys/dsl_dir.h> 56 #include <sys/dsl_prop.h> 57 #include <sys/dsl_synctask.h> 58 #include <sys/fs/zfs.h> 59 #include <sys/arc.h> 60 #include <sys/callb.h> 61 #include <sys/systeminfo.h> 62 #include <sys/spa_boot.h> 63 #include <sys/zfs_ioctl.h> 64 #include <sys/dsl_scan.h> 65 66 #ifdef _KERNEL 67 #include <sys/bootprops.h> 68 #include <sys/callb.h> 69 #include <sys/cpupart.h> 70 #include <sys/pool.h> 71 #include <sys/sysdc.h> 72 #include <sys/zone.h> 73 #endif /* _KERNEL */ 74 75 #include "zfs_prop.h" 76 #include "zfs_comutil.h" 77 78 typedef enum zti_modes { 79 zti_mode_fixed, /* value is # of threads (min 1) */ 80 zti_mode_online_percent, /* value is % of online CPUs */ 81 zti_mode_batch, /* cpu-intensive; value is ignored */ 82 zti_mode_null, /* don't create a taskq */ 83 zti_nmodes 84 } zti_modes_t; 85 86 #define ZTI_FIX(n) { zti_mode_fixed, (n) } 87 #define ZTI_PCT(n) { zti_mode_online_percent, (n) } 88 #define ZTI_BATCH { zti_mode_batch, 0 } 89 #define ZTI_NULL { zti_mode_null, 0 } 90 91 #define ZTI_ONE ZTI_FIX(1) 92 93 typedef struct zio_taskq_info { 94 enum zti_modes zti_mode; 95 uint_t zti_value; 96 } zio_taskq_info_t; 97 98 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { 99 "issue", "issue_high", "intr", "intr_high" 100 }; 101 102 /* 103 * Define the taskq threads for the following I/O types: 104 * NULL, READ, WRITE, FREE, CLAIM, and IOCTL 105 */ 106 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { 107 /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ 108 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 109 { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, 110 { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, 111 { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL }, 112 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 113 { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, 114 }; 115 116 static dsl_syncfunc_t spa_sync_props; 117 static boolean_t spa_has_active_shared_spare(spa_t *spa); 118 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, 119 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 120 char **ereport); 121 static void spa_vdev_resilver_done(spa_t *spa); 122 123 uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ 124 id_t zio_taskq_psrset_bind = PS_NONE; 125 boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ 126 uint_t zio_taskq_basedc = 80; /* base duty cycle */ 127 128 boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ 129 130 /* 131 * This (illegal) pool name is used when temporarily importing a spa_t in order 132 * to get the vdev stats associated with the imported devices. 133 */ 134 #define TRYIMPORT_NAME "$import" 135 136 /* 137 * ========================================================================== 138 * SPA properties routines 139 * ========================================================================== 140 */ 141 142 /* 143 * Add a (source=src, propname=propval) list to an nvlist. 144 */ 145 static void 146 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, 147 uint64_t intval, zprop_source_t src) 148 { 149 const char *propname = zpool_prop_to_name(prop); 150 nvlist_t *propval; 151 152 VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); 153 VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); 154 155 if (strval != NULL) 156 VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); 157 else 158 VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); 159 160 VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); 161 nvlist_free(propval); 162 } 163 164 /* 165 * Get property values from the spa configuration. 166 */ 167 static void 168 spa_prop_get_config(spa_t *spa, nvlist_t **nvp) 169 { 170 uint64_t size; 171 uint64_t alloc; 172 uint64_t cap, version; 173 zprop_source_t src = ZPROP_SRC_NONE; 174 spa_config_dirent_t *dp; 175 176 ASSERT(MUTEX_HELD(&spa->spa_props_lock)); 177 178 if (spa->spa_root_vdev != NULL) { 179 alloc = metaslab_class_get_alloc(spa_normal_class(spa)); 180 size = metaslab_class_get_space(spa_normal_class(spa)); 181 spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); 182 spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); 183 spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); 184 spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, 185 size - alloc, src); 186 spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, 187 (spa_mode(spa) == FREAD), src); 188 189 cap = (size == 0) ? 0 : (alloc * 100 / size); 190 spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); 191 192 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, 193 ddt_get_pool_dedup_ratio(spa), src); 194 195 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, 196 spa->spa_root_vdev->vdev_state, src); 197 198 version = spa_version(spa); 199 if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) 200 src = ZPROP_SRC_DEFAULT; 201 else 202 src = ZPROP_SRC_LOCAL; 203 spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); 204 } 205 206 spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); 207 208 if (spa->spa_comment != NULL) { 209 spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 210 0, ZPROP_SRC_LOCAL); 211 } 212 213 if (spa->spa_root != NULL) 214 spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 215 0, ZPROP_SRC_LOCAL); 216 217 if ((dp = list_head(&spa->spa_config_list)) != NULL) { 218 if (dp->scd_path == NULL) { 219 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 220 "none", 0, ZPROP_SRC_LOCAL); 221 } else if (strcmp(dp->scd_path, spa_config_path) != 0) { 222 spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, 223 dp->scd_path, 0, ZPROP_SRC_LOCAL); 224 } 225 } 226 } 227 228 /* 229 * Get zpool property values. 230 */ 231 int 232 spa_prop_get(spa_t *spa, nvlist_t **nvp) 233 { 234 objset_t *mos = spa->spa_meta_objset; 235 zap_cursor_t zc; 236 zap_attribute_t za; 237 int err; 238 239 VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); 240 241 mutex_enter(&spa->spa_props_lock); 242 243 /* 244 * Get properties from the spa config. 245 */ 246 spa_prop_get_config(spa, nvp); 247 248 /* If no pool property object, no more prop to get. */ 249 if (mos == NULL || spa->spa_pool_props_object == 0) { 250 mutex_exit(&spa->spa_props_lock); 251 return (0); 252 } 253 254 /* 255 * Get properties from the MOS pool property object. 256 */ 257 for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); 258 (err = zap_cursor_retrieve(&zc, &za)) == 0; 259 zap_cursor_advance(&zc)) { 260 uint64_t intval = 0; 261 char *strval = NULL; 262 zprop_source_t src = ZPROP_SRC_DEFAULT; 263 zpool_prop_t prop; 264 265 if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) 266 continue; 267 268 switch (za.za_integer_length) { 269 case 8: 270 /* integer property */ 271 if (za.za_first_integer != 272 zpool_prop_default_numeric(prop)) 273 src = ZPROP_SRC_LOCAL; 274 275 if (prop == ZPOOL_PROP_BOOTFS) { 276 dsl_pool_t *dp; 277 dsl_dataset_t *ds = NULL; 278 279 dp = spa_get_dsl(spa); 280 rw_enter(&dp->dp_config_rwlock, RW_READER); 281 if (err = dsl_dataset_hold_obj(dp, 282 za.za_first_integer, FTAG, &ds)) { 283 rw_exit(&dp->dp_config_rwlock); 284 break; 285 } 286 287 strval = kmem_alloc( 288 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, 289 KM_SLEEP); 290 dsl_dataset_name(ds, strval); 291 dsl_dataset_rele(ds, FTAG); 292 rw_exit(&dp->dp_config_rwlock); 293 } else { 294 strval = NULL; 295 intval = za.za_first_integer; 296 } 297 298 spa_prop_add_list(*nvp, prop, strval, intval, src); 299 300 if (strval != NULL) 301 kmem_free(strval, 302 MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); 303 304 break; 305 306 case 1: 307 /* string property */ 308 strval = kmem_alloc(za.za_num_integers, KM_SLEEP); 309 err = zap_lookup(mos, spa->spa_pool_props_object, 310 za.za_name, 1, za.za_num_integers, strval); 311 if (err) { 312 kmem_free(strval, za.za_num_integers); 313 break; 314 } 315 spa_prop_add_list(*nvp, prop, strval, 0, src); 316 kmem_free(strval, za.za_num_integers); 317 break; 318 319 default: 320 break; 321 } 322 } 323 zap_cursor_fini(&zc); 324 mutex_exit(&spa->spa_props_lock); 325 out: 326 if (err && err != ENOENT) { 327 nvlist_free(*nvp); 328 *nvp = NULL; 329 return (err); 330 } 331 332 return (0); 333 } 334 335 /* 336 * Validate the given pool properties nvlist and modify the list 337 * for the property values to be set. 338 */ 339 static int 340 spa_prop_validate(spa_t *spa, nvlist_t *props) 341 { 342 nvpair_t *elem; 343 int error = 0, reset_bootfs = 0; 344 uint64_t objnum; 345 346 elem = NULL; 347 while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { 348 zpool_prop_t prop; 349 char *propname, *strval; 350 uint64_t intval; 351 objset_t *os; 352 char *slash, *check; 353 354 propname = nvpair_name(elem); 355 356 if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL) 357 return (EINVAL); 358 359 switch (prop) { 360 case ZPOOL_PROP_VERSION: 361 error = nvpair_value_uint64(elem, &intval); 362 if (!error && 363 (intval < spa_version(spa) || intval > SPA_VERSION)) 364 error = EINVAL; 365 break; 366 367 case ZPOOL_PROP_DELEGATION: 368 case ZPOOL_PROP_AUTOREPLACE: 369 case ZPOOL_PROP_LISTSNAPS: 370 case ZPOOL_PROP_AUTOEXPAND: 371 error = nvpair_value_uint64(elem, &intval); 372 if (!error && intval > 1) 373 error = EINVAL; 374 break; 375 376 case ZPOOL_PROP_BOOTFS: 377 /* 378 * If the pool version is less than SPA_VERSION_BOOTFS, 379 * or the pool is still being created (version == 0), 380 * the bootfs property cannot be set. 381 */ 382 if (spa_version(spa) < SPA_VERSION_BOOTFS) { 383 error = ENOTSUP; 384 break; 385 } 386 387 /* 388 * Make sure the vdev config is bootable 389 */ 390 if (!vdev_is_bootable(spa->spa_root_vdev)) { 391 error = ENOTSUP; 392 break; 393 } 394 395 reset_bootfs = 1; 396 397 error = nvpair_value_string(elem, &strval); 398 399 if (!error) { 400 uint64_t compress; 401 402 if (strval == NULL || strval[0] == '\0') { 403 objnum = zpool_prop_default_numeric( 404 ZPOOL_PROP_BOOTFS); 405 break; 406 } 407 408 if (error = dmu_objset_hold(strval, FTAG, &os)) 409 break; 410 411 /* Must be ZPL and not gzip compressed. */ 412 413 if (dmu_objset_type(os) != DMU_OST_ZFS) { 414 error = ENOTSUP; 415 } else if ((error = dsl_prop_get_integer(strval, 416 zfs_prop_to_name(ZFS_PROP_COMPRESSION), 417 &compress, NULL)) == 0 && 418 !BOOTFS_COMPRESS_VALID(compress)) { 419 error = ENOTSUP; 420 } else { 421 objnum = dmu_objset_id(os); 422 } 423 dmu_objset_rele(os, FTAG); 424 } 425 break; 426 427 case ZPOOL_PROP_FAILUREMODE: 428 error = nvpair_value_uint64(elem, &intval); 429 if (!error && (intval < ZIO_FAILURE_MODE_WAIT || 430 intval > ZIO_FAILURE_MODE_PANIC)) 431 error = EINVAL; 432 433 /* 434 * This is a special case which only occurs when 435 * the pool has completely failed. This allows 436 * the user to change the in-core failmode property 437 * without syncing it out to disk (I/Os might 438 * currently be blocked). We do this by returning 439 * EIO to the caller (spa_prop_set) to trick it 440 * into thinking we encountered a property validation 441 * error. 442 */ 443 if (!error && spa_suspended(spa)) { 444 spa->spa_failmode = intval; 445 error = EIO; 446 } 447 break; 448 449 case ZPOOL_PROP_CACHEFILE: 450 if ((error = nvpair_value_string(elem, &strval)) != 0) 451 break; 452 453 if (strval[0] == '\0') 454 break; 455 456 if (strcmp(strval, "none") == 0) 457 break; 458 459 if (strval[0] != '/') { 460 error = EINVAL; 461 break; 462 } 463 464 slash = strrchr(strval, '/'); 465 ASSERT(slash != NULL); 466 467 if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || 468 strcmp(slash, "/..") == 0) 469 error = EINVAL; 470 break; 471 472 case ZPOOL_PROP_COMMENT: 473 if ((error = nvpair_value_string(elem, &strval)) != 0) 474 break; 475 for (check = strval; *check != '\0'; check++) { 476 /* 477 * The kernel doesn't have an easy isprint() 478 * check. For this kernel check, we merely 479 * check ASCII apart from DEL. Fix this if 480 * there is an easy-to-use kernel isprint(). 481 */ 482 if (*check >= 0x7f) { 483 error = EINVAL; 484 break; 485 } 486 check++; 487 } 488 if (strlen(strval) > ZPROP_MAX_COMMENT) 489 error = E2BIG; 490 break; 491 492 case ZPOOL_PROP_DEDUPDITTO: 493 if (spa_version(spa) < SPA_VERSION_DEDUP) 494 error = ENOTSUP; 495 else 496 error = nvpair_value_uint64(elem, &intval); 497 if (error == 0 && 498 intval != 0 && intval < ZIO_DEDUPDITTO_MIN) 499 error = EINVAL; 500 break; 501 } 502 503 if (error) 504 break; 505 } 506 507 if (!error && reset_bootfs) { 508 error = nvlist_remove(props, 509 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); 510 511 if (!error) { 512 error = nvlist_add_uint64(props, 513 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); 514 } 515 } 516 517 return (error); 518 } 519 520 void 521 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) 522 { 523 char *cachefile; 524 spa_config_dirent_t *dp; 525 526 if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), 527 &cachefile) != 0) 528 return; 529 530 dp = kmem_alloc(sizeof (spa_config_dirent_t), 531 KM_SLEEP); 532 533 if (cachefile[0] == '\0') 534 dp->scd_path = spa_strdup(spa_config_path); 535 else if (strcmp(cachefile, "none") == 0) 536 dp->scd_path = NULL; 537 else 538 dp->scd_path = spa_strdup(cachefile); 539 540 list_insert_head(&spa->spa_config_list, dp); 541 if (need_sync) 542 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 543 } 544 545 int 546 spa_prop_set(spa_t *spa, nvlist_t *nvp) 547 { 548 int error; 549 nvpair_t *elem; 550 boolean_t need_sync = B_FALSE; 551 zpool_prop_t prop; 552 553 if ((error = spa_prop_validate(spa, nvp)) != 0) 554 return (error); 555 556 elem = NULL; 557 while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { 558 if ((prop = zpool_name_to_prop( 559 nvpair_name(elem))) == ZPROP_INVAL) 560 return (EINVAL); 561 562 if (prop == ZPOOL_PROP_CACHEFILE || 563 prop == ZPOOL_PROP_ALTROOT || 564 prop == ZPOOL_PROP_READONLY) 565 continue; 566 567 need_sync = B_TRUE; 568 break; 569 } 570 571 if (need_sync) 572 return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, 573 spa, nvp, 3)); 574 else 575 return (0); 576 } 577 578 /* 579 * If the bootfs property value is dsobj, clear it. 580 */ 581 void 582 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) 583 { 584 if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { 585 VERIFY(zap_remove(spa->spa_meta_objset, 586 spa->spa_pool_props_object, 587 zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); 588 spa->spa_bootfs = 0; 589 } 590 } 591 592 /* 593 * Change the GUID for the pool. This is done so that we can later 594 * re-import a pool built from a clone of our own vdevs. We will modify 595 * the root vdev's guid, our own pool guid, and then mark all of our 596 * vdevs dirty. Note that we must make sure that all our vdevs are 597 * online when we do this, or else any vdevs that weren't present 598 * would be orphaned from our pool. We are also going to issue a 599 * sysevent to update any watchers. 600 */ 601 int 602 spa_change_guid(spa_t *spa) 603 { 604 uint64_t oldguid, newguid; 605 uint64_t txg; 606 607 if (!(spa_mode_global & FWRITE)) 608 return (EROFS); 609 610 txg = spa_vdev_enter(spa); 611 612 if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY) 613 return (spa_vdev_exit(spa, NULL, txg, ENXIO)); 614 615 oldguid = spa_guid(spa); 616 newguid = spa_generate_guid(NULL); 617 ASSERT3U(oldguid, !=, newguid); 618 619 spa->spa_root_vdev->vdev_guid = newguid; 620 spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid); 621 622 vdev_config_dirty(spa->spa_root_vdev); 623 624 spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); 625 626 return (spa_vdev_exit(spa, NULL, txg, 0)); 627 } 628 629 /* 630 * ========================================================================== 631 * SPA state manipulation (open/create/destroy/import/export) 632 * ========================================================================== 633 */ 634 635 static int 636 spa_error_entry_compare(const void *a, const void *b) 637 { 638 spa_error_entry_t *sa = (spa_error_entry_t *)a; 639 spa_error_entry_t *sb = (spa_error_entry_t *)b; 640 int ret; 641 642 ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, 643 sizeof (zbookmark_t)); 644 645 if (ret < 0) 646 return (-1); 647 else if (ret > 0) 648 return (1); 649 else 650 return (0); 651 } 652 653 /* 654 * Utility function which retrieves copies of the current logs and 655 * re-initializes them in the process. 656 */ 657 void 658 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) 659 { 660 ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); 661 662 bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); 663 bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); 664 665 avl_create(&spa->spa_errlist_scrub, 666 spa_error_entry_compare, sizeof (spa_error_entry_t), 667 offsetof(spa_error_entry_t, se_avl)); 668 avl_create(&spa->spa_errlist_last, 669 spa_error_entry_compare, sizeof (spa_error_entry_t), 670 offsetof(spa_error_entry_t, se_avl)); 671 } 672 673 static taskq_t * 674 spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, 675 uint_t value) 676 { 677 uint_t flags = 0; 678 boolean_t batch = B_FALSE; 679 680 switch (mode) { 681 case zti_mode_null: 682 return (NULL); /* no taskq needed */ 683 684 case zti_mode_fixed: 685 ASSERT3U(value, >=, 1); 686 value = MAX(value, 1); 687 break; 688 689 case zti_mode_batch: 690 batch = B_TRUE; 691 flags |= TASKQ_THREADS_CPU_PCT; 692 value = zio_taskq_batch_pct; 693 break; 694 695 case zti_mode_online_percent: 696 flags |= TASKQ_THREADS_CPU_PCT; 697 break; 698 699 default: 700 panic("unrecognized mode for %s taskq (%u:%u) in " 701 "spa_activate()", 702 name, mode, value); 703 break; 704 } 705 706 if (zio_taskq_sysdc && spa->spa_proc != &p0) { 707 if (batch) 708 flags |= TASKQ_DC_BATCH; 709 710 return (taskq_create_sysdc(name, value, 50, INT_MAX, 711 spa->spa_proc, zio_taskq_basedc, flags)); 712 } 713 return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, 714 spa->spa_proc, flags)); 715 } 716 717 static void 718 spa_create_zio_taskqs(spa_t *spa) 719 { 720 for (int t = 0; t < ZIO_TYPES; t++) { 721 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 722 const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; 723 enum zti_modes mode = ztip->zti_mode; 724 uint_t value = ztip->zti_value; 725 char name[32]; 726 727 (void) snprintf(name, sizeof (name), 728 "%s_%s", zio_type_name[t], zio_taskq_types[q]); 729 730 spa->spa_zio_taskq[t][q] = 731 spa_taskq_create(spa, name, mode, value); 732 } 733 } 734 } 735 736 #ifdef _KERNEL 737 static void 738 spa_thread(void *arg) 739 { 740 callb_cpr_t cprinfo; 741 742 spa_t *spa = arg; 743 user_t *pu = PTOU(curproc); 744 745 CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, 746 spa->spa_name); 747 748 ASSERT(curproc != &p0); 749 (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), 750 "zpool-%s", spa->spa_name); 751 (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); 752 753 /* bind this thread to the requested psrset */ 754 if (zio_taskq_psrset_bind != PS_NONE) { 755 pool_lock(); 756 mutex_enter(&cpu_lock); 757 mutex_enter(&pidlock); 758 mutex_enter(&curproc->p_lock); 759 760 if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 761 0, NULL, NULL) == 0) { 762 curthread->t_bind_pset = zio_taskq_psrset_bind; 763 } else { 764 cmn_err(CE_WARN, 765 "Couldn't bind process for zfs pool \"%s\" to " 766 "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); 767 } 768 769 mutex_exit(&curproc->p_lock); 770 mutex_exit(&pidlock); 771 mutex_exit(&cpu_lock); 772 pool_unlock(); 773 } 774 775 if (zio_taskq_sysdc) { 776 sysdc_thread_enter(curthread, 100, 0); 777 } 778 779 spa->spa_proc = curproc; 780 spa->spa_did = curthread->t_did; 781 782 spa_create_zio_taskqs(spa); 783 784 mutex_enter(&spa->spa_proc_lock); 785 ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); 786 787 spa->spa_proc_state = SPA_PROC_ACTIVE; 788 cv_broadcast(&spa->spa_proc_cv); 789 790 CALLB_CPR_SAFE_BEGIN(&cprinfo); 791 while (spa->spa_proc_state == SPA_PROC_ACTIVE) 792 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 793 CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); 794 795 ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); 796 spa->spa_proc_state = SPA_PROC_GONE; 797 spa->spa_proc = &p0; 798 cv_broadcast(&spa->spa_proc_cv); 799 CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ 800 801 mutex_enter(&curproc->p_lock); 802 lwp_exit(); 803 } 804 #endif 805 806 /* 807 * Activate an uninitialized pool. 808 */ 809 static void 810 spa_activate(spa_t *spa, int mode) 811 { 812 ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); 813 814 spa->spa_state = POOL_STATE_ACTIVE; 815 spa->spa_mode = mode; 816 817 spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); 818 spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); 819 820 /* Try to create a covering process */ 821 mutex_enter(&spa->spa_proc_lock); 822 ASSERT(spa->spa_proc_state == SPA_PROC_NONE); 823 ASSERT(spa->spa_proc == &p0); 824 spa->spa_did = 0; 825 826 /* Only create a process if we're going to be around a while. */ 827 if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { 828 if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, 829 NULL, 0) == 0) { 830 spa->spa_proc_state = SPA_PROC_CREATED; 831 while (spa->spa_proc_state == SPA_PROC_CREATED) { 832 cv_wait(&spa->spa_proc_cv, 833 &spa->spa_proc_lock); 834 } 835 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 836 ASSERT(spa->spa_proc != &p0); 837 ASSERT(spa->spa_did != 0); 838 } else { 839 #ifdef _KERNEL 840 cmn_err(CE_WARN, 841 "Couldn't create process for zfs pool \"%s\"\n", 842 spa->spa_name); 843 #endif 844 } 845 } 846 mutex_exit(&spa->spa_proc_lock); 847 848 /* If we didn't create a process, we need to create our taskqs. */ 849 if (spa->spa_proc == &p0) { 850 spa_create_zio_taskqs(spa); 851 } 852 853 list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), 854 offsetof(vdev_t, vdev_config_dirty_node)); 855 list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), 856 offsetof(vdev_t, vdev_state_dirty_node)); 857 858 txg_list_create(&spa->spa_vdev_txg_list, 859 offsetof(struct vdev, vdev_txg_node)); 860 861 avl_create(&spa->spa_errlist_scrub, 862 spa_error_entry_compare, sizeof (spa_error_entry_t), 863 offsetof(spa_error_entry_t, se_avl)); 864 avl_create(&spa->spa_errlist_last, 865 spa_error_entry_compare, sizeof (spa_error_entry_t), 866 offsetof(spa_error_entry_t, se_avl)); 867 } 868 869 /* 870 * Opposite of spa_activate(). 871 */ 872 static void 873 spa_deactivate(spa_t *spa) 874 { 875 ASSERT(spa->spa_sync_on == B_FALSE); 876 ASSERT(spa->spa_dsl_pool == NULL); 877 ASSERT(spa->spa_root_vdev == NULL); 878 ASSERT(spa->spa_async_zio_root == NULL); 879 ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); 880 881 txg_list_destroy(&spa->spa_vdev_txg_list); 882 883 list_destroy(&spa->spa_config_dirty_list); 884 list_destroy(&spa->spa_state_dirty_list); 885 886 for (int t = 0; t < ZIO_TYPES; t++) { 887 for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { 888 if (spa->spa_zio_taskq[t][q] != NULL) 889 taskq_destroy(spa->spa_zio_taskq[t][q]); 890 spa->spa_zio_taskq[t][q] = NULL; 891 } 892 } 893 894 metaslab_class_destroy(spa->spa_normal_class); 895 spa->spa_normal_class = NULL; 896 897 metaslab_class_destroy(spa->spa_log_class); 898 spa->spa_log_class = NULL; 899 900 /* 901 * If this was part of an import or the open otherwise failed, we may 902 * still have errors left in the queues. Empty them just in case. 903 */ 904 spa_errlog_drain(spa); 905 906 avl_destroy(&spa->spa_errlist_scrub); 907 avl_destroy(&spa->spa_errlist_last); 908 909 spa->spa_state = POOL_STATE_UNINITIALIZED; 910 911 mutex_enter(&spa->spa_proc_lock); 912 if (spa->spa_proc_state != SPA_PROC_NONE) { 913 ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); 914 spa->spa_proc_state = SPA_PROC_DEACTIVATE; 915 cv_broadcast(&spa->spa_proc_cv); 916 while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { 917 ASSERT(spa->spa_proc != &p0); 918 cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); 919 } 920 ASSERT(spa->spa_proc_state == SPA_PROC_GONE); 921 spa->spa_proc_state = SPA_PROC_NONE; 922 } 923 ASSERT(spa->spa_proc == &p0); 924 mutex_exit(&spa->spa_proc_lock); 925 926 /* 927 * We want to make sure spa_thread() has actually exited the ZFS 928 * module, so that the module can't be unloaded out from underneath 929 * it. 930 */ 931 if (spa->spa_did != 0) { 932 thread_join(spa->spa_did); 933 spa->spa_did = 0; 934 } 935 } 936 937 /* 938 * Verify a pool configuration, and construct the vdev tree appropriately. This 939 * will create all the necessary vdevs in the appropriate layout, with each vdev 940 * in the CLOSED state. This will prep the pool before open/creation/import. 941 * All vdev validation is done by the vdev_alloc() routine. 942 */ 943 static int 944 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, 945 uint_t id, int atype) 946 { 947 nvlist_t **child; 948 uint_t children; 949 int error; 950 951 if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) 952 return (error); 953 954 if ((*vdp)->vdev_ops->vdev_op_leaf) 955 return (0); 956 957 error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, 958 &child, &children); 959 960 if (error == ENOENT) 961 return (0); 962 963 if (error) { 964 vdev_free(*vdp); 965 *vdp = NULL; 966 return (EINVAL); 967 } 968 969 for (int c = 0; c < children; c++) { 970 vdev_t *vd; 971 if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, 972 atype)) != 0) { 973 vdev_free(*vdp); 974 *vdp = NULL; 975 return (error); 976 } 977 } 978 979 ASSERT(*vdp != NULL); 980 981 return (0); 982 } 983 984 /* 985 * Opposite of spa_load(). 986 */ 987 static void 988 spa_unload(spa_t *spa) 989 { 990 int i; 991 992 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 993 994 /* 995 * Stop async tasks. 996 */ 997 spa_async_suspend(spa); 998 999 /* 1000 * Stop syncing. 1001 */ 1002 if (spa->spa_sync_on) { 1003 txg_sync_stop(spa->spa_dsl_pool); 1004 spa->spa_sync_on = B_FALSE; 1005 } 1006 1007 /* 1008 * Wait for any outstanding async I/O to complete. 1009 */ 1010 if (spa->spa_async_zio_root != NULL) { 1011 (void) zio_wait(spa->spa_async_zio_root); 1012 spa->spa_async_zio_root = NULL; 1013 } 1014 1015 bpobj_close(&spa->spa_deferred_bpobj); 1016 1017 /* 1018 * Close the dsl pool. 1019 */ 1020 if (spa->spa_dsl_pool) { 1021 dsl_pool_close(spa->spa_dsl_pool); 1022 spa->spa_dsl_pool = NULL; 1023 spa->spa_meta_objset = NULL; 1024 } 1025 1026 ddt_unload(spa); 1027 1028 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1029 1030 /* 1031 * Drop and purge level 2 cache 1032 */ 1033 spa_l2cache_drop(spa); 1034 1035 /* 1036 * Close all vdevs. 1037 */ 1038 if (spa->spa_root_vdev) 1039 vdev_free(spa->spa_root_vdev); 1040 ASSERT(spa->spa_root_vdev == NULL); 1041 1042 for (i = 0; i < spa->spa_spares.sav_count; i++) 1043 vdev_free(spa->spa_spares.sav_vdevs[i]); 1044 if (spa->spa_spares.sav_vdevs) { 1045 kmem_free(spa->spa_spares.sav_vdevs, 1046 spa->spa_spares.sav_count * sizeof (void *)); 1047 spa->spa_spares.sav_vdevs = NULL; 1048 } 1049 if (spa->spa_spares.sav_config) { 1050 nvlist_free(spa->spa_spares.sav_config); 1051 spa->spa_spares.sav_config = NULL; 1052 } 1053 spa->spa_spares.sav_count = 0; 1054 1055 for (i = 0; i < spa->spa_l2cache.sav_count; i++) 1056 vdev_free(spa->spa_l2cache.sav_vdevs[i]); 1057 if (spa->spa_l2cache.sav_vdevs) { 1058 kmem_free(spa->spa_l2cache.sav_vdevs, 1059 spa->spa_l2cache.sav_count * sizeof (void *)); 1060 spa->spa_l2cache.sav_vdevs = NULL; 1061 } 1062 if (spa->spa_l2cache.sav_config) { 1063 nvlist_free(spa->spa_l2cache.sav_config); 1064 spa->spa_l2cache.sav_config = NULL; 1065 } 1066 spa->spa_l2cache.sav_count = 0; 1067 1068 spa->spa_async_suspended = 0; 1069 1070 if (spa->spa_comment != NULL) { 1071 spa_strfree(spa->spa_comment); 1072 spa->spa_comment = NULL; 1073 } 1074 1075 spa_config_exit(spa, SCL_ALL, FTAG); 1076 } 1077 1078 /* 1079 * Load (or re-load) the current list of vdevs describing the active spares for 1080 * this pool. When this is called, we have some form of basic information in 1081 * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and 1082 * then re-generate a more complete list including status information. 1083 */ 1084 static void 1085 spa_load_spares(spa_t *spa) 1086 { 1087 nvlist_t **spares; 1088 uint_t nspares; 1089 int i; 1090 vdev_t *vd, *tvd; 1091 1092 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1093 1094 /* 1095 * First, close and free any existing spare vdevs. 1096 */ 1097 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1098 vd = spa->spa_spares.sav_vdevs[i]; 1099 1100 /* Undo the call to spa_activate() below */ 1101 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1102 B_FALSE)) != NULL && tvd->vdev_isspare) 1103 spa_spare_remove(tvd); 1104 vdev_close(vd); 1105 vdev_free(vd); 1106 } 1107 1108 if (spa->spa_spares.sav_vdevs) 1109 kmem_free(spa->spa_spares.sav_vdevs, 1110 spa->spa_spares.sav_count * sizeof (void *)); 1111 1112 if (spa->spa_spares.sav_config == NULL) 1113 nspares = 0; 1114 else 1115 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 1116 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 1117 1118 spa->spa_spares.sav_count = (int)nspares; 1119 spa->spa_spares.sav_vdevs = NULL; 1120 1121 if (nspares == 0) 1122 return; 1123 1124 /* 1125 * Construct the array of vdevs, opening them to get status in the 1126 * process. For each spare, there is potentially two different vdev_t 1127 * structures associated with it: one in the list of spares (used only 1128 * for basic validation purposes) and one in the active vdev 1129 * configuration (if it's spared in). During this phase we open and 1130 * validate each vdev on the spare list. If the vdev also exists in the 1131 * active configuration, then we also mark this vdev as an active spare. 1132 */ 1133 spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), 1134 KM_SLEEP); 1135 for (i = 0; i < spa->spa_spares.sav_count; i++) { 1136 VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, 1137 VDEV_ALLOC_SPARE) == 0); 1138 ASSERT(vd != NULL); 1139 1140 spa->spa_spares.sav_vdevs[i] = vd; 1141 1142 if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, 1143 B_FALSE)) != NULL) { 1144 if (!tvd->vdev_isspare) 1145 spa_spare_add(tvd); 1146 1147 /* 1148 * We only mark the spare active if we were successfully 1149 * able to load the vdev. Otherwise, importing a pool 1150 * with a bad active spare would result in strange 1151 * behavior, because multiple pool would think the spare 1152 * is actively in use. 1153 * 1154 * There is a vulnerability here to an equally bizarre 1155 * circumstance, where a dead active spare is later 1156 * brought back to life (onlined or otherwise). Given 1157 * the rarity of this scenario, and the extra complexity 1158 * it adds, we ignore the possibility. 1159 */ 1160 if (!vdev_is_dead(tvd)) 1161 spa_spare_activate(tvd); 1162 } 1163 1164 vd->vdev_top = vd; 1165 vd->vdev_aux = &spa->spa_spares; 1166 1167 if (vdev_open(vd) != 0) 1168 continue; 1169 1170 if (vdev_validate_aux(vd) == 0) 1171 spa_spare_add(vd); 1172 } 1173 1174 /* 1175 * Recompute the stashed list of spares, with status information 1176 * this time. 1177 */ 1178 VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, 1179 DATA_TYPE_NVLIST_ARRAY) == 0); 1180 1181 spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), 1182 KM_SLEEP); 1183 for (i = 0; i < spa->spa_spares.sav_count; i++) 1184 spares[i] = vdev_config_generate(spa, 1185 spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); 1186 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 1187 ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); 1188 for (i = 0; i < spa->spa_spares.sav_count; i++) 1189 nvlist_free(spares[i]); 1190 kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); 1191 } 1192 1193 /* 1194 * Load (or re-load) the current list of vdevs describing the active l2cache for 1195 * this pool. When this is called, we have some form of basic information in 1196 * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and 1197 * then re-generate a more complete list including status information. 1198 * Devices which are already active have their details maintained, and are 1199 * not re-opened. 1200 */ 1201 static void 1202 spa_load_l2cache(spa_t *spa) 1203 { 1204 nvlist_t **l2cache; 1205 uint_t nl2cache; 1206 int i, j, oldnvdevs; 1207 uint64_t guid; 1208 vdev_t *vd, **oldvdevs, **newvdevs; 1209 spa_aux_vdev_t *sav = &spa->spa_l2cache; 1210 1211 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 1212 1213 if (sav->sav_config != NULL) { 1214 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, 1215 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 1216 newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); 1217 } else { 1218 nl2cache = 0; 1219 } 1220 1221 oldvdevs = sav->sav_vdevs; 1222 oldnvdevs = sav->sav_count; 1223 sav->sav_vdevs = NULL; 1224 sav->sav_count = 0; 1225 1226 /* 1227 * Process new nvlist of vdevs. 1228 */ 1229 for (i = 0; i < nl2cache; i++) { 1230 VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, 1231 &guid) == 0); 1232 1233 newvdevs[i] = NULL; 1234 for (j = 0; j < oldnvdevs; j++) { 1235 vd = oldvdevs[j]; 1236 if (vd != NULL && guid == vd->vdev_guid) { 1237 /* 1238 * Retain previous vdev for add/remove ops. 1239 */ 1240 newvdevs[i] = vd; 1241 oldvdevs[j] = NULL; 1242 break; 1243 } 1244 } 1245 1246 if (newvdevs[i] == NULL) { 1247 /* 1248 * Create new vdev 1249 */ 1250 VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, 1251 VDEV_ALLOC_L2CACHE) == 0); 1252 ASSERT(vd != NULL); 1253 newvdevs[i] = vd; 1254 1255 /* 1256 * Commit this vdev as an l2cache device, 1257 * even if it fails to open. 1258 */ 1259 spa_l2cache_add(vd); 1260 1261 vd->vdev_top = vd; 1262 vd->vdev_aux = sav; 1263 1264 spa_l2cache_activate(vd); 1265 1266 if (vdev_open(vd) != 0) 1267 continue; 1268 1269 (void) vdev_validate_aux(vd); 1270 1271 if (!vdev_is_dead(vd)) 1272 l2arc_add_vdev(spa, vd); 1273 } 1274 } 1275 1276 /* 1277 * Purge vdevs that were dropped 1278 */ 1279 for (i = 0; i < oldnvdevs; i++) { 1280 uint64_t pool; 1281 1282 vd = oldvdevs[i]; 1283 if (vd != NULL) { 1284 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 1285 pool != 0ULL && l2arc_vdev_present(vd)) 1286 l2arc_remove_vdev(vd); 1287 (void) vdev_close(vd); 1288 spa_l2cache_remove(vd); 1289 } 1290 } 1291 1292 if (oldvdevs) 1293 kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); 1294 1295 if (sav->sav_config == NULL) 1296 goto out; 1297 1298 sav->sav_vdevs = newvdevs; 1299 sav->sav_count = (int)nl2cache; 1300 1301 /* 1302 * Recompute the stashed list of l2cache devices, with status 1303 * information this time. 1304 */ 1305 VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, 1306 DATA_TYPE_NVLIST_ARRAY) == 0); 1307 1308 l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 1309 for (i = 0; i < sav->sav_count; i++) 1310 l2cache[i] = vdev_config_generate(spa, 1311 sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); 1312 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 1313 ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); 1314 out: 1315 for (i = 0; i < sav->sav_count; i++) 1316 nvlist_free(l2cache[i]); 1317 if (sav->sav_count) 1318 kmem_free(l2cache, sav->sav_count * sizeof (void *)); 1319 } 1320 1321 static int 1322 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) 1323 { 1324 dmu_buf_t *db; 1325 char *packed = NULL; 1326 size_t nvsize = 0; 1327 int error; 1328 *value = NULL; 1329 1330 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 1331 nvsize = *(uint64_t *)db->db_data; 1332 dmu_buf_rele(db, FTAG); 1333 1334 packed = kmem_alloc(nvsize, KM_SLEEP); 1335 error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, 1336 DMU_READ_PREFETCH); 1337 if (error == 0) 1338 error = nvlist_unpack(packed, nvsize, value, 0); 1339 kmem_free(packed, nvsize); 1340 1341 return (error); 1342 } 1343 1344 /* 1345 * Checks to see if the given vdev could not be opened, in which case we post a 1346 * sysevent to notify the autoreplace code that the device has been removed. 1347 */ 1348 static void 1349 spa_check_removed(vdev_t *vd) 1350 { 1351 for (int c = 0; c < vd->vdev_children; c++) 1352 spa_check_removed(vd->vdev_child[c]); 1353 1354 if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { 1355 zfs_post_autoreplace(vd->vdev_spa, vd); 1356 spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); 1357 } 1358 } 1359 1360 /* 1361 * Validate the current config against the MOS config 1362 */ 1363 static boolean_t 1364 spa_config_valid(spa_t *spa, nvlist_t *config) 1365 { 1366 vdev_t *mrvd, *rvd = spa->spa_root_vdev; 1367 nvlist_t *nv; 1368 1369 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); 1370 1371 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1372 VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); 1373 1374 ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); 1375 1376 /* 1377 * If we're doing a normal import, then build up any additional 1378 * diagnostic information about missing devices in this config. 1379 * We'll pass this up to the user for further processing. 1380 */ 1381 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { 1382 nvlist_t **child, *nv; 1383 uint64_t idx = 0; 1384 1385 child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), 1386 KM_SLEEP); 1387 VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); 1388 1389 for (int c = 0; c < rvd->vdev_children; c++) { 1390 vdev_t *tvd = rvd->vdev_child[c]; 1391 vdev_t *mtvd = mrvd->vdev_child[c]; 1392 1393 if (tvd->vdev_ops == &vdev_missing_ops && 1394 mtvd->vdev_ops != &vdev_missing_ops && 1395 mtvd->vdev_islog) 1396 child[idx++] = vdev_config_generate(spa, mtvd, 1397 B_FALSE, 0); 1398 } 1399 1400 if (idx) { 1401 VERIFY(nvlist_add_nvlist_array(nv, 1402 ZPOOL_CONFIG_CHILDREN, child, idx) == 0); 1403 VERIFY(nvlist_add_nvlist(spa->spa_load_info, 1404 ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); 1405 1406 for (int i = 0; i < idx; i++) 1407 nvlist_free(child[i]); 1408 } 1409 nvlist_free(nv); 1410 kmem_free(child, rvd->vdev_children * sizeof (char **)); 1411 } 1412 1413 /* 1414 * Compare the root vdev tree with the information we have 1415 * from the MOS config (mrvd). Check each top-level vdev 1416 * with the corresponding MOS config top-level (mtvd). 1417 */ 1418 for (int c = 0; c < rvd->vdev_children; c++) { 1419 vdev_t *tvd = rvd->vdev_child[c]; 1420 vdev_t *mtvd = mrvd->vdev_child[c]; 1421 1422 /* 1423 * Resolve any "missing" vdevs in the current configuration. 1424 * If we find that the MOS config has more accurate information 1425 * about the top-level vdev then use that vdev instead. 1426 */ 1427 if (tvd->vdev_ops == &vdev_missing_ops && 1428 mtvd->vdev_ops != &vdev_missing_ops) { 1429 1430 if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) 1431 continue; 1432 1433 /* 1434 * Device specific actions. 1435 */ 1436 if (mtvd->vdev_islog) { 1437 spa_set_log_state(spa, SPA_LOG_CLEAR); 1438 } else { 1439 /* 1440 * XXX - once we have 'readonly' pool 1441 * support we should be able to handle 1442 * missing data devices by transitioning 1443 * the pool to readonly. 1444 */ 1445 continue; 1446 } 1447 1448 /* 1449 * Swap the missing vdev with the data we were 1450 * able to obtain from the MOS config. 1451 */ 1452 vdev_remove_child(rvd, tvd); 1453 vdev_remove_child(mrvd, mtvd); 1454 1455 vdev_add_child(rvd, mtvd); 1456 vdev_add_child(mrvd, tvd); 1457 1458 spa_config_exit(spa, SCL_ALL, FTAG); 1459 vdev_load(mtvd); 1460 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1461 1462 vdev_reopen(rvd); 1463 } else if (mtvd->vdev_islog) { 1464 /* 1465 * Load the slog device's state from the MOS config 1466 * since it's possible that the label does not 1467 * contain the most up-to-date information. 1468 */ 1469 vdev_load_log_state(tvd, mtvd); 1470 vdev_reopen(tvd); 1471 } 1472 } 1473 vdev_free(mrvd); 1474 spa_config_exit(spa, SCL_ALL, FTAG); 1475 1476 /* 1477 * Ensure we were able to validate the config. 1478 */ 1479 return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); 1480 } 1481 1482 /* 1483 * Check for missing log devices 1484 */ 1485 static int 1486 spa_check_logs(spa_t *spa) 1487 { 1488 switch (spa->spa_log_state) { 1489 case SPA_LOG_MISSING: 1490 /* need to recheck in case slog has been restored */ 1491 case SPA_LOG_UNKNOWN: 1492 if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, 1493 DS_FIND_CHILDREN)) { 1494 spa_set_log_state(spa, SPA_LOG_MISSING); 1495 return (1); 1496 } 1497 break; 1498 } 1499 return (0); 1500 } 1501 1502 static boolean_t 1503 spa_passivate_log(spa_t *spa) 1504 { 1505 vdev_t *rvd = spa->spa_root_vdev; 1506 boolean_t slog_found = B_FALSE; 1507 1508 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1509 1510 if (!spa_has_slogs(spa)) 1511 return (B_FALSE); 1512 1513 for (int c = 0; c < rvd->vdev_children; c++) { 1514 vdev_t *tvd = rvd->vdev_child[c]; 1515 metaslab_group_t *mg = tvd->vdev_mg; 1516 1517 if (tvd->vdev_islog) { 1518 metaslab_group_passivate(mg); 1519 slog_found = B_TRUE; 1520 } 1521 } 1522 1523 return (slog_found); 1524 } 1525 1526 static void 1527 spa_activate_log(spa_t *spa) 1528 { 1529 vdev_t *rvd = spa->spa_root_vdev; 1530 1531 ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); 1532 1533 for (int c = 0; c < rvd->vdev_children; c++) { 1534 vdev_t *tvd = rvd->vdev_child[c]; 1535 metaslab_group_t *mg = tvd->vdev_mg; 1536 1537 if (tvd->vdev_islog) 1538 metaslab_group_activate(mg); 1539 } 1540 } 1541 1542 int 1543 spa_offline_log(spa_t *spa) 1544 { 1545 int error = 0; 1546 1547 if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, 1548 NULL, DS_FIND_CHILDREN)) == 0) { 1549 1550 /* 1551 * We successfully offlined the log device, sync out the 1552 * current txg so that the "stubby" block can be removed 1553 * by zil_sync(). 1554 */ 1555 txg_wait_synced(spa->spa_dsl_pool, 0); 1556 } 1557 return (error); 1558 } 1559 1560 static void 1561 spa_aux_check_removed(spa_aux_vdev_t *sav) 1562 { 1563 for (int i = 0; i < sav->sav_count; i++) 1564 spa_check_removed(sav->sav_vdevs[i]); 1565 } 1566 1567 void 1568 spa_claim_notify(zio_t *zio) 1569 { 1570 spa_t *spa = zio->io_spa; 1571 1572 if (zio->io_error) 1573 return; 1574 1575 mutex_enter(&spa->spa_props_lock); /* any mutex will do */ 1576 if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) 1577 spa->spa_claim_max_txg = zio->io_bp->blk_birth; 1578 mutex_exit(&spa->spa_props_lock); 1579 } 1580 1581 typedef struct spa_load_error { 1582 uint64_t sle_meta_count; 1583 uint64_t sle_data_count; 1584 } spa_load_error_t; 1585 1586 static void 1587 spa_load_verify_done(zio_t *zio) 1588 { 1589 blkptr_t *bp = zio->io_bp; 1590 spa_load_error_t *sle = zio->io_private; 1591 dmu_object_type_t type = BP_GET_TYPE(bp); 1592 int error = zio->io_error; 1593 1594 if (error) { 1595 if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && 1596 type != DMU_OT_INTENT_LOG) 1597 atomic_add_64(&sle->sle_meta_count, 1); 1598 else 1599 atomic_add_64(&sle->sle_data_count, 1); 1600 } 1601 zio_data_buf_free(zio->io_data, zio->io_size); 1602 } 1603 1604 /*ARGSUSED*/ 1605 static int 1606 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, 1607 arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) 1608 { 1609 if (bp != NULL) { 1610 zio_t *rio = arg; 1611 size_t size = BP_GET_PSIZE(bp); 1612 void *data = zio_data_buf_alloc(size); 1613 1614 zio_nowait(zio_read(rio, spa, bp, data, size, 1615 spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, 1616 ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | 1617 ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); 1618 } 1619 return (0); 1620 } 1621 1622 static int 1623 spa_load_verify(spa_t *spa) 1624 { 1625 zio_t *rio; 1626 spa_load_error_t sle = { 0 }; 1627 zpool_rewind_policy_t policy; 1628 boolean_t verify_ok = B_FALSE; 1629 int error; 1630 1631 zpool_get_rewind_policy(spa->spa_config, &policy); 1632 1633 if (policy.zrp_request & ZPOOL_NEVER_REWIND) 1634 return (0); 1635 1636 rio = zio_root(spa, NULL, &sle, 1637 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1638 1639 error = traverse_pool(spa, spa->spa_verify_min_txg, 1640 TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); 1641 1642 (void) zio_wait(rio); 1643 1644 spa->spa_load_meta_errors = sle.sle_meta_count; 1645 spa->spa_load_data_errors = sle.sle_data_count; 1646 1647 if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && 1648 sle.sle_data_count <= policy.zrp_maxdata) { 1649 int64_t loss = 0; 1650 1651 verify_ok = B_TRUE; 1652 spa->spa_load_txg = spa->spa_uberblock.ub_txg; 1653 spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; 1654 1655 loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; 1656 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1657 ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); 1658 VERIFY(nvlist_add_int64(spa->spa_load_info, 1659 ZPOOL_CONFIG_REWIND_TIME, loss) == 0); 1660 VERIFY(nvlist_add_uint64(spa->spa_load_info, 1661 ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); 1662 } else { 1663 spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; 1664 } 1665 1666 if (error) { 1667 if (error != ENXIO && error != EIO) 1668 error = EIO; 1669 return (error); 1670 } 1671 1672 return (verify_ok ? 0 : EIO); 1673 } 1674 1675 /* 1676 * Find a value in the pool props object. 1677 */ 1678 static void 1679 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) 1680 { 1681 (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, 1682 zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); 1683 } 1684 1685 /* 1686 * Find a value in the pool directory object. 1687 */ 1688 static int 1689 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) 1690 { 1691 return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 1692 name, sizeof (uint64_t), 1, val)); 1693 } 1694 1695 static int 1696 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) 1697 { 1698 vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); 1699 return (err); 1700 } 1701 1702 /* 1703 * Fix up config after a partly-completed split. This is done with the 1704 * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off 1705 * pool have that entry in their config, but only the splitting one contains 1706 * a list of all the guids of the vdevs that are being split off. 1707 * 1708 * This function determines what to do with that list: either rejoin 1709 * all the disks to the pool, or complete the splitting process. To attempt 1710 * the rejoin, each disk that is offlined is marked online again, and 1711 * we do a reopen() call. If the vdev label for every disk that was 1712 * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) 1713 * then we call vdev_split() on each disk, and complete the split. 1714 * 1715 * Otherwise we leave the config alone, with all the vdevs in place in 1716 * the original pool. 1717 */ 1718 static void 1719 spa_try_repair(spa_t *spa, nvlist_t *config) 1720 { 1721 uint_t extracted; 1722 uint64_t *glist; 1723 uint_t i, gcount; 1724 nvlist_t *nvl; 1725 vdev_t **vd; 1726 boolean_t attempt_reopen; 1727 1728 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) 1729 return; 1730 1731 /* check that the config is complete */ 1732 if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 1733 &glist, &gcount) != 0) 1734 return; 1735 1736 vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); 1737 1738 /* attempt to online all the vdevs & validate */ 1739 attempt_reopen = B_TRUE; 1740 for (i = 0; i < gcount; i++) { 1741 if (glist[i] == 0) /* vdev is hole */ 1742 continue; 1743 1744 vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); 1745 if (vd[i] == NULL) { 1746 /* 1747 * Don't bother attempting to reopen the disks; 1748 * just do the split. 1749 */ 1750 attempt_reopen = B_FALSE; 1751 } else { 1752 /* attempt to re-online it */ 1753 vd[i]->vdev_offline = B_FALSE; 1754 } 1755 } 1756 1757 if (attempt_reopen) { 1758 vdev_reopen(spa->spa_root_vdev); 1759 1760 /* check each device to see what state it's in */ 1761 for (extracted = 0, i = 0; i < gcount; i++) { 1762 if (vd[i] != NULL && 1763 vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) 1764 break; 1765 ++extracted; 1766 } 1767 } 1768 1769 /* 1770 * If every disk has been moved to the new pool, or if we never 1771 * even attempted to look at them, then we split them off for 1772 * good. 1773 */ 1774 if (!attempt_reopen || gcount == extracted) { 1775 for (i = 0; i < gcount; i++) 1776 if (vd[i] != NULL) 1777 vdev_split(vd[i]); 1778 vdev_reopen(spa->spa_root_vdev); 1779 } 1780 1781 kmem_free(vd, gcount * sizeof (vdev_t *)); 1782 } 1783 1784 static int 1785 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, 1786 boolean_t mosconfig) 1787 { 1788 nvlist_t *config = spa->spa_config; 1789 char *ereport = FM_EREPORT_ZFS_POOL; 1790 char *comment; 1791 int error; 1792 uint64_t pool_guid; 1793 nvlist_t *nvl; 1794 1795 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) 1796 return (EINVAL); 1797 1798 ASSERT(spa->spa_comment == NULL); 1799 if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) 1800 spa->spa_comment = spa_strdup(comment); 1801 1802 /* 1803 * Versioning wasn't explicitly added to the label until later, so if 1804 * it's not present treat it as the initial version. 1805 */ 1806 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, 1807 &spa->spa_ubsync.ub_version) != 0) 1808 spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; 1809 1810 (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, 1811 &spa->spa_config_txg); 1812 1813 if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && 1814 spa_guid_exists(pool_guid, 0)) { 1815 error = EEXIST; 1816 } else { 1817 spa->spa_config_guid = pool_guid; 1818 1819 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, 1820 &nvl) == 0) { 1821 VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, 1822 KM_SLEEP) == 0); 1823 } 1824 1825 gethrestime(&spa->spa_loaded_ts); 1826 error = spa_load_impl(spa, pool_guid, config, state, type, 1827 mosconfig, &ereport); 1828 } 1829 1830 spa->spa_minref = refcount_count(&spa->spa_refcount); 1831 if (error) { 1832 if (error != EEXIST) { 1833 spa->spa_loaded_ts.tv_sec = 0; 1834 spa->spa_loaded_ts.tv_nsec = 0; 1835 } 1836 if (error != EBADF) { 1837 zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); 1838 } 1839 } 1840 spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; 1841 spa->spa_ena = 0; 1842 1843 return (error); 1844 } 1845 1846 /* 1847 * Load an existing storage pool, using the pool's builtin spa_config as a 1848 * source of configuration information. 1849 */ 1850 static int 1851 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, 1852 spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, 1853 char **ereport) 1854 { 1855 int error = 0; 1856 nvlist_t *nvroot = NULL; 1857 vdev_t *rvd; 1858 uberblock_t *ub = &spa->spa_uberblock; 1859 uint64_t children, config_cache_txg = spa->spa_config_txg; 1860 int orig_mode = spa->spa_mode; 1861 int parse; 1862 uint64_t obj; 1863 1864 /* 1865 * If this is an untrusted config, access the pool in read-only mode. 1866 * This prevents things like resilvering recently removed devices. 1867 */ 1868 if (!mosconfig) 1869 spa->spa_mode = FREAD; 1870 1871 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 1872 1873 spa->spa_load_state = state; 1874 1875 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) 1876 return (EINVAL); 1877 1878 parse = (type == SPA_IMPORT_EXISTING ? 1879 VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); 1880 1881 /* 1882 * Create "The Godfather" zio to hold all async IOs 1883 */ 1884 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 1885 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 1886 1887 /* 1888 * Parse the configuration into a vdev tree. We explicitly set the 1889 * value that will be returned by spa_version() since parsing the 1890 * configuration requires knowing the version number. 1891 */ 1892 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1893 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); 1894 spa_config_exit(spa, SCL_ALL, FTAG); 1895 1896 if (error != 0) 1897 return (error); 1898 1899 ASSERT(spa->spa_root_vdev == rvd); 1900 1901 if (type != SPA_IMPORT_ASSEMBLE) { 1902 ASSERT(spa_guid(spa) == pool_guid); 1903 } 1904 1905 /* 1906 * Try to open all vdevs, loading each label in the process. 1907 */ 1908 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1909 error = vdev_open(rvd); 1910 spa_config_exit(spa, SCL_ALL, FTAG); 1911 if (error != 0) 1912 return (error); 1913 1914 /* 1915 * We need to validate the vdev labels against the configuration that 1916 * we have in hand, which is dependent on the setting of mosconfig. If 1917 * mosconfig is true then we're validating the vdev labels based on 1918 * that config. Otherwise, we're validating against the cached config 1919 * (zpool.cache) that was read when we loaded the zfs module, and then 1920 * later we will recursively call spa_load() and validate against 1921 * the vdev config. 1922 * 1923 * If we're assembling a new pool that's been split off from an 1924 * existing pool, the labels haven't yet been updated so we skip 1925 * validation for now. 1926 */ 1927 if (type != SPA_IMPORT_ASSEMBLE) { 1928 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1929 error = vdev_validate(rvd); 1930 spa_config_exit(spa, SCL_ALL, FTAG); 1931 1932 if (error != 0) 1933 return (error); 1934 1935 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 1936 return (ENXIO); 1937 } 1938 1939 /* 1940 * Find the best uberblock. 1941 */ 1942 vdev_uberblock_load(NULL, rvd, ub); 1943 1944 /* 1945 * If we weren't able to find a single valid uberblock, return failure. 1946 */ 1947 if (ub->ub_txg == 0) 1948 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); 1949 1950 /* 1951 * If the pool is newer than the code, we can't open it. 1952 */ 1953 if (ub->ub_version > SPA_VERSION) 1954 return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); 1955 1956 /* 1957 * If the vdev guid sum doesn't match the uberblock, we have an 1958 * incomplete configuration. We first check to see if the pool 1959 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). 1960 * If it is, defer the vdev_guid_sum check till later so we 1961 * can handle missing vdevs. 1962 */ 1963 if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, 1964 &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && 1965 rvd->vdev_guid_sum != ub->ub_guid_sum) 1966 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); 1967 1968 if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { 1969 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 1970 spa_try_repair(spa, config); 1971 spa_config_exit(spa, SCL_ALL, FTAG); 1972 nvlist_free(spa->spa_config_splitting); 1973 spa->spa_config_splitting = NULL; 1974 } 1975 1976 /* 1977 * Initialize internal SPA structures. 1978 */ 1979 spa->spa_state = POOL_STATE_ACTIVE; 1980 spa->spa_ubsync = spa->spa_uberblock; 1981 spa->spa_verify_min_txg = spa->spa_extreme_rewind ? 1982 TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; 1983 spa->spa_first_txg = spa->spa_last_ubsync_txg ? 1984 spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; 1985 spa->spa_claim_max_txg = spa->spa_first_txg; 1986 spa->spa_prev_software_version = ub->ub_software_version; 1987 1988 error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); 1989 if (error) 1990 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1991 spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; 1992 1993 if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) 1994 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 1995 1996 if (!mosconfig) { 1997 uint64_t hostid; 1998 nvlist_t *policy = NULL, *nvconfig; 1999 2000 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2001 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2002 2003 if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, 2004 ZPOOL_CONFIG_HOSTID, &hostid) == 0) { 2005 char *hostname; 2006 unsigned long myhostid = 0; 2007 2008 VERIFY(nvlist_lookup_string(nvconfig, 2009 ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); 2010 2011 #ifdef _KERNEL 2012 myhostid = zone_get_hostid(NULL); 2013 #else /* _KERNEL */ 2014 /* 2015 * We're emulating the system's hostid in userland, so 2016 * we can't use zone_get_hostid(). 2017 */ 2018 (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); 2019 #endif /* _KERNEL */ 2020 if (hostid != 0 && myhostid != 0 && 2021 hostid != myhostid) { 2022 nvlist_free(nvconfig); 2023 cmn_err(CE_WARN, "pool '%s' could not be " 2024 "loaded as it was last accessed by " 2025 "another system (host: %s hostid: 0x%lx). " 2026 "See: http://www.sun.com/msg/ZFS-8000-EY", 2027 spa_name(spa), hostname, 2028 (unsigned long)hostid); 2029 return (EBADF); 2030 } 2031 } 2032 if (nvlist_lookup_nvlist(spa->spa_config, 2033 ZPOOL_REWIND_POLICY, &policy) == 0) 2034 VERIFY(nvlist_add_nvlist(nvconfig, 2035 ZPOOL_REWIND_POLICY, policy) == 0); 2036 2037 spa_config_set(spa, nvconfig); 2038 spa_unload(spa); 2039 spa_deactivate(spa); 2040 spa_activate(spa, orig_mode); 2041 2042 return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); 2043 } 2044 2045 if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) 2046 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2047 error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); 2048 if (error != 0) 2049 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2050 2051 /* 2052 * Load the bit that tells us to use the new accounting function 2053 * (raid-z deflation). If we have an older pool, this will not 2054 * be present. 2055 */ 2056 error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); 2057 if (error != 0 && error != ENOENT) 2058 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2059 2060 error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, 2061 &spa->spa_creation_version); 2062 if (error != 0 && error != ENOENT) 2063 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2064 2065 /* 2066 * Load the persistent error log. If we have an older pool, this will 2067 * not be present. 2068 */ 2069 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); 2070 if (error != 0 && error != ENOENT) 2071 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2072 2073 error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, 2074 &spa->spa_errlog_scrub); 2075 if (error != 0 && error != ENOENT) 2076 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2077 2078 /* 2079 * Load the history object. If we have an older pool, this 2080 * will not be present. 2081 */ 2082 error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); 2083 if (error != 0 && error != ENOENT) 2084 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2085 2086 /* 2087 * If we're assembling the pool from the split-off vdevs of 2088 * an existing pool, we don't want to attach the spares & cache 2089 * devices. 2090 */ 2091 2092 /* 2093 * Load any hot spares for this pool. 2094 */ 2095 error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); 2096 if (error != 0 && error != ENOENT) 2097 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2098 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2099 ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); 2100 if (load_nvlist(spa, spa->spa_spares.sav_object, 2101 &spa->spa_spares.sav_config) != 0) 2102 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2103 2104 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2105 spa_load_spares(spa); 2106 spa_config_exit(spa, SCL_ALL, FTAG); 2107 } else if (error == 0) { 2108 spa->spa_spares.sav_sync = B_TRUE; 2109 } 2110 2111 /* 2112 * Load any level 2 ARC devices for this pool. 2113 */ 2114 error = spa_dir_prop(spa, DMU_POOL_L2CACHE, 2115 &spa->spa_l2cache.sav_object); 2116 if (error != 0 && error != ENOENT) 2117 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2118 if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { 2119 ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); 2120 if (load_nvlist(spa, spa->spa_l2cache.sav_object, 2121 &spa->spa_l2cache.sav_config) != 0) 2122 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2123 2124 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2125 spa_load_l2cache(spa); 2126 spa_config_exit(spa, SCL_ALL, FTAG); 2127 } else if (error == 0) { 2128 spa->spa_l2cache.sav_sync = B_TRUE; 2129 } 2130 2131 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 2132 2133 error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); 2134 if (error && error != ENOENT) 2135 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2136 2137 if (error == 0) { 2138 uint64_t autoreplace; 2139 2140 spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); 2141 spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); 2142 spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); 2143 spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); 2144 spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); 2145 spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, 2146 &spa->spa_dedup_ditto); 2147 2148 spa->spa_autoreplace = (autoreplace != 0); 2149 } 2150 2151 /* 2152 * If the 'autoreplace' property is set, then post a resource notifying 2153 * the ZFS DE that it should not issue any faults for unopenable 2154 * devices. We also iterate over the vdevs, and post a sysevent for any 2155 * unopenable vdevs so that the normal autoreplace handler can take 2156 * over. 2157 */ 2158 if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { 2159 spa_check_removed(spa->spa_root_vdev); 2160 /* 2161 * For the import case, this is done in spa_import(), because 2162 * at this point we're using the spare definitions from 2163 * the MOS config, not necessarily from the userland config. 2164 */ 2165 if (state != SPA_LOAD_IMPORT) { 2166 spa_aux_check_removed(&spa->spa_spares); 2167 spa_aux_check_removed(&spa->spa_l2cache); 2168 } 2169 } 2170 2171 /* 2172 * Load the vdev state for all toplevel vdevs. 2173 */ 2174 vdev_load(rvd); 2175 2176 /* 2177 * Propagate the leaf DTLs we just loaded all the way up the tree. 2178 */ 2179 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2180 vdev_dtl_reassess(rvd, 0, 0, B_FALSE); 2181 spa_config_exit(spa, SCL_ALL, FTAG); 2182 2183 /* 2184 * Load the DDTs (dedup tables). 2185 */ 2186 error = ddt_load(spa); 2187 if (error != 0) 2188 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2189 2190 spa_update_dspace(spa); 2191 2192 /* 2193 * Validate the config, using the MOS config to fill in any 2194 * information which might be missing. If we fail to validate 2195 * the config then declare the pool unfit for use. If we're 2196 * assembling a pool from a split, the log is not transferred 2197 * over. 2198 */ 2199 if (type != SPA_IMPORT_ASSEMBLE) { 2200 nvlist_t *nvconfig; 2201 2202 if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) 2203 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); 2204 2205 if (!spa_config_valid(spa, nvconfig)) { 2206 nvlist_free(nvconfig); 2207 return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, 2208 ENXIO)); 2209 } 2210 nvlist_free(nvconfig); 2211 2212 /* 2213 * Now that we've validate the config, check the state of the 2214 * root vdev. If it can't be opened, it indicates one or 2215 * more toplevel vdevs are faulted. 2216 */ 2217 if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) 2218 return (ENXIO); 2219 2220 if (spa_check_logs(spa)) { 2221 *ereport = FM_EREPORT_ZFS_LOG_REPLAY; 2222 return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); 2223 } 2224 } 2225 2226 /* 2227 * We've successfully opened the pool, verify that we're ready 2228 * to start pushing transactions. 2229 */ 2230 if (state != SPA_LOAD_TRYIMPORT) { 2231 if (error = spa_load_verify(spa)) 2232 return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, 2233 error)); 2234 } 2235 2236 if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || 2237 spa->spa_load_max_txg == UINT64_MAX)) { 2238 dmu_tx_t *tx; 2239 int need_update = B_FALSE; 2240 2241 ASSERT(state != SPA_LOAD_TRYIMPORT); 2242 2243 /* 2244 * Claim log blocks that haven't been committed yet. 2245 * This must all happen in a single txg. 2246 * Note: spa_claim_max_txg is updated by spa_claim_notify(), 2247 * invoked from zil_claim_log_block()'s i/o done callback. 2248 * Price of rollback is that we abandon the log. 2249 */ 2250 spa->spa_claiming = B_TRUE; 2251 2252 tx = dmu_tx_create_assigned(spa_get_dsl(spa), 2253 spa_first_txg(spa)); 2254 (void) dmu_objset_find(spa_name(spa), 2255 zil_claim, tx, DS_FIND_CHILDREN); 2256 dmu_tx_commit(tx); 2257 2258 spa->spa_claiming = B_FALSE; 2259 2260 spa_set_log_state(spa, SPA_LOG_GOOD); 2261 spa->spa_sync_on = B_TRUE; 2262 txg_sync_start(spa->spa_dsl_pool); 2263 2264 /* 2265 * Wait for all claims to sync. We sync up to the highest 2266 * claimed log block birth time so that claimed log blocks 2267 * don't appear to be from the future. spa_claim_max_txg 2268 * will have been set for us by either zil_check_log_chain() 2269 * (invoked from spa_check_logs()) or zil_claim() above. 2270 */ 2271 txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); 2272 2273 /* 2274 * If the config cache is stale, or we have uninitialized 2275 * metaslabs (see spa_vdev_add()), then update the config. 2276 * 2277 * If this is a verbatim import, trust the current 2278 * in-core spa_config and update the disk labels. 2279 */ 2280 if (config_cache_txg != spa->spa_config_txg || 2281 state == SPA_LOAD_IMPORT || 2282 state == SPA_LOAD_RECOVER || 2283 (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) 2284 need_update = B_TRUE; 2285 2286 for (int c = 0; c < rvd->vdev_children; c++) 2287 if (rvd->vdev_child[c]->vdev_ms_array == 0) 2288 need_update = B_TRUE; 2289 2290 /* 2291 * Update the config cache asychronously in case we're the 2292 * root pool, in which case the config cache isn't writable yet. 2293 */ 2294 if (need_update) 2295 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); 2296 2297 /* 2298 * Check all DTLs to see if anything needs resilvering. 2299 */ 2300 if (!dsl_scan_resilvering(spa->spa_dsl_pool) && 2301 vdev_resilver_needed(rvd, NULL, NULL)) 2302 spa_async_request(spa, SPA_ASYNC_RESILVER); 2303 2304 /* 2305 * Delete any inconsistent datasets. 2306 */ 2307 (void) dmu_objset_find(spa_name(spa), 2308 dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); 2309 2310 /* 2311 * Clean up any stale temporary dataset userrefs. 2312 */ 2313 dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); 2314 } 2315 2316 return (0); 2317 } 2318 2319 static int 2320 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) 2321 { 2322 int mode = spa->spa_mode; 2323 2324 spa_unload(spa); 2325 spa_deactivate(spa); 2326 2327 spa->spa_load_max_txg--; 2328 2329 spa_activate(spa, mode); 2330 spa_async_suspend(spa); 2331 2332 return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); 2333 } 2334 2335 static int 2336 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, 2337 uint64_t max_request, int rewind_flags) 2338 { 2339 nvlist_t *config = NULL; 2340 int load_error, rewind_error; 2341 uint64_t safe_rewind_txg; 2342 uint64_t min_txg; 2343 2344 if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { 2345 spa->spa_load_max_txg = spa->spa_load_txg; 2346 spa_set_log_state(spa, SPA_LOG_CLEAR); 2347 } else { 2348 spa->spa_load_max_txg = max_request; 2349 } 2350 2351 load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, 2352 mosconfig); 2353 if (load_error == 0) 2354 return (0); 2355 2356 if (spa->spa_root_vdev != NULL) 2357 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2358 2359 spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; 2360 spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; 2361 2362 if (rewind_flags & ZPOOL_NEVER_REWIND) { 2363 nvlist_free(config); 2364 return (load_error); 2365 } 2366 2367 /* Price of rolling back is discarding txgs, including log */ 2368 if (state == SPA_LOAD_RECOVER) 2369 spa_set_log_state(spa, SPA_LOG_CLEAR); 2370 2371 spa->spa_load_max_txg = spa->spa_last_ubsync_txg; 2372 safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; 2373 min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? 2374 TXG_INITIAL : safe_rewind_txg; 2375 2376 /* 2377 * Continue as long as we're finding errors, we're still within 2378 * the acceptable rewind range, and we're still finding uberblocks 2379 */ 2380 while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && 2381 spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { 2382 if (spa->spa_load_max_txg < safe_rewind_txg) 2383 spa->spa_extreme_rewind = B_TRUE; 2384 rewind_error = spa_load_retry(spa, state, mosconfig); 2385 } 2386 2387 spa->spa_extreme_rewind = B_FALSE; 2388 spa->spa_load_max_txg = UINT64_MAX; 2389 2390 if (config && (rewind_error || state != SPA_LOAD_RECOVER)) 2391 spa_config_set(spa, config); 2392 2393 return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); 2394 } 2395 2396 /* 2397 * Pool Open/Import 2398 * 2399 * The import case is identical to an open except that the configuration is sent 2400 * down from userland, instead of grabbed from the configuration cache. For the 2401 * case of an open, the pool configuration will exist in the 2402 * POOL_STATE_UNINITIALIZED state. 2403 * 2404 * The stats information (gen/count/ustats) is used to gather vdev statistics at 2405 * the same time open the pool, without having to keep around the spa_t in some 2406 * ambiguous state. 2407 */ 2408 static int 2409 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, 2410 nvlist_t **config) 2411 { 2412 spa_t *spa; 2413 spa_load_state_t state = SPA_LOAD_OPEN; 2414 int error; 2415 int locked = B_FALSE; 2416 2417 *spapp = NULL; 2418 2419 /* 2420 * As disgusting as this is, we need to support recursive calls to this 2421 * function because dsl_dir_open() is called during spa_load(), and ends 2422 * up calling spa_open() again. The real fix is to figure out how to 2423 * avoid dsl_dir_open() calling this in the first place. 2424 */ 2425 if (mutex_owner(&spa_namespace_lock) != curthread) { 2426 mutex_enter(&spa_namespace_lock); 2427 locked = B_TRUE; 2428 } 2429 2430 if ((spa = spa_lookup(pool)) == NULL) { 2431 if (locked) 2432 mutex_exit(&spa_namespace_lock); 2433 return (ENOENT); 2434 } 2435 2436 if (spa->spa_state == POOL_STATE_UNINITIALIZED) { 2437 zpool_rewind_policy_t policy; 2438 2439 zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, 2440 &policy); 2441 if (policy.zrp_request & ZPOOL_DO_REWIND) 2442 state = SPA_LOAD_RECOVER; 2443 2444 spa_activate(spa, spa_mode_global); 2445 2446 if (state != SPA_LOAD_RECOVER) 2447 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 2448 2449 error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, 2450 policy.zrp_request); 2451 2452 if (error == EBADF) { 2453 /* 2454 * If vdev_validate() returns failure (indicated by 2455 * EBADF), it indicates that one of the vdevs indicates 2456 * that the pool has been exported or destroyed. If 2457 * this is the case, the config cache is out of sync and 2458 * we should remove the pool from the namespace. 2459 */ 2460 spa_unload(spa); 2461 spa_deactivate(spa); 2462 spa_config_sync(spa, B_TRUE, B_TRUE); 2463 spa_remove(spa); 2464 if (locked) 2465 mutex_exit(&spa_namespace_lock); 2466 return (ENOENT); 2467 } 2468 2469 if (error) { 2470 /* 2471 * We can't open the pool, but we still have useful 2472 * information: the state of each vdev after the 2473 * attempted vdev_open(). Return this to the user. 2474 */ 2475 if (config != NULL && spa->spa_config) { 2476 VERIFY(nvlist_dup(spa->spa_config, config, 2477 KM_SLEEP) == 0); 2478 VERIFY(nvlist_add_nvlist(*config, 2479 ZPOOL_CONFIG_LOAD_INFO, 2480 spa->spa_load_info) == 0); 2481 } 2482 spa_unload(spa); 2483 spa_deactivate(spa); 2484 spa->spa_last_open_failed = error; 2485 if (locked) 2486 mutex_exit(&spa_namespace_lock); 2487 *spapp = NULL; 2488 return (error); 2489 } 2490 } 2491 2492 spa_open_ref(spa, tag); 2493 2494 if (config != NULL) 2495 *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 2496 2497 /* 2498 * If we've recovered the pool, pass back any information we 2499 * gathered while doing the load. 2500 */ 2501 if (state == SPA_LOAD_RECOVER) { 2502 VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, 2503 spa->spa_load_info) == 0); 2504 } 2505 2506 if (locked) { 2507 spa->spa_last_open_failed = 0; 2508 spa->spa_last_ubsync_txg = 0; 2509 spa->spa_load_txg = 0; 2510 mutex_exit(&spa_namespace_lock); 2511 } 2512 2513 *spapp = spa; 2514 2515 return (0); 2516 } 2517 2518 int 2519 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, 2520 nvlist_t **config) 2521 { 2522 return (spa_open_common(name, spapp, tag, policy, config)); 2523 } 2524 2525 int 2526 spa_open(const char *name, spa_t **spapp, void *tag) 2527 { 2528 return (spa_open_common(name, spapp, tag, NULL, NULL)); 2529 } 2530 2531 /* 2532 * Lookup the given spa_t, incrementing the inject count in the process, 2533 * preventing it from being exported or destroyed. 2534 */ 2535 spa_t * 2536 spa_inject_addref(char *name) 2537 { 2538 spa_t *spa; 2539 2540 mutex_enter(&spa_namespace_lock); 2541 if ((spa = spa_lookup(name)) == NULL) { 2542 mutex_exit(&spa_namespace_lock); 2543 return (NULL); 2544 } 2545 spa->spa_inject_ref++; 2546 mutex_exit(&spa_namespace_lock); 2547 2548 return (spa); 2549 } 2550 2551 void 2552 spa_inject_delref(spa_t *spa) 2553 { 2554 mutex_enter(&spa_namespace_lock); 2555 spa->spa_inject_ref--; 2556 mutex_exit(&spa_namespace_lock); 2557 } 2558 2559 /* 2560 * Add spares device information to the nvlist. 2561 */ 2562 static void 2563 spa_add_spares(spa_t *spa, nvlist_t *config) 2564 { 2565 nvlist_t **spares; 2566 uint_t i, nspares; 2567 nvlist_t *nvroot; 2568 uint64_t guid; 2569 vdev_stat_t *vs; 2570 uint_t vsc; 2571 uint64_t pool; 2572 2573 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2574 2575 if (spa->spa_spares.sav_count == 0) 2576 return; 2577 2578 VERIFY(nvlist_lookup_nvlist(config, 2579 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2580 VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 2581 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2582 if (nspares != 0) { 2583 VERIFY(nvlist_add_nvlist_array(nvroot, 2584 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 2585 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2586 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); 2587 2588 /* 2589 * Go through and find any spares which have since been 2590 * repurposed as an active spare. If this is the case, update 2591 * their status appropriately. 2592 */ 2593 for (i = 0; i < nspares; i++) { 2594 VERIFY(nvlist_lookup_uint64(spares[i], 2595 ZPOOL_CONFIG_GUID, &guid) == 0); 2596 if (spa_spare_exists(guid, &pool, NULL) && 2597 pool != 0ULL) { 2598 VERIFY(nvlist_lookup_uint64_array( 2599 spares[i], ZPOOL_CONFIG_VDEV_STATS, 2600 (uint64_t **)&vs, &vsc) == 0); 2601 vs->vs_state = VDEV_STATE_CANT_OPEN; 2602 vs->vs_aux = VDEV_AUX_SPARED; 2603 } 2604 } 2605 } 2606 } 2607 2608 /* 2609 * Add l2cache device information to the nvlist, including vdev stats. 2610 */ 2611 static void 2612 spa_add_l2cache(spa_t *spa, nvlist_t *config) 2613 { 2614 nvlist_t **l2cache; 2615 uint_t i, j, nl2cache; 2616 nvlist_t *nvroot; 2617 uint64_t guid; 2618 vdev_t *vd; 2619 vdev_stat_t *vs; 2620 uint_t vsc; 2621 2622 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); 2623 2624 if (spa->spa_l2cache.sav_count == 0) 2625 return; 2626 2627 VERIFY(nvlist_lookup_nvlist(config, 2628 ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); 2629 VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 2630 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2631 if (nl2cache != 0) { 2632 VERIFY(nvlist_add_nvlist_array(nvroot, 2633 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 2634 VERIFY(nvlist_lookup_nvlist_array(nvroot, 2635 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); 2636 2637 /* 2638 * Update level 2 cache device stats. 2639 */ 2640 2641 for (i = 0; i < nl2cache; i++) { 2642 VERIFY(nvlist_lookup_uint64(l2cache[i], 2643 ZPOOL_CONFIG_GUID, &guid) == 0); 2644 2645 vd = NULL; 2646 for (j = 0; j < spa->spa_l2cache.sav_count; j++) { 2647 if (guid == 2648 spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { 2649 vd = spa->spa_l2cache.sav_vdevs[j]; 2650 break; 2651 } 2652 } 2653 ASSERT(vd != NULL); 2654 2655 VERIFY(nvlist_lookup_uint64_array(l2cache[i], 2656 ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) 2657 == 0); 2658 vdev_get_stats(vd, vs); 2659 } 2660 } 2661 } 2662 2663 int 2664 spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) 2665 { 2666 int error; 2667 spa_t *spa; 2668 2669 *config = NULL; 2670 error = spa_open_common(name, &spa, FTAG, NULL, config); 2671 2672 if (spa != NULL) { 2673 /* 2674 * This still leaves a window of inconsistency where the spares 2675 * or l2cache devices could change and the config would be 2676 * self-inconsistent. 2677 */ 2678 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 2679 2680 if (*config != NULL) { 2681 uint64_t loadtimes[2]; 2682 2683 loadtimes[0] = spa->spa_loaded_ts.tv_sec; 2684 loadtimes[1] = spa->spa_loaded_ts.tv_nsec; 2685 VERIFY(nvlist_add_uint64_array(*config, 2686 ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); 2687 2688 VERIFY(nvlist_add_uint64(*config, 2689 ZPOOL_CONFIG_ERRCOUNT, 2690 spa_get_errlog_size(spa)) == 0); 2691 2692 if (spa_suspended(spa)) 2693 VERIFY(nvlist_add_uint64(*config, 2694 ZPOOL_CONFIG_SUSPENDED, 2695 spa->spa_failmode) == 0); 2696 2697 spa_add_spares(spa, *config); 2698 spa_add_l2cache(spa, *config); 2699 } 2700 } 2701 2702 /* 2703 * We want to get the alternate root even for faulted pools, so we cheat 2704 * and call spa_lookup() directly. 2705 */ 2706 if (altroot) { 2707 if (spa == NULL) { 2708 mutex_enter(&spa_namespace_lock); 2709 spa = spa_lookup(name); 2710 if (spa) 2711 spa_altroot(spa, altroot, buflen); 2712 else 2713 altroot[0] = '\0'; 2714 spa = NULL; 2715 mutex_exit(&spa_namespace_lock); 2716 } else { 2717 spa_altroot(spa, altroot, buflen); 2718 } 2719 } 2720 2721 if (spa != NULL) { 2722 spa_config_exit(spa, SCL_CONFIG, FTAG); 2723 spa_close(spa, FTAG); 2724 } 2725 2726 return (error); 2727 } 2728 2729 /* 2730 * Validate that the auxiliary device array is well formed. We must have an 2731 * array of nvlists, each which describes a valid leaf vdev. If this is an 2732 * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be 2733 * specified, as long as they are well-formed. 2734 */ 2735 static int 2736 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, 2737 spa_aux_vdev_t *sav, const char *config, uint64_t version, 2738 vdev_labeltype_t label) 2739 { 2740 nvlist_t **dev; 2741 uint_t i, ndev; 2742 vdev_t *vd; 2743 int error; 2744 2745 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2746 2747 /* 2748 * It's acceptable to have no devs specified. 2749 */ 2750 if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) 2751 return (0); 2752 2753 if (ndev == 0) 2754 return (EINVAL); 2755 2756 /* 2757 * Make sure the pool is formatted with a version that supports this 2758 * device type. 2759 */ 2760 if (spa_version(spa) < version) 2761 return (ENOTSUP); 2762 2763 /* 2764 * Set the pending device list so we correctly handle device in-use 2765 * checking. 2766 */ 2767 sav->sav_pending = dev; 2768 sav->sav_npending = ndev; 2769 2770 for (i = 0; i < ndev; i++) { 2771 if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, 2772 mode)) != 0) 2773 goto out; 2774 2775 if (!vd->vdev_ops->vdev_op_leaf) { 2776 vdev_free(vd); 2777 error = EINVAL; 2778 goto out; 2779 } 2780 2781 /* 2782 * The L2ARC currently only supports disk devices in 2783 * kernel context. For user-level testing, we allow it. 2784 */ 2785 #ifdef _KERNEL 2786 if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && 2787 strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { 2788 error = ENOTBLK; 2789 goto out; 2790 } 2791 #endif 2792 vd->vdev_top = vd; 2793 2794 if ((error = vdev_open(vd)) == 0 && 2795 (error = vdev_label_init(vd, crtxg, label)) == 0) { 2796 VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, 2797 vd->vdev_guid) == 0); 2798 } 2799 2800 vdev_free(vd); 2801 2802 if (error && 2803 (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) 2804 goto out; 2805 else 2806 error = 0; 2807 } 2808 2809 out: 2810 sav->sav_pending = NULL; 2811 sav->sav_npending = 0; 2812 return (error); 2813 } 2814 2815 static int 2816 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) 2817 { 2818 int error; 2819 2820 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 2821 2822 if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2823 &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, 2824 VDEV_LABEL_SPARE)) != 0) { 2825 return (error); 2826 } 2827 2828 return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, 2829 &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, 2830 VDEV_LABEL_L2CACHE)); 2831 } 2832 2833 static void 2834 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, 2835 const char *config) 2836 { 2837 int i; 2838 2839 if (sav->sav_config != NULL) { 2840 nvlist_t **olddevs; 2841 uint_t oldndevs; 2842 nvlist_t **newdevs; 2843 2844 /* 2845 * Generate new dev list by concatentating with the 2846 * current dev list. 2847 */ 2848 VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, 2849 &olddevs, &oldndevs) == 0); 2850 2851 newdevs = kmem_alloc(sizeof (void *) * 2852 (ndevs + oldndevs), KM_SLEEP); 2853 for (i = 0; i < oldndevs; i++) 2854 VERIFY(nvlist_dup(olddevs[i], &newdevs[i], 2855 KM_SLEEP) == 0); 2856 for (i = 0; i < ndevs; i++) 2857 VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], 2858 KM_SLEEP) == 0); 2859 2860 VERIFY(nvlist_remove(sav->sav_config, config, 2861 DATA_TYPE_NVLIST_ARRAY) == 0); 2862 2863 VERIFY(nvlist_add_nvlist_array(sav->sav_config, 2864 config, newdevs, ndevs + oldndevs) == 0); 2865 for (i = 0; i < oldndevs + ndevs; i++) 2866 nvlist_free(newdevs[i]); 2867 kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); 2868 } else { 2869 /* 2870 * Generate a new dev list. 2871 */ 2872 VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, 2873 KM_SLEEP) == 0); 2874 VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, 2875 devs, ndevs) == 0); 2876 } 2877 } 2878 2879 /* 2880 * Stop and drop level 2 ARC devices 2881 */ 2882 void 2883 spa_l2cache_drop(spa_t *spa) 2884 { 2885 vdev_t *vd; 2886 int i; 2887 spa_aux_vdev_t *sav = &spa->spa_l2cache; 2888 2889 for (i = 0; i < sav->sav_count; i++) { 2890 uint64_t pool; 2891 2892 vd = sav->sav_vdevs[i]; 2893 ASSERT(vd != NULL); 2894 2895 if (spa_l2cache_exists(vd->vdev_guid, &pool) && 2896 pool != 0ULL && l2arc_vdev_present(vd)) 2897 l2arc_remove_vdev(vd); 2898 if (vd->vdev_isl2cache) 2899 spa_l2cache_remove(vd); 2900 vdev_clear_stats(vd); 2901 (void) vdev_close(vd); 2902 } 2903 } 2904 2905 /* 2906 * Pool Creation 2907 */ 2908 int 2909 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, 2910 const char *history_str, nvlist_t *zplprops) 2911 { 2912 spa_t *spa; 2913 char *altroot = NULL; 2914 vdev_t *rvd; 2915 dsl_pool_t *dp; 2916 dmu_tx_t *tx; 2917 int error = 0; 2918 uint64_t txg = TXG_INITIAL; 2919 nvlist_t **spares, **l2cache; 2920 uint_t nspares, nl2cache; 2921 uint64_t version, obj; 2922 2923 /* 2924 * If this pool already exists, return failure. 2925 */ 2926 mutex_enter(&spa_namespace_lock); 2927 if (spa_lookup(pool) != NULL) { 2928 mutex_exit(&spa_namespace_lock); 2929 return (EEXIST); 2930 } 2931 2932 /* 2933 * Allocate a new spa_t structure. 2934 */ 2935 (void) nvlist_lookup_string(props, 2936 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 2937 spa = spa_add(pool, NULL, altroot); 2938 spa_activate(spa, spa_mode_global); 2939 2940 if (props && (error = spa_prop_validate(spa, props))) { 2941 spa_deactivate(spa); 2942 spa_remove(spa); 2943 mutex_exit(&spa_namespace_lock); 2944 return (error); 2945 } 2946 2947 if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), 2948 &version) != 0) 2949 version = SPA_VERSION; 2950 ASSERT(version <= SPA_VERSION); 2951 2952 spa->spa_first_txg = txg; 2953 spa->spa_uberblock.ub_txg = txg - 1; 2954 spa->spa_uberblock.ub_version = version; 2955 spa->spa_ubsync = spa->spa_uberblock; 2956 2957 /* 2958 * Create "The Godfather" zio to hold all async IOs 2959 */ 2960 spa->spa_async_zio_root = zio_root(spa, NULL, NULL, 2961 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); 2962 2963 /* 2964 * Create the root vdev. 2965 */ 2966 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 2967 2968 error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); 2969 2970 ASSERT(error != 0 || rvd != NULL); 2971 ASSERT(error != 0 || spa->spa_root_vdev == rvd); 2972 2973 if (error == 0 && !zfs_allocatable_devs(nvroot)) 2974 error = EINVAL; 2975 2976 if (error == 0 && 2977 (error = vdev_create(rvd, txg, B_FALSE)) == 0 && 2978 (error = spa_validate_aux(spa, nvroot, txg, 2979 VDEV_ALLOC_ADD)) == 0) { 2980 for (int c = 0; c < rvd->vdev_children; c++) { 2981 vdev_metaslab_set_size(rvd->vdev_child[c]); 2982 vdev_expand(rvd->vdev_child[c], txg); 2983 } 2984 } 2985 2986 spa_config_exit(spa, SCL_ALL, FTAG); 2987 2988 if (error != 0) { 2989 spa_unload(spa); 2990 spa_deactivate(spa); 2991 spa_remove(spa); 2992 mutex_exit(&spa_namespace_lock); 2993 return (error); 2994 } 2995 2996 /* 2997 * Get the list of spares, if specified. 2998 */ 2999 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3000 &spares, &nspares) == 0) { 3001 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, 3002 KM_SLEEP) == 0); 3003 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3004 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3005 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3006 spa_load_spares(spa); 3007 spa_config_exit(spa, SCL_ALL, FTAG); 3008 spa->spa_spares.sav_sync = B_TRUE; 3009 } 3010 3011 /* 3012 * Get the list of level 2 cache devices, if specified. 3013 */ 3014 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3015 &l2cache, &nl2cache) == 0) { 3016 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3017 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3018 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3019 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3020 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3021 spa_load_l2cache(spa); 3022 spa_config_exit(spa, SCL_ALL, FTAG); 3023 spa->spa_l2cache.sav_sync = B_TRUE; 3024 } 3025 3026 spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); 3027 spa->spa_meta_objset = dp->dp_meta_objset; 3028 3029 /* 3030 * Create DDTs (dedup tables). 3031 */ 3032 ddt_create(spa); 3033 3034 spa_update_dspace(spa); 3035 3036 tx = dmu_tx_create_assigned(dp, txg); 3037 3038 /* 3039 * Create the pool config object. 3040 */ 3041 spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, 3042 DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, 3043 DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); 3044 3045 if (zap_add(spa->spa_meta_objset, 3046 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, 3047 sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { 3048 cmn_err(CE_PANIC, "failed to add pool config"); 3049 } 3050 3051 if (zap_add(spa->spa_meta_objset, 3052 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, 3053 sizeof (uint64_t), 1, &version, tx) != 0) { 3054 cmn_err(CE_PANIC, "failed to add pool version"); 3055 } 3056 3057 /* Newly created pools with the right version are always deflated. */ 3058 if (version >= SPA_VERSION_RAIDZ_DEFLATE) { 3059 spa->spa_deflate = TRUE; 3060 if (zap_add(spa->spa_meta_objset, 3061 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 3062 sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { 3063 cmn_err(CE_PANIC, "failed to add deflate"); 3064 } 3065 } 3066 3067 /* 3068 * Create the deferred-free bpobj. Turn off compression 3069 * because sync-to-convergence takes longer if the blocksize 3070 * keeps changing. 3071 */ 3072 obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); 3073 dmu_object_set_compress(spa->spa_meta_objset, obj, 3074 ZIO_COMPRESS_OFF, tx); 3075 if (zap_add(spa->spa_meta_objset, 3076 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, 3077 sizeof (uint64_t), 1, &obj, tx) != 0) { 3078 cmn_err(CE_PANIC, "failed to add bpobj"); 3079 } 3080 VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, 3081 spa->spa_meta_objset, obj)); 3082 3083 /* 3084 * Create the pool's history object. 3085 */ 3086 if (version >= SPA_VERSION_ZPOOL_HISTORY) 3087 spa_history_create_obj(spa, tx); 3088 3089 /* 3090 * Set pool properties. 3091 */ 3092 spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); 3093 spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); 3094 spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); 3095 spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); 3096 3097 if (props != NULL) { 3098 spa_configfile_set(spa, props, B_FALSE); 3099 spa_sync_props(spa, props, tx); 3100 } 3101 3102 dmu_tx_commit(tx); 3103 3104 spa->spa_sync_on = B_TRUE; 3105 txg_sync_start(spa->spa_dsl_pool); 3106 3107 /* 3108 * We explicitly wait for the first transaction to complete so that our 3109 * bean counters are appropriately updated. 3110 */ 3111 txg_wait_synced(spa->spa_dsl_pool, txg); 3112 3113 spa_config_sync(spa, B_FALSE, B_TRUE); 3114 3115 if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) 3116 (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); 3117 spa_history_log_version(spa, LOG_POOL_CREATE); 3118 3119 spa->spa_minref = refcount_count(&spa->spa_refcount); 3120 3121 mutex_exit(&spa_namespace_lock); 3122 3123 return (0); 3124 } 3125 3126 #ifdef _KERNEL 3127 /* 3128 * Get the root pool information from the root disk, then import the root pool 3129 * during the system boot up time. 3130 */ 3131 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); 3132 3133 static nvlist_t * 3134 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) 3135 { 3136 nvlist_t *config; 3137 nvlist_t *nvtop, *nvroot; 3138 uint64_t pgid; 3139 3140 if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) 3141 return (NULL); 3142 3143 /* 3144 * Add this top-level vdev to the child array. 3145 */ 3146 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3147 &nvtop) == 0); 3148 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, 3149 &pgid) == 0); 3150 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); 3151 3152 /* 3153 * Put this pool's top-level vdevs into a root vdev. 3154 */ 3155 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 3156 VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, 3157 VDEV_TYPE_ROOT) == 0); 3158 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); 3159 VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); 3160 VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, 3161 &nvtop, 1) == 0); 3162 3163 /* 3164 * Replace the existing vdev_tree with the new root vdev in 3165 * this pool's configuration (remove the old, add the new). 3166 */ 3167 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); 3168 nvlist_free(nvroot); 3169 return (config); 3170 } 3171 3172 /* 3173 * Walk the vdev tree and see if we can find a device with "better" 3174 * configuration. A configuration is "better" if the label on that 3175 * device has a more recent txg. 3176 */ 3177 static void 3178 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) 3179 { 3180 for (int c = 0; c < vd->vdev_children; c++) 3181 spa_alt_rootvdev(vd->vdev_child[c], avd, txg); 3182 3183 if (vd->vdev_ops->vdev_op_leaf) { 3184 nvlist_t *label; 3185 uint64_t label_txg; 3186 3187 if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, 3188 &label) != 0) 3189 return; 3190 3191 VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, 3192 &label_txg) == 0); 3193 3194 /* 3195 * Do we have a better boot device? 3196 */ 3197 if (label_txg > *txg) { 3198 *txg = label_txg; 3199 *avd = vd; 3200 } 3201 nvlist_free(label); 3202 } 3203 } 3204 3205 /* 3206 * Import a root pool. 3207 * 3208 * For x86. devpath_list will consist of devid and/or physpath name of 3209 * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). 3210 * The GRUB "findroot" command will return the vdev we should boot. 3211 * 3212 * For Sparc, devpath_list consists the physpath name of the booting device 3213 * no matter the rootpool is a single device pool or a mirrored pool. 3214 * e.g. 3215 * "/pci@1f,0/ide@d/disk@0,0:a" 3216 */ 3217 int 3218 spa_import_rootpool(char *devpath, char *devid) 3219 { 3220 spa_t *spa; 3221 vdev_t *rvd, *bvd, *avd = NULL; 3222 nvlist_t *config, *nvtop; 3223 uint64_t guid, txg; 3224 char *pname; 3225 int error; 3226 3227 /* 3228 * Read the label from the boot device and generate a configuration. 3229 */ 3230 config = spa_generate_rootconf(devpath, devid, &guid); 3231 #if defined(_OBP) && defined(_KERNEL) 3232 if (config == NULL) { 3233 if (strstr(devpath, "/iscsi/ssd") != NULL) { 3234 /* iscsi boot */ 3235 get_iscsi_bootpath_phy(devpath); 3236 config = spa_generate_rootconf(devpath, devid, &guid); 3237 } 3238 } 3239 #endif 3240 if (config == NULL) { 3241 cmn_err(CE_NOTE, "Can not read the pool label from '%s'", 3242 devpath); 3243 return (EIO); 3244 } 3245 3246 VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, 3247 &pname) == 0); 3248 VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); 3249 3250 mutex_enter(&spa_namespace_lock); 3251 if ((spa = spa_lookup(pname)) != NULL) { 3252 /* 3253 * Remove the existing root pool from the namespace so that we 3254 * can replace it with the correct config we just read in. 3255 */ 3256 spa_remove(spa); 3257 } 3258 3259 spa = spa_add(pname, config, NULL); 3260 spa->spa_is_root = B_TRUE; 3261 spa->spa_import_flags = ZFS_IMPORT_VERBATIM; 3262 3263 /* 3264 * Build up a vdev tree based on the boot device's label config. 3265 */ 3266 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3267 &nvtop) == 0); 3268 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3269 error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, 3270 VDEV_ALLOC_ROOTPOOL); 3271 spa_config_exit(spa, SCL_ALL, FTAG); 3272 if (error) { 3273 mutex_exit(&spa_namespace_lock); 3274 nvlist_free(config); 3275 cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", 3276 pname); 3277 return (error); 3278 } 3279 3280 /* 3281 * Get the boot vdev. 3282 */ 3283 if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { 3284 cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", 3285 (u_longlong_t)guid); 3286 error = ENOENT; 3287 goto out; 3288 } 3289 3290 /* 3291 * Determine if there is a better boot device. 3292 */ 3293 avd = bvd; 3294 spa_alt_rootvdev(rvd, &avd, &txg); 3295 if (avd != bvd) { 3296 cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " 3297 "try booting from '%s'", avd->vdev_path); 3298 error = EINVAL; 3299 goto out; 3300 } 3301 3302 /* 3303 * If the boot device is part of a spare vdev then ensure that 3304 * we're booting off the active spare. 3305 */ 3306 if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && 3307 !bvd->vdev_isspare) { 3308 cmn_err(CE_NOTE, "The boot device is currently spared. Please " 3309 "try booting from '%s'", 3310 bvd->vdev_parent-> 3311 vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); 3312 error = EINVAL; 3313 goto out; 3314 } 3315 3316 error = 0; 3317 spa_history_log_version(spa, LOG_POOL_IMPORT); 3318 out: 3319 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3320 vdev_free(rvd); 3321 spa_config_exit(spa, SCL_ALL, FTAG); 3322 mutex_exit(&spa_namespace_lock); 3323 3324 nvlist_free(config); 3325 return (error); 3326 } 3327 3328 #endif 3329 3330 /* 3331 * Import a non-root pool into the system. 3332 */ 3333 int 3334 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) 3335 { 3336 spa_t *spa; 3337 char *altroot = NULL; 3338 spa_load_state_t state = SPA_LOAD_IMPORT; 3339 zpool_rewind_policy_t policy; 3340 uint64_t mode = spa_mode_global; 3341 uint64_t readonly = B_FALSE; 3342 int error; 3343 nvlist_t *nvroot; 3344 nvlist_t **spares, **l2cache; 3345 uint_t nspares, nl2cache; 3346 3347 /* 3348 * If a pool with this name exists, return failure. 3349 */ 3350 mutex_enter(&spa_namespace_lock); 3351 if (spa_lookup(pool) != NULL) { 3352 mutex_exit(&spa_namespace_lock); 3353 return (EEXIST); 3354 } 3355 3356 /* 3357 * Create and initialize the spa structure. 3358 */ 3359 (void) nvlist_lookup_string(props, 3360 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 3361 (void) nvlist_lookup_uint64(props, 3362 zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); 3363 if (readonly) 3364 mode = FREAD; 3365 spa = spa_add(pool, config, altroot); 3366 spa->spa_import_flags = flags; 3367 3368 /* 3369 * Verbatim import - Take a pool and insert it into the namespace 3370 * as if it had been loaded at boot. 3371 */ 3372 if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { 3373 if (props != NULL) 3374 spa_configfile_set(spa, props, B_FALSE); 3375 3376 spa_config_sync(spa, B_FALSE, B_TRUE); 3377 3378 mutex_exit(&spa_namespace_lock); 3379 spa_history_log_version(spa, LOG_POOL_IMPORT); 3380 3381 return (0); 3382 } 3383 3384 spa_activate(spa, mode); 3385 3386 /* 3387 * Don't start async tasks until we know everything is healthy. 3388 */ 3389 spa_async_suspend(spa); 3390 3391 zpool_get_rewind_policy(config, &policy); 3392 if (policy.zrp_request & ZPOOL_DO_REWIND) 3393 state = SPA_LOAD_RECOVER; 3394 3395 /* 3396 * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig 3397 * because the user-supplied config is actually the one to trust when 3398 * doing an import. 3399 */ 3400 if (state != SPA_LOAD_RECOVER) 3401 spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; 3402 3403 error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, 3404 policy.zrp_request); 3405 3406 /* 3407 * Propagate anything learned while loading the pool and pass it 3408 * back to caller (i.e. rewind info, missing devices, etc). 3409 */ 3410 VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, 3411 spa->spa_load_info) == 0); 3412 3413 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3414 /* 3415 * Toss any existing sparelist, as it doesn't have any validity 3416 * anymore, and conflicts with spa_has_spare(). 3417 */ 3418 if (spa->spa_spares.sav_config) { 3419 nvlist_free(spa->spa_spares.sav_config); 3420 spa->spa_spares.sav_config = NULL; 3421 spa_load_spares(spa); 3422 } 3423 if (spa->spa_l2cache.sav_config) { 3424 nvlist_free(spa->spa_l2cache.sav_config); 3425 spa->spa_l2cache.sav_config = NULL; 3426 spa_load_l2cache(spa); 3427 } 3428 3429 VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, 3430 &nvroot) == 0); 3431 if (error == 0) 3432 error = spa_validate_aux(spa, nvroot, -1ULL, 3433 VDEV_ALLOC_SPARE); 3434 if (error == 0) 3435 error = spa_validate_aux(spa, nvroot, -1ULL, 3436 VDEV_ALLOC_L2CACHE); 3437 spa_config_exit(spa, SCL_ALL, FTAG); 3438 3439 if (props != NULL) 3440 spa_configfile_set(spa, props, B_FALSE); 3441 3442 if (error != 0 || (props && spa_writeable(spa) && 3443 (error = spa_prop_set(spa, props)))) { 3444 spa_unload(spa); 3445 spa_deactivate(spa); 3446 spa_remove(spa); 3447 mutex_exit(&spa_namespace_lock); 3448 return (error); 3449 } 3450 3451 spa_async_resume(spa); 3452 3453 /* 3454 * Override any spares and level 2 cache devices as specified by 3455 * the user, as these may have correct device names/devids, etc. 3456 */ 3457 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, 3458 &spares, &nspares) == 0) { 3459 if (spa->spa_spares.sav_config) 3460 VERIFY(nvlist_remove(spa->spa_spares.sav_config, 3461 ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); 3462 else 3463 VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, 3464 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3465 VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, 3466 ZPOOL_CONFIG_SPARES, spares, nspares) == 0); 3467 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3468 spa_load_spares(spa); 3469 spa_config_exit(spa, SCL_ALL, FTAG); 3470 spa->spa_spares.sav_sync = B_TRUE; 3471 } 3472 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, 3473 &l2cache, &nl2cache) == 0) { 3474 if (spa->spa_l2cache.sav_config) 3475 VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, 3476 ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); 3477 else 3478 VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, 3479 NV_UNIQUE_NAME, KM_SLEEP) == 0); 3480 VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, 3481 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); 3482 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3483 spa_load_l2cache(spa); 3484 spa_config_exit(spa, SCL_ALL, FTAG); 3485 spa->spa_l2cache.sav_sync = B_TRUE; 3486 } 3487 3488 /* 3489 * Check for any removed devices. 3490 */ 3491 if (spa->spa_autoreplace) { 3492 spa_aux_check_removed(&spa->spa_spares); 3493 spa_aux_check_removed(&spa->spa_l2cache); 3494 } 3495 3496 if (spa_writeable(spa)) { 3497 /* 3498 * Update the config cache to include the newly-imported pool. 3499 */ 3500 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3501 } 3502 3503 /* 3504 * It's possible that the pool was expanded while it was exported. 3505 * We kick off an async task to handle this for us. 3506 */ 3507 spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); 3508 3509 mutex_exit(&spa_namespace_lock); 3510 spa_history_log_version(spa, LOG_POOL_IMPORT); 3511 3512 return (0); 3513 } 3514 3515 nvlist_t * 3516 spa_tryimport(nvlist_t *tryconfig) 3517 { 3518 nvlist_t *config = NULL; 3519 char *poolname; 3520 spa_t *spa; 3521 uint64_t state; 3522 int error; 3523 3524 if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) 3525 return (NULL); 3526 3527 if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) 3528 return (NULL); 3529 3530 /* 3531 * Create and initialize the spa structure. 3532 */ 3533 mutex_enter(&spa_namespace_lock); 3534 spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); 3535 spa_activate(spa, FREAD); 3536 3537 /* 3538 * Pass off the heavy lifting to spa_load(). 3539 * Pass TRUE for mosconfig because the user-supplied config 3540 * is actually the one to trust when doing an import. 3541 */ 3542 error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); 3543 3544 /* 3545 * If 'tryconfig' was at least parsable, return the current config. 3546 */ 3547 if (spa->spa_root_vdev != NULL) { 3548 config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); 3549 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, 3550 poolname) == 0); 3551 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 3552 state) == 0); 3553 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, 3554 spa->spa_uberblock.ub_timestamp) == 0); 3555 3556 /* 3557 * If the bootfs property exists on this pool then we 3558 * copy it out so that external consumers can tell which 3559 * pools are bootable. 3560 */ 3561 if ((!error || error == EEXIST) && spa->spa_bootfs) { 3562 char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3563 3564 /* 3565 * We have to play games with the name since the 3566 * pool was opened as TRYIMPORT_NAME. 3567 */ 3568 if (dsl_dsobj_to_dsname(spa_name(spa), 3569 spa->spa_bootfs, tmpname) == 0) { 3570 char *cp; 3571 char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); 3572 3573 cp = strchr(tmpname, '/'); 3574 if (cp == NULL) { 3575 (void) strlcpy(dsname, tmpname, 3576 MAXPATHLEN); 3577 } else { 3578 (void) snprintf(dsname, MAXPATHLEN, 3579 "%s/%s", poolname, ++cp); 3580 } 3581 VERIFY(nvlist_add_string(config, 3582 ZPOOL_CONFIG_BOOTFS, dsname) == 0); 3583 kmem_free(dsname, MAXPATHLEN); 3584 } 3585 kmem_free(tmpname, MAXPATHLEN); 3586 } 3587 3588 /* 3589 * Add the list of hot spares and level 2 cache devices. 3590 */ 3591 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 3592 spa_add_spares(spa, config); 3593 spa_add_l2cache(spa, config); 3594 spa_config_exit(spa, SCL_CONFIG, FTAG); 3595 } 3596 3597 spa_unload(spa); 3598 spa_deactivate(spa); 3599 spa_remove(spa); 3600 mutex_exit(&spa_namespace_lock); 3601 3602 return (config); 3603 } 3604 3605 /* 3606 * Pool export/destroy 3607 * 3608 * The act of destroying or exporting a pool is very simple. We make sure there 3609 * is no more pending I/O and any references to the pool are gone. Then, we 3610 * update the pool state and sync all the labels to disk, removing the 3611 * configuration from the cache afterwards. If the 'hardforce' flag is set, then 3612 * we don't sync the labels or remove the configuration cache. 3613 */ 3614 static int 3615 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, 3616 boolean_t force, boolean_t hardforce) 3617 { 3618 spa_t *spa; 3619 3620 if (oldconfig) 3621 *oldconfig = NULL; 3622 3623 if (!(spa_mode_global & FWRITE)) 3624 return (EROFS); 3625 3626 mutex_enter(&spa_namespace_lock); 3627 if ((spa = spa_lookup(pool)) == NULL) { 3628 mutex_exit(&spa_namespace_lock); 3629 return (ENOENT); 3630 } 3631 3632 /* 3633 * Put a hold on the pool, drop the namespace lock, stop async tasks, 3634 * reacquire the namespace lock, and see if we can export. 3635 */ 3636 spa_open_ref(spa, FTAG); 3637 mutex_exit(&spa_namespace_lock); 3638 spa_async_suspend(spa); 3639 mutex_enter(&spa_namespace_lock); 3640 spa_close(spa, FTAG); 3641 3642 /* 3643 * The pool will be in core if it's openable, 3644 * in which case we can modify its state. 3645 */ 3646 if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { 3647 /* 3648 * Objsets may be open only because they're dirty, so we 3649 * have to force it to sync before checking spa_refcnt. 3650 */ 3651 txg_wait_synced(spa->spa_dsl_pool, 0); 3652 3653 /* 3654 * A pool cannot be exported or destroyed if there are active 3655 * references. If we are resetting a pool, allow references by 3656 * fault injection handlers. 3657 */ 3658 if (!spa_refcount_zero(spa) || 3659 (spa->spa_inject_ref != 0 && 3660 new_state != POOL_STATE_UNINITIALIZED)) { 3661 spa_async_resume(spa); 3662 mutex_exit(&spa_namespace_lock); 3663 return (EBUSY); 3664 } 3665 3666 /* 3667 * A pool cannot be exported if it has an active shared spare. 3668 * This is to prevent other pools stealing the active spare 3669 * from an exported pool. At user's own will, such pool can 3670 * be forcedly exported. 3671 */ 3672 if (!force && new_state == POOL_STATE_EXPORTED && 3673 spa_has_active_shared_spare(spa)) { 3674 spa_async_resume(spa); 3675 mutex_exit(&spa_namespace_lock); 3676 return (EXDEV); 3677 } 3678 3679 /* 3680 * We want this to be reflected on every label, 3681 * so mark them all dirty. spa_unload() will do the 3682 * final sync that pushes these changes out. 3683 */ 3684 if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { 3685 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 3686 spa->spa_state = new_state; 3687 spa->spa_final_txg = spa_last_synced_txg(spa) + 3688 TXG_DEFER_SIZE + 1; 3689 vdev_config_dirty(spa->spa_root_vdev); 3690 spa_config_exit(spa, SCL_ALL, FTAG); 3691 } 3692 } 3693 3694 spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); 3695 3696 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 3697 spa_unload(spa); 3698 spa_deactivate(spa); 3699 } 3700 3701 if (oldconfig && spa->spa_config) 3702 VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); 3703 3704 if (new_state != POOL_STATE_UNINITIALIZED) { 3705 if (!hardforce) 3706 spa_config_sync(spa, B_TRUE, B_TRUE); 3707 spa_remove(spa); 3708 } 3709 mutex_exit(&spa_namespace_lock); 3710 3711 return (0); 3712 } 3713 3714 /* 3715 * Destroy a storage pool. 3716 */ 3717 int 3718 spa_destroy(char *pool) 3719 { 3720 return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, 3721 B_FALSE, B_FALSE)); 3722 } 3723 3724 /* 3725 * Export a storage pool. 3726 */ 3727 int 3728 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, 3729 boolean_t hardforce) 3730 { 3731 return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, 3732 force, hardforce)); 3733 } 3734 3735 /* 3736 * Similar to spa_export(), this unloads the spa_t without actually removing it 3737 * from the namespace in any way. 3738 */ 3739 int 3740 spa_reset(char *pool) 3741 { 3742 return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, 3743 B_FALSE, B_FALSE)); 3744 } 3745 3746 /* 3747 * ========================================================================== 3748 * Device manipulation 3749 * ========================================================================== 3750 */ 3751 3752 /* 3753 * Add a device to a storage pool. 3754 */ 3755 int 3756 spa_vdev_add(spa_t *spa, nvlist_t *nvroot) 3757 { 3758 uint64_t txg, id; 3759 int error; 3760 vdev_t *rvd = spa->spa_root_vdev; 3761 vdev_t *vd, *tvd; 3762 nvlist_t **spares, **l2cache; 3763 uint_t nspares, nl2cache; 3764 3765 ASSERT(spa_writeable(spa)); 3766 3767 txg = spa_vdev_enter(spa); 3768 3769 if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, 3770 VDEV_ALLOC_ADD)) != 0) 3771 return (spa_vdev_exit(spa, NULL, txg, error)); 3772 3773 spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ 3774 3775 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, 3776 &nspares) != 0) 3777 nspares = 0; 3778 3779 if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, 3780 &nl2cache) != 0) 3781 nl2cache = 0; 3782 3783 if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) 3784 return (spa_vdev_exit(spa, vd, txg, EINVAL)); 3785 3786 if (vd->vdev_children != 0 && 3787 (error = vdev_create(vd, txg, B_FALSE)) != 0) 3788 return (spa_vdev_exit(spa, vd, txg, error)); 3789 3790 /* 3791 * We must validate the spares and l2cache devices after checking the 3792 * children. Otherwise, vdev_inuse() will blindly overwrite the spare. 3793 */ 3794 if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) 3795 return (spa_vdev_exit(spa, vd, txg, error)); 3796 3797 /* 3798 * Transfer each new top-level vdev from vd to rvd. 3799 */ 3800 for (int c = 0; c < vd->vdev_children; c++) { 3801 3802 /* 3803 * Set the vdev id to the first hole, if one exists. 3804 */ 3805 for (id = 0; id < rvd->vdev_children; id++) { 3806 if (rvd->vdev_child[id]->vdev_ishole) { 3807 vdev_free(rvd->vdev_child[id]); 3808 break; 3809 } 3810 } 3811 tvd = vd->vdev_child[c]; 3812 vdev_remove_child(vd, tvd); 3813 tvd->vdev_id = id; 3814 vdev_add_child(rvd, tvd); 3815 vdev_config_dirty(tvd); 3816 } 3817 3818 if (nspares != 0) { 3819 spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, 3820 ZPOOL_CONFIG_SPARES); 3821 spa_load_spares(spa); 3822 spa->spa_spares.sav_sync = B_TRUE; 3823 } 3824 3825 if (nl2cache != 0) { 3826 spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, 3827 ZPOOL_CONFIG_L2CACHE); 3828 spa_load_l2cache(spa); 3829 spa->spa_l2cache.sav_sync = B_TRUE; 3830 } 3831 3832 /* 3833 * We have to be careful when adding new vdevs to an existing pool. 3834 * If other threads start allocating from these vdevs before we 3835 * sync the config cache, and we lose power, then upon reboot we may 3836 * fail to open the pool because there are DVAs that the config cache 3837 * can't translate. Therefore, we first add the vdevs without 3838 * initializing metaslabs; sync the config cache (via spa_vdev_exit()); 3839 * and then let spa_config_update() initialize the new metaslabs. 3840 * 3841 * spa_load() checks for added-but-not-initialized vdevs, so that 3842 * if we lose power at any point in this sequence, the remaining 3843 * steps will be completed the next time we load the pool. 3844 */ 3845 (void) spa_vdev_exit(spa, vd, txg, 0); 3846 3847 mutex_enter(&spa_namespace_lock); 3848 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 3849 mutex_exit(&spa_namespace_lock); 3850 3851 return (0); 3852 } 3853 3854 /* 3855 * Attach a device to a mirror. The arguments are the path to any device 3856 * in the mirror, and the nvroot for the new device. If the path specifies 3857 * a device that is not mirrored, we automatically insert the mirror vdev. 3858 * 3859 * If 'replacing' is specified, the new device is intended to replace the 3860 * existing device; in this case the two devices are made into their own 3861 * mirror using the 'replacing' vdev, which is functionally identical to 3862 * the mirror vdev (it actually reuses all the same ops) but has a few 3863 * extra rules: you can't attach to it after it's been created, and upon 3864 * completion of resilvering, the first disk (the one being replaced) 3865 * is automatically detached. 3866 */ 3867 int 3868 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) 3869 { 3870 uint64_t txg, dtl_max_txg; 3871 vdev_t *rvd = spa->spa_root_vdev; 3872 vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; 3873 vdev_ops_t *pvops; 3874 char *oldvdpath, *newvdpath; 3875 int newvd_isspare; 3876 int error; 3877 3878 ASSERT(spa_writeable(spa)); 3879 3880 txg = spa_vdev_enter(spa); 3881 3882 oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); 3883 3884 if (oldvd == NULL) 3885 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 3886 3887 if (!oldvd->vdev_ops->vdev_op_leaf) 3888 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 3889 3890 pvd = oldvd->vdev_parent; 3891 3892 if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, 3893 VDEV_ALLOC_ADD)) != 0) 3894 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 3895 3896 if (newrootvd->vdev_children != 1) 3897 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3898 3899 newvd = newrootvd->vdev_child[0]; 3900 3901 if (!newvd->vdev_ops->vdev_op_leaf) 3902 return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); 3903 3904 if ((error = vdev_create(newrootvd, txg, replacing)) != 0) 3905 return (spa_vdev_exit(spa, newrootvd, txg, error)); 3906 3907 /* 3908 * Spares can't replace logs 3909 */ 3910 if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) 3911 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3912 3913 if (!replacing) { 3914 /* 3915 * For attach, the only allowable parent is a mirror or the root 3916 * vdev. 3917 */ 3918 if (pvd->vdev_ops != &vdev_mirror_ops && 3919 pvd->vdev_ops != &vdev_root_ops) 3920 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3921 3922 pvops = &vdev_mirror_ops; 3923 } else { 3924 /* 3925 * Active hot spares can only be replaced by inactive hot 3926 * spares. 3927 */ 3928 if (pvd->vdev_ops == &vdev_spare_ops && 3929 oldvd->vdev_isspare && 3930 !spa_has_spare(spa, newvd->vdev_guid)) 3931 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3932 3933 /* 3934 * If the source is a hot spare, and the parent isn't already a 3935 * spare, then we want to create a new hot spare. Otherwise, we 3936 * want to create a replacing vdev. The user is not allowed to 3937 * attach to a spared vdev child unless the 'isspare' state is 3938 * the same (spare replaces spare, non-spare replaces 3939 * non-spare). 3940 */ 3941 if (pvd->vdev_ops == &vdev_replacing_ops && 3942 spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { 3943 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3944 } else if (pvd->vdev_ops == &vdev_spare_ops && 3945 newvd->vdev_isspare != oldvd->vdev_isspare) { 3946 return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); 3947 } 3948 3949 if (newvd->vdev_isspare) 3950 pvops = &vdev_spare_ops; 3951 else 3952 pvops = &vdev_replacing_ops; 3953 } 3954 3955 /* 3956 * Make sure the new device is big enough. 3957 */ 3958 if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) 3959 return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); 3960 3961 /* 3962 * The new device cannot have a higher alignment requirement 3963 * than the top-level vdev. 3964 */ 3965 if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) 3966 return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); 3967 3968 /* 3969 * If this is an in-place replacement, update oldvd's path and devid 3970 * to make it distinguishable from newvd, and unopenable from now on. 3971 */ 3972 if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { 3973 spa_strfree(oldvd->vdev_path); 3974 oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, 3975 KM_SLEEP); 3976 (void) sprintf(oldvd->vdev_path, "%s/%s", 3977 newvd->vdev_path, "old"); 3978 if (oldvd->vdev_devid != NULL) { 3979 spa_strfree(oldvd->vdev_devid); 3980 oldvd->vdev_devid = NULL; 3981 } 3982 } 3983 3984 /* mark the device being resilvered */ 3985 newvd->vdev_resilvering = B_TRUE; 3986 3987 /* 3988 * If the parent is not a mirror, or if we're replacing, insert the new 3989 * mirror/replacing/spare vdev above oldvd. 3990 */ 3991 if (pvd->vdev_ops != pvops) 3992 pvd = vdev_add_parent(oldvd, pvops); 3993 3994 ASSERT(pvd->vdev_top->vdev_parent == rvd); 3995 ASSERT(pvd->vdev_ops == pvops); 3996 ASSERT(oldvd->vdev_parent == pvd); 3997 3998 /* 3999 * Extract the new device from its root and add it to pvd. 4000 */ 4001 vdev_remove_child(newrootvd, newvd); 4002 newvd->vdev_id = pvd->vdev_children; 4003 newvd->vdev_crtxg = oldvd->vdev_crtxg; 4004 vdev_add_child(pvd, newvd); 4005 4006 tvd = newvd->vdev_top; 4007 ASSERT(pvd->vdev_top == tvd); 4008 ASSERT(tvd->vdev_parent == rvd); 4009 4010 vdev_config_dirty(tvd); 4011 4012 /* 4013 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account 4014 * for any dmu_sync-ed blocks. It will propagate upward when 4015 * spa_vdev_exit() calls vdev_dtl_reassess(). 4016 */ 4017 dtl_max_txg = txg + TXG_CONCURRENT_STATES; 4018 4019 vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, 4020 dtl_max_txg - TXG_INITIAL); 4021 4022 if (newvd->vdev_isspare) { 4023 spa_spare_activate(newvd); 4024 spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); 4025 } 4026 4027 oldvdpath = spa_strdup(oldvd->vdev_path); 4028 newvdpath = spa_strdup(newvd->vdev_path); 4029 newvd_isspare = newvd->vdev_isspare; 4030 4031 /* 4032 * Mark newvd's DTL dirty in this txg. 4033 */ 4034 vdev_dirty(tvd, VDD_DTL, newvd, txg); 4035 4036 /* 4037 * Restart the resilver 4038 */ 4039 dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); 4040 4041 /* 4042 * Commit the config 4043 */ 4044 (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); 4045 4046 spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL, 4047 "%s vdev=%s %s vdev=%s", 4048 replacing && newvd_isspare ? "spare in" : 4049 replacing ? "replace" : "attach", newvdpath, 4050 replacing ? "for" : "to", oldvdpath); 4051 4052 spa_strfree(oldvdpath); 4053 spa_strfree(newvdpath); 4054 4055 if (spa->spa_bootfs) 4056 spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); 4057 4058 return (0); 4059 } 4060 4061 /* 4062 * Detach a device from a mirror or replacing vdev. 4063 * If 'replace_done' is specified, only detach if the parent 4064 * is a replacing vdev. 4065 */ 4066 int 4067 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) 4068 { 4069 uint64_t txg; 4070 int error; 4071 vdev_t *rvd = spa->spa_root_vdev; 4072 vdev_t *vd, *pvd, *cvd, *tvd; 4073 boolean_t unspare = B_FALSE; 4074 uint64_t unspare_guid; 4075 char *vdpath; 4076 4077 ASSERT(spa_writeable(spa)); 4078 4079 txg = spa_vdev_enter(spa); 4080 4081 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4082 4083 if (vd == NULL) 4084 return (spa_vdev_exit(spa, NULL, txg, ENODEV)); 4085 4086 if (!vd->vdev_ops->vdev_op_leaf) 4087 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4088 4089 pvd = vd->vdev_parent; 4090 4091 /* 4092 * If the parent/child relationship is not as expected, don't do it. 4093 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing 4094 * vdev that's replacing B with C. The user's intent in replacing 4095 * is to go from M(A,B) to M(A,C). If the user decides to cancel 4096 * the replace by detaching C, the expected behavior is to end up 4097 * M(A,B). But suppose that right after deciding to detach C, 4098 * the replacement of B completes. We would have M(A,C), and then 4099 * ask to detach C, which would leave us with just A -- not what 4100 * the user wanted. To prevent this, we make sure that the 4101 * parent/child relationship hasn't changed -- in this example, 4102 * that C's parent is still the replacing vdev R. 4103 */ 4104 if (pvd->vdev_guid != pguid && pguid != 0) 4105 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4106 4107 /* 4108 * Only 'replacing' or 'spare' vdevs can be replaced. 4109 */ 4110 if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && 4111 pvd->vdev_ops != &vdev_spare_ops) 4112 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4113 4114 ASSERT(pvd->vdev_ops != &vdev_spare_ops || 4115 spa_version(spa) >= SPA_VERSION_SPARES); 4116 4117 /* 4118 * Only mirror, replacing, and spare vdevs support detach. 4119 */ 4120 if (pvd->vdev_ops != &vdev_replacing_ops && 4121 pvd->vdev_ops != &vdev_mirror_ops && 4122 pvd->vdev_ops != &vdev_spare_ops) 4123 return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); 4124 4125 /* 4126 * If this device has the only valid copy of some data, 4127 * we cannot safely detach it. 4128 */ 4129 if (vdev_dtl_required(vd)) 4130 return (spa_vdev_exit(spa, NULL, txg, EBUSY)); 4131 4132 ASSERT(pvd->vdev_children >= 2); 4133 4134 /* 4135 * If we are detaching the second disk from a replacing vdev, then 4136 * check to see if we changed the original vdev's path to have "/old" 4137 * at the end in spa_vdev_attach(). If so, undo that change now. 4138 */ 4139 if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && 4140 vd->vdev_path != NULL) { 4141 size_t len = strlen(vd->vdev_path); 4142 4143 for (int c = 0; c < pvd->vdev_children; c++) { 4144 cvd = pvd->vdev_child[c]; 4145 4146 if (cvd == vd || cvd->vdev_path == NULL) 4147 continue; 4148 4149 if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && 4150 strcmp(cvd->vdev_path + len, "/old") == 0) { 4151 spa_strfree(cvd->vdev_path); 4152 cvd->vdev_path = spa_strdup(vd->vdev_path); 4153 break; 4154 } 4155 } 4156 } 4157 4158 /* 4159 * If we are detaching the original disk from a spare, then it implies 4160 * that the spare should become a real disk, and be removed from the 4161 * active spare list for the pool. 4162 */ 4163 if (pvd->vdev_ops == &vdev_spare_ops && 4164 vd->vdev_id == 0 && 4165 pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) 4166 unspare = B_TRUE; 4167 4168 /* 4169 * Erase the disk labels so the disk can be used for other things. 4170 * This must be done after all other error cases are handled, 4171 * but before we disembowel vd (so we can still do I/O to it). 4172 * But if we can't do it, don't treat the error as fatal -- 4173 * it may be that the unwritability of the disk is the reason 4174 * it's being detached! 4175 */ 4176 error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4177 4178 /* 4179 * Remove vd from its parent and compact the parent's children. 4180 */ 4181 vdev_remove_child(pvd, vd); 4182 vdev_compact_children(pvd); 4183 4184 /* 4185 * Remember one of the remaining children so we can get tvd below. 4186 */ 4187 cvd = pvd->vdev_child[pvd->vdev_children - 1]; 4188 4189 /* 4190 * If we need to remove the remaining child from the list of hot spares, 4191 * do it now, marking the vdev as no longer a spare in the process. 4192 * We must do this before vdev_remove_parent(), because that can 4193 * change the GUID if it creates a new toplevel GUID. For a similar 4194 * reason, we must remove the spare now, in the same txg as the detach; 4195 * otherwise someone could attach a new sibling, change the GUID, and 4196 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. 4197 */ 4198 if (unspare) { 4199 ASSERT(cvd->vdev_isspare); 4200 spa_spare_remove(cvd); 4201 unspare_guid = cvd->vdev_guid; 4202 (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); 4203 cvd->vdev_unspare = B_TRUE; 4204 } 4205 4206 /* 4207 * If the parent mirror/replacing vdev only has one child, 4208 * the parent is no longer needed. Remove it from the tree. 4209 */ 4210 if (pvd->vdev_children == 1) { 4211 if (pvd->vdev_ops == &vdev_spare_ops) 4212 cvd->vdev_unspare = B_FALSE; 4213 vdev_remove_parent(cvd); 4214 cvd->vdev_resilvering = B_FALSE; 4215 } 4216 4217 4218 /* 4219 * We don't set tvd until now because the parent we just removed 4220 * may have been the previous top-level vdev. 4221 */ 4222 tvd = cvd->vdev_top; 4223 ASSERT(tvd->vdev_parent == rvd); 4224 4225 /* 4226 * Reevaluate the parent vdev state. 4227 */ 4228 vdev_propagate_state(cvd); 4229 4230 /* 4231 * If the 'autoexpand' property is set on the pool then automatically 4232 * try to expand the size of the pool. For example if the device we 4233 * just detached was smaller than the others, it may be possible to 4234 * add metaslabs (i.e. grow the pool). We need to reopen the vdev 4235 * first so that we can obtain the updated sizes of the leaf vdevs. 4236 */ 4237 if (spa->spa_autoexpand) { 4238 vdev_reopen(tvd); 4239 vdev_expand(tvd, txg); 4240 } 4241 4242 vdev_config_dirty(tvd); 4243 4244 /* 4245 * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that 4246 * vd->vdev_detached is set and free vd's DTL object in syncing context. 4247 * But first make sure we're not on any *other* txg's DTL list, to 4248 * prevent vd from being accessed after it's freed. 4249 */ 4250 vdpath = spa_strdup(vd->vdev_path); 4251 for (int t = 0; t < TXG_SIZE; t++) 4252 (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); 4253 vd->vdev_detached = B_TRUE; 4254 vdev_dirty(tvd, VDD_DTL, vd, txg); 4255 4256 spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); 4257 4258 /* hang on to the spa before we release the lock */ 4259 spa_open_ref(spa, FTAG); 4260 4261 error = spa_vdev_exit(spa, vd, txg, 0); 4262 4263 spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL, 4264 "vdev=%s", vdpath); 4265 spa_strfree(vdpath); 4266 4267 /* 4268 * If this was the removal of the original device in a hot spare vdev, 4269 * then we want to go through and remove the device from the hot spare 4270 * list of every other pool. 4271 */ 4272 if (unspare) { 4273 spa_t *altspa = NULL; 4274 4275 mutex_enter(&spa_namespace_lock); 4276 while ((altspa = spa_next(altspa)) != NULL) { 4277 if (altspa->spa_state != POOL_STATE_ACTIVE || 4278 altspa == spa) 4279 continue; 4280 4281 spa_open_ref(altspa, FTAG); 4282 mutex_exit(&spa_namespace_lock); 4283 (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); 4284 mutex_enter(&spa_namespace_lock); 4285 spa_close(altspa, FTAG); 4286 } 4287 mutex_exit(&spa_namespace_lock); 4288 4289 /* search the rest of the vdevs for spares to remove */ 4290 spa_vdev_resilver_done(spa); 4291 } 4292 4293 /* all done with the spa; OK to release */ 4294 mutex_enter(&spa_namespace_lock); 4295 spa_close(spa, FTAG); 4296 mutex_exit(&spa_namespace_lock); 4297 4298 return (error); 4299 } 4300 4301 /* 4302 * Split a set of devices from their mirrors, and create a new pool from them. 4303 */ 4304 int 4305 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, 4306 nvlist_t *props, boolean_t exp) 4307 { 4308 int error = 0; 4309 uint64_t txg, *glist; 4310 spa_t *newspa; 4311 uint_t c, children, lastlog; 4312 nvlist_t **child, *nvl, *tmp; 4313 dmu_tx_t *tx; 4314 char *altroot = NULL; 4315 vdev_t *rvd, **vml = NULL; /* vdev modify list */ 4316 boolean_t activate_slog; 4317 4318 ASSERT(spa_writeable(spa)); 4319 4320 txg = spa_vdev_enter(spa); 4321 4322 /* clear the log and flush everything up to now */ 4323 activate_slog = spa_passivate_log(spa); 4324 (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4325 error = spa_offline_log(spa); 4326 txg = spa_vdev_config_enter(spa); 4327 4328 if (activate_slog) 4329 spa_activate_log(spa); 4330 4331 if (error != 0) 4332 return (spa_vdev_exit(spa, NULL, txg, error)); 4333 4334 /* check new spa name before going any further */ 4335 if (spa_lookup(newname) != NULL) 4336 return (spa_vdev_exit(spa, NULL, txg, EEXIST)); 4337 4338 /* 4339 * scan through all the children to ensure they're all mirrors 4340 */ 4341 if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || 4342 nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, 4343 &children) != 0) 4344 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4345 4346 /* first, check to ensure we've got the right child count */ 4347 rvd = spa->spa_root_vdev; 4348 lastlog = 0; 4349 for (c = 0; c < rvd->vdev_children; c++) { 4350 vdev_t *vd = rvd->vdev_child[c]; 4351 4352 /* don't count the holes & logs as children */ 4353 if (vd->vdev_islog || vd->vdev_ishole) { 4354 if (lastlog == 0) 4355 lastlog = c; 4356 continue; 4357 } 4358 4359 lastlog = 0; 4360 } 4361 if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) 4362 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4363 4364 /* next, ensure no spare or cache devices are part of the split */ 4365 if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || 4366 nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) 4367 return (spa_vdev_exit(spa, NULL, txg, EINVAL)); 4368 4369 vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); 4370 glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); 4371 4372 /* then, loop over each vdev and validate it */ 4373 for (c = 0; c < children; c++) { 4374 uint64_t is_hole = 0; 4375 4376 (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, 4377 &is_hole); 4378 4379 if (is_hole != 0) { 4380 if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || 4381 spa->spa_root_vdev->vdev_child[c]->vdev_islog) { 4382 continue; 4383 } else { 4384 error = EINVAL; 4385 break; 4386 } 4387 } 4388 4389 /* which disk is going to be split? */ 4390 if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, 4391 &glist[c]) != 0) { 4392 error = EINVAL; 4393 break; 4394 } 4395 4396 /* look it up in the spa */ 4397 vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); 4398 if (vml[c] == NULL) { 4399 error = ENODEV; 4400 break; 4401 } 4402 4403 /* make sure there's nothing stopping the split */ 4404 if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || 4405 vml[c]->vdev_islog || 4406 vml[c]->vdev_ishole || 4407 vml[c]->vdev_isspare || 4408 vml[c]->vdev_isl2cache || 4409 !vdev_writeable(vml[c]) || 4410 vml[c]->vdev_children != 0 || 4411 vml[c]->vdev_state != VDEV_STATE_HEALTHY || 4412 c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { 4413 error = EINVAL; 4414 break; 4415 } 4416 4417 if (vdev_dtl_required(vml[c])) { 4418 error = EBUSY; 4419 break; 4420 } 4421 4422 /* we need certain info from the top level */ 4423 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, 4424 vml[c]->vdev_top->vdev_ms_array) == 0); 4425 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, 4426 vml[c]->vdev_top->vdev_ms_shift) == 0); 4427 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, 4428 vml[c]->vdev_top->vdev_asize) == 0); 4429 VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, 4430 vml[c]->vdev_top->vdev_ashift) == 0); 4431 } 4432 4433 if (error != 0) { 4434 kmem_free(vml, children * sizeof (vdev_t *)); 4435 kmem_free(glist, children * sizeof (uint64_t)); 4436 return (spa_vdev_exit(spa, NULL, txg, error)); 4437 } 4438 4439 /* stop writers from using the disks */ 4440 for (c = 0; c < children; c++) { 4441 if (vml[c] != NULL) 4442 vml[c]->vdev_offline = B_TRUE; 4443 } 4444 vdev_reopen(spa->spa_root_vdev); 4445 4446 /* 4447 * Temporarily record the splitting vdevs in the spa config. This 4448 * will disappear once the config is regenerated. 4449 */ 4450 VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); 4451 VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, 4452 glist, children) == 0); 4453 kmem_free(glist, children * sizeof (uint64_t)); 4454 4455 mutex_enter(&spa->spa_props_lock); 4456 VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, 4457 nvl) == 0); 4458 mutex_exit(&spa->spa_props_lock); 4459 spa->spa_config_splitting = nvl; 4460 vdev_config_dirty(spa->spa_root_vdev); 4461 4462 /* configure and create the new pool */ 4463 VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); 4464 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, 4465 exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); 4466 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, 4467 spa_version(spa)) == 0); 4468 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, 4469 spa->spa_config_txg) == 0); 4470 VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, 4471 spa_generate_guid(NULL)) == 0); 4472 (void) nvlist_lookup_string(props, 4473 zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); 4474 4475 /* add the new pool to the namespace */ 4476 newspa = spa_add(newname, config, altroot); 4477 newspa->spa_config_txg = spa->spa_config_txg; 4478 spa_set_log_state(newspa, SPA_LOG_CLEAR); 4479 4480 /* release the spa config lock, retaining the namespace lock */ 4481 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4482 4483 if (zio_injection_enabled) 4484 zio_handle_panic_injection(spa, FTAG, 1); 4485 4486 spa_activate(newspa, spa_mode_global); 4487 spa_async_suspend(newspa); 4488 4489 /* create the new pool from the disks of the original pool */ 4490 error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); 4491 if (error) 4492 goto out; 4493 4494 /* if that worked, generate a real config for the new pool */ 4495 if (newspa->spa_root_vdev != NULL) { 4496 VERIFY(nvlist_alloc(&newspa->spa_config_splitting, 4497 NV_UNIQUE_NAME, KM_SLEEP) == 0); 4498 VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, 4499 ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); 4500 spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, 4501 B_TRUE)); 4502 } 4503 4504 /* set the props */ 4505 if (props != NULL) { 4506 spa_configfile_set(newspa, props, B_FALSE); 4507 error = spa_prop_set(newspa, props); 4508 if (error) 4509 goto out; 4510 } 4511 4512 /* flush everything */ 4513 txg = spa_vdev_config_enter(newspa); 4514 vdev_config_dirty(newspa->spa_root_vdev); 4515 (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); 4516 4517 if (zio_injection_enabled) 4518 zio_handle_panic_injection(spa, FTAG, 2); 4519 4520 spa_async_resume(newspa); 4521 4522 /* finally, update the original pool's config */ 4523 txg = spa_vdev_config_enter(spa); 4524 tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); 4525 error = dmu_tx_assign(tx, TXG_WAIT); 4526 if (error != 0) 4527 dmu_tx_abort(tx); 4528 for (c = 0; c < children; c++) { 4529 if (vml[c] != NULL) { 4530 vdev_split(vml[c]); 4531 if (error == 0) 4532 spa_history_log_internal(LOG_POOL_VDEV_DETACH, 4533 spa, tx, "vdev=%s", 4534 vml[c]->vdev_path); 4535 vdev_free(vml[c]); 4536 } 4537 } 4538 vdev_config_dirty(spa->spa_root_vdev); 4539 spa->spa_config_splitting = NULL; 4540 nvlist_free(nvl); 4541 if (error == 0) 4542 dmu_tx_commit(tx); 4543 (void) spa_vdev_exit(spa, NULL, txg, 0); 4544 4545 if (zio_injection_enabled) 4546 zio_handle_panic_injection(spa, FTAG, 3); 4547 4548 /* split is complete; log a history record */ 4549 spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL, 4550 "split new pool %s from pool %s", newname, spa_name(spa)); 4551 4552 kmem_free(vml, children * sizeof (vdev_t *)); 4553 4554 /* if we're not going to mount the filesystems in userland, export */ 4555 if (exp) 4556 error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, 4557 B_FALSE, B_FALSE); 4558 4559 return (error); 4560 4561 out: 4562 spa_unload(newspa); 4563 spa_deactivate(newspa); 4564 spa_remove(newspa); 4565 4566 txg = spa_vdev_config_enter(spa); 4567 4568 /* re-online all offlined disks */ 4569 for (c = 0; c < children; c++) { 4570 if (vml[c] != NULL) 4571 vml[c]->vdev_offline = B_FALSE; 4572 } 4573 vdev_reopen(spa->spa_root_vdev); 4574 4575 nvlist_free(spa->spa_config_splitting); 4576 spa->spa_config_splitting = NULL; 4577 (void) spa_vdev_exit(spa, NULL, txg, error); 4578 4579 kmem_free(vml, children * sizeof (vdev_t *)); 4580 return (error); 4581 } 4582 4583 static nvlist_t * 4584 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) 4585 { 4586 for (int i = 0; i < count; i++) { 4587 uint64_t guid; 4588 4589 VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, 4590 &guid) == 0); 4591 4592 if (guid == target_guid) 4593 return (nvpp[i]); 4594 } 4595 4596 return (NULL); 4597 } 4598 4599 static void 4600 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, 4601 nvlist_t *dev_to_remove) 4602 { 4603 nvlist_t **newdev = NULL; 4604 4605 if (count > 1) 4606 newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); 4607 4608 for (int i = 0, j = 0; i < count; i++) { 4609 if (dev[i] == dev_to_remove) 4610 continue; 4611 VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); 4612 } 4613 4614 VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); 4615 VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); 4616 4617 for (int i = 0; i < count - 1; i++) 4618 nvlist_free(newdev[i]); 4619 4620 if (count > 1) 4621 kmem_free(newdev, (count - 1) * sizeof (void *)); 4622 } 4623 4624 /* 4625 * Evacuate the device. 4626 */ 4627 static int 4628 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) 4629 { 4630 uint64_t txg; 4631 int error = 0; 4632 4633 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4634 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 4635 ASSERT(vd == vd->vdev_top); 4636 4637 /* 4638 * Evacuate the device. We don't hold the config lock as writer 4639 * since we need to do I/O but we do keep the 4640 * spa_namespace_lock held. Once this completes the device 4641 * should no longer have any blocks allocated on it. 4642 */ 4643 if (vd->vdev_islog) { 4644 if (vd->vdev_stat.vs_alloc != 0) 4645 error = spa_offline_log(spa); 4646 } else { 4647 error = ENOTSUP; 4648 } 4649 4650 if (error) 4651 return (error); 4652 4653 /* 4654 * The evacuation succeeded. Remove any remaining MOS metadata 4655 * associated with this vdev, and wait for these changes to sync. 4656 */ 4657 ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0); 4658 txg = spa_vdev_config_enter(spa); 4659 vd->vdev_removing = B_TRUE; 4660 vdev_dirty(vd, 0, NULL, txg); 4661 vdev_config_dirty(vd); 4662 spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); 4663 4664 return (0); 4665 } 4666 4667 /* 4668 * Complete the removal by cleaning up the namespace. 4669 */ 4670 static void 4671 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) 4672 { 4673 vdev_t *rvd = spa->spa_root_vdev; 4674 uint64_t id = vd->vdev_id; 4675 boolean_t last_vdev = (id == (rvd->vdev_children - 1)); 4676 4677 ASSERT(MUTEX_HELD(&spa_namespace_lock)); 4678 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); 4679 ASSERT(vd == vd->vdev_top); 4680 4681 /* 4682 * Only remove any devices which are empty. 4683 */ 4684 if (vd->vdev_stat.vs_alloc != 0) 4685 return; 4686 4687 (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); 4688 4689 if (list_link_active(&vd->vdev_state_dirty_node)) 4690 vdev_state_clean(vd); 4691 if (list_link_active(&vd->vdev_config_dirty_node)) 4692 vdev_config_clean(vd); 4693 4694 vdev_free(vd); 4695 4696 if (last_vdev) { 4697 vdev_compact_children(rvd); 4698 } else { 4699 vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); 4700 vdev_add_child(rvd, vd); 4701 } 4702 vdev_config_dirty(rvd); 4703 4704 /* 4705 * Reassess the health of our root vdev. 4706 */ 4707 vdev_reopen(rvd); 4708 } 4709 4710 /* 4711 * Remove a device from the pool - 4712 * 4713 * Removing a device from the vdev namespace requires several steps 4714 * and can take a significant amount of time. As a result we use 4715 * the spa_vdev_config_[enter/exit] functions which allow us to 4716 * grab and release the spa_config_lock while still holding the namespace 4717 * lock. During each step the configuration is synced out. 4718 */ 4719 4720 /* 4721 * Remove a device from the pool. Currently, this supports removing only hot 4722 * spares, slogs, and level 2 ARC devices. 4723 */ 4724 int 4725 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) 4726 { 4727 vdev_t *vd; 4728 metaslab_group_t *mg; 4729 nvlist_t **spares, **l2cache, *nv; 4730 uint64_t txg = 0; 4731 uint_t nspares, nl2cache; 4732 int error = 0; 4733 boolean_t locked = MUTEX_HELD(&spa_namespace_lock); 4734 4735 ASSERT(spa_writeable(spa)); 4736 4737 if (!locked) 4738 txg = spa_vdev_enter(spa); 4739 4740 vd = spa_lookup_by_guid(spa, guid, B_FALSE); 4741 4742 if (spa->spa_spares.sav_vdevs != NULL && 4743 nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, 4744 ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && 4745 (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { 4746 /* 4747 * Only remove the hot spare if it's not currently in use 4748 * in this pool. 4749 */ 4750 if (vd == NULL || unspare) { 4751 spa_vdev_remove_aux(spa->spa_spares.sav_config, 4752 ZPOOL_CONFIG_SPARES, spares, nspares, nv); 4753 spa_load_spares(spa); 4754 spa->spa_spares.sav_sync = B_TRUE; 4755 } else { 4756 error = EBUSY; 4757 } 4758 } else if (spa->spa_l2cache.sav_vdevs != NULL && 4759 nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, 4760 ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && 4761 (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { 4762 /* 4763 * Cache devices can always be removed. 4764 */ 4765 spa_vdev_remove_aux(spa->spa_l2cache.sav_config, 4766 ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); 4767 spa_load_l2cache(spa); 4768 spa->spa_l2cache.sav_sync = B_TRUE; 4769 } else if (vd != NULL && vd->vdev_islog) { 4770 ASSERT(!locked); 4771 ASSERT(vd == vd->vdev_top); 4772 4773 /* 4774 * XXX - Once we have bp-rewrite this should 4775 * become the common case. 4776 */ 4777 4778 mg = vd->vdev_mg; 4779 4780 /* 4781 * Stop allocating from this vdev. 4782 */ 4783 metaslab_group_passivate(mg); 4784 4785 /* 4786 * Wait for the youngest allocations and frees to sync, 4787 * and then wait for the deferral of those frees to finish. 4788 */ 4789 spa_vdev_config_exit(spa, NULL, 4790 txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); 4791 4792 /* 4793 * Attempt to evacuate the vdev. 4794 */ 4795 error = spa_vdev_remove_evacuate(spa, vd); 4796 4797 txg = spa_vdev_config_enter(spa); 4798 4799 /* 4800 * If we couldn't evacuate the vdev, unwind. 4801 */ 4802 if (error) { 4803 metaslab_group_activate(mg); 4804 return (spa_vdev_exit(spa, NULL, txg, error)); 4805 } 4806 4807 /* 4808 * Clean up the vdev namespace. 4809 */ 4810 spa_vdev_remove_from_namespace(spa, vd); 4811 4812 } else if (vd != NULL) { 4813 /* 4814 * Normal vdevs cannot be removed (yet). 4815 */ 4816 error = ENOTSUP; 4817 } else { 4818 /* 4819 * There is no vdev of any kind with the specified guid. 4820 */ 4821 error = ENOENT; 4822 } 4823 4824 if (!locked) 4825 return (spa_vdev_exit(spa, NULL, txg, error)); 4826 4827 return (error); 4828 } 4829 4830 /* 4831 * Find any device that's done replacing, or a vdev marked 'unspare' that's 4832 * current spared, so we can detach it. 4833 */ 4834 static vdev_t * 4835 spa_vdev_resilver_done_hunt(vdev_t *vd) 4836 { 4837 vdev_t *newvd, *oldvd; 4838 4839 for (int c = 0; c < vd->vdev_children; c++) { 4840 oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); 4841 if (oldvd != NULL) 4842 return (oldvd); 4843 } 4844 4845 /* 4846 * Check for a completed replacement. We always consider the first 4847 * vdev in the list to be the oldest vdev, and the last one to be 4848 * the newest (see spa_vdev_attach() for how that works). In 4849 * the case where the newest vdev is faulted, we will not automatically 4850 * remove it after a resilver completes. This is OK as it will require 4851 * user intervention to determine which disk the admin wishes to keep. 4852 */ 4853 if (vd->vdev_ops == &vdev_replacing_ops) { 4854 ASSERT(vd->vdev_children > 1); 4855 4856 newvd = vd->vdev_child[vd->vdev_children - 1]; 4857 oldvd = vd->vdev_child[0]; 4858 4859 if (vdev_dtl_empty(newvd, DTL_MISSING) && 4860 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4861 !vdev_dtl_required(oldvd)) 4862 return (oldvd); 4863 } 4864 4865 /* 4866 * Check for a completed resilver with the 'unspare' flag set. 4867 */ 4868 if (vd->vdev_ops == &vdev_spare_ops) { 4869 vdev_t *first = vd->vdev_child[0]; 4870 vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; 4871 4872 if (last->vdev_unspare) { 4873 oldvd = first; 4874 newvd = last; 4875 } else if (first->vdev_unspare) { 4876 oldvd = last; 4877 newvd = first; 4878 } else { 4879 oldvd = NULL; 4880 } 4881 4882 if (oldvd != NULL && 4883 vdev_dtl_empty(newvd, DTL_MISSING) && 4884 vdev_dtl_empty(newvd, DTL_OUTAGE) && 4885 !vdev_dtl_required(oldvd)) 4886 return (oldvd); 4887 4888 /* 4889 * If there are more than two spares attached to a disk, 4890 * and those spares are not required, then we want to 4891 * attempt to free them up now so that they can be used 4892 * by other pools. Once we're back down to a single 4893 * disk+spare, we stop removing them. 4894 */ 4895 if (vd->vdev_children > 2) { 4896 newvd = vd->vdev_child[1]; 4897 4898 if (newvd->vdev_isspare && last->vdev_isspare && 4899 vdev_dtl_empty(last, DTL_MISSING) && 4900 vdev_dtl_empty(last, DTL_OUTAGE) && 4901 !vdev_dtl_required(newvd)) 4902 return (newvd); 4903 } 4904 } 4905 4906 return (NULL); 4907 } 4908 4909 static void 4910 spa_vdev_resilver_done(spa_t *spa) 4911 { 4912 vdev_t *vd, *pvd, *ppvd; 4913 uint64_t guid, sguid, pguid, ppguid; 4914 4915 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4916 4917 while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { 4918 pvd = vd->vdev_parent; 4919 ppvd = pvd->vdev_parent; 4920 guid = vd->vdev_guid; 4921 pguid = pvd->vdev_guid; 4922 ppguid = ppvd->vdev_guid; 4923 sguid = 0; 4924 /* 4925 * If we have just finished replacing a hot spared device, then 4926 * we need to detach the parent's first child (the original hot 4927 * spare) as well. 4928 */ 4929 if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && 4930 ppvd->vdev_children == 2) { 4931 ASSERT(pvd->vdev_ops == &vdev_replacing_ops); 4932 sguid = ppvd->vdev_child[1]->vdev_guid; 4933 } 4934 spa_config_exit(spa, SCL_ALL, FTAG); 4935 if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) 4936 return; 4937 if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) 4938 return; 4939 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 4940 } 4941 4942 spa_config_exit(spa, SCL_ALL, FTAG); 4943 } 4944 4945 /* 4946 * Update the stored path or FRU for this vdev. 4947 */ 4948 int 4949 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, 4950 boolean_t ispath) 4951 { 4952 vdev_t *vd; 4953 boolean_t sync = B_FALSE; 4954 4955 ASSERT(spa_writeable(spa)); 4956 4957 spa_vdev_state_enter(spa, SCL_ALL); 4958 4959 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) 4960 return (spa_vdev_state_exit(spa, NULL, ENOENT)); 4961 4962 if (!vd->vdev_ops->vdev_op_leaf) 4963 return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); 4964 4965 if (ispath) { 4966 if (strcmp(value, vd->vdev_path) != 0) { 4967 spa_strfree(vd->vdev_path); 4968 vd->vdev_path = spa_strdup(value); 4969 sync = B_TRUE; 4970 } 4971 } else { 4972 if (vd->vdev_fru == NULL) { 4973 vd->vdev_fru = spa_strdup(value); 4974 sync = B_TRUE; 4975 } else if (strcmp(value, vd->vdev_fru) != 0) { 4976 spa_strfree(vd->vdev_fru); 4977 vd->vdev_fru = spa_strdup(value); 4978 sync = B_TRUE; 4979 } 4980 } 4981 4982 return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); 4983 } 4984 4985 int 4986 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) 4987 { 4988 return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); 4989 } 4990 4991 int 4992 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) 4993 { 4994 return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); 4995 } 4996 4997 /* 4998 * ========================================================================== 4999 * SPA Scanning 5000 * ========================================================================== 5001 */ 5002 5003 int 5004 spa_scan_stop(spa_t *spa) 5005 { 5006 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5007 if (dsl_scan_resilvering(spa->spa_dsl_pool)) 5008 return (EBUSY); 5009 return (dsl_scan_cancel(spa->spa_dsl_pool)); 5010 } 5011 5012 int 5013 spa_scan(spa_t *spa, pool_scan_func_t func) 5014 { 5015 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); 5016 5017 if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) 5018 return (ENOTSUP); 5019 5020 /* 5021 * If a resilver was requested, but there is no DTL on a 5022 * writeable leaf device, we have nothing to do. 5023 */ 5024 if (func == POOL_SCAN_RESILVER && 5025 !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { 5026 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); 5027 return (0); 5028 } 5029 5030 return (dsl_scan(spa->spa_dsl_pool, func)); 5031 } 5032 5033 /* 5034 * ========================================================================== 5035 * SPA async task processing 5036 * ========================================================================== 5037 */ 5038 5039 static void 5040 spa_async_remove(spa_t *spa, vdev_t *vd) 5041 { 5042 if (vd->vdev_remove_wanted) { 5043 vd->vdev_remove_wanted = B_FALSE; 5044 vd->vdev_delayed_close = B_FALSE; 5045 vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); 5046 5047 /* 5048 * We want to clear the stats, but we don't want to do a full 5049 * vdev_clear() as that will cause us to throw away 5050 * degraded/faulted state as well as attempt to reopen the 5051 * device, all of which is a waste. 5052 */ 5053 vd->vdev_stat.vs_read_errors = 0; 5054 vd->vdev_stat.vs_write_errors = 0; 5055 vd->vdev_stat.vs_checksum_errors = 0; 5056 5057 vdev_state_dirty(vd->vdev_top); 5058 } 5059 5060 for (int c = 0; c < vd->vdev_children; c++) 5061 spa_async_remove(spa, vd->vdev_child[c]); 5062 } 5063 5064 static void 5065 spa_async_probe(spa_t *spa, vdev_t *vd) 5066 { 5067 if (vd->vdev_probe_wanted) { 5068 vd->vdev_probe_wanted = B_FALSE; 5069 vdev_reopen(vd); /* vdev_open() does the actual probe */ 5070 } 5071 5072 for (int c = 0; c < vd->vdev_children; c++) 5073 spa_async_probe(spa, vd->vdev_child[c]); 5074 } 5075 5076 static void 5077 spa_async_autoexpand(spa_t *spa, vdev_t *vd) 5078 { 5079 sysevent_id_t eid; 5080 nvlist_t *attr; 5081 char *physpath; 5082 5083 if (!spa->spa_autoexpand) 5084 return; 5085 5086 for (int c = 0; c < vd->vdev_children; c++) { 5087 vdev_t *cvd = vd->vdev_child[c]; 5088 spa_async_autoexpand(spa, cvd); 5089 } 5090 5091 if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) 5092 return; 5093 5094 physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); 5095 (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); 5096 5097 VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5098 VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); 5099 5100 (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, 5101 ESC_DEV_DLE, attr, &eid, DDI_SLEEP); 5102 5103 nvlist_free(attr); 5104 kmem_free(physpath, MAXPATHLEN); 5105 } 5106 5107 static void 5108 spa_async_thread(spa_t *spa) 5109 { 5110 int tasks; 5111 5112 ASSERT(spa->spa_sync_on); 5113 5114 mutex_enter(&spa->spa_async_lock); 5115 tasks = spa->spa_async_tasks; 5116 spa->spa_async_tasks = 0; 5117 mutex_exit(&spa->spa_async_lock); 5118 5119 /* 5120 * See if the config needs to be updated. 5121 */ 5122 if (tasks & SPA_ASYNC_CONFIG_UPDATE) { 5123 uint64_t old_space, new_space; 5124 5125 mutex_enter(&spa_namespace_lock); 5126 old_space = metaslab_class_get_space(spa_normal_class(spa)); 5127 spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); 5128 new_space = metaslab_class_get_space(spa_normal_class(spa)); 5129 mutex_exit(&spa_namespace_lock); 5130 5131 /* 5132 * If the pool grew as a result of the config update, 5133 * then log an internal history event. 5134 */ 5135 if (new_space != old_space) { 5136 spa_history_log_internal(LOG_POOL_VDEV_ONLINE, 5137 spa, NULL, 5138 "pool '%s' size: %llu(+%llu)", 5139 spa_name(spa), new_space, new_space - old_space); 5140 } 5141 } 5142 5143 /* 5144 * See if any devices need to be marked REMOVED. 5145 */ 5146 if (tasks & SPA_ASYNC_REMOVE) { 5147 spa_vdev_state_enter(spa, SCL_NONE); 5148 spa_async_remove(spa, spa->spa_root_vdev); 5149 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) 5150 spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); 5151 for (int i = 0; i < spa->spa_spares.sav_count; i++) 5152 spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); 5153 (void) spa_vdev_state_exit(spa, NULL, 0); 5154 } 5155 5156 if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { 5157 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5158 spa_async_autoexpand(spa, spa->spa_root_vdev); 5159 spa_config_exit(spa, SCL_CONFIG, FTAG); 5160 } 5161 5162 /* 5163 * See if any devices need to be probed. 5164 */ 5165 if (tasks & SPA_ASYNC_PROBE) { 5166 spa_vdev_state_enter(spa, SCL_NONE); 5167 spa_async_probe(spa, spa->spa_root_vdev); 5168 (void) spa_vdev_state_exit(spa, NULL, 0); 5169 } 5170 5171 /* 5172 * If any devices are done replacing, detach them. 5173 */ 5174 if (tasks & SPA_ASYNC_RESILVER_DONE) 5175 spa_vdev_resilver_done(spa); 5176 5177 /* 5178 * Kick off a resilver. 5179 */ 5180 if (tasks & SPA_ASYNC_RESILVER) 5181 dsl_resilver_restart(spa->spa_dsl_pool, 0); 5182 5183 /* 5184 * Let the world know that we're done. 5185 */ 5186 mutex_enter(&spa->spa_async_lock); 5187 spa->spa_async_thread = NULL; 5188 cv_broadcast(&spa->spa_async_cv); 5189 mutex_exit(&spa->spa_async_lock); 5190 thread_exit(); 5191 } 5192 5193 void 5194 spa_async_suspend(spa_t *spa) 5195 { 5196 mutex_enter(&spa->spa_async_lock); 5197 spa->spa_async_suspended++; 5198 while (spa->spa_async_thread != NULL) 5199 cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); 5200 mutex_exit(&spa->spa_async_lock); 5201 } 5202 5203 void 5204 spa_async_resume(spa_t *spa) 5205 { 5206 mutex_enter(&spa->spa_async_lock); 5207 ASSERT(spa->spa_async_suspended != 0); 5208 spa->spa_async_suspended--; 5209 mutex_exit(&spa->spa_async_lock); 5210 } 5211 5212 static void 5213 spa_async_dispatch(spa_t *spa) 5214 { 5215 mutex_enter(&spa->spa_async_lock); 5216 if (spa->spa_async_tasks && !spa->spa_async_suspended && 5217 spa->spa_async_thread == NULL && 5218 rootdir != NULL && !vn_is_readonly(rootdir)) 5219 spa->spa_async_thread = thread_create(NULL, 0, 5220 spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); 5221 mutex_exit(&spa->spa_async_lock); 5222 } 5223 5224 void 5225 spa_async_request(spa_t *spa, int task) 5226 { 5227 zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); 5228 mutex_enter(&spa->spa_async_lock); 5229 spa->spa_async_tasks |= task; 5230 mutex_exit(&spa->spa_async_lock); 5231 } 5232 5233 /* 5234 * ========================================================================== 5235 * SPA syncing routines 5236 * ========================================================================== 5237 */ 5238 5239 static int 5240 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5241 { 5242 bpobj_t *bpo = arg; 5243 bpobj_enqueue(bpo, bp, tx); 5244 return (0); 5245 } 5246 5247 static int 5248 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) 5249 { 5250 zio_t *zio = arg; 5251 5252 zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, 5253 zio->io_flags)); 5254 return (0); 5255 } 5256 5257 static void 5258 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) 5259 { 5260 char *packed = NULL; 5261 size_t bufsize; 5262 size_t nvsize = 0; 5263 dmu_buf_t *db; 5264 5265 VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); 5266 5267 /* 5268 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration 5269 * information. This avoids the dbuf_will_dirty() path and 5270 * saves us a pre-read to get data we don't actually care about. 5271 */ 5272 bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE); 5273 packed = kmem_alloc(bufsize, KM_SLEEP); 5274 5275 VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, 5276 KM_SLEEP) == 0); 5277 bzero(packed + nvsize, bufsize - nvsize); 5278 5279 dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); 5280 5281 kmem_free(packed, bufsize); 5282 5283 VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); 5284 dmu_buf_will_dirty(db, tx); 5285 *(uint64_t *)db->db_data = nvsize; 5286 dmu_buf_rele(db, FTAG); 5287 } 5288 5289 static void 5290 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, 5291 const char *config, const char *entry) 5292 { 5293 nvlist_t *nvroot; 5294 nvlist_t **list; 5295 int i; 5296 5297 if (!sav->sav_sync) 5298 return; 5299 5300 /* 5301 * Update the MOS nvlist describing the list of available devices. 5302 * spa_validate_aux() will have already made sure this nvlist is 5303 * valid and the vdevs are labeled appropriately. 5304 */ 5305 if (sav->sav_object == 0) { 5306 sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, 5307 DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, 5308 sizeof (uint64_t), tx); 5309 VERIFY(zap_update(spa->spa_meta_objset, 5310 DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, 5311 &sav->sav_object, tx) == 0); 5312 } 5313 5314 VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); 5315 if (sav->sav_count == 0) { 5316 VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); 5317 } else { 5318 list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); 5319 for (i = 0; i < sav->sav_count; i++) 5320 list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], 5321 B_FALSE, VDEV_CONFIG_L2CACHE); 5322 VERIFY(nvlist_add_nvlist_array(nvroot, config, list, 5323 sav->sav_count) == 0); 5324 for (i = 0; i < sav->sav_count; i++) 5325 nvlist_free(list[i]); 5326 kmem_free(list, sav->sav_count * sizeof (void *)); 5327 } 5328 5329 spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); 5330 nvlist_free(nvroot); 5331 5332 sav->sav_sync = B_FALSE; 5333 } 5334 5335 static void 5336 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) 5337 { 5338 nvlist_t *config; 5339 5340 if (list_is_empty(&spa->spa_config_dirty_list)) 5341 return; 5342 5343 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5344 5345 config = spa_config_generate(spa, spa->spa_root_vdev, 5346 dmu_tx_get_txg(tx), B_FALSE); 5347 5348 spa_config_exit(spa, SCL_STATE, FTAG); 5349 5350 if (spa->spa_config_syncing) 5351 nvlist_free(spa->spa_config_syncing); 5352 spa->spa_config_syncing = config; 5353 5354 spa_sync_nvlist(spa, spa->spa_config_object, config, tx); 5355 } 5356 5357 /* 5358 * Set zpool properties. 5359 */ 5360 static void 5361 spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx) 5362 { 5363 spa_t *spa = arg1; 5364 objset_t *mos = spa->spa_meta_objset; 5365 nvlist_t *nvp = arg2; 5366 nvpair_t *elem; 5367 uint64_t intval; 5368 char *strval; 5369 zpool_prop_t prop; 5370 const char *propname; 5371 zprop_type_t proptype; 5372 5373 mutex_enter(&spa->spa_props_lock); 5374 5375 elem = NULL; 5376 while ((elem = nvlist_next_nvpair(nvp, elem))) { 5377 switch (prop = zpool_name_to_prop(nvpair_name(elem))) { 5378 case ZPOOL_PROP_VERSION: 5379 /* 5380 * Only set version for non-zpool-creation cases 5381 * (set/import). spa_create() needs special care 5382 * for version setting. 5383 */ 5384 if (tx->tx_txg != TXG_INITIAL) { 5385 VERIFY(nvpair_value_uint64(elem, 5386 &intval) == 0); 5387 ASSERT(intval <= SPA_VERSION); 5388 ASSERT(intval >= spa_version(spa)); 5389 spa->spa_uberblock.ub_version = intval; 5390 vdev_config_dirty(spa->spa_root_vdev); 5391 } 5392 break; 5393 5394 case ZPOOL_PROP_ALTROOT: 5395 /* 5396 * 'altroot' is a non-persistent property. It should 5397 * have been set temporarily at creation or import time. 5398 */ 5399 ASSERT(spa->spa_root != NULL); 5400 break; 5401 5402 case ZPOOL_PROP_READONLY: 5403 case ZPOOL_PROP_CACHEFILE: 5404 /* 5405 * 'readonly' and 'cachefile' are also non-persisitent 5406 * properties. 5407 */ 5408 break; 5409 case ZPOOL_PROP_COMMENT: 5410 VERIFY(nvpair_value_string(elem, &strval) == 0); 5411 if (spa->spa_comment != NULL) 5412 spa_strfree(spa->spa_comment); 5413 spa->spa_comment = spa_strdup(strval); 5414 /* 5415 * We need to dirty the configuration on all the vdevs 5416 * so that their labels get updated. It's unnecessary 5417 * to do this for pool creation since the vdev's 5418 * configuratoin has already been dirtied. 5419 */ 5420 if (tx->tx_txg != TXG_INITIAL) 5421 vdev_config_dirty(spa->spa_root_vdev); 5422 break; 5423 default: 5424 /* 5425 * Set pool property values in the poolprops mos object. 5426 */ 5427 if (spa->spa_pool_props_object == 0) { 5428 VERIFY((spa->spa_pool_props_object = 5429 zap_create(mos, DMU_OT_POOL_PROPS, 5430 DMU_OT_NONE, 0, tx)) > 0); 5431 5432 VERIFY(zap_update(mos, 5433 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, 5434 8, 1, &spa->spa_pool_props_object, tx) 5435 == 0); 5436 } 5437 5438 /* normalize the property name */ 5439 propname = zpool_prop_to_name(prop); 5440 proptype = zpool_prop_get_type(prop); 5441 5442 if (nvpair_type(elem) == DATA_TYPE_STRING) { 5443 ASSERT(proptype == PROP_TYPE_STRING); 5444 VERIFY(nvpair_value_string(elem, &strval) == 0); 5445 VERIFY(zap_update(mos, 5446 spa->spa_pool_props_object, propname, 5447 1, strlen(strval) + 1, strval, tx) == 0); 5448 5449 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { 5450 VERIFY(nvpair_value_uint64(elem, &intval) == 0); 5451 5452 if (proptype == PROP_TYPE_INDEX) { 5453 const char *unused; 5454 VERIFY(zpool_prop_index_to_string( 5455 prop, intval, &unused) == 0); 5456 } 5457 VERIFY(zap_update(mos, 5458 spa->spa_pool_props_object, propname, 5459 8, 1, &intval, tx) == 0); 5460 } else { 5461 ASSERT(0); /* not allowed */ 5462 } 5463 5464 switch (prop) { 5465 case ZPOOL_PROP_DELEGATION: 5466 spa->spa_delegation = intval; 5467 break; 5468 case ZPOOL_PROP_BOOTFS: 5469 spa->spa_bootfs = intval; 5470 break; 5471 case ZPOOL_PROP_FAILUREMODE: 5472 spa->spa_failmode = intval; 5473 break; 5474 case ZPOOL_PROP_AUTOEXPAND: 5475 spa->spa_autoexpand = intval; 5476 if (tx->tx_txg != TXG_INITIAL) 5477 spa_async_request(spa, 5478 SPA_ASYNC_AUTOEXPAND); 5479 break; 5480 case ZPOOL_PROP_DEDUPDITTO: 5481 spa->spa_dedup_ditto = intval; 5482 break; 5483 default: 5484 break; 5485 } 5486 } 5487 5488 /* log internal history if this is not a zpool create */ 5489 if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY && 5490 tx->tx_txg != TXG_INITIAL) { 5491 spa_history_log_internal(LOG_POOL_PROPSET, 5492 spa, tx, "%s %lld %s", 5493 nvpair_name(elem), intval, spa_name(spa)); 5494 } 5495 } 5496 5497 mutex_exit(&spa->spa_props_lock); 5498 } 5499 5500 /* 5501 * Perform one-time upgrade on-disk changes. spa_version() does not 5502 * reflect the new version this txg, so there must be no changes this 5503 * txg to anything that the upgrade code depends on after it executes. 5504 * Therefore this must be called after dsl_pool_sync() does the sync 5505 * tasks. 5506 */ 5507 static void 5508 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) 5509 { 5510 dsl_pool_t *dp = spa->spa_dsl_pool; 5511 5512 ASSERT(spa->spa_sync_pass == 1); 5513 5514 if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && 5515 spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { 5516 dsl_pool_create_origin(dp, tx); 5517 5518 /* Keeping the origin open increases spa_minref */ 5519 spa->spa_minref += 3; 5520 } 5521 5522 if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && 5523 spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { 5524 dsl_pool_upgrade_clones(dp, tx); 5525 } 5526 5527 if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && 5528 spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { 5529 dsl_pool_upgrade_dir_clones(dp, tx); 5530 5531 /* Keeping the freedir open increases spa_minref */ 5532 spa->spa_minref += 3; 5533 } 5534 } 5535 5536 /* 5537 * Sync the specified transaction group. New blocks may be dirtied as 5538 * part of the process, so we iterate until it converges. 5539 */ 5540 void 5541 spa_sync(spa_t *spa, uint64_t txg) 5542 { 5543 dsl_pool_t *dp = spa->spa_dsl_pool; 5544 objset_t *mos = spa->spa_meta_objset; 5545 bpobj_t *defer_bpo = &spa->spa_deferred_bpobj; 5546 bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; 5547 vdev_t *rvd = spa->spa_root_vdev; 5548 vdev_t *vd; 5549 dmu_tx_t *tx; 5550 int error; 5551 5552 VERIFY(spa_writeable(spa)); 5553 5554 /* 5555 * Lock out configuration changes. 5556 */ 5557 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); 5558 5559 spa->spa_syncing_txg = txg; 5560 spa->spa_sync_pass = 0; 5561 5562 /* 5563 * If there are any pending vdev state changes, convert them 5564 * into config changes that go out with this transaction group. 5565 */ 5566 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5567 while (list_head(&spa->spa_state_dirty_list) != NULL) { 5568 /* 5569 * We need the write lock here because, for aux vdevs, 5570 * calling vdev_config_dirty() modifies sav_config. 5571 * This is ugly and will become unnecessary when we 5572 * eliminate the aux vdev wart by integrating all vdevs 5573 * into the root vdev tree. 5574 */ 5575 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5576 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); 5577 while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { 5578 vdev_state_clean(vd); 5579 vdev_config_dirty(vd); 5580 } 5581 spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); 5582 spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); 5583 } 5584 spa_config_exit(spa, SCL_STATE, FTAG); 5585 5586 tx = dmu_tx_create_assigned(dp, txg); 5587 5588 /* 5589 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, 5590 * set spa_deflate if we have no raid-z vdevs. 5591 */ 5592 if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && 5593 spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { 5594 int i; 5595 5596 for (i = 0; i < rvd->vdev_children; i++) { 5597 vd = rvd->vdev_child[i]; 5598 if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) 5599 break; 5600 } 5601 if (i == rvd->vdev_children) { 5602 spa->spa_deflate = TRUE; 5603 VERIFY(0 == zap_add(spa->spa_meta_objset, 5604 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, 5605 sizeof (uint64_t), 1, &spa->spa_deflate, tx)); 5606 } 5607 } 5608 5609 /* 5610 * If anything has changed in this txg, or if someone is waiting 5611 * for this txg to sync (eg, spa_vdev_remove()), push the 5612 * deferred frees from the previous txg. If not, leave them 5613 * alone so that we don't generate work on an otherwise idle 5614 * system. 5615 */ 5616 if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || 5617 !txg_list_empty(&dp->dp_dirty_dirs, txg) || 5618 !txg_list_empty(&dp->dp_sync_tasks, txg) || 5619 ((dsl_scan_active(dp->dp_scan) || 5620 txg_sync_waiting(dp)) && !spa_shutting_down(spa))) { 5621 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5622 VERIFY3U(bpobj_iterate(defer_bpo, 5623 spa_free_sync_cb, zio, tx), ==, 0); 5624 VERIFY3U(zio_wait(zio), ==, 0); 5625 } 5626 5627 /* 5628 * Iterate to convergence. 5629 */ 5630 do { 5631 int pass = ++spa->spa_sync_pass; 5632 5633 spa_sync_config_object(spa, tx); 5634 spa_sync_aux_dev(spa, &spa->spa_spares, tx, 5635 ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); 5636 spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, 5637 ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); 5638 spa_errlog_sync(spa, txg); 5639 dsl_pool_sync(dp, txg); 5640 5641 if (pass <= SYNC_PASS_DEFERRED_FREE) { 5642 zio_t *zio = zio_root(spa, NULL, NULL, 0); 5643 bplist_iterate(free_bpl, spa_free_sync_cb, 5644 zio, tx); 5645 VERIFY(zio_wait(zio) == 0); 5646 } else { 5647 bplist_iterate(free_bpl, bpobj_enqueue_cb, 5648 defer_bpo, tx); 5649 } 5650 5651 ddt_sync(spa, txg); 5652 dsl_scan_sync(dp, tx); 5653 5654 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) 5655 vdev_sync(vd, txg); 5656 5657 if (pass == 1) 5658 spa_sync_upgrades(spa, tx); 5659 5660 } while (dmu_objset_is_dirty(mos, txg)); 5661 5662 /* 5663 * Rewrite the vdev configuration (which includes the uberblock) 5664 * to commit the transaction group. 5665 * 5666 * If there are no dirty vdevs, we sync the uberblock to a few 5667 * random top-level vdevs that are known to be visible in the 5668 * config cache (see spa_vdev_add() for a complete description). 5669 * If there *are* dirty vdevs, sync the uberblock to all vdevs. 5670 */ 5671 for (;;) { 5672 /* 5673 * We hold SCL_STATE to prevent vdev open/close/etc. 5674 * while we're attempting to write the vdev labels. 5675 */ 5676 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); 5677 5678 if (list_is_empty(&spa->spa_config_dirty_list)) { 5679 vdev_t *svd[SPA_DVAS_PER_BP]; 5680 int svdcount = 0; 5681 int children = rvd->vdev_children; 5682 int c0 = spa_get_random(children); 5683 5684 for (int c = 0; c < children; c++) { 5685 vd = rvd->vdev_child[(c0 + c) % children]; 5686 if (vd->vdev_ms_array == 0 || vd->vdev_islog) 5687 continue; 5688 svd[svdcount++] = vd; 5689 if (svdcount == SPA_DVAS_PER_BP) 5690 break; 5691 } 5692 error = vdev_config_sync(svd, svdcount, txg, B_FALSE); 5693 if (error != 0) 5694 error = vdev_config_sync(svd, svdcount, txg, 5695 B_TRUE); 5696 } else { 5697 error = vdev_config_sync(rvd->vdev_child, 5698 rvd->vdev_children, txg, B_FALSE); 5699 if (error != 0) 5700 error = vdev_config_sync(rvd->vdev_child, 5701 rvd->vdev_children, txg, B_TRUE); 5702 } 5703 5704 spa_config_exit(spa, SCL_STATE, FTAG); 5705 5706 if (error == 0) 5707 break; 5708 zio_suspend(spa, NULL); 5709 zio_resume_wait(spa); 5710 } 5711 dmu_tx_commit(tx); 5712 5713 /* 5714 * Clear the dirty config list. 5715 */ 5716 while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) 5717 vdev_config_clean(vd); 5718 5719 /* 5720 * Now that the new config has synced transactionally, 5721 * let it become visible to the config cache. 5722 */ 5723 if (spa->spa_config_syncing != NULL) { 5724 spa_config_set(spa, spa->spa_config_syncing); 5725 spa->spa_config_txg = txg; 5726 spa->spa_config_syncing = NULL; 5727 } 5728 5729 spa->spa_ubsync = spa->spa_uberblock; 5730 5731 dsl_pool_sync_done(dp, txg); 5732 5733 /* 5734 * Update usable space statistics. 5735 */ 5736 while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) 5737 vdev_sync_done(vd, txg); 5738 5739 spa_update_dspace(spa); 5740 5741 /* 5742 * It had better be the case that we didn't dirty anything 5743 * since vdev_config_sync(). 5744 */ 5745 ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); 5746 ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); 5747 ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); 5748 5749 spa->spa_sync_pass = 0; 5750 5751 spa_config_exit(spa, SCL_CONFIG, FTAG); 5752 5753 spa_handle_ignored_writes(spa); 5754 5755 /* 5756 * If any async tasks have been requested, kick them off. 5757 */ 5758 spa_async_dispatch(spa); 5759 } 5760 5761 /* 5762 * Sync all pools. We don't want to hold the namespace lock across these 5763 * operations, so we take a reference on the spa_t and drop the lock during the 5764 * sync. 5765 */ 5766 void 5767 spa_sync_allpools(void) 5768 { 5769 spa_t *spa = NULL; 5770 mutex_enter(&spa_namespace_lock); 5771 while ((spa = spa_next(spa)) != NULL) { 5772 if (spa_state(spa) != POOL_STATE_ACTIVE || 5773 !spa_writeable(spa) || spa_suspended(spa)) 5774 continue; 5775 spa_open_ref(spa, FTAG); 5776 mutex_exit(&spa_namespace_lock); 5777 txg_wait_synced(spa_get_dsl(spa), 0); 5778 mutex_enter(&spa_namespace_lock); 5779 spa_close(spa, FTAG); 5780 } 5781 mutex_exit(&spa_namespace_lock); 5782 } 5783 5784 /* 5785 * ========================================================================== 5786 * Miscellaneous routines 5787 * ========================================================================== 5788 */ 5789 5790 /* 5791 * Remove all pools in the system. 5792 */ 5793 void 5794 spa_evict_all(void) 5795 { 5796 spa_t *spa; 5797 5798 /* 5799 * Remove all cached state. All pools should be closed now, 5800 * so every spa in the AVL tree should be unreferenced. 5801 */ 5802 mutex_enter(&spa_namespace_lock); 5803 while ((spa = spa_next(NULL)) != NULL) { 5804 /* 5805 * Stop async tasks. The async thread may need to detach 5806 * a device that's been replaced, which requires grabbing 5807 * spa_namespace_lock, so we must drop it here. 5808 */ 5809 spa_open_ref(spa, FTAG); 5810 mutex_exit(&spa_namespace_lock); 5811 spa_async_suspend(spa); 5812 mutex_enter(&spa_namespace_lock); 5813 spa_close(spa, FTAG); 5814 5815 if (spa->spa_state != POOL_STATE_UNINITIALIZED) { 5816 spa_unload(spa); 5817 spa_deactivate(spa); 5818 } 5819 spa_remove(spa); 5820 } 5821 mutex_exit(&spa_namespace_lock); 5822 } 5823 5824 vdev_t * 5825 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) 5826 { 5827 vdev_t *vd; 5828 int i; 5829 5830 if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) 5831 return (vd); 5832 5833 if (aux) { 5834 for (i = 0; i < spa->spa_l2cache.sav_count; i++) { 5835 vd = spa->spa_l2cache.sav_vdevs[i]; 5836 if (vd->vdev_guid == guid) 5837 return (vd); 5838 } 5839 5840 for (i = 0; i < spa->spa_spares.sav_count; i++) { 5841 vd = spa->spa_spares.sav_vdevs[i]; 5842 if (vd->vdev_guid == guid) 5843 return (vd); 5844 } 5845 } 5846 5847 return (NULL); 5848 } 5849 5850 void 5851 spa_upgrade(spa_t *spa, uint64_t version) 5852 { 5853 ASSERT(spa_writeable(spa)); 5854 5855 spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); 5856 5857 /* 5858 * This should only be called for a non-faulted pool, and since a 5859 * future version would result in an unopenable pool, this shouldn't be 5860 * possible. 5861 */ 5862 ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION); 5863 ASSERT(version >= spa->spa_uberblock.ub_version); 5864 5865 spa->spa_uberblock.ub_version = version; 5866 vdev_config_dirty(spa->spa_root_vdev); 5867 5868 spa_config_exit(spa, SCL_ALL, FTAG); 5869 5870 txg_wait_synced(spa_get_dsl(spa), 0); 5871 } 5872 5873 boolean_t 5874 spa_has_spare(spa_t *spa, uint64_t guid) 5875 { 5876 int i; 5877 uint64_t spareguid; 5878 spa_aux_vdev_t *sav = &spa->spa_spares; 5879 5880 for (i = 0; i < sav->sav_count; i++) 5881 if (sav->sav_vdevs[i]->vdev_guid == guid) 5882 return (B_TRUE); 5883 5884 for (i = 0; i < sav->sav_npending; i++) { 5885 if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, 5886 &spareguid) == 0 && spareguid == guid) 5887 return (B_TRUE); 5888 } 5889 5890 return (B_FALSE); 5891 } 5892 5893 /* 5894 * Check if a pool has an active shared spare device. 5895 * Note: reference count of an active spare is 2, as a spare and as a replace 5896 */ 5897 static boolean_t 5898 spa_has_active_shared_spare(spa_t *spa) 5899 { 5900 int i, refcnt; 5901 uint64_t pool; 5902 spa_aux_vdev_t *sav = &spa->spa_spares; 5903 5904 for (i = 0; i < sav->sav_count; i++) { 5905 if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, 5906 &refcnt) && pool != 0ULL && pool == spa_guid(spa) && 5907 refcnt > 2) 5908 return (B_TRUE); 5909 } 5910 5911 return (B_FALSE); 5912 } 5913 5914 /* 5915 * Post a sysevent corresponding to the given event. The 'name' must be one of 5916 * the event definitions in sys/sysevent/eventdefs.h. The payload will be 5917 * filled in from the spa and (optionally) the vdev. This doesn't do anything 5918 * in the userland libzpool, as we don't want consumers to misinterpret ztest 5919 * or zdb as real changes. 5920 */ 5921 void 5922 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) 5923 { 5924 #ifdef _KERNEL 5925 sysevent_t *ev; 5926 sysevent_attr_list_t *attr = NULL; 5927 sysevent_value_t value; 5928 sysevent_id_t eid; 5929 5930 ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", 5931 SE_SLEEP); 5932 5933 value.value_type = SE_DATA_TYPE_STRING; 5934 value.value.sv_string = spa_name(spa); 5935 if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) 5936 goto done; 5937 5938 value.value_type = SE_DATA_TYPE_UINT64; 5939 value.value.sv_uint64 = spa_guid(spa); 5940 if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) 5941 goto done; 5942 5943 if (vd) { 5944 value.value_type = SE_DATA_TYPE_UINT64; 5945 value.value.sv_uint64 = vd->vdev_guid; 5946 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, 5947 SE_SLEEP) != 0) 5948 goto done; 5949 5950 if (vd->vdev_path) { 5951 value.value_type = SE_DATA_TYPE_STRING; 5952 value.value.sv_string = vd->vdev_path; 5953 if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, 5954 &value, SE_SLEEP) != 0) 5955 goto done; 5956 } 5957 } 5958 5959 if (sysevent_attach_attributes(ev, attr) != 0) 5960 goto done; 5961 attr = NULL; 5962 5963 (void) log_sysevent(ev, SE_SLEEP, &eid); 5964 5965 done: 5966 if (attr) 5967 sysevent_free_attr(attr); 5968 sysevent_free(ev); 5969 #endif 5970 } 5971