1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3 * CDDL HEADER START
4 *
5 * The contents of this file are subject to the terms of the
6 * Common Development and Distribution License (the "License").
7 * You may not use this file except in compliance with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or https://opensource.org/licenses/CDDL-1.0.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22
23 /*
24 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 * Copyright (c) 2014 Integros [integros.com]
28 * Copyright 2016 Toomas Soome <tsoome@me.com>
29 * Copyright 2017 Joyent, Inc.
30 * Copyright (c) 2017, Intel Corporation.
31 * Copyright (c) 2019, Datto Inc. All rights reserved.
32 * Copyright (c) 2021, 2025, Klara, Inc.
33 * Copyright (c) 2021, 2023 Hewlett Packard Enterprise Development LP.
34 */
35
36 #include <sys/zfs_context.h>
37 #include <sys/fm/fs/zfs.h>
38 #include <sys/spa.h>
39 #include <sys/spa_impl.h>
40 #include <sys/bpobj.h>
41 #include <sys/dmu.h>
42 #include <sys/dmu_tx.h>
43 #include <sys/dsl_dir.h>
44 #include <sys/vdev_impl.h>
45 #include <sys/vdev_rebuild.h>
46 #include <sys/vdev_draid.h>
47 #include <sys/uberblock_impl.h>
48 #include <sys/metaslab.h>
49 #include <sys/metaslab_impl.h>
50 #include <sys/space_map.h>
51 #include <sys/space_reftree.h>
52 #include <sys/zio.h>
53 #include <sys/zap.h>
54 #include <sys/fs/zfs.h>
55 #include <sys/arc.h>
56 #include <sys/zil.h>
57 #include <sys/dsl_scan.h>
58 #include <sys/vdev_raidz.h>
59 #include <sys/abd.h>
60 #include <sys/vdev_initialize.h>
61 #include <sys/vdev_trim.h>
62 #include <sys/vdev_raidz.h>
63 #include <sys/zvol.h>
64 #include <sys/zfs_ratelimit.h>
65 #include "zfs_prop.h"
66
67 /*
68 * One metaslab from each (normal-class) vdev is used by the ZIL. These are
69 * called "embedded slog metaslabs", are referenced by vdev_log_mg, and are
70 * part of the spa_embedded_log_class. The metaslab with the most free space
71 * in each vdev is selected for this purpose when the pool is opened (or a
72 * vdev is added). See vdev_metaslab_init().
73 *
74 * Log blocks can be allocated from the following locations. Each one is tried
75 * in order until the allocation succeeds:
76 * 1. dedicated log vdevs, aka "slog" (spa_log_class)
77 * 2. embedded slog metaslabs (spa_embedded_log_class)
78 * 3. other metaslabs in normal vdevs (spa_normal_class)
79 *
80 * zfs_embedded_slog_min_ms disables the embedded slog if there are fewer
81 * than this number of metaslabs in the vdev. This ensures that we don't set
82 * aside an unreasonable amount of space for the ZIL. If set to less than
83 * 1 << (spa_slop_shift + 1), on small pools the usable space may be reduced
84 * (by more than 1<<spa_slop_shift) due to the embedded slog metaslab.
85 */
86 static uint_t zfs_embedded_slog_min_ms = 64;
87
88 /* default target for number of metaslabs per top-level vdev */
89 static uint_t zfs_vdev_default_ms_count = 200;
90
91 /* minimum number of metaslabs per top-level vdev */
92 static uint_t zfs_vdev_min_ms_count = 16;
93
94 /* practical upper limit of total metaslabs per top-level vdev */
95 static uint_t zfs_vdev_ms_count_limit = 1ULL << 17;
96
97 /* lower limit for metaslab size (512M) */
98 static uint_t zfs_vdev_default_ms_shift = 29;
99
100 /* upper limit for metaslab size (16G) */
101 static uint_t zfs_vdev_max_ms_shift = 34;
102
103 static int vdev_validate_skip = B_FALSE;
104
105 /*
106 * Since the DTL space map of a vdev is not expected to have a lot of
107 * entries, we default its block size to 4K.
108 */
109 int zfs_vdev_dtl_sm_blksz = (1 << 12);
110
111 /*
112 * Rate limit slow IO (delay) events to this many per second.
113 */
114 static unsigned int zfs_slow_io_events_per_second = 20;
115
116 /*
117 * Rate limit deadman "hung IO" events to this many per second.
118 */
119 static unsigned int zfs_deadman_events_per_second = 1;
120
121 /*
122 * Rate limit direct write IO verify failures to this many per scond.
123 */
124 static unsigned int zfs_dio_write_verify_events_per_second = 20;
125
126 /*
127 * Rate limit checksum events after this many checksum errors per second.
128 */
129 static unsigned int zfs_checksum_events_per_second = 20;
130
131 /*
132 * Ignore errors during scrub/resilver. Allows to work around resilver
133 * upon import when there are pool errors.
134 */
135 static int zfs_scan_ignore_errors = 0;
136
137 /*
138 * vdev-wide space maps that have lots of entries written to them at
139 * the end of each transaction can benefit from a higher I/O bandwidth
140 * (e.g. vdev_obsolete_sm), thus we default their block size to 128K.
141 */
142 int zfs_vdev_standard_sm_blksz = (1 << 17);
143
144 /*
145 * Tunable parameter for debugging or performance analysis. Setting this
146 * will cause pool corruption on power loss if a volatile out-of-order
147 * write cache is enabled.
148 */
149 int zfs_nocacheflush = 0;
150
151 /*
152 * Maximum and minimum ashift values that can be automatically set based on
153 * vdev's physical ashift (disk's physical sector size). While ASHIFT_MAX
154 * is higher than the maximum value, it is intentionally limited here to not
155 * excessively impact pool space efficiency. Higher ashift values may still
156 * be forced by vdev logical ashift or by user via ashift property, but won't
157 * be set automatically as a performance optimization.
158 */
159 uint_t zfs_vdev_max_auto_ashift = 14;
160 uint_t zfs_vdev_min_auto_ashift = ASHIFT_MIN;
161
162 /*
163 * VDEV checksum verification for Direct I/O writes. This is neccessary for
164 * Linux, because anonymous pages can not be placed under write protection
165 * during Direct I/O writes.
166 */
167 #if !defined(__FreeBSD__)
168 uint_t zfs_vdev_direct_write_verify = 1;
169 #else
170 uint_t zfs_vdev_direct_write_verify = 0;
171 #endif
172
173 void
vdev_dbgmsg(vdev_t * vd,const char * fmt,...)174 vdev_dbgmsg(vdev_t *vd, const char *fmt, ...)
175 {
176 va_list adx;
177 char buf[256];
178
179 va_start(adx, fmt);
180 (void) vsnprintf(buf, sizeof (buf), fmt, adx);
181 va_end(adx);
182
183 if (vd->vdev_path != NULL) {
184 zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type,
185 vd->vdev_path, buf);
186 } else {
187 zfs_dbgmsg("%s-%llu vdev (guid %llu): %s",
188 vd->vdev_ops->vdev_op_type,
189 (u_longlong_t)vd->vdev_id,
190 (u_longlong_t)vd->vdev_guid, buf);
191 }
192 }
193
194 void
vdev_dbgmsg_print_tree(vdev_t * vd,int indent)195 vdev_dbgmsg_print_tree(vdev_t *vd, int indent)
196 {
197 char state[20];
198
199 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) {
200 zfs_dbgmsg("%*svdev %llu: %s", indent, "",
201 (u_longlong_t)vd->vdev_id,
202 vd->vdev_ops->vdev_op_type);
203 return;
204 }
205
206 switch (vd->vdev_state) {
207 case VDEV_STATE_UNKNOWN:
208 (void) snprintf(state, sizeof (state), "unknown");
209 break;
210 case VDEV_STATE_CLOSED:
211 (void) snprintf(state, sizeof (state), "closed");
212 break;
213 case VDEV_STATE_OFFLINE:
214 (void) snprintf(state, sizeof (state), "offline");
215 break;
216 case VDEV_STATE_REMOVED:
217 (void) snprintf(state, sizeof (state), "removed");
218 break;
219 case VDEV_STATE_CANT_OPEN:
220 (void) snprintf(state, sizeof (state), "can't open");
221 break;
222 case VDEV_STATE_FAULTED:
223 (void) snprintf(state, sizeof (state), "faulted");
224 break;
225 case VDEV_STATE_DEGRADED:
226 (void) snprintf(state, sizeof (state), "degraded");
227 break;
228 case VDEV_STATE_HEALTHY:
229 (void) snprintf(state, sizeof (state), "healthy");
230 break;
231 default:
232 (void) snprintf(state, sizeof (state), "<state %u>",
233 (uint_t)vd->vdev_state);
234 }
235
236 zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent,
237 "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type,
238 vd->vdev_islog ? " (log)" : "",
239 (u_longlong_t)vd->vdev_guid,
240 vd->vdev_path ? vd->vdev_path : "N/A", state);
241
242 for (uint64_t i = 0; i < vd->vdev_children; i++)
243 vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2);
244 }
245
246 char *
vdev_rt_name(vdev_t * vd,const char * name)247 vdev_rt_name(vdev_t *vd, const char *name)
248 {
249 return (kmem_asprintf("{spa=%s vdev_guid=%llu %s}",
250 spa_name(vd->vdev_spa),
251 (u_longlong_t)vd->vdev_guid,
252 name));
253 }
254
255 static char *
vdev_rt_name_dtl(vdev_t * vd,const char * name,vdev_dtl_type_t dtl_type)256 vdev_rt_name_dtl(vdev_t *vd, const char *name, vdev_dtl_type_t dtl_type)
257 {
258 return (kmem_asprintf("{spa=%s vdev_guid=%llu %s[%d]}",
259 spa_name(vd->vdev_spa),
260 (u_longlong_t)vd->vdev_guid,
261 name,
262 dtl_type));
263 }
264
265 /*
266 * Virtual device management.
267 */
268
269 static vdev_ops_t *const vdev_ops_table[] = {
270 &vdev_root_ops,
271 &vdev_raidz_ops,
272 &vdev_draid_ops,
273 &vdev_draid_spare_ops,
274 &vdev_mirror_ops,
275 &vdev_replacing_ops,
276 &vdev_spare_ops,
277 &vdev_disk_ops,
278 &vdev_file_ops,
279 &vdev_missing_ops,
280 &vdev_hole_ops,
281 &vdev_indirect_ops,
282 NULL
283 };
284
285 /*
286 * Given a vdev type, return the appropriate ops vector.
287 */
288 static vdev_ops_t *
vdev_getops(const char * type)289 vdev_getops(const char *type)
290 {
291 vdev_ops_t *ops, *const *opspp;
292
293 for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++)
294 if (strcmp(ops->vdev_op_type, type) == 0)
295 break;
296
297 return (ops);
298 }
299
300 /*
301 * Given a vdev and a metaslab class, find which metaslab group we're
302 * interested in. All vdevs may belong to two different metaslab classes.
303 * Dedicated slog devices use only the primary metaslab group, rather than a
304 * separate log group. For embedded slogs, vdev_log_mg will be non-NULL and
305 * will point to a metaslab group of either embedded_log_class (for normal
306 * vdevs) or special_embedded_log_class (for special vdevs).
307 */
308 metaslab_group_t *
vdev_get_mg(vdev_t * vd,metaslab_class_t * mc)309 vdev_get_mg(vdev_t *vd, metaslab_class_t *mc)
310 {
311 if ((mc == spa_embedded_log_class(vd->vdev_spa) ||
312 mc == spa_special_embedded_log_class(vd->vdev_spa)) &&
313 vd->vdev_log_mg != NULL)
314 return (vd->vdev_log_mg);
315 else
316 return (vd->vdev_mg);
317 }
318
319 void
vdev_default_xlate(vdev_t * vd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)320 vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
321 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
322 {
323 (void) vd, (void) remain_rs;
324
325 physical_rs->rs_start = logical_rs->rs_start;
326 physical_rs->rs_end = logical_rs->rs_end;
327 }
328
329 /*
330 * Derive the enumerated allocation bias from string input.
331 * String origin is either the per-vdev zap or zpool(8).
332 */
333 static vdev_alloc_bias_t
vdev_derive_alloc_bias(const char * bias)334 vdev_derive_alloc_bias(const char *bias)
335 {
336 vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
337
338 if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0)
339 alloc_bias = VDEV_BIAS_LOG;
340 else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0)
341 alloc_bias = VDEV_BIAS_SPECIAL;
342 else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0)
343 alloc_bias = VDEV_BIAS_DEDUP;
344
345 return (alloc_bias);
346 }
347
348 uint64_t
vdev_default_psize(vdev_t * vd,uint64_t asize,uint64_t txg)349 vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg)
350 {
351 ASSERT0(asize % (1ULL << vd->vdev_top->vdev_ashift));
352 uint64_t csize, psize = asize;
353 for (int c = 0; c < vd->vdev_children; c++) {
354 csize = vdev_asize_to_psize_txg(vd->vdev_child[c], asize, txg);
355 psize = MIN(psize, csize);
356 }
357
358 return (psize);
359 }
360
361 /*
362 * Default asize function: return the MAX of psize with the asize of
363 * all children. This is what's used by anything other than RAID-Z.
364 */
365 uint64_t
vdev_default_asize(vdev_t * vd,uint64_t psize,uint64_t txg)366 vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
367 {
368 uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
369 uint64_t csize;
370
371 for (int c = 0; c < vd->vdev_children; c++) {
372 csize = vdev_psize_to_asize_txg(vd->vdev_child[c], psize, txg);
373 asize = MAX(asize, csize);
374 }
375
376 return (asize);
377 }
378
379 uint64_t
vdev_default_min_asize(vdev_t * vd)380 vdev_default_min_asize(vdev_t *vd)
381 {
382 return (vd->vdev_min_asize);
383 }
384
385 /*
386 * Get the minimum allocatable size. We define the allocatable size as
387 * the vdev's asize rounded to the nearest metaslab. This allows us to
388 * replace or attach devices which don't have the same physical size but
389 * can still satisfy the same number of allocations.
390 */
391 uint64_t
vdev_get_min_asize(vdev_t * vd)392 vdev_get_min_asize(vdev_t *vd)
393 {
394 vdev_t *pvd = vd->vdev_parent;
395
396 /*
397 * If our parent is NULL (inactive spare or cache) or is the root,
398 * just return our own asize.
399 */
400 if (pvd == NULL)
401 return (vd->vdev_asize);
402
403 /*
404 * The top-level vdev just returns the allocatable size rounded
405 * to the nearest metaslab.
406 */
407 if (vd == vd->vdev_top)
408 return (P2ALIGN_TYPED(vd->vdev_asize, 1ULL << vd->vdev_ms_shift,
409 uint64_t));
410
411 return (pvd->vdev_ops->vdev_op_min_asize(pvd));
412 }
413
414 void
vdev_set_min_asize(vdev_t * vd)415 vdev_set_min_asize(vdev_t *vd)
416 {
417 vd->vdev_min_asize = vdev_get_min_asize(vd);
418
419 for (int c = 0; c < vd->vdev_children; c++)
420 vdev_set_min_asize(vd->vdev_child[c]);
421 }
422
423 /*
424 * Get the minimal allocation size for the top-level vdev.
425 */
426 uint64_t
vdev_get_min_alloc(vdev_t * vd)427 vdev_get_min_alloc(vdev_t *vd)
428 {
429 uint64_t min_alloc = 1ULL << vd->vdev_ashift;
430
431 if (vd->vdev_ops->vdev_op_min_alloc != NULL)
432 min_alloc = vd->vdev_ops->vdev_op_min_alloc(vd);
433
434 return (min_alloc);
435 }
436
437 /*
438 * Get the parity level for a top-level vdev.
439 */
440 uint64_t
vdev_get_nparity(vdev_t * vd)441 vdev_get_nparity(vdev_t *vd)
442 {
443 uint64_t nparity = 0;
444
445 if (vd->vdev_ops->vdev_op_nparity != NULL)
446 nparity = vd->vdev_ops->vdev_op_nparity(vd);
447
448 return (nparity);
449 }
450
451 static int
vdev_prop_get_int(vdev_t * vd,vdev_prop_t prop,uint64_t * value)452 vdev_prop_get_int(vdev_t *vd, vdev_prop_t prop, uint64_t *value)
453 {
454 spa_t *spa = vd->vdev_spa;
455 objset_t *mos = spa->spa_meta_objset;
456 uint64_t objid;
457 int err;
458
459 if (vd->vdev_root_zap != 0) {
460 objid = vd->vdev_root_zap;
461 } else if (vd->vdev_top_zap != 0) {
462 objid = vd->vdev_top_zap;
463 } else if (vd->vdev_leaf_zap != 0) {
464 objid = vd->vdev_leaf_zap;
465 } else {
466 return (EINVAL);
467 }
468
469 err = zap_lookup(mos, objid, vdev_prop_to_name(prop),
470 sizeof (uint64_t), 1, value);
471
472 if (err == ENOENT)
473 *value = vdev_prop_default_numeric(prop);
474
475 return (err);
476 }
477
478 /*
479 * Get the number of data disks for a top-level vdev.
480 */
481 uint64_t
vdev_get_ndisks(vdev_t * vd)482 vdev_get_ndisks(vdev_t *vd)
483 {
484 uint64_t ndisks = 1;
485
486 if (vd->vdev_ops->vdev_op_ndisks != NULL)
487 ndisks = vd->vdev_ops->vdev_op_ndisks(vd);
488
489 return (ndisks);
490 }
491
492 vdev_t *
vdev_lookup_top(spa_t * spa,uint64_t vdev)493 vdev_lookup_top(spa_t *spa, uint64_t vdev)
494 {
495 vdev_t *rvd = spa->spa_root_vdev;
496
497 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
498
499 if (vdev < rvd->vdev_children) {
500 ASSERT(rvd->vdev_child[vdev] != NULL);
501 return (rvd->vdev_child[vdev]);
502 }
503
504 return (NULL);
505 }
506
507 vdev_t *
vdev_lookup_by_guid(vdev_t * vd,uint64_t guid)508 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
509 {
510 vdev_t *mvd;
511
512 if (vd->vdev_guid == guid)
513 return (vd);
514
515 for (int c = 0; c < vd->vdev_children; c++)
516 if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
517 NULL)
518 return (mvd);
519
520 return (NULL);
521 }
522
523 static int
vdev_count_leaves_impl(vdev_t * vd)524 vdev_count_leaves_impl(vdev_t *vd)
525 {
526 int n = 0;
527
528 if (vd->vdev_ops->vdev_op_leaf)
529 return (1);
530
531 for (int c = 0; c < vd->vdev_children; c++)
532 n += vdev_count_leaves_impl(vd->vdev_child[c]);
533
534 return (n);
535 }
536
537 int
vdev_count_leaves(spa_t * spa)538 vdev_count_leaves(spa_t *spa)
539 {
540 int rc;
541
542 spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
543 rc = vdev_count_leaves_impl(spa->spa_root_vdev);
544 spa_config_exit(spa, SCL_VDEV, FTAG);
545
546 return (rc);
547 }
548
549 void
vdev_add_child(vdev_t * pvd,vdev_t * cvd)550 vdev_add_child(vdev_t *pvd, vdev_t *cvd)
551 {
552 size_t oldsize, newsize;
553 uint64_t id = cvd->vdev_id;
554 vdev_t **newchild;
555
556 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
557 ASSERT0P(cvd->vdev_parent);
558
559 cvd->vdev_parent = pvd;
560
561 if (pvd == NULL)
562 return;
563
564 ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL);
565
566 oldsize = pvd->vdev_children * sizeof (vdev_t *);
567 pvd->vdev_children = MAX(pvd->vdev_children, id + 1);
568 newsize = pvd->vdev_children * sizeof (vdev_t *);
569
570 newchild = kmem_alloc(newsize, KM_SLEEP);
571 if (pvd->vdev_child != NULL) {
572 memcpy(newchild, pvd->vdev_child, oldsize);
573 kmem_free(pvd->vdev_child, oldsize);
574 }
575
576 pvd->vdev_child = newchild;
577 pvd->vdev_child[id] = cvd;
578 pvd->vdev_nonrot &= cvd->vdev_nonrot;
579
580 cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd);
581 ASSERT0P(cvd->vdev_top->vdev_parent->vdev_parent);
582
583 /*
584 * Walk up all ancestors to update guid sum.
585 */
586 for (; pvd != NULL; pvd = pvd->vdev_parent)
587 pvd->vdev_guid_sum += cvd->vdev_guid_sum;
588
589 if (cvd->vdev_ops->vdev_op_leaf) {
590 list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd);
591 cvd->vdev_spa->spa_leaf_list_gen++;
592 }
593 }
594
595 void
vdev_remove_child(vdev_t * pvd,vdev_t * cvd)596 vdev_remove_child(vdev_t *pvd, vdev_t *cvd)
597 {
598 int c;
599 uint_t id = cvd->vdev_id;
600
601 ASSERT(cvd->vdev_parent == pvd);
602
603 if (pvd == NULL)
604 return;
605
606 ASSERT(id < pvd->vdev_children);
607 ASSERT(pvd->vdev_child[id] == cvd);
608
609 pvd->vdev_child[id] = NULL;
610 cvd->vdev_parent = NULL;
611
612 for (c = 0; c < pvd->vdev_children; c++)
613 if (pvd->vdev_child[c])
614 break;
615
616 if (c == pvd->vdev_children) {
617 kmem_free(pvd->vdev_child, c * sizeof (vdev_t *));
618 pvd->vdev_child = NULL;
619 pvd->vdev_children = 0;
620 }
621
622 if (cvd->vdev_ops->vdev_op_leaf) {
623 spa_t *spa = cvd->vdev_spa;
624 list_remove(&spa->spa_leaf_list, cvd);
625 spa->spa_leaf_list_gen++;
626 }
627
628 /*
629 * Walk up all ancestors to update guid sum.
630 */
631 for (; pvd != NULL; pvd = pvd->vdev_parent)
632 pvd->vdev_guid_sum -= cvd->vdev_guid_sum;
633 }
634
635 /*
636 * Remove any holes in the child array.
637 */
638 void
vdev_compact_children(vdev_t * pvd)639 vdev_compact_children(vdev_t *pvd)
640 {
641 vdev_t **newchild, *cvd;
642 int oldc = pvd->vdev_children;
643 int newc;
644
645 ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
646
647 if (oldc == 0)
648 return;
649
650 for (int c = newc = 0; c < oldc; c++)
651 if (pvd->vdev_child[c])
652 newc++;
653
654 if (newc > 0) {
655 newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP);
656
657 for (int c = newc = 0; c < oldc; c++) {
658 if ((cvd = pvd->vdev_child[c]) != NULL) {
659 newchild[newc] = cvd;
660 cvd->vdev_id = newc++;
661 }
662 }
663 } else {
664 newchild = NULL;
665 }
666
667 kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *));
668 pvd->vdev_child = newchild;
669 pvd->vdev_children = newc;
670 }
671
672 /*
673 * Allocate and minimally initialize a vdev_t.
674 */
675 vdev_t *
vdev_alloc_common(spa_t * spa,uint_t id,uint64_t guid,vdev_ops_t * ops)676 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
677 {
678 vdev_t *vd;
679 vdev_indirect_config_t *vic;
680
681 vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP);
682 vic = &vd->vdev_indirect_config;
683
684 if (spa->spa_root_vdev == NULL) {
685 ASSERT(ops == &vdev_root_ops);
686 spa->spa_root_vdev = vd;
687 spa->spa_load_guid = spa_generate_load_guid();
688 }
689
690 if (guid == 0 && ops != &vdev_hole_ops) {
691 if (spa->spa_root_vdev == vd) {
692 /*
693 * The root vdev's guid will also be the pool guid,
694 * which must be unique among all pools.
695 */
696 guid = spa_generate_guid(NULL);
697 } else {
698 /*
699 * Any other vdev's guid must be unique within the pool.
700 */
701 guid = spa_generate_guid(spa);
702 }
703 ASSERT(!spa_guid_exists(spa_guid(spa), guid));
704 }
705
706 vd->vdev_spa = spa;
707 vd->vdev_id = id;
708 vd->vdev_guid = guid;
709 vd->vdev_guid_sum = guid;
710 vd->vdev_ops = ops;
711 vd->vdev_state = VDEV_STATE_CLOSED;
712 vd->vdev_ishole = (ops == &vdev_hole_ops);
713 vic->vic_prev_indirect_vdev = UINT64_MAX;
714
715 rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL);
716 mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL);
717 vd->vdev_obsolete_segments = zfs_range_tree_create_flags(
718 NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
719 ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_obsolete_segments"));
720
721 /*
722 * Initialize rate limit structs for events. We rate limit ZIO delay
723 * and checksum events so that we don't overwhelm ZED with thousands
724 * of events when a disk is acting up.
725 */
726 zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second,
727 1);
728 zfs_ratelimit_init(&vd->vdev_deadman_rl, &zfs_deadman_events_per_second,
729 1);
730 zfs_ratelimit_init(&vd->vdev_dio_verify_rl,
731 &zfs_dio_write_verify_events_per_second, 1);
732 zfs_ratelimit_init(&vd->vdev_checksum_rl,
733 &zfs_checksum_events_per_second, 1);
734
735 /*
736 * Default Thresholds for tuning ZED
737 */
738 vd->vdev_checksum_n = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N);
739 vd->vdev_checksum_t = vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T);
740 vd->vdev_io_n = vdev_prop_default_numeric(VDEV_PROP_IO_N);
741 vd->vdev_io_t = vdev_prop_default_numeric(VDEV_PROP_IO_T);
742 vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
743 vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
744
745 list_link_init(&vd->vdev_config_dirty_node);
746 list_link_init(&vd->vdev_state_dirty_node);
747 list_link_init(&vd->vdev_initialize_node);
748 list_link_init(&vd->vdev_leaf_node);
749 list_link_init(&vd->vdev_trim_node);
750
751 mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL);
752 mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
753 mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
754 mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
755
756 mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
757 mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
758 cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
759 cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
760
761 mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
762 mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
763 mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
764 cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL);
765 cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
766 cv_init(&vd->vdev_autotrim_kick_cv, NULL, CV_DEFAULT, NULL);
767 cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
768
769 mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
770 cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
771
772 for (int t = 0; t < DTL_TYPES; t++) {
773 vd->vdev_dtl[t] = zfs_range_tree_create_flags(
774 NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
775 ZFS_RT_F_DYN_NAME, vdev_rt_name_dtl(vd, "vdev_dtl", t));
776 }
777
778 txg_list_create(&vd->vdev_ms_list, spa,
779 offsetof(struct metaslab, ms_txg_node));
780 txg_list_create(&vd->vdev_dtl_list, spa,
781 offsetof(struct vdev, vdev_dtl_node));
782 vd->vdev_stat.vs_timestamp = gethrtime();
783 vdev_queue_init(vd);
784
785 return (vd);
786 }
787
788 /*
789 * Allocate a new vdev. The 'alloctype' is used to control whether we are
790 * creating a new vdev or loading an existing one - the behavior is slightly
791 * different for each case.
792 */
793 int
vdev_alloc(spa_t * spa,vdev_t ** vdp,nvlist_t * nv,vdev_t * parent,uint_t id,int alloctype)794 vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
795 int alloctype)
796 {
797 vdev_ops_t *ops;
798 const char *type;
799 uint64_t guid = 0, islog;
800 vdev_t *vd;
801 vdev_indirect_config_t *vic;
802 const char *tmp = NULL;
803 int rc;
804 vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE;
805 boolean_t top_level = (parent && !parent->vdev_parent);
806
807 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
808
809 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
810 return (SET_ERROR(EINVAL));
811
812 if ((ops = vdev_getops(type)) == NULL)
813 return (SET_ERROR(EINVAL));
814
815 /*
816 * If this is a load, get the vdev guid from the nvlist.
817 * Otherwise, vdev_alloc_common() will generate one for us.
818 */
819 if (alloctype == VDEV_ALLOC_LOAD) {
820 uint64_t label_id;
821
822 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) ||
823 label_id != id)
824 return (SET_ERROR(EINVAL));
825
826 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
827 return (SET_ERROR(EINVAL));
828 } else if (alloctype == VDEV_ALLOC_SPARE) {
829 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
830 return (SET_ERROR(EINVAL));
831 } else if (alloctype == VDEV_ALLOC_L2CACHE) {
832 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
833 return (SET_ERROR(EINVAL));
834 } else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
835 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
836 return (SET_ERROR(EINVAL));
837 }
838
839 /*
840 * The first allocated vdev must be of type 'root'.
841 */
842 if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL)
843 return (SET_ERROR(EINVAL));
844
845 /*
846 * Determine whether we're a log vdev.
847 */
848 islog = 0;
849 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog);
850 if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
851 return (SET_ERROR(ENOTSUP));
852
853 if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
854 return (SET_ERROR(ENOTSUP));
855
856 if (top_level && alloctype == VDEV_ALLOC_ADD) {
857 const char *bias;
858
859 /*
860 * If creating a top-level vdev, check for allocation
861 * classes input.
862 */
863 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS,
864 &bias) == 0) {
865 alloc_bias = vdev_derive_alloc_bias(bias);
866
867 /* spa_vdev_add() expects feature to be enabled */
868 if (spa->spa_load_state != SPA_LOAD_CREATE &&
869 !spa_feature_is_enabled(spa,
870 SPA_FEATURE_ALLOCATION_CLASSES)) {
871 return (SET_ERROR(ENOTSUP));
872 }
873 }
874
875 /* spa_vdev_add() expects feature to be enabled */
876 if (ops == &vdev_draid_ops &&
877 spa->spa_load_state != SPA_LOAD_CREATE &&
878 !spa_feature_is_enabled(spa, SPA_FEATURE_DRAID)) {
879 return (SET_ERROR(ENOTSUP));
880 }
881 }
882
883 /*
884 * Initialize the vdev specific data. This is done before calling
885 * vdev_alloc_common() since it may fail and this simplifies the
886 * error reporting and cleanup code paths.
887 */
888 void *tsd = NULL;
889 if (ops->vdev_op_init != NULL) {
890 rc = ops->vdev_op_init(spa, nv, &tsd);
891 if (rc != 0) {
892 return (rc);
893 }
894 }
895
896 vd = vdev_alloc_common(spa, id, guid, ops);
897 vd->vdev_tsd = tsd;
898 vd->vdev_islog = islog;
899
900 if (top_level && alloc_bias != VDEV_BIAS_NONE)
901 vd->vdev_alloc_bias = alloc_bias;
902
903 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &tmp) == 0)
904 vd->vdev_path = spa_strdup(tmp);
905
906 /*
907 * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a
908 * fault on a vdev and want it to persist across imports (like with
909 * zpool offline -f).
910 */
911 rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp);
912 if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) {
913 vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
914 vd->vdev_faulted = 1;
915 vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
916 }
917
918 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &tmp) == 0)
919 vd->vdev_devid = spa_strdup(tmp);
920 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &tmp) == 0)
921 vd->vdev_physpath = spa_strdup(tmp);
922
923 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH,
924 &tmp) == 0)
925 vd->vdev_enc_sysfs_path = spa_strdup(tmp);
926
927 if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &tmp) == 0)
928 vd->vdev_fru = spa_strdup(tmp);
929
930 /*
931 * Set the whole_disk property. If it's not specified, leave the value
932 * as -1.
933 */
934 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
935 &vd->vdev_wholedisk) != 0)
936 vd->vdev_wholedisk = -1ULL;
937
938 vic = &vd->vdev_indirect_config;
939
940 ASSERT0(vic->vic_mapping_object);
941 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT,
942 &vic->vic_mapping_object);
943 ASSERT0(vic->vic_births_object);
944 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS,
945 &vic->vic_births_object);
946 ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX);
947 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV,
948 &vic->vic_prev_indirect_vdev);
949
950 /*
951 * Look for the 'not present' flag. This will only be set if the device
952 * was not present at the time of import.
953 */
954 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
955 &vd->vdev_not_present);
956
957 /*
958 * Get the alignment requirement. Ignore pool ashift for vdev
959 * attach case.
960 */
961 if (alloctype != VDEV_ALLOC_ATTACH) {
962 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT,
963 &vd->vdev_ashift);
964 } else {
965 vd->vdev_attaching = B_TRUE;
966 }
967
968 /*
969 * Retrieve the vdev creation time.
970 */
971 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
972 &vd->vdev_crtxg);
973
974 if (vd->vdev_ops == &vdev_root_ops &&
975 (alloctype == VDEV_ALLOC_LOAD ||
976 alloctype == VDEV_ALLOC_SPLIT ||
977 alloctype == VDEV_ALLOC_ROOTPOOL)) {
978 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_ROOT_ZAP,
979 &vd->vdev_root_zap);
980 }
981
982 /*
983 * If we're a top-level vdev, try to load the allocation parameters.
984 */
985 if (top_level &&
986 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
987 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
988 &vd->vdev_ms_array);
989 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
990 &vd->vdev_ms_shift);
991 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE,
992 &vd->vdev_asize);
993 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NONALLOCATING,
994 &vd->vdev_noalloc);
995 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING,
996 &vd->vdev_removing);
997 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
998 &vd->vdev_top_zap);
999 vd->vdev_rz_expanding = nvlist_exists(nv,
1000 ZPOOL_CONFIG_RAIDZ_EXPANDING);
1001 } else {
1002 ASSERT0(vd->vdev_top_zap);
1003 }
1004
1005 if (top_level && alloctype != VDEV_ALLOC_ATTACH) {
1006 ASSERT(alloctype == VDEV_ALLOC_LOAD ||
1007 alloctype == VDEV_ALLOC_ADD ||
1008 alloctype == VDEV_ALLOC_SPLIT ||
1009 alloctype == VDEV_ALLOC_ROOTPOOL);
1010 /* Note: metaslab_group_create() is now deferred */
1011 }
1012
1013 if (vd->vdev_ops->vdev_op_leaf &&
1014 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
1015 (void) nvlist_lookup_uint64(nv,
1016 ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap);
1017 } else {
1018 ASSERT0(vd->vdev_leaf_zap);
1019 }
1020
1021 /*
1022 * If we're a leaf vdev, try to load the DTL object and other state.
1023 */
1024
1025 if (vd->vdev_ops->vdev_op_leaf &&
1026 (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
1027 alloctype == VDEV_ALLOC_ROOTPOOL)) {
1028 if (alloctype == VDEV_ALLOC_LOAD) {
1029 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
1030 &vd->vdev_dtl_object);
1031 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
1032 &vd->vdev_unspare);
1033 }
1034
1035 if (alloctype == VDEV_ALLOC_ROOTPOOL) {
1036 uint64_t spare = 0;
1037
1038 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
1039 &spare) == 0 && spare)
1040 spa_spare_add(vd);
1041 }
1042
1043 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
1044 &vd->vdev_offline);
1045
1046 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
1047 &vd->vdev_resilver_txg);
1048
1049 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
1050 &vd->vdev_rebuild_txg);
1051
1052 if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
1053 vdev_defer_resilver(vd);
1054
1055 /*
1056 * In general, when importing a pool we want to ignore the
1057 * persistent fault state, as the diagnosis made on another
1058 * system may not be valid in the current context. The only
1059 * exception is if we forced a vdev to a persistently faulted
1060 * state with 'zpool offline -f'. The persistent fault will
1061 * remain across imports until cleared.
1062 *
1063 * Local vdevs will remain in the faulted state.
1064 */
1065 if (spa_load_state(spa) == SPA_LOAD_OPEN ||
1066 spa_load_state(spa) == SPA_LOAD_IMPORT) {
1067 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
1068 &vd->vdev_faulted);
1069 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
1070 &vd->vdev_degraded);
1071 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
1072 &vd->vdev_removed);
1073
1074 if (vd->vdev_faulted || vd->vdev_degraded) {
1075 const char *aux;
1076
1077 vd->vdev_label_aux =
1078 VDEV_AUX_ERR_EXCEEDED;
1079 if (nvlist_lookup_string(nv,
1080 ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
1081 strcmp(aux, "external") == 0)
1082 vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
1083 else
1084 vd->vdev_faulted = 0ULL;
1085 }
1086 }
1087 }
1088
1089 if (top_level && (ops == &vdev_raidz_ops || ops == &vdev_draid_ops))
1090 vd->vdev_autosit =
1091 vdev_prop_default_numeric(VDEV_PROP_AUTOSIT);
1092
1093 /*
1094 * Add ourselves to the parent's list of children.
1095 */
1096 vdev_add_child(parent, vd);
1097
1098 *vdp = vd;
1099
1100 return (0);
1101 }
1102
1103 void
vdev_free(vdev_t * vd)1104 vdev_free(vdev_t *vd)
1105 {
1106 spa_t *spa = vd->vdev_spa;
1107
1108 ASSERT0P(vd->vdev_initialize_thread);
1109 ASSERT0P(vd->vdev_trim_thread);
1110 ASSERT0P(vd->vdev_autotrim_thread);
1111 ASSERT0P(vd->vdev_rebuild_thread);
1112
1113 /*
1114 * Scan queues are normally destroyed at the end of a scan. If the
1115 * queue exists here, that implies the vdev is being removed while
1116 * the scan is still running.
1117 */
1118 if (vd->vdev_scan_io_queue != NULL) {
1119 mutex_enter(&vd->vdev_scan_io_queue_lock);
1120 dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
1121 vd->vdev_scan_io_queue = NULL;
1122 mutex_exit(&vd->vdev_scan_io_queue_lock);
1123 }
1124
1125 /*
1126 * vdev_free() implies closing the vdev first. This is simpler than
1127 * trying to ensure complicated semantics for all callers.
1128 */
1129 vdev_close(vd);
1130
1131 ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
1132 ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
1133
1134 /*
1135 * Free all children.
1136 */
1137 for (int c = 0; c < vd->vdev_children; c++)
1138 vdev_free(vd->vdev_child[c]);
1139
1140 ASSERT0P(vd->vdev_child);
1141 ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
1142
1143 if (vd->vdev_ops->vdev_op_fini != NULL)
1144 vd->vdev_ops->vdev_op_fini(vd);
1145
1146 /*
1147 * Discard allocation state.
1148 */
1149 if (vd->vdev_mg != NULL) {
1150 vdev_metaslab_fini(vd);
1151 metaslab_group_destroy(vd->vdev_mg);
1152 vd->vdev_mg = NULL;
1153 }
1154 if (vd->vdev_log_mg != NULL) {
1155 ASSERT0(vd->vdev_ms_count);
1156 metaslab_group_destroy(vd->vdev_log_mg);
1157 vd->vdev_log_mg = NULL;
1158 }
1159
1160 ASSERT0(vd->vdev_stat.vs_space);
1161 ASSERT0(vd->vdev_stat.vs_dspace);
1162 ASSERT0(vd->vdev_stat.vs_alloc);
1163
1164 /*
1165 * Remove this vdev from its parent's child list.
1166 */
1167 vdev_remove_child(vd->vdev_parent, vd);
1168
1169 ASSERT0P(vd->vdev_parent);
1170 ASSERT(!list_link_active(&vd->vdev_leaf_node));
1171
1172 /*
1173 * Clean up vdev structure.
1174 */
1175 vdev_queue_fini(vd);
1176
1177 if (vd->vdev_path)
1178 spa_strfree(vd->vdev_path);
1179 if (vd->vdev_devid)
1180 spa_strfree(vd->vdev_devid);
1181 if (vd->vdev_physpath)
1182 spa_strfree(vd->vdev_physpath);
1183
1184 if (vd->vdev_enc_sysfs_path)
1185 spa_strfree(vd->vdev_enc_sysfs_path);
1186
1187 if (vd->vdev_fru)
1188 spa_strfree(vd->vdev_fru);
1189
1190 if (vd->vdev_isspare)
1191 spa_spare_remove(vd);
1192 if (vd->vdev_isl2cache)
1193 spa_l2cache_remove(vd);
1194 if (vd->vdev_prev_histo)
1195 kmem_free(vd->vdev_prev_histo,
1196 sizeof (uint64_t) * VDEV_L_HISTO_BUCKETS);
1197
1198 txg_list_destroy(&vd->vdev_ms_list);
1199 txg_list_destroy(&vd->vdev_dtl_list);
1200
1201 mutex_enter(&vd->vdev_dtl_lock);
1202 space_map_close(vd->vdev_dtl_sm);
1203 for (int t = 0; t < DTL_TYPES; t++) {
1204 zfs_range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
1205 zfs_range_tree_destroy(vd->vdev_dtl[t]);
1206 }
1207 mutex_exit(&vd->vdev_dtl_lock);
1208
1209 EQUIV(vd->vdev_indirect_births != NULL,
1210 vd->vdev_indirect_mapping != NULL);
1211 if (vd->vdev_indirect_births != NULL) {
1212 vdev_indirect_mapping_close(vd->vdev_indirect_mapping);
1213 vdev_indirect_births_close(vd->vdev_indirect_births);
1214 }
1215
1216 if (vd->vdev_obsolete_sm != NULL) {
1217 ASSERT(vd->vdev_removing ||
1218 vd->vdev_ops == &vdev_indirect_ops);
1219 space_map_close(vd->vdev_obsolete_sm);
1220 vd->vdev_obsolete_sm = NULL;
1221 }
1222 zfs_range_tree_destroy(vd->vdev_obsolete_segments);
1223 rw_destroy(&vd->vdev_indirect_rwlock);
1224 mutex_destroy(&vd->vdev_obsolete_lock);
1225
1226 mutex_destroy(&vd->vdev_dtl_lock);
1227 mutex_destroy(&vd->vdev_stat_lock);
1228 mutex_destroy(&vd->vdev_probe_lock);
1229 mutex_destroy(&vd->vdev_scan_io_queue_lock);
1230
1231 mutex_destroy(&vd->vdev_initialize_lock);
1232 mutex_destroy(&vd->vdev_initialize_io_lock);
1233 cv_destroy(&vd->vdev_initialize_io_cv);
1234 cv_destroy(&vd->vdev_initialize_cv);
1235
1236 mutex_destroy(&vd->vdev_trim_lock);
1237 mutex_destroy(&vd->vdev_autotrim_lock);
1238 mutex_destroy(&vd->vdev_trim_io_lock);
1239 cv_destroy(&vd->vdev_trim_cv);
1240 cv_destroy(&vd->vdev_autotrim_cv);
1241 cv_destroy(&vd->vdev_autotrim_kick_cv);
1242 cv_destroy(&vd->vdev_trim_io_cv);
1243
1244 mutex_destroy(&vd->vdev_rebuild_lock);
1245 cv_destroy(&vd->vdev_rebuild_cv);
1246
1247 zfs_ratelimit_fini(&vd->vdev_delay_rl);
1248 zfs_ratelimit_fini(&vd->vdev_deadman_rl);
1249 zfs_ratelimit_fini(&vd->vdev_dio_verify_rl);
1250 zfs_ratelimit_fini(&vd->vdev_checksum_rl);
1251
1252 if (vd == spa->spa_root_vdev)
1253 spa->spa_root_vdev = NULL;
1254
1255 kmem_free(vd, sizeof (vdev_t));
1256 }
1257
1258 /*
1259 * Transfer top-level vdev state from svd to tvd.
1260 */
1261 static void
vdev_top_transfer(vdev_t * svd,vdev_t * tvd)1262 vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
1263 {
1264 spa_t *spa = svd->vdev_spa;
1265 metaslab_t *msp;
1266 vdev_t *vd;
1267 int t;
1268
1269 ASSERT(tvd == tvd->vdev_top);
1270
1271 tvd->vdev_ms_array = svd->vdev_ms_array;
1272 tvd->vdev_ms_shift = svd->vdev_ms_shift;
1273 tvd->vdev_ms_count = svd->vdev_ms_count;
1274 tvd->vdev_top_zap = svd->vdev_top_zap;
1275
1276 svd->vdev_ms_array = 0;
1277 svd->vdev_ms_shift = 0;
1278 svd->vdev_ms_count = 0;
1279 svd->vdev_top_zap = 0;
1280
1281 if (tvd->vdev_mg)
1282 ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg);
1283 if (tvd->vdev_log_mg)
1284 ASSERT3P(tvd->vdev_log_mg, ==, svd->vdev_log_mg);
1285 tvd->vdev_mg = svd->vdev_mg;
1286 tvd->vdev_log_mg = svd->vdev_log_mg;
1287 tvd->vdev_ms = svd->vdev_ms;
1288
1289 svd->vdev_mg = NULL;
1290 svd->vdev_log_mg = NULL;
1291 svd->vdev_ms = NULL;
1292
1293 if (tvd->vdev_mg != NULL)
1294 tvd->vdev_mg->mg_vd = tvd;
1295 if (tvd->vdev_log_mg != NULL)
1296 tvd->vdev_log_mg->mg_vd = tvd;
1297
1298 tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm;
1299 svd->vdev_checkpoint_sm = NULL;
1300
1301 tvd->vdev_alloc_bias = svd->vdev_alloc_bias;
1302 svd->vdev_alloc_bias = VDEV_BIAS_NONE;
1303
1304 tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc;
1305 tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space;
1306 tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace;
1307
1308 svd->vdev_stat.vs_alloc = 0;
1309 svd->vdev_stat.vs_space = 0;
1310 svd->vdev_stat.vs_dspace = 0;
1311
1312 /*
1313 * State which may be set on a top-level vdev that's in the
1314 * process of being removed.
1315 */
1316 ASSERT0(tvd->vdev_indirect_config.vic_births_object);
1317 ASSERT0(tvd->vdev_indirect_config.vic_mapping_object);
1318 ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL);
1319 ASSERT0P(tvd->vdev_indirect_mapping);
1320 ASSERT0P(tvd->vdev_indirect_births);
1321 ASSERT0P(tvd->vdev_obsolete_sm);
1322 ASSERT0(tvd->vdev_noalloc);
1323 ASSERT0(tvd->vdev_removing);
1324 ASSERT0(tvd->vdev_rebuilding);
1325 tvd->vdev_noalloc = svd->vdev_noalloc;
1326 tvd->vdev_removing = svd->vdev_removing;
1327 tvd->vdev_rebuilding = svd->vdev_rebuilding;
1328 tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
1329 tvd->vdev_indirect_config = svd->vdev_indirect_config;
1330 tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
1331 tvd->vdev_indirect_births = svd->vdev_indirect_births;
1332 zfs_range_tree_swap(&svd->vdev_obsolete_segments,
1333 &tvd->vdev_obsolete_segments);
1334 tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm;
1335 svd->vdev_indirect_config.vic_mapping_object = 0;
1336 svd->vdev_indirect_config.vic_births_object = 0;
1337 svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL;
1338 svd->vdev_indirect_mapping = NULL;
1339 svd->vdev_indirect_births = NULL;
1340 svd->vdev_obsolete_sm = NULL;
1341 svd->vdev_noalloc = 0;
1342 svd->vdev_removing = 0;
1343 svd->vdev_rebuilding = 0;
1344
1345 for (t = 0; t < TXG_SIZE; t++) {
1346 while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
1347 (void) txg_list_add(&tvd->vdev_ms_list, msp, t);
1348 while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL)
1349 (void) txg_list_add(&tvd->vdev_dtl_list, vd, t);
1350 if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t))
1351 (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t);
1352 }
1353
1354 if (list_link_active(&svd->vdev_config_dirty_node)) {
1355 vdev_config_clean(svd);
1356 vdev_config_dirty(tvd);
1357 }
1358
1359 if (list_link_active(&svd->vdev_state_dirty_node)) {
1360 vdev_state_clean(svd);
1361 vdev_state_dirty(tvd);
1362 }
1363
1364 tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio;
1365 svd->vdev_deflate_ratio = 0;
1366
1367 tvd->vdev_islog = svd->vdev_islog;
1368 svd->vdev_islog = 0;
1369
1370 dsl_scan_io_queue_vdev_xfer(svd, tvd);
1371 }
1372
1373 static void
vdev_top_update(vdev_t * tvd,vdev_t * vd)1374 vdev_top_update(vdev_t *tvd, vdev_t *vd)
1375 {
1376 if (vd == NULL)
1377 return;
1378
1379 vd->vdev_top = tvd;
1380
1381 for (int c = 0; c < vd->vdev_children; c++)
1382 vdev_top_update(tvd, vd->vdev_child[c]);
1383 }
1384
1385 /*
1386 * Add a mirror/replacing vdev above an existing vdev. There is no need to
1387 * call .vdev_op_init() since mirror/replacing vdevs do not have private state.
1388 */
1389 vdev_t *
vdev_add_parent(vdev_t * cvd,vdev_ops_t * ops)1390 vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
1391 {
1392 spa_t *spa = cvd->vdev_spa;
1393 vdev_t *pvd = cvd->vdev_parent;
1394 vdev_t *mvd;
1395
1396 ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1397
1398 mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
1399
1400 mvd->vdev_asize = cvd->vdev_asize;
1401 mvd->vdev_min_asize = cvd->vdev_min_asize;
1402 mvd->vdev_max_asize = cvd->vdev_max_asize;
1403 mvd->vdev_psize = cvd->vdev_psize;
1404 mvd->vdev_ashift = cvd->vdev_ashift;
1405 mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
1406 mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
1407 mvd->vdev_state = cvd->vdev_state;
1408 mvd->vdev_crtxg = cvd->vdev_crtxg;
1409 mvd->vdev_nonrot = cvd->vdev_nonrot;
1410
1411 vdev_remove_child(pvd, cvd);
1412 vdev_add_child(pvd, mvd);
1413 cvd->vdev_id = mvd->vdev_children;
1414 vdev_add_child(mvd, cvd);
1415 vdev_top_update(cvd->vdev_top, cvd->vdev_top);
1416
1417 if (mvd == mvd->vdev_top)
1418 vdev_top_transfer(cvd, mvd);
1419
1420 return (mvd);
1421 }
1422
1423 /*
1424 * Remove a 1-way mirror/replacing vdev from the tree.
1425 */
1426 void
vdev_remove_parent(vdev_t * cvd)1427 vdev_remove_parent(vdev_t *cvd)
1428 {
1429 vdev_t *mvd = cvd->vdev_parent;
1430 vdev_t *pvd = mvd->vdev_parent;
1431
1432 ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1433
1434 ASSERT(mvd->vdev_children == 1);
1435 ASSERT(mvd->vdev_ops == &vdev_mirror_ops ||
1436 mvd->vdev_ops == &vdev_replacing_ops ||
1437 mvd->vdev_ops == &vdev_spare_ops);
1438 cvd->vdev_ashift = mvd->vdev_ashift;
1439 cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
1440 cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
1441 vdev_remove_child(mvd, cvd);
1442 vdev_remove_child(pvd, mvd);
1443
1444 /*
1445 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
1446 * Otherwise, we could have detached an offline device, and when we
1447 * go to import the pool we'll think we have two top-level vdevs,
1448 * instead of a different version of the same top-level vdev.
1449 */
1450 if (mvd->vdev_top == mvd) {
1451 uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
1452 cvd->vdev_orig_guid = cvd->vdev_guid;
1453 cvd->vdev_guid += guid_delta;
1454 cvd->vdev_guid_sum += guid_delta;
1455
1456 /*
1457 * If pool not set for autoexpand, we need to also preserve
1458 * mvd's asize to prevent automatic expansion of cvd.
1459 * Otherwise if we are adjusting the mirror by attaching and
1460 * detaching children of non-uniform sizes, the mirror could
1461 * autoexpand, unexpectedly requiring larger devices to
1462 * re-establish the mirror.
1463 */
1464 if (!cvd->vdev_spa->spa_autoexpand)
1465 cvd->vdev_asize = mvd->vdev_asize;
1466 }
1467 cvd->vdev_id = mvd->vdev_id;
1468 vdev_add_child(pvd, cvd);
1469 vdev_top_update(cvd->vdev_top, cvd->vdev_top);
1470
1471 if (cvd == cvd->vdev_top)
1472 vdev_top_transfer(mvd, cvd);
1473
1474 ASSERT0(mvd->vdev_children);
1475 vdev_free(mvd);
1476 }
1477
1478 /*
1479 * Choose GCD for spa_gcd_alloc.
1480 */
1481 static uint64_t
vdev_gcd(uint64_t a,uint64_t b)1482 vdev_gcd(uint64_t a, uint64_t b)
1483 {
1484 while (b != 0) {
1485 uint64_t t = b;
1486 b = a % b;
1487 a = t;
1488 }
1489 return (a);
1490 }
1491
1492 /*
1493 * Set spa_min_alloc and spa_gcd_alloc.
1494 */
1495 static void
vdev_spa_set_alloc(spa_t * spa,uint64_t min_alloc)1496 vdev_spa_set_alloc(spa_t *spa, uint64_t min_alloc)
1497 {
1498 if (min_alloc < spa->spa_min_alloc)
1499 spa->spa_min_alloc = min_alloc;
1500 if (spa->spa_gcd_alloc == INT_MAX) {
1501 spa->spa_gcd_alloc = min_alloc;
1502 } else {
1503 spa->spa_gcd_alloc = vdev_gcd(min_alloc,
1504 spa->spa_gcd_alloc);
1505 }
1506 }
1507
1508 void
vdev_metaslab_group_create(vdev_t * vd)1509 vdev_metaslab_group_create(vdev_t *vd)
1510 {
1511 spa_t *spa = vd->vdev_spa;
1512
1513 /*
1514 * metaslab_group_create was delayed until allocation bias was available
1515 */
1516 if (vd->vdev_mg == NULL) {
1517 metaslab_class_t *mc;
1518
1519 if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE)
1520 vd->vdev_alloc_bias = VDEV_BIAS_LOG;
1521
1522 ASSERT3U(vd->vdev_islog, ==,
1523 (vd->vdev_alloc_bias == VDEV_BIAS_LOG));
1524
1525 switch (vd->vdev_alloc_bias) {
1526 case VDEV_BIAS_LOG:
1527 mc = spa_log_class(spa);
1528 break;
1529 case VDEV_BIAS_SPECIAL:
1530 mc = spa_special_class(spa);
1531 break;
1532 case VDEV_BIAS_DEDUP:
1533 mc = spa_dedup_class(spa);
1534 break;
1535 default:
1536 mc = spa_normal_class(spa);
1537 }
1538
1539 vd->vdev_mg = metaslab_group_create(mc, vd);
1540
1541 if (!vd->vdev_islog) {
1542 if (mc == spa_special_class(spa)) {
1543 vd->vdev_log_mg = metaslab_group_create(
1544 spa_special_embedded_log_class(spa), vd);
1545 } else {
1546 vd->vdev_log_mg = metaslab_group_create(
1547 spa_embedded_log_class(spa), vd);
1548 }
1549 }
1550
1551 /*
1552 * The spa ashift min/max only apply for the normal metaslab
1553 * class. Class destination is late binding so ashift boundary
1554 * setting had to wait until now.
1555 */
1556 if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
1557 mc == spa_normal_class(spa) && vd->vdev_aux == NULL) {
1558 if (vd->vdev_ashift > spa->spa_max_ashift)
1559 spa->spa_max_ashift = vd->vdev_ashift;
1560 if (vd->vdev_ashift < spa->spa_min_ashift)
1561 spa->spa_min_ashift = vd->vdev_ashift;
1562
1563 uint64_t min_alloc = vdev_get_min_alloc(vd);
1564 vdev_spa_set_alloc(spa, min_alloc);
1565 }
1566 }
1567 }
1568
1569 void
vdev_update_nonallocating_space(vdev_t * vd,boolean_t add)1570 vdev_update_nonallocating_space(vdev_t *vd, boolean_t add)
1571 {
1572 spa_t *spa = vd->vdev_spa;
1573
1574 if (vd->vdev_mg->mg_class != spa_normal_class(spa))
1575 return;
1576
1577 uint64_t raw_space = metaslab_group_get_space(vd->vdev_mg);
1578 uint64_t dspace = spa_deflate(spa) ?
1579 vdev_deflated_space(vd, raw_space) : raw_space;
1580 if (add) {
1581 spa->spa_nonallocating_dspace += dspace;
1582 } else {
1583 ASSERT3U(spa->spa_nonallocating_dspace, >=, dspace);
1584 spa->spa_nonallocating_dspace -= dspace;
1585 }
1586 }
1587
1588 int
vdev_metaslab_init(vdev_t * vd,uint64_t txg)1589 vdev_metaslab_init(vdev_t *vd, uint64_t txg)
1590 {
1591 spa_t *spa = vd->vdev_spa;
1592 uint64_t oldc = vd->vdev_ms_count;
1593 uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
1594 metaslab_t **mspp;
1595 int error;
1596 boolean_t expanding = (oldc != 0);
1597
1598 ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1599
1600 /*
1601 * This vdev is not being allocated from yet or is a hole.
1602 */
1603 if (vd->vdev_ms_shift == 0)
1604 return (0);
1605
1606 ASSERT(!vd->vdev_ishole);
1607
1608 ASSERT(oldc <= newc);
1609
1610 mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
1611
1612 if (expanding) {
1613 memcpy(mspp, vd->vdev_ms, oldc * sizeof (*mspp));
1614 vmem_free(vd->vdev_ms, oldc * sizeof (*mspp));
1615 }
1616
1617 vd->vdev_ms = mspp;
1618 vd->vdev_ms_count = newc;
1619
1620 /*
1621 * Weighting algorithms can depend on the number of metaslabs in the
1622 * vdev. In order to ensure that all weights are correct at all times,
1623 * we need to recalculate here.
1624 */
1625 for (uint64_t m = 0; m < oldc; m++) {
1626 metaslab_t *msp = vd->vdev_ms[m];
1627 mutex_enter(&msp->ms_lock);
1628 metaslab_recalculate_weight_and_sort(msp);
1629 mutex_exit(&msp->ms_lock);
1630 }
1631
1632 for (uint64_t m = oldc; m < newc; m++) {
1633 uint64_t object = 0;
1634 /*
1635 * vdev_ms_array may be 0 if we are creating the "fake"
1636 * metaslabs for an indirect vdev for zdb's leak detection.
1637 * See zdb_leak_init().
1638 */
1639 if (txg == 0 && vd->vdev_ms_array != 0) {
1640 error = dmu_read(spa->spa_meta_objset,
1641 vd->vdev_ms_array,
1642 m * sizeof (uint64_t), sizeof (uint64_t), &object,
1643 DMU_READ_PREFETCH);
1644 if (error != 0) {
1645 vdev_dbgmsg(vd, "unable to read the metaslab "
1646 "array [error=%d]", error);
1647 return (error);
1648 }
1649 }
1650
1651 error = metaslab_init(vd->vdev_mg, m, object, txg,
1652 &(vd->vdev_ms[m]));
1653 if (error != 0) {
1654 vdev_dbgmsg(vd, "metaslab_init failed [error=%d]",
1655 error);
1656 return (error);
1657 }
1658 }
1659
1660 /*
1661 * Find the emptiest metaslab on the vdev and mark it for use for
1662 * embedded slog by moving it from the regular to the log metaslab
1663 * group. This works for normal and special vdevs.
1664 */
1665 if ((vd->vdev_mg->mg_class == spa_normal_class(spa) ||
1666 vd->vdev_mg->mg_class == spa_special_class(spa)) &&
1667 vd->vdev_ms_count > zfs_embedded_slog_min_ms &&
1668 avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) {
1669 uint64_t slog_msid = 0;
1670 uint64_t smallest = UINT64_MAX;
1671
1672 /*
1673 * Note, we only search the new metaslabs, because the old
1674 * (pre-existing) ones may be active (e.g. have non-empty
1675 * range_tree's), and we don't move them to the new
1676 * metaslab_t.
1677 */
1678 for (uint64_t m = oldc; m < newc; m++) {
1679 uint64_t alloc =
1680 space_map_allocated(vd->vdev_ms[m]->ms_sm);
1681 if (alloc < smallest) {
1682 slog_msid = m;
1683 smallest = alloc;
1684 }
1685 }
1686 metaslab_t *slog_ms = vd->vdev_ms[slog_msid];
1687 /*
1688 * The metaslab was marked as dirty at the end of
1689 * metaslab_init(). Remove it from the dirty list so that we
1690 * can uninitialize and reinitialize it to the new class.
1691 */
1692 if (txg != 0) {
1693 (void) txg_list_remove_this(&vd->vdev_ms_list,
1694 slog_ms, txg);
1695 }
1696 uint64_t sm_obj = space_map_object(slog_ms->ms_sm);
1697 metaslab_fini(slog_ms);
1698 VERIFY0(metaslab_init(vd->vdev_log_mg, slog_msid, sm_obj, txg,
1699 &vd->vdev_ms[slog_msid]));
1700 }
1701
1702 if (txg == 0)
1703 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
1704
1705 /*
1706 * If the vdev is marked as non-allocating then don't
1707 * activate the metaslabs since we want to ensure that
1708 * no allocations are performed on this device.
1709 */
1710 if (vd->vdev_noalloc) {
1711 /* track non-allocating vdev space */
1712 vdev_update_nonallocating_space(vd, B_TRUE);
1713 } else if (!expanding) {
1714 metaslab_group_activate(vd->vdev_mg);
1715 if (vd->vdev_log_mg != NULL)
1716 metaslab_group_activate(vd->vdev_log_mg);
1717 }
1718
1719 if (txg == 0)
1720 spa_config_exit(spa, SCL_ALLOC, FTAG);
1721
1722 return (0);
1723 }
1724
1725 void
vdev_metaslab_fini(vdev_t * vd)1726 vdev_metaslab_fini(vdev_t *vd)
1727 {
1728 if (vd->vdev_checkpoint_sm != NULL) {
1729 ASSERT(spa_feature_is_active(vd->vdev_spa,
1730 SPA_FEATURE_POOL_CHECKPOINT));
1731 space_map_close(vd->vdev_checkpoint_sm);
1732 /*
1733 * Even though we close the space map, we need to set its
1734 * pointer to NULL. The reason is that vdev_metaslab_fini()
1735 * may be called multiple times for certain operations
1736 * (i.e. when destroying a pool) so we need to ensure that
1737 * this clause never executes twice. This logic is similar
1738 * to the one used for the vdev_ms clause below.
1739 */
1740 vd->vdev_checkpoint_sm = NULL;
1741 }
1742
1743 if (vd->vdev_ms != NULL) {
1744 metaslab_group_t *mg = vd->vdev_mg;
1745
1746 metaslab_group_passivate(mg);
1747 if (vd->vdev_log_mg != NULL) {
1748 ASSERT(!vd->vdev_islog);
1749 metaslab_group_passivate(vd->vdev_log_mg);
1750 }
1751
1752 uint64_t count = vd->vdev_ms_count;
1753 for (uint64_t m = 0; m < count; m++) {
1754 metaslab_t *msp = vd->vdev_ms[m];
1755 if (msp != NULL)
1756 metaslab_fini(msp);
1757 }
1758 vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
1759 vd->vdev_ms = NULL;
1760 vd->vdev_ms_count = 0;
1761
1762 for (int i = 0; i < ZFS_RANGE_TREE_HISTOGRAM_SIZE; i++) {
1763 ASSERT0(mg->mg_histogram[i]);
1764 if (vd->vdev_log_mg != NULL)
1765 ASSERT0(vd->vdev_log_mg->mg_histogram[i]);
1766 }
1767 }
1768 ASSERT0(vd->vdev_ms_count);
1769 }
1770
1771 typedef struct vdev_probe_stats {
1772 boolean_t vps_readable;
1773 boolean_t vps_writeable;
1774 boolean_t vps_zio_done_probe;
1775 int vps_flags;
1776 } vdev_probe_stats_t;
1777
1778 static void
vdev_probe_done(zio_t * zio)1779 vdev_probe_done(zio_t *zio)
1780 {
1781 spa_t *spa = zio->io_spa;
1782 vdev_t *vd = zio->io_vd;
1783 vdev_probe_stats_t *vps = zio->io_private;
1784
1785 ASSERT(vd->vdev_probe_zio != NULL);
1786
1787 if (zio->io_type == ZIO_TYPE_READ) {
1788 if (zio->io_error == 0)
1789 vps->vps_readable = 1;
1790 if (zio->io_error == 0 && spa_writeable(spa)) {
1791 zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
1792 zio->io_offset, zio->io_size, zio->io_abd,
1793 ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1794 ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
1795 } else {
1796 abd_free(zio->io_abd);
1797 }
1798 } else if (zio->io_type == ZIO_TYPE_WRITE) {
1799 if (zio->io_error == 0)
1800 vps->vps_writeable = 1;
1801 abd_free(zio->io_abd);
1802 } else if (zio->io_type == ZIO_TYPE_NULL) {
1803 zio_t *pio;
1804 zio_link_t *zl;
1805
1806 vd->vdev_cant_read |= !vps->vps_readable;
1807 vd->vdev_cant_write |= !vps->vps_writeable;
1808 vdev_dbgmsg(vd, "probe done, cant_read=%u cant_write=%u",
1809 vd->vdev_cant_read, vd->vdev_cant_write);
1810
1811 if (vdev_readable(vd) &&
1812 (vdev_writeable(vd) || !spa_writeable(spa))) {
1813 zio->io_error = 0;
1814 } else {
1815 ASSERT(zio->io_error != 0);
1816 vdev_dbgmsg(vd, "failed probe");
1817 (void) zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
1818 spa, vd, NULL, NULL, 0);
1819 zio->io_error = SET_ERROR(ENXIO);
1820
1821 /*
1822 * If this probe was initiated from zio pipeline, then
1823 * change the state in a spa_async_request. Probes that
1824 * were initiated from a vdev_open can change the state
1825 * as part of the open call.
1826 * Skip fault injection if this vdev is already removed
1827 * or a removal is pending.
1828 */
1829 if (vps->vps_zio_done_probe &&
1830 !vd->vdev_remove_wanted && !vd->vdev_removed) {
1831 vd->vdev_fault_wanted = B_TRUE;
1832 spa_async_request(spa, SPA_ASYNC_FAULT_VDEV);
1833 }
1834 }
1835
1836 mutex_enter(&vd->vdev_probe_lock);
1837 ASSERT(vd->vdev_probe_zio == zio);
1838 vd->vdev_probe_zio = NULL;
1839 mutex_exit(&vd->vdev_probe_lock);
1840
1841 zl = NULL;
1842 while ((pio = zio_walk_parents(zio, &zl)) != NULL)
1843 if (!vdev_accessible(vd, pio))
1844 pio->io_error = SET_ERROR(ENXIO);
1845
1846 kmem_free(vps, sizeof (*vps));
1847 }
1848 }
1849
1850 /*
1851 * Determine whether this device is accessible.
1852 *
1853 * Read and write to several known locations: the pad regions of each
1854 * vdev label but the first, which we leave alone in case it contains
1855 * a VTOC.
1856 */
1857 zio_t *
vdev_probe(vdev_t * vd,zio_t * zio)1858 vdev_probe(vdev_t *vd, zio_t *zio)
1859 {
1860 spa_t *spa = vd->vdev_spa;
1861 vdev_probe_stats_t *vps = NULL;
1862 zio_t *pio;
1863
1864 ASSERT(vd->vdev_ops->vdev_op_leaf);
1865
1866 /*
1867 * Don't probe the probe.
1868 */
1869 if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
1870 return (NULL);
1871
1872 /*
1873 * To prevent 'probe storms' when a device fails, we create
1874 * just one probe i/o at a time. All zios that want to probe
1875 * this vdev will become parents of the probe io.
1876 */
1877 mutex_enter(&vd->vdev_probe_lock);
1878
1879 if ((pio = vd->vdev_probe_zio) == NULL) {
1880 vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
1881
1882 vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
1883 ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_TRYHARD;
1884 vps->vps_zio_done_probe = (zio != NULL);
1885
1886 if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
1887 /*
1888 * vdev_cant_read and vdev_cant_write can only
1889 * transition from TRUE to FALSE when we have the
1890 * SCL_ZIO lock as writer; otherwise they can only
1891 * transition from FALSE to TRUE. This ensures that
1892 * any zio looking at these values can assume that
1893 * failures persist for the life of the I/O. That's
1894 * important because when a device has intermittent
1895 * connectivity problems, we want to ensure that
1896 * they're ascribed to the device (ENXIO) and not
1897 * the zio (EIO).
1898 *
1899 * Since we hold SCL_ZIO as writer here, clear both
1900 * values so the probe can reevaluate from first
1901 * principles.
1902 */
1903 vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
1904 vd->vdev_cant_read = B_FALSE;
1905 vd->vdev_cant_write = B_FALSE;
1906 }
1907
1908 vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
1909 vdev_probe_done, vps,
1910 vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
1911 }
1912
1913 if (zio != NULL)
1914 zio_add_child(zio, pio);
1915
1916 mutex_exit(&vd->vdev_probe_lock);
1917
1918 if (vps == NULL) {
1919 ASSERT(zio != NULL);
1920 return (NULL);
1921 }
1922
1923 for (int l = 1; l < VDEV_LABELS; l++) {
1924 zio_nowait(zio_read_phys(pio, vd,
1925 vdev_label_offset(vd->vdev_psize, l,
1926 offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE,
1927 abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE),
1928 ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
1929 ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
1930 }
1931
1932 if (zio == NULL)
1933 return (pio);
1934
1935 zio_nowait(pio);
1936 return (NULL);
1937 }
1938
1939 static void
vdev_load_child(void * arg)1940 vdev_load_child(void *arg)
1941 {
1942 vdev_t *vd = arg;
1943
1944 vd->vdev_load_error = vdev_load(vd);
1945 }
1946
1947 static void
vdev_open_child(void * arg)1948 vdev_open_child(void *arg)
1949 {
1950 vdev_t *vd = arg;
1951
1952 vd->vdev_open_thread = curthread;
1953 vd->vdev_open_error = vdev_open(vd);
1954 vd->vdev_open_thread = NULL;
1955 }
1956
1957 static boolean_t
vdev_uses_zvols(vdev_t * vd)1958 vdev_uses_zvols(vdev_t *vd)
1959 {
1960 #ifdef _KERNEL
1961 if (zvol_is_zvol(vd->vdev_path))
1962 return (B_TRUE);
1963 #endif
1964
1965 for (int c = 0; c < vd->vdev_children; c++)
1966 if (vdev_uses_zvols(vd->vdev_child[c]))
1967 return (B_TRUE);
1968
1969 return (B_FALSE);
1970 }
1971
1972 /*
1973 * Returns B_TRUE if the passed child should be opened.
1974 */
1975 static boolean_t
vdev_default_open_children_func(vdev_t * vd)1976 vdev_default_open_children_func(vdev_t *vd)
1977 {
1978 (void) vd;
1979 return (B_TRUE);
1980 }
1981
1982 /*
1983 * Open the requested child vdevs. If any of the leaf vdevs are using
1984 * a ZFS volume then do the opens in a single thread. This avoids a
1985 * deadlock when the current thread is holding the spa_namespace_lock.
1986 */
1987 static void
vdev_open_children_impl(vdev_t * vd,vdev_open_children_func_t * open_func)1988 vdev_open_children_impl(vdev_t *vd, vdev_open_children_func_t *open_func)
1989 {
1990 int children = vd->vdev_children;
1991
1992 taskq_t *tq = taskq_create("vdev_open", children, minclsyspri,
1993 children, children, TASKQ_PREPOPULATE);
1994 vd->vdev_nonrot = B_TRUE;
1995
1996 for (int c = 0; c < children; c++) {
1997 vdev_t *cvd = vd->vdev_child[c];
1998
1999 if (open_func(cvd) == B_FALSE)
2000 continue;
2001
2002 if (tq == NULL || vdev_uses_zvols(vd)) {
2003 cvd->vdev_open_error = vdev_open(cvd);
2004 } else {
2005 VERIFY(taskq_dispatch(tq, vdev_open_child,
2006 cvd, TQ_SLEEP) != TASKQID_INVALID);
2007 }
2008 }
2009
2010 if (tq != NULL)
2011 taskq_wait(tq);
2012 for (int c = 0; c < children; c++) {
2013 vdev_t *cvd = vd->vdev_child[c];
2014
2015 if (open_func(cvd) == B_FALSE ||
2016 cvd->vdev_state <= VDEV_STATE_FAULTED)
2017 continue;
2018 vd->vdev_nonrot &= cvd->vdev_nonrot;
2019 }
2020
2021 if (tq != NULL)
2022 taskq_destroy(tq);
2023 }
2024
2025 /*
2026 * Open all child vdevs.
2027 */
2028 void
vdev_open_children(vdev_t * vd)2029 vdev_open_children(vdev_t *vd)
2030 {
2031 vdev_open_children_impl(vd, vdev_default_open_children_func);
2032 }
2033
2034 /*
2035 * Conditionally open a subset of child vdevs.
2036 */
2037 void
vdev_open_children_subset(vdev_t * vd,vdev_open_children_func_t * open_func)2038 vdev_open_children_subset(vdev_t *vd, vdev_open_children_func_t *open_func)
2039 {
2040 vdev_open_children_impl(vd, open_func);
2041 }
2042
2043 /*
2044 * Compute the raidz-deflation ratio. Note, we hard-code 128k (1 << 17)
2045 * because it is the "typical" blocksize. Even though SPA_MAXBLOCKSIZE
2046 * changed, this algorithm can not change, otherwise it would inconsistently
2047 * account for existing bp's. We also hard-code txg 0 for the same reason
2048 * since expanded RAIDZ vdevs can use a different asize for different birth
2049 * txg's.
2050 */
2051 static void
vdev_set_deflate_ratio(vdev_t * vd)2052 vdev_set_deflate_ratio(vdev_t *vd)
2053 {
2054 if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) {
2055 vd->vdev_deflate_ratio = (1 << 17) /
2056 (vdev_psize_to_asize_txg(vd, 1 << 17, 0) >>
2057 SPA_MINBLOCKSHIFT);
2058 }
2059 }
2060
2061 /*
2062 * Choose the best of two ashifts, preferring one between logical ashift
2063 * (absolute minimum) and administrator defined maximum, otherwise take
2064 * the biggest of the two.
2065 */
2066 uint64_t
vdev_best_ashift(uint64_t logical,uint64_t a,uint64_t b)2067 vdev_best_ashift(uint64_t logical, uint64_t a, uint64_t b)
2068 {
2069 if (a > logical && a <= zfs_vdev_max_auto_ashift) {
2070 if (b <= logical || b > zfs_vdev_max_auto_ashift)
2071 return (a);
2072 else
2073 return (MAX(a, b));
2074 } else if (b <= logical || b > zfs_vdev_max_auto_ashift)
2075 return (MAX(a, b));
2076 return (b);
2077 }
2078
2079 /*
2080 * Maximize performance by inflating the configured ashift for top level
2081 * vdevs to be as close to the physical ashift as possible while maintaining
2082 * administrator defined limits and ensuring it doesn't go below the
2083 * logical ashift.
2084 */
2085 static void
vdev_ashift_optimize(vdev_t * vd)2086 vdev_ashift_optimize(vdev_t *vd)
2087 {
2088 ASSERT(vd == vd->vdev_top);
2089
2090 if (vd->vdev_ashift < vd->vdev_physical_ashift &&
2091 vd->vdev_physical_ashift <= zfs_vdev_max_auto_ashift) {
2092 vd->vdev_ashift = MIN(
2093 MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift),
2094 MAX(zfs_vdev_min_auto_ashift,
2095 vd->vdev_physical_ashift));
2096 } else {
2097 /*
2098 * If the logical and physical ashifts are the same, then
2099 * we ensure that the top-level vdev's ashift is not smaller
2100 * than our minimum ashift value. For the unusual case
2101 * where logical ashift > physical ashift, we can't cap
2102 * the calculated ashift based on max ashift as that
2103 * would cause failures.
2104 * We still check if we need to increase it to match
2105 * the min ashift.
2106 */
2107 vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift,
2108 vd->vdev_ashift);
2109 }
2110 }
2111
2112 /*
2113 * Prepare a virtual device for access.
2114 */
2115 int
vdev_open(vdev_t * vd)2116 vdev_open(vdev_t *vd)
2117 {
2118 spa_t *spa = vd->vdev_spa;
2119 int error;
2120 uint64_t osize = 0;
2121 uint64_t max_osize = 0;
2122 uint64_t asize, max_asize, psize;
2123 uint64_t logical_ashift = 0;
2124 uint64_t physical_ashift = 0;
2125
2126 ASSERT(vd->vdev_open_thread == curthread ||
2127 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2128 ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
2129 vd->vdev_state == VDEV_STATE_CANT_OPEN ||
2130 vd->vdev_state == VDEV_STATE_OFFLINE);
2131
2132 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2133 vd->vdev_cant_read = B_FALSE;
2134 vd->vdev_cant_write = B_FALSE;
2135 vd->vdev_fault_wanted = B_FALSE;
2136 vd->vdev_remove_wanted = B_FALSE;
2137 vd->vdev_min_asize = vdev_get_min_asize(vd);
2138
2139 /*
2140 * If this vdev is not removed, check its fault status. If it's
2141 * faulted, bail out of the open.
2142 */
2143 if (!vd->vdev_removed && vd->vdev_faulted) {
2144 ASSERT0(vd->vdev_children);
2145 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
2146 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
2147 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2148 vd->vdev_label_aux);
2149 return (SET_ERROR(ENXIO));
2150 } else if (vd->vdev_offline) {
2151 ASSERT0(vd->vdev_children);
2152 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
2153 return (SET_ERROR(ENXIO));
2154 }
2155
2156 error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
2157 &logical_ashift, &physical_ashift);
2158
2159 /* Keep the device in removed state if unplugged */
2160 if (error == ENOENT && vd->vdev_removed) {
2161 vdev_set_state(vd, B_TRUE, VDEV_STATE_REMOVED,
2162 VDEV_AUX_NONE);
2163 return (error);
2164 }
2165
2166 /*
2167 * Physical volume size should never be larger than its max size, unless
2168 * the disk has shrunk while we were reading it or the device is buggy
2169 * or damaged: either way it's not safe for use, bail out of the open.
2170 */
2171 if (osize > max_osize) {
2172 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2173 VDEV_AUX_OPEN_FAILED);
2174 return (SET_ERROR(ENXIO));
2175 }
2176
2177 /*
2178 * Reset the vdev_reopening flag so that we actually close
2179 * the vdev on error.
2180 */
2181 vd->vdev_reopening = B_FALSE;
2182 if (zio_injection_enabled && error == 0)
2183 error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO));
2184
2185 if (error) {
2186 if (vd->vdev_removed &&
2187 vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED)
2188 vd->vdev_removed = B_FALSE;
2189
2190 if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) {
2191 vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE,
2192 vd->vdev_stat.vs_aux);
2193 } else {
2194 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2195 vd->vdev_stat.vs_aux);
2196 }
2197 return (error);
2198 }
2199
2200 vd->vdev_removed = B_FALSE;
2201
2202 /*
2203 * Recheck the faulted flag now that we have confirmed that
2204 * the vdev is accessible. If we're faulted, bail.
2205 */
2206 if (vd->vdev_faulted) {
2207 ASSERT0(vd->vdev_children);
2208 ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
2209 vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
2210 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2211 vd->vdev_label_aux);
2212 return (SET_ERROR(ENXIO));
2213 }
2214
2215 if (vd->vdev_degraded) {
2216 ASSERT0(vd->vdev_children);
2217 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
2218 VDEV_AUX_ERR_EXCEEDED);
2219 } else {
2220 vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
2221 }
2222
2223 /*
2224 * For hole or missing vdevs we just return success.
2225 */
2226 if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
2227 return (0);
2228
2229 for (int c = 0; c < vd->vdev_children; c++) {
2230 if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
2231 vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
2232 VDEV_AUX_NONE);
2233 break;
2234 }
2235 }
2236
2237 osize = P2ALIGN_TYPED(osize, sizeof (vdev_label_t), uint64_t);
2238 max_osize = P2ALIGN_TYPED(max_osize, sizeof (vdev_label_t), uint64_t);
2239
2240 if (vd->vdev_children == 0) {
2241 if (osize < SPA_MINDEVSIZE) {
2242 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2243 VDEV_AUX_TOO_SMALL);
2244 return (SET_ERROR(EOVERFLOW));
2245 }
2246 psize = osize;
2247 asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE);
2248 max_asize = max_osize - (VDEV_LABEL_START_SIZE +
2249 VDEV_LABEL_END_SIZE);
2250 } else {
2251 if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE -
2252 (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
2253 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2254 VDEV_AUX_TOO_SMALL);
2255 return (SET_ERROR(EOVERFLOW));
2256 }
2257 psize = 0;
2258 asize = osize;
2259 max_asize = max_osize;
2260 }
2261
2262 /*
2263 * If the vdev was expanded, record this so that we can re-create the
2264 * uberblock rings in labels {2,3}, during the next sync.
2265 */
2266 if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0))
2267 vd->vdev_copy_uberblocks = B_TRUE;
2268
2269 vd->vdev_psize = psize;
2270
2271 /*
2272 * Make sure the allocatable size hasn't shrunk too much.
2273 */
2274 if (asize < vd->vdev_min_asize) {
2275 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2276 VDEV_AUX_BAD_LABEL);
2277 return (SET_ERROR(EINVAL));
2278 }
2279
2280 /*
2281 * We can always set the logical/physical ashift members since
2282 * their values are only used to calculate the vdev_ashift when
2283 * the device is first added to the config. These values should
2284 * not be used for anything else since they may change whenever
2285 * the device is reopened and we don't store them in the label.
2286 */
2287 vd->vdev_physical_ashift =
2288 MAX(physical_ashift, vd->vdev_physical_ashift);
2289 vd->vdev_logical_ashift = MAX(logical_ashift,
2290 vd->vdev_logical_ashift);
2291
2292 if (vd->vdev_asize == 0) {
2293 /*
2294 * This is the first-ever open, so use the computed values.
2295 * For compatibility, a different ashift can be requested.
2296 */
2297 vd->vdev_asize = asize;
2298 vd->vdev_max_asize = max_asize;
2299
2300 /*
2301 * If the vdev_ashift was not overridden at creation time
2302 * (0) or the override value is impossible for the device,
2303 * then set it the logical ashift and optimize the ashift.
2304 */
2305 if (vd->vdev_ashift < vd->vdev_logical_ashift) {
2306 vd->vdev_ashift = vd->vdev_logical_ashift;
2307
2308 if (vd->vdev_logical_ashift > ASHIFT_MAX) {
2309 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2310 VDEV_AUX_ASHIFT_TOO_BIG);
2311 return (SET_ERROR(EDOM));
2312 }
2313
2314 if (vd->vdev_top == vd && vd->vdev_attaching == B_FALSE)
2315 vdev_ashift_optimize(vd);
2316 vd->vdev_attaching = B_FALSE;
2317 }
2318 if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN ||
2319 vd->vdev_ashift > ASHIFT_MAX)) {
2320 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2321 VDEV_AUX_BAD_ASHIFT);
2322 return (SET_ERROR(EDOM));
2323 }
2324 } else {
2325 /*
2326 * Make sure the alignment required hasn't increased.
2327 */
2328 if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
2329 vd->vdev_ops->vdev_op_leaf) {
2330 (void) zfs_ereport_post(
2331 FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT,
2332 spa, vd, NULL, NULL, 0);
2333 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
2334 VDEV_AUX_BAD_LABEL);
2335 return (SET_ERROR(EDOM));
2336 }
2337 vd->vdev_max_asize = max_asize;
2338 }
2339
2340 /*
2341 * If all children are healthy we update asize if either:
2342 * The asize has increased, due to a device expansion caused by dynamic
2343 * LUN growth or vdev replacement, and automatic expansion is enabled;
2344 * making the additional space available.
2345 *
2346 * The asize has decreased, due to a device shrink usually caused by a
2347 * vdev replace with a smaller device. This ensures that calculations
2348 * based of max_asize and asize e.g. esize are always valid. It's safe
2349 * to do this as we've already validated that asize is greater than
2350 * vdev_min_asize.
2351 */
2352 if (vd->vdev_state == VDEV_STATE_HEALTHY &&
2353 ((asize > vd->vdev_asize &&
2354 (vd->vdev_expanding || spa->spa_autoexpand)) ||
2355 (asize < vd->vdev_asize)))
2356 vd->vdev_asize = asize;
2357
2358 vdev_set_min_asize(vd);
2359
2360 /*
2361 * Ensure we can issue some IO before declaring the
2362 * vdev open for business.
2363 */
2364 if (vd->vdev_ops->vdev_op_leaf &&
2365 (error = zio_wait(vdev_probe(vd, NULL))) != 0) {
2366 vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
2367 VDEV_AUX_ERR_EXCEEDED);
2368 return (error);
2369 }
2370
2371 /*
2372 * Track the minimum allocation size.
2373 */
2374 if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
2375 vd->vdev_islog == 0 && vd->vdev_aux == NULL) {
2376 uint64_t min_alloc = vdev_get_min_alloc(vd);
2377 vdev_spa_set_alloc(spa, min_alloc);
2378 }
2379
2380 /*
2381 * If this is a leaf vdev, assess whether a resilver is needed.
2382 * But don't do this if we are doing a reopen for a scrub, since
2383 * this would just restart the scrub we are already doing.
2384 */
2385 if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen)
2386 dsl_scan_assess_vdev(spa->spa_dsl_pool, vd);
2387
2388 return (0);
2389 }
2390
2391 static void
vdev_validate_child(void * arg)2392 vdev_validate_child(void *arg)
2393 {
2394 vdev_t *vd = arg;
2395
2396 vd->vdev_validate_thread = curthread;
2397 vd->vdev_validate_error = vdev_validate(vd);
2398 vd->vdev_validate_thread = NULL;
2399 }
2400
2401 /*
2402 * Called once the vdevs are all opened, this routine validates the label
2403 * contents. This needs to be done before vdev_load() so that we don't
2404 * inadvertently do repair I/Os to the wrong device.
2405 *
2406 * This function will only return failure if one of the vdevs indicates that it
2407 * has since been destroyed or exported. This is only possible if
2408 * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state
2409 * will be updated but the function will return 0.
2410 */
2411 int
vdev_validate(vdev_t * vd)2412 vdev_validate(vdev_t *vd)
2413 {
2414 spa_t *spa = vd->vdev_spa;
2415 taskq_t *tq = NULL;
2416 nvlist_t *label;
2417 uint64_t guid = 0, aux_guid = 0, top_guid;
2418 uint64_t state;
2419 nvlist_t *nvl;
2420 uint64_t txg;
2421 int children = vd->vdev_children;
2422
2423 if (vdev_validate_skip)
2424 return (0);
2425
2426 if (children > 0) {
2427 tq = taskq_create("vdev_validate", children, minclsyspri,
2428 children, children, TASKQ_PREPOPULATE);
2429 }
2430
2431 for (uint64_t c = 0; c < children; c++) {
2432 vdev_t *cvd = vd->vdev_child[c];
2433
2434 if (tq == NULL || vdev_uses_zvols(cvd)) {
2435 vdev_validate_child(cvd);
2436 } else {
2437 VERIFY(taskq_dispatch(tq, vdev_validate_child, cvd,
2438 TQ_SLEEP) != TASKQID_INVALID);
2439 }
2440 }
2441 if (tq != NULL) {
2442 taskq_wait(tq);
2443 taskq_destroy(tq);
2444 }
2445 for (int c = 0; c < children; c++) {
2446 int error = vd->vdev_child[c]->vdev_validate_error;
2447
2448 if (error != 0)
2449 return (SET_ERROR(EBADF));
2450 }
2451
2452
2453 /*
2454 * If the device has already failed, or was marked offline, don't do
2455 * any further validation. Otherwise, label I/O will fail and we will
2456 * overwrite the previous state.
2457 */
2458 if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd))
2459 return (0);
2460
2461 /*
2462 * If we are performing an extreme rewind, we allow for a label that
2463 * was modified at a point after the current txg.
2464 * If config lock is not held do not check for the txg. spa_sync could
2465 * be updating the vdev's label before updating spa_last_synced_txg.
2466 */
2467 if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 ||
2468 spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG)
2469 txg = UINT64_MAX;
2470 else
2471 txg = spa_last_synced_txg(spa);
2472
2473 if ((label = vdev_label_read_config(vd, txg)) == NULL) {
2474 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2475 VDEV_AUX_BAD_LABEL);
2476 vdev_dbgmsg(vd, "vdev_validate: failed reading config for "
2477 "txg %llu", (u_longlong_t)txg);
2478 return (0);
2479 }
2480
2481 /*
2482 * Determine if this vdev has been split off into another
2483 * pool. If so, then refuse to open it.
2484 */
2485 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
2486 &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
2487 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2488 VDEV_AUX_SPLIT_POOL);
2489 nvlist_free(label);
2490 vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool");
2491 return (0);
2492 }
2493
2494 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) {
2495 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2496 VDEV_AUX_CORRUPT_DATA);
2497 nvlist_free(label);
2498 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2499 ZPOOL_CONFIG_POOL_GUID);
2500 return (0);
2501 }
2502
2503 /*
2504 * If config is not trusted then ignore the spa guid check. This is
2505 * necessary because if the machine crashed during a re-guid the new
2506 * guid might have been written to all of the vdev labels, but not the
2507 * cached config. The check will be performed again once we have the
2508 * trusted config from the MOS.
2509 */
2510 if (spa->spa_trust_config && guid != spa_guid(spa)) {
2511 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2512 VDEV_AUX_CORRUPT_DATA);
2513 nvlist_free(label);
2514 vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't "
2515 "match config (%llu != %llu)", (u_longlong_t)guid,
2516 (u_longlong_t)spa_guid(spa));
2517 return (0);
2518 }
2519
2520 if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
2521 != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
2522 &aux_guid) != 0)
2523 aux_guid = 0;
2524
2525 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) {
2526 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2527 VDEV_AUX_CORRUPT_DATA);
2528 nvlist_free(label);
2529 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2530 ZPOOL_CONFIG_GUID);
2531 return (0);
2532 }
2533
2534 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid)
2535 != 0) {
2536 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2537 VDEV_AUX_CORRUPT_DATA);
2538 nvlist_free(label);
2539 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2540 ZPOOL_CONFIG_TOP_GUID);
2541 return (0);
2542 }
2543
2544 /*
2545 * If this vdev just became a top-level vdev because its sibling was
2546 * detached, it will have adopted the parent's vdev guid -- but the
2547 * label may or may not be on disk yet. Fortunately, either version
2548 * of the label will have the same top guid, so if we're a top-level
2549 * vdev, we can safely compare to that instead.
2550 * However, if the config comes from a cachefile that failed to update
2551 * after the detach, a top-level vdev will appear as a non top-level
2552 * vdev in the config. Also relax the constraints if we perform an
2553 * extreme rewind.
2554 *
2555 * If we split this vdev off instead, then we also check the
2556 * original pool's guid. We don't want to consider the vdev
2557 * corrupt if it is partway through a split operation.
2558 */
2559 if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) {
2560 boolean_t mismatch = B_FALSE;
2561 if (spa->spa_trust_config && !spa->spa_extreme_rewind) {
2562 if (vd != vd->vdev_top || vd->vdev_guid != top_guid)
2563 mismatch = B_TRUE;
2564 } else {
2565 if (vd->vdev_guid != top_guid &&
2566 vd->vdev_top->vdev_guid != guid)
2567 mismatch = B_TRUE;
2568 }
2569
2570 if (mismatch) {
2571 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2572 VDEV_AUX_CORRUPT_DATA);
2573 nvlist_free(label);
2574 vdev_dbgmsg(vd, "vdev_validate: config guid "
2575 "doesn't match label guid");
2576 vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu",
2577 (u_longlong_t)vd->vdev_guid,
2578 (u_longlong_t)vd->vdev_top->vdev_guid);
2579 vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, "
2580 "aux_guid %llu", (u_longlong_t)guid,
2581 (u_longlong_t)top_guid, (u_longlong_t)aux_guid);
2582 return (0);
2583 }
2584 }
2585
2586 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
2587 &state) != 0) {
2588 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2589 VDEV_AUX_CORRUPT_DATA);
2590 nvlist_free(label);
2591 vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label",
2592 ZPOOL_CONFIG_POOL_STATE);
2593 return (0);
2594 }
2595
2596 nvlist_free(label);
2597
2598 /*
2599 * If this is a verbatim import, no need to check the
2600 * state of the pool.
2601 */
2602 if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) &&
2603 spa_load_state(spa) == SPA_LOAD_OPEN &&
2604 state != POOL_STATE_ACTIVE) {
2605 vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) "
2606 "for spa %s", (u_longlong_t)state, spa->spa_name);
2607 return (SET_ERROR(EBADF));
2608 }
2609
2610 /*
2611 * If we were able to open and validate a vdev that was
2612 * previously marked permanently unavailable, clear that state
2613 * now.
2614 */
2615 if (vd->vdev_not_present)
2616 vd->vdev_not_present = 0;
2617
2618 return (0);
2619 }
2620
2621 static void
vdev_update_path(const char * prefix,char * svd,char ** dvd,uint64_t guid)2622 vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid)
2623 {
2624 if (svd != NULL && *dvd != NULL) {
2625 if (strcmp(svd, *dvd) != 0) {
2626 zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed "
2627 "from '%s' to '%s'", (u_longlong_t)guid, prefix,
2628 *dvd, svd);
2629 spa_strfree(*dvd);
2630 *dvd = spa_strdup(svd);
2631 }
2632 } else if (svd != NULL) {
2633 *dvd = spa_strdup(svd);
2634 zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'",
2635 (u_longlong_t)guid, *dvd);
2636 }
2637 }
2638
2639 static void
vdev_copy_path_impl(vdev_t * svd,vdev_t * dvd)2640 vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd)
2641 {
2642 char *old, *new;
2643
2644 vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path,
2645 dvd->vdev_guid);
2646
2647 vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid,
2648 dvd->vdev_guid);
2649
2650 vdev_update_path("vdev_physpath", svd->vdev_physpath,
2651 &dvd->vdev_physpath, dvd->vdev_guid);
2652
2653 /*
2654 * Our enclosure sysfs path may have changed between imports
2655 */
2656 old = dvd->vdev_enc_sysfs_path;
2657 new = svd->vdev_enc_sysfs_path;
2658 if ((old != NULL && new == NULL) ||
2659 (old == NULL && new != NULL) ||
2660 ((old != NULL && new != NULL) && strcmp(new, old) != 0)) {
2661 zfs_dbgmsg("vdev_copy_path: vdev %llu: vdev_enc_sysfs_path "
2662 "changed from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid,
2663 old, new);
2664
2665 if (dvd->vdev_enc_sysfs_path)
2666 spa_strfree(dvd->vdev_enc_sysfs_path);
2667
2668 if (svd->vdev_enc_sysfs_path) {
2669 dvd->vdev_enc_sysfs_path = spa_strdup(
2670 svd->vdev_enc_sysfs_path);
2671 } else {
2672 dvd->vdev_enc_sysfs_path = NULL;
2673 }
2674 }
2675 }
2676
2677 /*
2678 * Recursively copy vdev paths from one vdev to another. Source and destination
2679 * vdev trees must have same geometry otherwise return error. Intended to copy
2680 * paths from userland config into MOS config.
2681 */
2682 int
vdev_copy_path_strict(vdev_t * svd,vdev_t * dvd)2683 vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd)
2684 {
2685 if ((svd->vdev_ops == &vdev_missing_ops) ||
2686 (svd->vdev_ishole && dvd->vdev_ishole) ||
2687 (dvd->vdev_ops == &vdev_indirect_ops))
2688 return (0);
2689
2690 if (svd->vdev_ops != dvd->vdev_ops) {
2691 vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s",
2692 svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type);
2693 return (SET_ERROR(EINVAL));
2694 }
2695
2696 if (svd->vdev_guid != dvd->vdev_guid) {
2697 vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != "
2698 "%llu)", (u_longlong_t)svd->vdev_guid,
2699 (u_longlong_t)dvd->vdev_guid);
2700 return (SET_ERROR(EINVAL));
2701 }
2702
2703 if (svd->vdev_children != dvd->vdev_children) {
2704 vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: "
2705 "%llu != %llu", (u_longlong_t)svd->vdev_children,
2706 (u_longlong_t)dvd->vdev_children);
2707 return (SET_ERROR(EINVAL));
2708 }
2709
2710 for (uint64_t i = 0; i < svd->vdev_children; i++) {
2711 int error = vdev_copy_path_strict(svd->vdev_child[i],
2712 dvd->vdev_child[i]);
2713 if (error != 0)
2714 return (error);
2715 }
2716
2717 if (svd->vdev_ops->vdev_op_leaf)
2718 vdev_copy_path_impl(svd, dvd);
2719
2720 return (0);
2721 }
2722
2723 static void
vdev_copy_path_search(vdev_t * stvd,vdev_t * dvd)2724 vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd)
2725 {
2726 ASSERT(stvd->vdev_top == stvd);
2727 ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id);
2728
2729 for (uint64_t i = 0; i < dvd->vdev_children; i++) {
2730 vdev_copy_path_search(stvd, dvd->vdev_child[i]);
2731 }
2732
2733 if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd))
2734 return;
2735
2736 /*
2737 * The idea here is that while a vdev can shift positions within
2738 * a top vdev (when replacing, attaching mirror, etc.) it cannot
2739 * step outside of it.
2740 */
2741 vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid);
2742
2743 if (vd == NULL || vd->vdev_ops != dvd->vdev_ops)
2744 return;
2745
2746 ASSERT(vd->vdev_ops->vdev_op_leaf);
2747
2748 vdev_copy_path_impl(vd, dvd);
2749 }
2750
2751 /*
2752 * Recursively copy vdev paths from one root vdev to another. Source and
2753 * destination vdev trees may differ in geometry. For each destination leaf
2754 * vdev, search a vdev with the same guid and top vdev id in the source.
2755 * Intended to copy paths from userland config into MOS config.
2756 */
2757 void
vdev_copy_path_relaxed(vdev_t * srvd,vdev_t * drvd)2758 vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd)
2759 {
2760 uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children);
2761 ASSERT(srvd->vdev_ops == &vdev_root_ops);
2762 ASSERT(drvd->vdev_ops == &vdev_root_ops);
2763
2764 for (uint64_t i = 0; i < children; i++) {
2765 vdev_copy_path_search(srvd->vdev_child[i],
2766 drvd->vdev_child[i]);
2767 }
2768 }
2769
2770 /*
2771 * Close a virtual device.
2772 */
2773 void
vdev_close(vdev_t * vd)2774 vdev_close(vdev_t *vd)
2775 {
2776 vdev_t *pvd = vd->vdev_parent;
2777 spa_t *spa __maybe_unused = vd->vdev_spa;
2778
2779 ASSERT(vd != NULL);
2780 ASSERT(vd->vdev_open_thread == curthread ||
2781 spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2782
2783 /*
2784 * If our parent is reopening, then we are as well, unless we are
2785 * going offline.
2786 */
2787 if (pvd != NULL && pvd->vdev_reopening)
2788 vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
2789
2790 vd->vdev_ops->vdev_op_close(vd);
2791
2792 /*
2793 * We record the previous state before we close it, so that if we are
2794 * doing a reopen(), we don't generate FMA ereports if we notice that
2795 * it's still faulted.
2796 */
2797 vd->vdev_prevstate = vd->vdev_state;
2798
2799 if (vd->vdev_offline)
2800 vd->vdev_state = VDEV_STATE_OFFLINE;
2801 else
2802 vd->vdev_state = VDEV_STATE_CLOSED;
2803 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
2804 }
2805
2806 void
vdev_hold(vdev_t * vd)2807 vdev_hold(vdev_t *vd)
2808 {
2809 spa_t *spa = vd->vdev_spa;
2810
2811 ASSERT(spa_is_root(spa));
2812 if (spa->spa_state == POOL_STATE_UNINITIALIZED)
2813 return;
2814
2815 for (int c = 0; c < vd->vdev_children; c++)
2816 vdev_hold(vd->vdev_child[c]);
2817
2818 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_hold != NULL)
2819 vd->vdev_ops->vdev_op_hold(vd);
2820 }
2821
2822 void
vdev_rele(vdev_t * vd)2823 vdev_rele(vdev_t *vd)
2824 {
2825 ASSERT(spa_is_root(vd->vdev_spa));
2826 for (int c = 0; c < vd->vdev_children; c++)
2827 vdev_rele(vd->vdev_child[c]);
2828
2829 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_ops->vdev_op_rele != NULL)
2830 vd->vdev_ops->vdev_op_rele(vd);
2831 }
2832
2833 /*
2834 * Reopen all interior vdevs and any unopened leaves. We don't actually
2835 * reopen leaf vdevs which had previously been opened as they might deadlock
2836 * on the spa_config_lock. Instead we only obtain the leaf's physical size.
2837 * If the leaf has never been opened then open it, as usual.
2838 */
2839 void
vdev_reopen(vdev_t * vd)2840 vdev_reopen(vdev_t *vd)
2841 {
2842 spa_t *spa = vd->vdev_spa;
2843
2844 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
2845
2846 /* set the reopening flag unless we're taking the vdev offline */
2847 vd->vdev_reopening = !vd->vdev_offline;
2848 vdev_close(vd);
2849 (void) vdev_open(vd);
2850
2851 /*
2852 * Call vdev_validate() here to make sure we have the same device.
2853 * Otherwise, a device with an invalid label could be successfully
2854 * opened in response to vdev_reopen().
2855 */
2856 if (vd->vdev_aux) {
2857 (void) vdev_validate_aux(vd);
2858 if (vdev_readable(vd) && vdev_writeable(vd) &&
2859 vd->vdev_aux == &spa->spa_l2cache) {
2860 /*
2861 * In case the vdev is present we should evict all ARC
2862 * buffers and pointers to log blocks and reclaim their
2863 * space before restoring its contents to L2ARC.
2864 */
2865 if (l2arc_vdev_present(vd)) {
2866 l2arc_rebuild_vdev(vd, B_TRUE);
2867 } else {
2868 l2arc_add_vdev(spa, vd);
2869 }
2870 spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
2871 spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
2872 }
2873 } else {
2874 (void) vdev_validate(vd);
2875 }
2876
2877 /*
2878 * Recheck if resilver is still needed and cancel any
2879 * scheduled resilver if resilver is unneeded.
2880 */
2881 if (!vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL) &&
2882 spa->spa_async_tasks & SPA_ASYNC_RESILVER) {
2883 mutex_enter(&spa->spa_async_lock);
2884 spa->spa_async_tasks &= ~SPA_ASYNC_RESILVER;
2885 mutex_exit(&spa->spa_async_lock);
2886 }
2887
2888 /*
2889 * Reassess parent vdev's health.
2890 */
2891 vdev_propagate_state(vd);
2892 }
2893
2894 int
vdev_create(vdev_t * vd,uint64_t txg,boolean_t isreplacing)2895 vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
2896 {
2897 int error;
2898
2899 /*
2900 * Normally, partial opens (e.g. of a mirror) are allowed.
2901 * For a create, however, we want to fail the request if
2902 * there are any components we can't open.
2903 */
2904 error = vdev_open(vd);
2905
2906 if (error || vd->vdev_state != VDEV_STATE_HEALTHY) {
2907 vdev_close(vd);
2908 return (error ? error : SET_ERROR(ENXIO));
2909 }
2910
2911 /*
2912 * Recursively load DTLs and initialize all labels.
2913 */
2914 if ((error = vdev_dtl_load(vd)) != 0 ||
2915 (error = vdev_label_init(vd, txg, isreplacing ?
2916 VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
2917 vdev_close(vd);
2918 return (error);
2919 }
2920
2921 return (0);
2922 }
2923
2924 void
vdev_metaslab_set_size(vdev_t * vd)2925 vdev_metaslab_set_size(vdev_t *vd)
2926 {
2927 uint64_t asize = vd->vdev_asize;
2928 uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
2929 uint64_t ms_shift;
2930
2931 /*
2932 * There are two dimensions to the metaslab sizing calculation:
2933 * the size of the metaslab and the count of metaslabs per vdev.
2934 *
2935 * The default values used below are a good balance between memory
2936 * usage (larger metaslab size means more memory needed for loaded
2937 * metaslabs; more metaslabs means more memory needed for the
2938 * metaslab_t structs), metaslab load time (larger metaslabs take
2939 * longer to load), and metaslab sync time (more metaslabs means
2940 * more time spent syncing all of them).
2941 *
2942 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
2943 * The range of the dimensions are as follows:
2944 *
2945 * 2^29 <= ms_size <= 2^34
2946 * 16 <= ms_count <= 131,072
2947 *
2948 * On the lower end of vdev sizes, we aim for metaslabs sizes of
2949 * at least 512MB (2^29) to minimize fragmentation effects when
2950 * testing with smaller devices. However, the count constraint
2951 * of at least 16 metaslabs will override this minimum size goal.
2952 *
2953 * On the upper end of vdev sizes, we aim for a maximum metaslab
2954 * size of 16GB. However, we will cap the total count to 2^17
2955 * metaslabs to keep our memory footprint in check and let the
2956 * metaslab size grow from there if that limit is hit.
2957 *
2958 * The net effect of applying above constrains is summarized below.
2959 *
2960 * vdev size metaslab count
2961 * --------------|-----------------
2962 * < 8GB ~16
2963 * 8GB - 100GB one per 512MB
2964 * 100GB - 3TB ~200
2965 * 3TB - 2PB one per 16GB
2966 * > 2PB ~131,072
2967 * --------------------------------
2968 *
2969 * Finally, note that all of the above calculate the initial
2970 * number of metaslabs. Expanding a top-level vdev will result
2971 * in additional metaslabs being allocated making it possible
2972 * to exceed the zfs_vdev_ms_count_limit.
2973 */
2974
2975 if (ms_count < zfs_vdev_min_ms_count)
2976 ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
2977 else if (ms_count > zfs_vdev_default_ms_count)
2978 ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
2979 else
2980 ms_shift = zfs_vdev_default_ms_shift;
2981
2982 if (ms_shift < SPA_MAXBLOCKSHIFT) {
2983 ms_shift = SPA_MAXBLOCKSHIFT;
2984 } else if (ms_shift > zfs_vdev_max_ms_shift) {
2985 ms_shift = zfs_vdev_max_ms_shift;
2986 /* cap the total count to constrain memory footprint */
2987 if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
2988 ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
2989 }
2990
2991 vd->vdev_ms_shift = ms_shift;
2992 ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT);
2993 }
2994
2995 void
vdev_dirty(vdev_t * vd,int flags,void * arg,uint64_t txg)2996 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
2997 {
2998 ASSERT(vd == vd->vdev_top);
2999 /* indirect vdevs don't have metaslabs or dtls */
3000 ASSERT(vdev_is_concrete(vd) || flags == 0);
3001 ASSERT(ISP2(flags));
3002 ASSERT(spa_writeable(vd->vdev_spa));
3003
3004 if (flags & VDD_METASLAB)
3005 (void) txg_list_add(&vd->vdev_ms_list, arg, txg);
3006
3007 if (flags & VDD_DTL)
3008 (void) txg_list_add(&vd->vdev_dtl_list, arg, txg);
3009
3010 (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
3011 }
3012
3013 void
vdev_dirty_leaves(vdev_t * vd,int flags,uint64_t txg)3014 vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
3015 {
3016 for (int c = 0; c < vd->vdev_children; c++)
3017 vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
3018
3019 if (vd->vdev_ops->vdev_op_leaf)
3020 vdev_dirty(vd->vdev_top, flags, vd, txg);
3021 }
3022
3023 /*
3024 * DTLs.
3025 *
3026 * A vdev's DTL (dirty time log) is the set of transaction groups for which
3027 * the vdev has less than perfect replication. There are four kinds of DTL:
3028 *
3029 * DTL_MISSING: txgs for which the vdev has no valid copies of the data
3030 *
3031 * DTL_PARTIAL: txgs for which data is available, but not fully replicated
3032 *
3033 * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
3034 * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
3035 * txgs that was scrubbed.
3036 *
3037 * DTL_OUTAGE: txgs which cannot currently be read, whether due to
3038 * persistent errors or just some device being offline.
3039 * Unlike the other three, the DTL_OUTAGE map is not generally
3040 * maintained; it's only computed when needed, typically to
3041 * determine whether a device can be detached.
3042 *
3043 * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
3044 * either has the data or it doesn't.
3045 *
3046 * For interior vdevs such as mirror and RAID-Z the picture is more complex.
3047 * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
3048 * if any child is less than fully replicated, then so is its parent.
3049 * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
3050 * comprising only those txgs which appear in 'maxfaults' or more children;
3051 * those are the txgs we don't have enough replication to read. For example,
3052 * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
3053 * thus, its DTL_MISSING consists of the set of txgs that appear in more than
3054 * two child DTL_MISSING maps.
3055 *
3056 * It should be clear from the above that to compute the DTLs and outage maps
3057 * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
3058 * Therefore, that is all we keep on disk. When loading the pool, or after
3059 * a configuration change, we generate all other DTLs from first principles.
3060 */
3061 void
vdev_dtl_dirty(vdev_t * vd,vdev_dtl_type_t t,uint64_t txg,uint64_t size)3062 vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
3063 {
3064 zfs_range_tree_t *rt = vd->vdev_dtl[t];
3065
3066 ASSERT(t < DTL_TYPES);
3067 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
3068 ASSERT(spa_writeable(vd->vdev_spa));
3069
3070 mutex_enter(&vd->vdev_dtl_lock);
3071 if (!zfs_range_tree_contains(rt, txg, size))
3072 zfs_range_tree_add(rt, txg, size);
3073 mutex_exit(&vd->vdev_dtl_lock);
3074 }
3075
3076 boolean_t
vdev_dtl_contains(vdev_t * vd,vdev_dtl_type_t t,uint64_t txg,uint64_t size)3077 vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
3078 {
3079 zfs_range_tree_t *rt = vd->vdev_dtl[t];
3080 boolean_t dirty = B_FALSE;
3081
3082 ASSERT(t < DTL_TYPES);
3083 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
3084
3085 /*
3086 * While we are loading the pool, the DTLs have not been loaded yet.
3087 * This isn't a problem but it can result in devices being tried
3088 * which are known to not have the data. In which case, the import
3089 * is relying on the checksum to ensure that we get the right data.
3090 * Note that while importing we are only reading the MOS, which is
3091 * always checksummed.
3092 */
3093 mutex_enter(&vd->vdev_dtl_lock);
3094 if (!zfs_range_tree_is_empty(rt))
3095 dirty = zfs_range_tree_contains(rt, txg, size);
3096 mutex_exit(&vd->vdev_dtl_lock);
3097
3098 return (dirty);
3099 }
3100
3101 boolean_t
vdev_dtl_empty(vdev_t * vd,vdev_dtl_type_t t)3102 vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
3103 {
3104 zfs_range_tree_t *rt = vd->vdev_dtl[t];
3105 boolean_t empty;
3106
3107 mutex_enter(&vd->vdev_dtl_lock);
3108 empty = zfs_range_tree_is_empty(rt);
3109 mutex_exit(&vd->vdev_dtl_lock);
3110
3111 return (empty);
3112 }
3113
3114 /*
3115 * Check if the txg falls within the range which must be
3116 * resilvered. DVAs outside this range can always be skipped.
3117 */
3118 boolean_t
vdev_default_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3119 vdev_default_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3120 uint64_t phys_birth)
3121 {
3122 (void) dva, (void) psize;
3123
3124 /* Set by sequential resilver. */
3125 if (phys_birth == TXG_UNKNOWN)
3126 return (B_TRUE);
3127
3128 return (vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1));
3129 }
3130
3131 /*
3132 * Returns B_TRUE if the vdev determines the DVA needs to be resilvered.
3133 */
3134 boolean_t
vdev_dtl_need_resilver(vdev_t * vd,const dva_t * dva,size_t psize,uint64_t phys_birth)3135 vdev_dtl_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3136 uint64_t phys_birth)
3137 {
3138 ASSERT(vd != vd->vdev_spa->spa_root_vdev);
3139
3140 if (vd->vdev_ops->vdev_op_need_resilver == NULL ||
3141 vd->vdev_ops->vdev_op_leaf)
3142 return (B_TRUE);
3143
3144 return (vd->vdev_ops->vdev_op_need_resilver(vd, dva, psize,
3145 phys_birth));
3146 }
3147
3148 /*
3149 * Returns the lowest txg in the DTL range.
3150 */
3151 static uint64_t
vdev_dtl_min(vdev_t * vd)3152 vdev_dtl_min(vdev_t *vd)
3153 {
3154 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
3155 ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
3156 ASSERT0(vd->vdev_children);
3157
3158 return (zfs_range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1);
3159 }
3160
3161 /*
3162 * Returns the highest txg in the DTL.
3163 */
3164 static uint64_t
vdev_dtl_max(vdev_t * vd)3165 vdev_dtl_max(vdev_t *vd)
3166 {
3167 ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
3168 ASSERT3U(zfs_range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
3169 ASSERT0(vd->vdev_children);
3170
3171 return (zfs_range_tree_max(vd->vdev_dtl[DTL_MISSING]));
3172 }
3173
3174 /*
3175 * Determine if a resilvering vdev should remove any DTL entries from
3176 * its range. If the vdev was resilvering for the entire duration of the
3177 * scan then it should excise that range from its DTLs. Otherwise, this
3178 * vdev is considered partially resilvered and should leave its DTL
3179 * entries intact. The comment in vdev_dtl_reassess() describes how we
3180 * excise the DTLs.
3181 */
3182 static boolean_t
vdev_dtl_should_excise(vdev_t * vd,boolean_t rebuild_done)3183 vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
3184 {
3185 ASSERT0(vd->vdev_children);
3186
3187 if (vd->vdev_state < VDEV_STATE_DEGRADED)
3188 return (B_FALSE);
3189
3190 if (vd->vdev_resilver_deferred)
3191 return (B_FALSE);
3192
3193 if (zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
3194 return (B_TRUE);
3195
3196 if (rebuild_done) {
3197 vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
3198 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
3199
3200 /* Rebuild not initiated by attach */
3201 if (vd->vdev_rebuild_txg == 0)
3202 return (B_TRUE);
3203
3204 /*
3205 * When a rebuild completes without error then all missing data
3206 * up to the rebuild max txg has been reconstructed and the DTL
3207 * is eligible for excision.
3208 */
3209 if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
3210 vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
3211 ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
3212 ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
3213 ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
3214 return (B_TRUE);
3215 }
3216 } else {
3217 dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
3218 dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
3219
3220 /* Resilver not initiated by attach */
3221 if (vd->vdev_resilver_txg == 0)
3222 return (B_TRUE);
3223
3224 /*
3225 * When a resilver is initiated the scan will assign the
3226 * scn_max_txg value to the highest txg value that exists
3227 * in all DTLs. If this device's max DTL is not part of this
3228 * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
3229 * then it is not eligible for excision.
3230 */
3231 if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
3232 ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
3233 ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
3234 ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
3235 return (B_TRUE);
3236 }
3237 }
3238
3239 return (B_FALSE);
3240 }
3241
3242 /*
3243 * Reassess DTLs after a config change or scrub completion. If txg == 0 no
3244 * write operations will be issued to the pool.
3245 */
3246 static void
vdev_dtl_reassess_impl(vdev_t * vd,uint64_t txg,uint64_t scrub_txg,boolean_t scrub_done,boolean_t rebuild_done,boolean_t faulting)3247 vdev_dtl_reassess_impl(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
3248 boolean_t scrub_done, boolean_t rebuild_done, boolean_t faulting)
3249 {
3250 spa_t *spa = vd->vdev_spa;
3251 avl_tree_t reftree;
3252 int minref;
3253
3254 ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
3255
3256 for (int c = 0; c < vd->vdev_children; c++)
3257 vdev_dtl_reassess_impl(vd->vdev_child[c], txg,
3258 scrub_txg, scrub_done, rebuild_done, faulting);
3259
3260 if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
3261 return;
3262
3263 if (vd->vdev_ops->vdev_op_leaf) {
3264 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
3265 vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
3266 boolean_t check_excise = B_FALSE;
3267 boolean_t wasempty = B_TRUE;
3268
3269 mutex_enter(&vd->vdev_dtl_lock);
3270
3271 /*
3272 * If requested, pretend the scan or rebuild completed cleanly.
3273 */
3274 if (zfs_scan_ignore_errors) {
3275 if (scn != NULL)
3276 scn->scn_phys.scn_errors = 0;
3277 if (vr != NULL)
3278 vr->vr_rebuild_phys.vrp_errors = 0;
3279 }
3280
3281 if (scrub_txg != 0 &&
3282 !zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
3283 wasempty = B_FALSE;
3284 zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d "
3285 "dtl:%llu/%llu errors:%llu",
3286 (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg,
3287 (u_longlong_t)scrub_txg, spa->spa_scrub_started,
3288 (u_longlong_t)vdev_dtl_min(vd),
3289 (u_longlong_t)vdev_dtl_max(vd),
3290 (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0));
3291 }
3292
3293 /*
3294 * If we've completed a scrub/resilver or a rebuild cleanly
3295 * then determine if this vdev should remove any DTLs. We
3296 * only want to excise regions on vdevs that were available
3297 * during the entire duration of this scan.
3298 */
3299 if (rebuild_done &&
3300 vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
3301 check_excise = B_TRUE;
3302 } else {
3303 if (spa->spa_scrub_started ||
3304 (scn != NULL && scn->scn_phys.scn_errors == 0)) {
3305 check_excise = B_TRUE;
3306 }
3307 }
3308
3309 if (scrub_txg && check_excise &&
3310 vdev_dtl_should_excise(vd, rebuild_done)) {
3311 /*
3312 * We completed a scrub, resilver or rebuild up to
3313 * scrub_txg. If we did it without rebooting, then
3314 * the scrub dtl will be valid, so excise the old
3315 * region and fold in the scrub dtl. Otherwise,
3316 * leave the dtl as-is if there was an error.
3317 *
3318 * There's little trick here: to excise the beginning
3319 * of the DTL_MISSING map, we put it into a reference
3320 * tree and then add a segment with refcnt -1 that
3321 * covers the range [0, scrub_txg). This means
3322 * that each txg in that range has refcnt -1 or 0.
3323 * We then add DTL_SCRUB with a refcnt of 2, so that
3324 * entries in the range [0, scrub_txg) will have a
3325 * positive refcnt -- either 1 or 2. We then convert
3326 * the reference tree into the new DTL_MISSING map.
3327 */
3328 space_reftree_create(&reftree);
3329 space_reftree_add_map(&reftree,
3330 vd->vdev_dtl[DTL_MISSING], 1);
3331 space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
3332 space_reftree_add_map(&reftree,
3333 vd->vdev_dtl[DTL_SCRUB], 2);
3334 space_reftree_generate_map(&reftree,
3335 vd->vdev_dtl[DTL_MISSING], 1);
3336 space_reftree_destroy(&reftree);
3337
3338 if (!zfs_range_tree_is_empty(
3339 vd->vdev_dtl[DTL_MISSING])) {
3340 zfs_dbgmsg("update DTL_MISSING:%llu/%llu",
3341 (u_longlong_t)vdev_dtl_min(vd),
3342 (u_longlong_t)vdev_dtl_max(vd));
3343 } else if (!wasempty) {
3344 zfs_dbgmsg("DTL_MISSING is now empty");
3345 }
3346 }
3347 zfs_range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
3348 zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
3349 zfs_range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
3350 if (scrub_done)
3351 zfs_range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL,
3352 NULL);
3353 zfs_range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
3354
3355 /*
3356 * For the faulting case, treat members of a replacing vdev
3357 * as if they are not available. It's more likely than not that
3358 * a vdev in a replacing vdev could encounter read errors so
3359 * treat it as not being able to contribute.
3360 */
3361 if (!vdev_readable(vd) ||
3362 (faulting && vd->vdev_parent != NULL &&
3363 vd->vdev_parent->vdev_ops == &vdev_replacing_ops)) {
3364 zfs_range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
3365 } else {
3366 zfs_range_tree_walk(vd->vdev_dtl[DTL_MISSING],
3367 zfs_range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
3368 }
3369
3370 /*
3371 * If the vdev was resilvering or rebuilding and no longer
3372 * has any DTLs then reset the appropriate flag and dirty
3373 * the top level so that we persist the change.
3374 */
3375 if (txg != 0 &&
3376 zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
3377 zfs_range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
3378 if (vd->vdev_rebuild_txg != 0) {
3379 vd->vdev_rebuild_txg = 0;
3380 vdev_config_dirty(vd->vdev_top);
3381 } else if (vd->vdev_resilver_txg != 0) {
3382 vd->vdev_resilver_txg = 0;
3383 vdev_config_dirty(vd->vdev_top);
3384 }
3385 }
3386
3387 mutex_exit(&vd->vdev_dtl_lock);
3388
3389 if (txg != 0)
3390 vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
3391 } else {
3392 mutex_enter(&vd->vdev_dtl_lock);
3393 for (int t = 0; t < DTL_TYPES; t++) {
3394 /* account for child's outage in parent's missing map */
3395 int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
3396 if (t == DTL_SCRUB) {
3397 /* leaf vdevs only */
3398 continue;
3399 }
3400 if (t == DTL_PARTIAL) {
3401 /* i.e. non-zero */
3402 minref = 1;
3403 } else if (vdev_get_nparity(vd) != 0) {
3404 /* RAIDZ, DRAID */
3405 minref = vdev_get_nparity(vd) + 1;
3406 } else {
3407 /* any kind of mirror */
3408 minref = vd->vdev_children;
3409 }
3410 space_reftree_create(&reftree);
3411 for (int c = 0; c < vd->vdev_children; c++) {
3412 vdev_t *cvd = vd->vdev_child[c];
3413 mutex_enter(&cvd->vdev_dtl_lock);
3414 space_reftree_add_map(&reftree,
3415 cvd->vdev_dtl[s], 1);
3416 mutex_exit(&cvd->vdev_dtl_lock);
3417 }
3418 space_reftree_generate_map(&reftree,
3419 vd->vdev_dtl[t], minref);
3420 space_reftree_destroy(&reftree);
3421 }
3422 mutex_exit(&vd->vdev_dtl_lock);
3423 }
3424
3425 if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) {
3426 raidz_dtl_reassessed(vd);
3427 }
3428 }
3429
3430 void
vdev_dtl_reassess(vdev_t * vd,uint64_t txg,uint64_t scrub_txg,boolean_t scrub_done,boolean_t rebuild_done)3431 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
3432 boolean_t scrub_done, boolean_t rebuild_done)
3433 {
3434 return (vdev_dtl_reassess_impl(vd, txg, scrub_txg, scrub_done,
3435 rebuild_done, B_FALSE));
3436 }
3437
3438 /*
3439 * Iterate over all the vdevs except spare, and post kobj events
3440 */
3441 void
vdev_post_kobj_evt(vdev_t * vd)3442 vdev_post_kobj_evt(vdev_t *vd)
3443 {
3444 if (vd->vdev_ops->vdev_op_kobj_evt_post &&
3445 vd->vdev_kobj_flag == B_FALSE) {
3446 vd->vdev_kobj_flag = B_TRUE;
3447 vd->vdev_ops->vdev_op_kobj_evt_post(vd);
3448 }
3449
3450 for (int c = 0; c < vd->vdev_children; c++)
3451 vdev_post_kobj_evt(vd->vdev_child[c]);
3452 }
3453
3454 /*
3455 * Iterate over all the vdevs except spare, and clear kobj events
3456 */
3457 void
vdev_clear_kobj_evt(vdev_t * vd)3458 vdev_clear_kobj_evt(vdev_t *vd)
3459 {
3460 vd->vdev_kobj_flag = B_FALSE;
3461
3462 for (int c = 0; c < vd->vdev_children; c++)
3463 vdev_clear_kobj_evt(vd->vdev_child[c]);
3464 }
3465
3466 int
vdev_dtl_load(vdev_t * vd)3467 vdev_dtl_load(vdev_t *vd)
3468 {
3469 spa_t *spa = vd->vdev_spa;
3470 objset_t *mos = spa->spa_meta_objset;
3471 zfs_range_tree_t *rt;
3472 int error = 0;
3473
3474 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
3475 ASSERT(vdev_is_concrete(vd));
3476
3477 /*
3478 * If the dtl cannot be sync'd there is no need to open it.
3479 */
3480 if (spa->spa_mode == SPA_MODE_READ && !spa->spa_read_spacemaps)
3481 return (0);
3482
3483 error = space_map_open(&vd->vdev_dtl_sm, mos,
3484 vd->vdev_dtl_object, 0, -1ULL, 0);
3485 if (error)
3486 return (error);
3487 ASSERT(vd->vdev_dtl_sm != NULL);
3488
3489 rt = zfs_range_tree_create_flags(
3490 NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
3491 ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "vdev_dtl_load:rt"));
3492 error = space_map_load(vd->vdev_dtl_sm, rt, SM_ALLOC);
3493 if (error == 0) {
3494 mutex_enter(&vd->vdev_dtl_lock);
3495 zfs_range_tree_walk(rt, zfs_range_tree_add,
3496 vd->vdev_dtl[DTL_MISSING]);
3497 mutex_exit(&vd->vdev_dtl_lock);
3498 }
3499
3500 zfs_range_tree_vacate(rt, NULL, NULL);
3501 zfs_range_tree_destroy(rt);
3502
3503 return (error);
3504 }
3505
3506 for (int c = 0; c < vd->vdev_children; c++) {
3507 error = vdev_dtl_load(vd->vdev_child[c]);
3508 if (error != 0)
3509 break;
3510 }
3511
3512 return (error);
3513 }
3514
3515 static void
vdev_zap_allocation_data(vdev_t * vd,dmu_tx_t * tx)3516 vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx)
3517 {
3518 spa_t *spa = vd->vdev_spa;
3519 objset_t *mos = spa->spa_meta_objset;
3520 vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias;
3521 const char *string;
3522
3523 ASSERT(alloc_bias != VDEV_BIAS_NONE);
3524
3525 string =
3526 (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG :
3527 (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL :
3528 (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL;
3529
3530 ASSERT(string != NULL);
3531 VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS,
3532 1, strlen(string) + 1, string, tx));
3533
3534 if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) {
3535 spa_activate_allocation_classes(spa, tx);
3536 }
3537 }
3538
3539 void
vdev_destroy_unlink_zap(vdev_t * vd,uint64_t zapobj,dmu_tx_t * tx)3540 vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx)
3541 {
3542 spa_t *spa = vd->vdev_spa;
3543
3544 VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx));
3545 VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
3546 zapobj, tx));
3547 }
3548
3549 uint64_t
vdev_create_link_zap(vdev_t * vd,dmu_tx_t * tx)3550 vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx)
3551 {
3552 spa_t *spa = vd->vdev_spa;
3553 uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA,
3554 DMU_OT_NONE, 0, tx);
3555
3556 ASSERT(zap != 0);
3557 VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps,
3558 zap, tx));
3559
3560 return (zap);
3561 }
3562
3563 void
vdev_construct_zaps(vdev_t * vd,dmu_tx_t * tx)3564 vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx)
3565 {
3566 if (vd->vdev_ops != &vdev_hole_ops &&
3567 vd->vdev_ops != &vdev_missing_ops &&
3568 vd->vdev_ops != &vdev_root_ops &&
3569 !vd->vdev_top->vdev_removing) {
3570 if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) {
3571 vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx);
3572 }
3573 if (vd == vd->vdev_top && vd->vdev_top_zap == 0) {
3574 vd->vdev_top_zap = vdev_create_link_zap(vd, tx);
3575 if (vd->vdev_alloc_bias != VDEV_BIAS_NONE)
3576 vdev_zap_allocation_data(vd, tx);
3577 }
3578 }
3579 if (vd->vdev_ops == &vdev_root_ops && vd->vdev_root_zap == 0 &&
3580 spa_feature_is_enabled(vd->vdev_spa, SPA_FEATURE_AVZ_V2)) {
3581 if (!spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2))
3582 spa_feature_incr(vd->vdev_spa, SPA_FEATURE_AVZ_V2, tx);
3583 vd->vdev_root_zap = vdev_create_link_zap(vd, tx);
3584 }
3585
3586 for (uint64_t i = 0; i < vd->vdev_children; i++) {
3587 vdev_construct_zaps(vd->vdev_child[i], tx);
3588 }
3589 }
3590
3591 static void
vdev_dtl_sync(vdev_t * vd,uint64_t txg)3592 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
3593 {
3594 spa_t *spa = vd->vdev_spa;
3595 zfs_range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
3596 objset_t *mos = spa->spa_meta_objset;
3597 zfs_range_tree_t *rtsync;
3598 dmu_tx_t *tx;
3599 uint64_t object = space_map_object(vd->vdev_dtl_sm);
3600
3601 ASSERT(vdev_is_concrete(vd));
3602 ASSERT(vd->vdev_ops->vdev_op_leaf);
3603
3604 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
3605
3606 if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
3607 mutex_enter(&vd->vdev_dtl_lock);
3608 space_map_free(vd->vdev_dtl_sm, tx);
3609 space_map_close(vd->vdev_dtl_sm);
3610 vd->vdev_dtl_sm = NULL;
3611 mutex_exit(&vd->vdev_dtl_lock);
3612
3613 /*
3614 * We only destroy the leaf ZAP for detached leaves or for
3615 * removed log devices. Removed data devices handle leaf ZAP
3616 * cleanup later, once cancellation is no longer possible.
3617 */
3618 if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached ||
3619 vd->vdev_top->vdev_islog)) {
3620 vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx);
3621 vd->vdev_leaf_zap = 0;
3622 }
3623
3624 dmu_tx_commit(tx);
3625 return;
3626 }
3627
3628 if (vd->vdev_dtl_sm == NULL) {
3629 uint64_t new_object;
3630
3631 new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx);
3632 VERIFY3U(new_object, !=, 0);
3633
3634 VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
3635 0, -1ULL, 0));
3636 ASSERT(vd->vdev_dtl_sm != NULL);
3637 }
3638
3639 rtsync = zfs_range_tree_create_flags(NULL, ZFS_RANGE_SEG64, NULL, 0, 0,
3640 ZFS_RT_F_DYN_NAME, vdev_rt_name(vd, "rtsync"));
3641
3642 mutex_enter(&vd->vdev_dtl_lock);
3643 zfs_range_tree_walk(rt, zfs_range_tree_add, rtsync);
3644 mutex_exit(&vd->vdev_dtl_lock);
3645
3646 space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx);
3647 space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx);
3648 zfs_range_tree_vacate(rtsync, NULL, NULL);
3649
3650 zfs_range_tree_destroy(rtsync);
3651
3652 /*
3653 * If the object for the space map has changed then dirty
3654 * the top level so that we update the config.
3655 */
3656 if (object != space_map_object(vd->vdev_dtl_sm)) {
3657 vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, "
3658 "new object %llu", (u_longlong_t)txg, spa_name(spa),
3659 (u_longlong_t)object,
3660 (u_longlong_t)space_map_object(vd->vdev_dtl_sm));
3661 vdev_config_dirty(vd->vdev_top);
3662 }
3663
3664 dmu_tx_commit(tx);
3665 }
3666
3667 /*
3668 * Determine whether the specified vdev can be
3669 * - offlined
3670 * - detached
3671 * - removed
3672 * - faulted
3673 * without losing data.
3674 */
3675 boolean_t
vdev_dtl_required(vdev_t * vd)3676 vdev_dtl_required(vdev_t *vd)
3677 {
3678 spa_t *spa = vd->vdev_spa;
3679 vdev_t *tvd = vd->vdev_top;
3680 uint8_t cant_read = vd->vdev_cant_read;
3681 boolean_t required;
3682 boolean_t faulting = vd->vdev_state == VDEV_STATE_FAULTED;
3683
3684 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
3685
3686 if (vd == spa->spa_root_vdev || vd == tvd)
3687 return (B_TRUE);
3688
3689 /*
3690 * Temporarily mark the device as unreadable, and then determine
3691 * whether this results in any DTL outages in the top-level vdev.
3692 * If not, we can safely offline/detach/remove the device.
3693 */
3694 vd->vdev_cant_read = B_TRUE;
3695 vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
3696 required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
3697 vd->vdev_cant_read = cant_read;
3698 vdev_dtl_reassess_impl(tvd, 0, 0, B_FALSE, B_FALSE, faulting);
3699
3700 if (!required && zio_injection_enabled) {
3701 required = !!zio_handle_device_injection(vd, NULL,
3702 SET_ERROR(ECHILD));
3703 }
3704
3705 return (required);
3706 }
3707
3708 /*
3709 * Determine if resilver is needed, and if so the txg range.
3710 */
3711 boolean_t
vdev_resilver_needed(vdev_t * vd,uint64_t * minp,uint64_t * maxp)3712 vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
3713 {
3714 boolean_t needed = B_FALSE;
3715 uint64_t thismin = UINT64_MAX;
3716 uint64_t thismax = 0;
3717
3718 if (vd->vdev_children == 0) {
3719 mutex_enter(&vd->vdev_dtl_lock);
3720 if (!zfs_range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
3721 vdev_writeable(vd)) {
3722
3723 thismin = vdev_dtl_min(vd);
3724 thismax = vdev_dtl_max(vd);
3725 needed = B_TRUE;
3726 }
3727 mutex_exit(&vd->vdev_dtl_lock);
3728 } else {
3729 for (int c = 0; c < vd->vdev_children; c++) {
3730 vdev_t *cvd = vd->vdev_child[c];
3731 uint64_t cmin, cmax;
3732
3733 if (vdev_resilver_needed(cvd, &cmin, &cmax)) {
3734 thismin = MIN(thismin, cmin);
3735 thismax = MAX(thismax, cmax);
3736 needed = B_TRUE;
3737 }
3738 }
3739 }
3740
3741 if (needed && minp) {
3742 *minp = thismin;
3743 *maxp = thismax;
3744 }
3745 return (needed);
3746 }
3747
3748 /*
3749 * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj
3750 * will contain either the checkpoint spacemap object or zero if none exists.
3751 * All other errors are returned to the caller.
3752 */
3753 int
vdev_checkpoint_sm_object(vdev_t * vd,uint64_t * sm_obj)3754 vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj)
3755 {
3756 ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER));
3757
3758 if (vd->vdev_top_zap == 0) {
3759 *sm_obj = 0;
3760 return (0);
3761 }
3762
3763 int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap,
3764 VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj);
3765 if (error == ENOENT) {
3766 *sm_obj = 0;
3767 error = 0;
3768 }
3769
3770 return (error);
3771 }
3772
3773 int
vdev_load(vdev_t * vd)3774 vdev_load(vdev_t *vd)
3775 {
3776 int children = vd->vdev_children;
3777 int error = 0;
3778 taskq_t *tq = NULL;
3779
3780 /*
3781 * It's only worthwhile to use the taskq for the root vdev, because the
3782 * slow part is metaslab_init, and that only happens for top-level
3783 * vdevs.
3784 */
3785 if (vd->vdev_ops == &vdev_root_ops && vd->vdev_children > 0) {
3786 tq = taskq_create("vdev_load", children, minclsyspri,
3787 children, children, TASKQ_PREPOPULATE);
3788 }
3789
3790 /*
3791 * Recursively load all children.
3792 */
3793 for (int c = 0; c < vd->vdev_children; c++) {
3794 vdev_t *cvd = vd->vdev_child[c];
3795
3796 if (tq == NULL || vdev_uses_zvols(cvd)) {
3797 cvd->vdev_load_error = vdev_load(cvd);
3798 } else {
3799 VERIFY(taskq_dispatch(tq, vdev_load_child,
3800 cvd, TQ_SLEEP) != TASKQID_INVALID);
3801 }
3802 }
3803
3804 if (tq != NULL) {
3805 taskq_wait(tq);
3806 taskq_destroy(tq);
3807 }
3808
3809 for (int c = 0; c < vd->vdev_children; c++) {
3810 int error = vd->vdev_child[c]->vdev_load_error;
3811
3812 if (error != 0)
3813 return (error);
3814 }
3815
3816 vdev_set_deflate_ratio(vd);
3817
3818 if (vd->vdev_ops == &vdev_raidz_ops) {
3819 error = vdev_raidz_load(vd);
3820 if (error != 0)
3821 return (error);
3822 }
3823
3824 /*
3825 * On spa_load path, grab the allocation bias from our zap
3826 */
3827 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3828 spa_t *spa = vd->vdev_spa;
3829 char bias_str[64];
3830
3831 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
3832 VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str),
3833 bias_str);
3834 if (error == 0) {
3835 ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE);
3836 vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str);
3837 } else if (error != ENOENT) {
3838 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3839 VDEV_AUX_CORRUPT_DATA);
3840 vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) "
3841 "failed [error=%d]",
3842 (u_longlong_t)vd->vdev_top_zap, error);
3843 return (error);
3844 }
3845 }
3846
3847 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3848 spa_t *spa = vd->vdev_spa;
3849 uint64_t failfast;
3850
3851 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
3852 vdev_prop_to_name(VDEV_PROP_FAILFAST), sizeof (failfast),
3853 1, &failfast);
3854 if (error == 0) {
3855 vd->vdev_failfast = failfast & 1;
3856 } else if (error == ENOENT) {
3857 vd->vdev_failfast = vdev_prop_default_numeric(
3858 VDEV_PROP_FAILFAST);
3859 } else {
3860 vdev_dbgmsg(vd,
3861 "vdev_load: zap_lookup(top_zap=%llu) "
3862 "failed [error=%d]",
3863 (u_longlong_t)vd->vdev_top_zap, error);
3864 }
3865 }
3866
3867 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3868 spa_t *spa = vd->vdev_spa;
3869 uint64_t autosit;
3870
3871 error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
3872 vdev_prop_to_name(VDEV_PROP_AUTOSIT), sizeof (autosit),
3873 1, &autosit);
3874 if (error == 0) {
3875 vd->vdev_autosit = autosit == 1;
3876 } else if (error == ENOENT) {
3877 vd->vdev_autosit = vdev_prop_default_numeric(
3878 VDEV_PROP_AUTOSIT);
3879 } else {
3880 vdev_dbgmsg(vd,
3881 "vdev_load: zap_lookup(top_zap=%llu) "
3882 "failed [error=%d]",
3883 (u_longlong_t)vd->vdev_top_zap, error);
3884 }
3885 }
3886
3887 /*
3888 * Load any rebuild state from the top-level vdev zap.
3889 */
3890 if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
3891 error = vdev_rebuild_load(vd);
3892 if (error && error != ENOTSUP) {
3893 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3894 VDEV_AUX_CORRUPT_DATA);
3895 vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
3896 "failed [error=%d]", error);
3897 return (error);
3898 }
3899 }
3900
3901 if (vd->vdev_top_zap != 0 || vd->vdev_leaf_zap != 0) {
3902 uint64_t zapobj;
3903
3904 if (vd->vdev_top_zap != 0)
3905 zapobj = vd->vdev_top_zap;
3906 else
3907 zapobj = vd->vdev_leaf_zap;
3908
3909 error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_N,
3910 &vd->vdev_checksum_n);
3911 if (error && error != ENOENT)
3912 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3913 "failed [error=%d]", (u_longlong_t)zapobj, error);
3914
3915 error = vdev_prop_get_int(vd, VDEV_PROP_CHECKSUM_T,
3916 &vd->vdev_checksum_t);
3917 if (error && error != ENOENT)
3918 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3919 "failed [error=%d]", (u_longlong_t)zapobj, error);
3920
3921 error = vdev_prop_get_int(vd, VDEV_PROP_IO_N,
3922 &vd->vdev_io_n);
3923 if (error && error != ENOENT)
3924 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3925 "failed [error=%d]", (u_longlong_t)zapobj, error);
3926
3927 error = vdev_prop_get_int(vd, VDEV_PROP_IO_T,
3928 &vd->vdev_io_t);
3929 if (error && error != ENOENT)
3930 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3931 "failed [error=%d]", (u_longlong_t)zapobj, error);
3932
3933 error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_N,
3934 &vd->vdev_slow_io_n);
3935 if (error && error != ENOENT)
3936 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3937 "failed [error=%d]", (u_longlong_t)zapobj, error);
3938
3939 error = vdev_prop_get_int(vd, VDEV_PROP_SLOW_IO_T,
3940 &vd->vdev_slow_io_t);
3941 if (error && error != ENOENT)
3942 vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3943 "failed [error=%d]", (u_longlong_t)zapobj, error);
3944 }
3945
3946 /*
3947 * If this is a top-level vdev, initialize its metaslabs.
3948 */
3949 if (vd == vd->vdev_top && vdev_is_concrete(vd)) {
3950 vdev_metaslab_group_create(vd);
3951
3952 if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) {
3953 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3954 VDEV_AUX_CORRUPT_DATA);
3955 vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, "
3956 "asize=%llu", (u_longlong_t)vd->vdev_ashift,
3957 (u_longlong_t)vd->vdev_asize);
3958 return (SET_ERROR(ENXIO));
3959 }
3960
3961 error = vdev_metaslab_init(vd, 0);
3962 if (error != 0) {
3963 vdev_dbgmsg(vd, "vdev_load: metaslab_init failed "
3964 "[error=%d]", error);
3965 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3966 VDEV_AUX_CORRUPT_DATA);
3967 return (error);
3968 }
3969
3970 uint64_t checkpoint_sm_obj;
3971 error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj);
3972 if (error == 0 && checkpoint_sm_obj != 0) {
3973 objset_t *mos = spa_meta_objset(vd->vdev_spa);
3974 ASSERT(vd->vdev_asize != 0);
3975 ASSERT0P(vd->vdev_checkpoint_sm);
3976
3977 error = space_map_open(&vd->vdev_checkpoint_sm,
3978 mos, checkpoint_sm_obj, 0, vd->vdev_asize,
3979 vd->vdev_ashift);
3980 if (error != 0) {
3981 vdev_dbgmsg(vd, "vdev_load: space_map_open "
3982 "failed for checkpoint spacemap (obj %llu) "
3983 "[error=%d]",
3984 (u_longlong_t)checkpoint_sm_obj, error);
3985 return (error);
3986 }
3987 ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL);
3988
3989 /*
3990 * Since the checkpoint_sm contains free entries
3991 * exclusively we can use space_map_allocated() to
3992 * indicate the cumulative checkpointed space that
3993 * has been freed.
3994 */
3995 vd->vdev_stat.vs_checkpoint_space =
3996 -space_map_allocated(vd->vdev_checkpoint_sm);
3997 vd->vdev_spa->spa_checkpoint_info.sci_dspace +=
3998 vd->vdev_stat.vs_checkpoint_space;
3999 } else if (error != 0) {
4000 vdev_dbgmsg(vd, "vdev_load: failed to retrieve "
4001 "checkpoint space map object from vdev ZAP "
4002 "[error=%d]", error);
4003 return (error);
4004 }
4005 }
4006
4007 /*
4008 * If this is a leaf vdev, load its DTL.
4009 */
4010 if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) {
4011 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
4012 VDEV_AUX_CORRUPT_DATA);
4013 vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed "
4014 "[error=%d]", error);
4015 return (error);
4016 }
4017
4018 uint64_t obsolete_sm_object;
4019 error = vdev_obsolete_sm_object(vd, &obsolete_sm_object);
4020 if (error == 0 && obsolete_sm_object != 0) {
4021 objset_t *mos = vd->vdev_spa->spa_meta_objset;
4022 ASSERT(vd->vdev_asize != 0);
4023 ASSERT0P(vd->vdev_obsolete_sm);
4024
4025 if ((error = space_map_open(&vd->vdev_obsolete_sm, mos,
4026 obsolete_sm_object, 0, vd->vdev_asize, 0))) {
4027 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
4028 VDEV_AUX_CORRUPT_DATA);
4029 vdev_dbgmsg(vd, "vdev_load: space_map_open failed for "
4030 "obsolete spacemap (obj %llu) [error=%d]",
4031 (u_longlong_t)obsolete_sm_object, error);
4032 return (error);
4033 }
4034 } else if (error != 0) {
4035 vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete "
4036 "space map object from vdev ZAP [error=%d]", error);
4037 return (error);
4038 }
4039
4040 return (0);
4041 }
4042
4043 /*
4044 * The special vdev case is used for hot spares and l2cache devices. Its
4045 * sole purpose it to set the vdev state for the associated vdev. To do this,
4046 * we make sure that we can open the underlying device, then try to read the
4047 * label, and make sure that the label is sane and that it hasn't been
4048 * repurposed to another pool.
4049 */
4050 int
vdev_validate_aux(vdev_t * vd)4051 vdev_validate_aux(vdev_t *vd)
4052 {
4053 nvlist_t *label;
4054 uint64_t guid, version;
4055 uint64_t state;
4056
4057 if (!vdev_readable(vd))
4058 return (0);
4059
4060 if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) {
4061 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
4062 VDEV_AUX_CORRUPT_DATA);
4063 return (-1);
4064 }
4065
4066 if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 ||
4067 !SPA_VERSION_IS_SUPPORTED(version) ||
4068 nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 ||
4069 guid != vd->vdev_guid ||
4070 nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) {
4071 vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
4072 VDEV_AUX_CORRUPT_DATA);
4073 nvlist_free(label);
4074 return (-1);
4075 }
4076
4077 /*
4078 * We don't actually check the pool state here. If it's in fact in
4079 * use by another pool, we update this fact on the fly when requested.
4080 */
4081 nvlist_free(label);
4082 return (0);
4083 }
4084
4085 static void
vdev_destroy_ms_flush_data(vdev_t * vd,dmu_tx_t * tx)4086 vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx)
4087 {
4088 objset_t *mos = spa_meta_objset(vd->vdev_spa);
4089
4090 if (vd->vdev_top_zap == 0)
4091 return;
4092
4093 uint64_t object = 0;
4094 int err = zap_lookup(mos, vd->vdev_top_zap,
4095 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object);
4096 if (err == ENOENT)
4097 return;
4098 VERIFY0(err);
4099
4100 VERIFY0(dmu_object_free(mos, object, tx));
4101 VERIFY0(zap_remove(mos, vd->vdev_top_zap,
4102 VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx));
4103 }
4104
4105 /*
4106 * Free the objects used to store this vdev's spacemaps, and the array
4107 * that points to them.
4108 */
4109 void
vdev_destroy_spacemaps(vdev_t * vd,dmu_tx_t * tx)4110 vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx)
4111 {
4112 if (vd->vdev_ms_array == 0)
4113 return;
4114
4115 objset_t *mos = vd->vdev_spa->spa_meta_objset;
4116 uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift;
4117 size_t array_bytes = array_count * sizeof (uint64_t);
4118 uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP);
4119 VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0,
4120 array_bytes, smobj_array, 0));
4121
4122 for (uint64_t i = 0; i < array_count; i++) {
4123 uint64_t smobj = smobj_array[i];
4124 if (smobj == 0)
4125 continue;
4126
4127 space_map_free_obj(mos, smobj, tx);
4128 }
4129
4130 kmem_free(smobj_array, array_bytes);
4131 VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx));
4132 vdev_destroy_ms_flush_data(vd, tx);
4133 vd->vdev_ms_array = 0;
4134 }
4135
4136 static void
vdev_remove_empty_log(vdev_t * vd,uint64_t txg)4137 vdev_remove_empty_log(vdev_t *vd, uint64_t txg)
4138 {
4139 spa_t *spa = vd->vdev_spa;
4140
4141 ASSERT(vd->vdev_islog);
4142 ASSERT(vd == vd->vdev_top);
4143 ASSERT3U(txg, ==, spa_syncing_txg(spa));
4144
4145 dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
4146
4147 vdev_destroy_spacemaps(vd, tx);
4148 if (vd->vdev_top_zap != 0) {
4149 vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx);
4150 vd->vdev_top_zap = 0;
4151 }
4152
4153 dmu_tx_commit(tx);
4154 }
4155
4156 void
vdev_sync_done(vdev_t * vd,uint64_t txg)4157 vdev_sync_done(vdev_t *vd, uint64_t txg)
4158 {
4159 metaslab_t *msp;
4160 boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
4161
4162 ASSERT(vdev_is_concrete(vd));
4163
4164 while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
4165 != NULL)
4166 metaslab_sync_done(msp, txg);
4167
4168 if (reassess) {
4169 metaslab_sync_reassess(vd->vdev_mg);
4170 if (vd->vdev_log_mg != NULL)
4171 metaslab_sync_reassess(vd->vdev_log_mg);
4172 }
4173 }
4174
4175 void
vdev_sync(vdev_t * vd,uint64_t txg)4176 vdev_sync(vdev_t *vd, uint64_t txg)
4177 {
4178 spa_t *spa = vd->vdev_spa;
4179 vdev_t *lvd;
4180 metaslab_t *msp;
4181
4182 ASSERT3U(txg, ==, spa->spa_syncing_txg);
4183 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
4184 if (zfs_range_tree_space(vd->vdev_obsolete_segments) > 0) {
4185 ASSERT(vd->vdev_removing ||
4186 vd->vdev_ops == &vdev_indirect_ops);
4187
4188 vdev_indirect_sync_obsolete(vd, tx);
4189
4190 /*
4191 * If the vdev is indirect, it can't have dirty
4192 * metaslabs or DTLs.
4193 */
4194 if (vd->vdev_ops == &vdev_indirect_ops) {
4195 ASSERT(txg_list_empty(&vd->vdev_ms_list, txg));
4196 ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg));
4197 dmu_tx_commit(tx);
4198 return;
4199 }
4200 }
4201
4202 ASSERT(vdev_is_concrete(vd));
4203
4204 if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 &&
4205 !vd->vdev_removing) {
4206 ASSERT(vd == vd->vdev_top);
4207 ASSERT0(vd->vdev_indirect_config.vic_mapping_object);
4208 vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset,
4209 DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx);
4210 ASSERT(vd->vdev_ms_array != 0);
4211 vdev_config_dirty(vd);
4212 }
4213
4214 while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
4215 metaslab_sync(msp, txg);
4216 (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
4217 }
4218
4219 while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL)
4220 vdev_dtl_sync(lvd, txg);
4221
4222 /*
4223 * If this is an empty log device being removed, destroy the
4224 * metadata associated with it.
4225 */
4226 if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing)
4227 vdev_remove_empty_log(vd, txg);
4228
4229 (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg));
4230 dmu_tx_commit(tx);
4231 }
4232 uint64_t
vdev_asize_to_psize_txg(vdev_t * vd,uint64_t asize,uint64_t txg)4233 vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize, uint64_t txg)
4234 {
4235 return (vd->vdev_ops->vdev_op_asize_to_psize(vd, asize, txg));
4236 }
4237
4238 /*
4239 * Return the amount of space that should be (or was) allocated for the given
4240 * psize (compressed block size) in the given TXG. Note that for expanded
4241 * RAIDZ vdevs, the size allocated for older BP's may be larger. See
4242 * vdev_raidz_psize_to_asize().
4243 */
4244 uint64_t
vdev_psize_to_asize_txg(vdev_t * vd,uint64_t psize,uint64_t txg)4245 vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize, uint64_t txg)
4246 {
4247 return (vd->vdev_ops->vdev_op_psize_to_asize(vd, psize, txg));
4248 }
4249
4250 uint64_t
vdev_psize_to_asize(vdev_t * vd,uint64_t psize)4251 vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
4252 {
4253 return (vdev_psize_to_asize_txg(vd, psize, 0));
4254 }
4255
4256 /*
4257 * Mark the given vdev faulted. A faulted vdev behaves as if the device could
4258 * not be opened, and no I/O is attempted.
4259 */
4260 int
vdev_fault(spa_t * spa,uint64_t guid,vdev_aux_t aux)4261 vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
4262 {
4263 vdev_t *vd, *tvd;
4264
4265 spa_vdev_state_enter(spa, SCL_NONE);
4266
4267 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4268 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4269
4270 if (!vd->vdev_ops->vdev_op_leaf)
4271 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4272
4273 tvd = vd->vdev_top;
4274
4275 /*
4276 * If user did a 'zpool offline -f' then make the fault persist across
4277 * reboots.
4278 */
4279 if (aux == VDEV_AUX_EXTERNAL_PERSIST) {
4280 /*
4281 * There are two kinds of forced faults: temporary and
4282 * persistent. Temporary faults go away at pool import, while
4283 * persistent faults stay set. Both types of faults can be
4284 * cleared with a zpool clear.
4285 *
4286 * We tell if a vdev is persistently faulted by looking at the
4287 * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at
4288 * import then it's a persistent fault. Otherwise, it's
4289 * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external"
4290 * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This
4291 * tells vdev_config_generate() (which gets run later) to set
4292 * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist.
4293 */
4294 vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL;
4295 vd->vdev_tmpoffline = B_FALSE;
4296 aux = VDEV_AUX_EXTERNAL;
4297 } else {
4298 vd->vdev_tmpoffline = B_TRUE;
4299 }
4300
4301 /*
4302 * We don't directly use the aux state here, but if we do a
4303 * vdev_reopen(), we need this value to be present to remember why we
4304 * were faulted.
4305 */
4306 vd->vdev_label_aux = aux;
4307
4308 /*
4309 * Faulted state takes precedence over degraded.
4310 */
4311 vd->vdev_delayed_close = B_FALSE;
4312 vd->vdev_faulted = 1ULL;
4313 vd->vdev_degraded = 0ULL;
4314 vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
4315
4316 /*
4317 * If this device has the only valid copy of the data, then
4318 * back off and simply mark the vdev as degraded instead.
4319 */
4320 if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) {
4321 vd->vdev_degraded = 1ULL;
4322 vd->vdev_faulted = 0ULL;
4323
4324 /*
4325 * If we reopen the device and it's not dead, only then do we
4326 * mark it degraded.
4327 */
4328 vdev_reopen(tvd);
4329
4330 if (vdev_readable(vd))
4331 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
4332 }
4333
4334 return (spa_vdev_state_exit(spa, vd, 0));
4335 }
4336
4337 /*
4338 * Mark the given vdev degraded. A degraded vdev is purely an indication to the
4339 * user that something is wrong. The vdev continues to operate as normal as far
4340 * as I/O is concerned.
4341 */
4342 int
vdev_degrade(spa_t * spa,uint64_t guid,vdev_aux_t aux)4343 vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
4344 {
4345 vdev_t *vd;
4346
4347 spa_vdev_state_enter(spa, SCL_NONE);
4348
4349 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4350 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4351
4352 if (!vd->vdev_ops->vdev_op_leaf)
4353 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4354
4355 /*
4356 * If the vdev is already faulted, then don't do anything.
4357 */
4358 if (vd->vdev_faulted || vd->vdev_degraded)
4359 return (spa_vdev_state_exit(spa, NULL, 0));
4360
4361 vd->vdev_degraded = 1ULL;
4362 if (!vdev_is_dead(vd))
4363 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
4364 aux);
4365
4366 return (spa_vdev_state_exit(spa, vd, 0));
4367 }
4368
4369 int
vdev_remove_wanted(spa_t * spa,uint64_t guid)4370 vdev_remove_wanted(spa_t *spa, uint64_t guid)
4371 {
4372 vdev_t *vd;
4373
4374 spa_vdev_state_enter(spa, SCL_NONE);
4375
4376 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4377 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4378
4379 /*
4380 * If the vdev is already removed, or expanding which can trigger
4381 * repartition add/remove events, then don't do anything.
4382 */
4383 if (vd->vdev_removed || vd->vdev_expanding)
4384 return (spa_vdev_state_exit(spa, NULL, 0));
4385
4386 /*
4387 * Confirm the vdev has been removed, otherwise don't do anything.
4388 */
4389 if (vd->vdev_ops->vdev_op_leaf && !zio_wait(vdev_probe(vd, NULL)))
4390 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(EEXIST)));
4391
4392 vd->vdev_remove_wanted = B_TRUE;
4393 spa_async_request(spa, SPA_ASYNC_REMOVE_BY_USER);
4394
4395 return (spa_vdev_state_exit(spa, vd, 0));
4396 }
4397
4398
4399 /*
4400 * Online the given vdev.
4401 *
4402 * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
4403 * spare device should be detached when the device finishes resilvering.
4404 * Second, the online should be treated like a 'test' online case, so no FMA
4405 * events are generated if the device fails to open.
4406 */
4407 int
vdev_online(spa_t * spa,uint64_t guid,uint64_t flags,vdev_state_t * newstate)4408 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
4409 {
4410 vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
4411 boolean_t wasoffline;
4412 vdev_state_t oldstate;
4413
4414 spa_vdev_state_enter(spa, SCL_NONE);
4415
4416 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4417 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4418
4419 wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline);
4420 oldstate = vd->vdev_state;
4421
4422 tvd = vd->vdev_top;
4423 vd->vdev_offline = B_FALSE;
4424 vd->vdev_tmpoffline = B_FALSE;
4425 vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
4426 vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
4427
4428 /* XXX - L2ARC 1.0 does not support expansion */
4429 if (!vd->vdev_aux) {
4430 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
4431 pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) ||
4432 spa->spa_autoexpand);
4433 vd->vdev_expansion_time = gethrestime_sec();
4434 }
4435
4436 vdev_reopen(tvd);
4437 vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
4438
4439 if (!vd->vdev_aux) {
4440 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
4441 pvd->vdev_expanding = B_FALSE;
4442 }
4443
4444 if (newstate)
4445 *newstate = vd->vdev_state;
4446 if ((flags & ZFS_ONLINE_UNSPARE) &&
4447 !vdev_is_dead(vd) && vd->vdev_parent &&
4448 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4449 vd->vdev_parent->vdev_child[0] == vd)
4450 vd->vdev_unspare = B_TRUE;
4451
4452 if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
4453
4454 /* XXX - L2ARC 1.0 does not support expansion */
4455 if (vd->vdev_aux)
4456 return (spa_vdev_state_exit(spa, vd, ENOTSUP));
4457 spa->spa_ccw_fail_time = 0;
4458 spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
4459 }
4460
4461 /* Restart initializing if necessary */
4462 mutex_enter(&vd->vdev_initialize_lock);
4463 if (vdev_writeable(vd) &&
4464 vd->vdev_initialize_thread == NULL &&
4465 vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
4466 (void) vdev_initialize(vd);
4467 }
4468 mutex_exit(&vd->vdev_initialize_lock);
4469
4470 /*
4471 * Restart trimming if necessary. We do not restart trimming for cache
4472 * devices here. This is triggered by l2arc_rebuild_vdev()
4473 * asynchronously for the whole device or in l2arc_evict() as it evicts
4474 * space for upcoming writes.
4475 */
4476 mutex_enter(&vd->vdev_trim_lock);
4477 if (vdev_writeable(vd) && !vd->vdev_isl2cache &&
4478 vd->vdev_trim_thread == NULL &&
4479 vd->vdev_trim_state == VDEV_TRIM_ACTIVE) {
4480 (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial,
4481 vd->vdev_trim_secure);
4482 }
4483 mutex_exit(&vd->vdev_trim_lock);
4484
4485 if (wasoffline ||
4486 (oldstate < VDEV_STATE_DEGRADED &&
4487 vd->vdev_state >= VDEV_STATE_DEGRADED)) {
4488 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE);
4489
4490 /*
4491 * Asynchronously detach spare vdev if resilver or
4492 * rebuild is not required
4493 */
4494 if (vd->vdev_unspare &&
4495 !dsl_scan_resilvering(spa->spa_dsl_pool) &&
4496 !dsl_scan_resilver_scheduled(spa->spa_dsl_pool) &&
4497 !vdev_rebuild_active(tvd))
4498 spa_async_request(spa, SPA_ASYNC_DETACH_SPARE);
4499 }
4500 return (spa_vdev_state_exit(spa, vd, 0));
4501 }
4502
4503 static int
vdev_offline_locked(spa_t * spa,uint64_t guid,uint64_t flags)4504 vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
4505 {
4506 vdev_t *vd, *tvd;
4507 int error = 0;
4508 uint64_t generation;
4509 metaslab_group_t *mg;
4510
4511 top:
4512 spa_vdev_state_enter(spa, SCL_ALLOC);
4513
4514 if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4515 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV)));
4516
4517 if (!vd->vdev_ops->vdev_op_leaf)
4518 return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP)));
4519
4520 if (vd->vdev_ops == &vdev_draid_spare_ops)
4521 return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
4522
4523 tvd = vd->vdev_top;
4524 mg = tvd->vdev_mg;
4525 generation = spa->spa_config_generation + 1;
4526
4527 /*
4528 * If the device isn't already offline, try to offline it.
4529 */
4530 if (!vd->vdev_offline) {
4531 /*
4532 * If this device has the only valid copy of some data,
4533 * don't allow it to be offlined. Log devices are always
4534 * expendable.
4535 */
4536 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
4537 vdev_dtl_required(vd))
4538 return (spa_vdev_state_exit(spa, NULL,
4539 SET_ERROR(EBUSY)));
4540
4541 /*
4542 * If the top-level is a slog and it has had allocations
4543 * then proceed. We check that the vdev's metaslab group
4544 * is not NULL since it's possible that we may have just
4545 * added this vdev but not yet initialized its metaslabs.
4546 */
4547 if (tvd->vdev_islog && mg != NULL) {
4548 /*
4549 * Prevent any future allocations.
4550 */
4551 ASSERT0P(tvd->vdev_log_mg);
4552 metaslab_group_passivate(mg);
4553 (void) spa_vdev_state_exit(spa, vd, 0);
4554
4555 error = spa_reset_logs(spa);
4556
4557 /*
4558 * If the log device was successfully reset but has
4559 * checkpointed data, do not offline it.
4560 */
4561 if (error == 0 &&
4562 tvd->vdev_checkpoint_sm != NULL) {
4563 ASSERT3U(space_map_allocated(
4564 tvd->vdev_checkpoint_sm), !=, 0);
4565 error = ZFS_ERR_CHECKPOINT_EXISTS;
4566 }
4567
4568 spa_vdev_state_enter(spa, SCL_ALLOC);
4569
4570 /*
4571 * Check to see if the config has changed.
4572 */
4573 if (error || generation != spa->spa_config_generation) {
4574 metaslab_group_activate(mg);
4575 if (error)
4576 return (spa_vdev_state_exit(spa,
4577 vd, error));
4578 (void) spa_vdev_state_exit(spa, vd, 0);
4579 goto top;
4580 }
4581 ASSERT0(tvd->vdev_stat.vs_alloc);
4582 }
4583
4584 /*
4585 * Offline this device and reopen its top-level vdev.
4586 * If the top-level vdev is a log device then just offline
4587 * it. Otherwise, if this action results in the top-level
4588 * vdev becoming unusable, undo it and fail the request.
4589 */
4590 vd->vdev_offline = B_TRUE;
4591 vdev_reopen(tvd);
4592
4593 if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
4594 vdev_is_dead(tvd)) {
4595 vd->vdev_offline = B_FALSE;
4596 vdev_reopen(tvd);
4597 return (spa_vdev_state_exit(spa, NULL,
4598 SET_ERROR(EBUSY)));
4599 }
4600
4601 /*
4602 * Add the device back into the metaslab rotor so that
4603 * once we online the device it's open for business.
4604 */
4605 if (tvd->vdev_islog && mg != NULL)
4606 metaslab_group_activate(mg);
4607 }
4608
4609 vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
4610
4611 return (spa_vdev_state_exit(spa, vd, 0));
4612 }
4613
4614 int
vdev_offline(spa_t * spa,uint64_t guid,uint64_t flags)4615 vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
4616 {
4617 int error;
4618
4619 mutex_enter(&spa->spa_vdev_top_lock);
4620 error = vdev_offline_locked(spa, guid, flags);
4621 mutex_exit(&spa->spa_vdev_top_lock);
4622
4623 return (error);
4624 }
4625
4626 /*
4627 * Clear the error counts associated with this vdev. Unlike vdev_online() and
4628 * vdev_offline(), we assume the spa config is locked. We also clear all
4629 * children. If 'vd' is NULL, then the user wants to clear all vdevs.
4630 */
4631 void
vdev_clear(spa_t * spa,vdev_t * vd)4632 vdev_clear(spa_t *spa, vdev_t *vd)
4633 {
4634 vdev_t *rvd = spa->spa_root_vdev;
4635
4636 ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
4637
4638 if (vd == NULL)
4639 vd = rvd;
4640
4641 vd->vdev_stat.vs_read_errors = 0;
4642 vd->vdev_stat.vs_write_errors = 0;
4643 vd->vdev_stat.vs_checksum_errors = 0;
4644 vd->vdev_stat.vs_dio_verify_errors = 0;
4645 vd->vdev_stat.vs_slow_ios = 0;
4646 atomic_store_64(&vd->vdev_outlier_count, 0);
4647 vd->vdev_read_sit_out_expire = 0;
4648
4649 for (int c = 0; c < vd->vdev_children; c++)
4650 vdev_clear(spa, vd->vdev_child[c]);
4651
4652 /*
4653 * It makes no sense to "clear" an indirect or removed vdev.
4654 */
4655 if (!vdev_is_concrete(vd) || vd->vdev_removed)
4656 return;
4657
4658 /*
4659 * If we're in the FAULTED state or have experienced failed I/O, then
4660 * clear the persistent state and attempt to reopen the device. We
4661 * also mark the vdev config dirty, so that the new faulted state is
4662 * written out to disk.
4663 */
4664 if (vd->vdev_faulted || vd->vdev_degraded ||
4665 !vdev_readable(vd) || !vdev_writeable(vd)) {
4666 /*
4667 * When reopening in response to a clear event, it may be due to
4668 * a fmadm repair request. In this case, if the device is
4669 * still broken, we want to still post the ereport again.
4670 */
4671 vd->vdev_forcefault = B_TRUE;
4672
4673 vd->vdev_faulted = vd->vdev_degraded = 0ULL;
4674 vd->vdev_cant_read = B_FALSE;
4675 vd->vdev_cant_write = B_FALSE;
4676 vd->vdev_stat.vs_aux = 0;
4677
4678 vdev_reopen(vd == rvd ? rvd : vd->vdev_top);
4679
4680 vd->vdev_forcefault = B_FALSE;
4681
4682 if (vd != rvd && vdev_writeable(vd->vdev_top))
4683 vdev_state_dirty(vd->vdev_top);
4684
4685 /* If a resilver isn't required, check if vdevs can be culled */
4686 if (vd->vdev_aux == NULL && !vdev_is_dead(vd) &&
4687 !dsl_scan_resilvering(spa->spa_dsl_pool) &&
4688 !dsl_scan_resilver_scheduled(spa->spa_dsl_pool))
4689 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
4690
4691 spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
4692 }
4693
4694 /*
4695 * When clearing a FMA-diagnosed fault, we always want to
4696 * unspare the device, as we assume that the original spare was
4697 * done in response to the FMA fault.
4698 */
4699 if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
4700 vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
4701 vd->vdev_parent->vdev_child[0] == vd)
4702 vd->vdev_unspare = B_TRUE;
4703
4704 /* Clear recent error events cache (i.e. duplicate events tracking) */
4705 zfs_ereport_clear(spa, vd);
4706 }
4707
4708 boolean_t
vdev_is_dead(vdev_t * vd)4709 vdev_is_dead(vdev_t *vd)
4710 {
4711 /*
4712 * Holes and missing devices are always considered "dead".
4713 * This simplifies the code since we don't have to check for
4714 * these types of devices in the various code paths.
4715 * Instead we rely on the fact that we skip over dead devices
4716 * before issuing I/O to them.
4717 */
4718 return (vd->vdev_state < VDEV_STATE_DEGRADED ||
4719 vd->vdev_ops == &vdev_hole_ops ||
4720 vd->vdev_ops == &vdev_missing_ops);
4721 }
4722
4723 boolean_t
vdev_readable(vdev_t * vd)4724 vdev_readable(vdev_t *vd)
4725 {
4726 return (!vdev_is_dead(vd) && !vd->vdev_cant_read);
4727 }
4728
4729 boolean_t
vdev_writeable(vdev_t * vd)4730 vdev_writeable(vdev_t *vd)
4731 {
4732 return (!vdev_is_dead(vd) && !vd->vdev_cant_write &&
4733 vdev_is_concrete(vd));
4734 }
4735
4736 boolean_t
vdev_allocatable(vdev_t * vd)4737 vdev_allocatable(vdev_t *vd)
4738 {
4739 uint64_t state = vd->vdev_state;
4740
4741 /*
4742 * We currently allow allocations from vdevs which may be in the
4743 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
4744 * fails to reopen then we'll catch it later when we're holding
4745 * the proper locks. Note that we have to get the vdev state
4746 * in a local variable because although it changes atomically,
4747 * we're asking two separate questions about it.
4748 */
4749 return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
4750 !vd->vdev_cant_write && vdev_is_concrete(vd) &&
4751 vd->vdev_mg->mg_initialized);
4752 }
4753
4754 boolean_t
vdev_accessible(vdev_t * vd,zio_t * zio)4755 vdev_accessible(vdev_t *vd, zio_t *zio)
4756 {
4757 ASSERT(zio->io_vd == vd);
4758
4759 if (vdev_is_dead(vd) || vd->vdev_remove_wanted)
4760 return (B_FALSE);
4761
4762 if (zio->io_type == ZIO_TYPE_READ)
4763 return (!vd->vdev_cant_read);
4764
4765 if (zio->io_type == ZIO_TYPE_WRITE)
4766 return (!vd->vdev_cant_write);
4767
4768 return (B_TRUE);
4769 }
4770
4771 static void
vdev_get_child_stat(vdev_t * cvd,vdev_stat_t * vs,vdev_stat_t * cvs)4772 vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs)
4773 {
4774 /*
4775 * Exclude the dRAID spare when aggregating to avoid double counting
4776 * the ops and bytes. These IOs are counted by the physical leaves.
4777 */
4778 if (cvd->vdev_ops == &vdev_draid_spare_ops)
4779 return;
4780
4781 for (int t = 0; t < VS_ZIO_TYPES; t++) {
4782 vs->vs_ops[t] += cvs->vs_ops[t];
4783 vs->vs_bytes[t] += cvs->vs_bytes[t];
4784 }
4785
4786 cvs->vs_scan_removing = cvd->vdev_removing;
4787 }
4788
4789 /*
4790 * Get extended stats
4791 */
4792 static void
vdev_get_child_stat_ex(vdev_t * cvd,vdev_stat_ex_t * vsx,vdev_stat_ex_t * cvsx)4793 vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx)
4794 {
4795 (void) cvd;
4796
4797 int t, b;
4798 for (t = 0; t < ZIO_TYPES; t++) {
4799 for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++)
4800 vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b];
4801
4802 for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) {
4803 vsx->vsx_total_histo[t][b] +=
4804 cvsx->vsx_total_histo[t][b];
4805 }
4806 }
4807
4808 for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4809 for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) {
4810 vsx->vsx_queue_histo[t][b] +=
4811 cvsx->vsx_queue_histo[t][b];
4812 }
4813 vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t];
4814 vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t];
4815
4816 for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++)
4817 vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b];
4818
4819 for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++)
4820 vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b];
4821 }
4822
4823 }
4824
4825 boolean_t
vdev_is_spacemap_addressable(vdev_t * vd)4826 vdev_is_spacemap_addressable(vdev_t *vd)
4827 {
4828 if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
4829 return (B_TRUE);
4830
4831 /*
4832 * If double-word space map entries are not enabled we assume
4833 * 47 bits of the space map entry are dedicated to the entry's
4834 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
4835 * to calculate the maximum address that can be described by a
4836 * space map entry for the given device.
4837 */
4838 uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
4839
4840 if (shift >= 63) /* detect potential overflow */
4841 return (B_TRUE);
4842
4843 return (vd->vdev_asize < (1ULL << shift));
4844 }
4845
4846 /*
4847 * Get statistics for the given vdev.
4848 */
4849 static void
vdev_get_stats_ex_impl(vdev_t * vd,vdev_stat_t * vs,vdev_stat_ex_t * vsx)4850 vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
4851 {
4852 int t;
4853 /*
4854 * If we're getting stats on the root vdev, aggregate the I/O counts
4855 * over all top-level vdevs (i.e. the direct children of the root).
4856 */
4857 if (!vd->vdev_ops->vdev_op_leaf) {
4858 if (vs) {
4859 memset(vs->vs_ops, 0, sizeof (vs->vs_ops));
4860 memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes));
4861 }
4862 if (vsx)
4863 memset(vsx, 0, sizeof (*vsx));
4864
4865 for (int c = 0; c < vd->vdev_children; c++) {
4866 vdev_t *cvd = vd->vdev_child[c];
4867 vdev_stat_t *cvs = &cvd->vdev_stat;
4868 vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex;
4869
4870 vdev_get_stats_ex_impl(cvd, cvs, cvsx);
4871 if (vs)
4872 vdev_get_child_stat(cvd, vs, cvs);
4873 if (vsx)
4874 vdev_get_child_stat_ex(cvd, vsx, cvsx);
4875 }
4876 } else {
4877 /*
4878 * We're a leaf. Just copy our ZIO active queue stats in. The
4879 * other leaf stats are updated in vdev_stat_update().
4880 */
4881 if (!vsx)
4882 return;
4883
4884 memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
4885
4886 for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
4887 vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
4888 vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
4889 }
4890 }
4891 }
4892
4893 void
vdev_get_stats_ex(vdev_t * vd,vdev_stat_t * vs,vdev_stat_ex_t * vsx)4894 vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
4895 {
4896 vdev_t *tvd = vd->vdev_top;
4897 mutex_enter(&vd->vdev_stat_lock);
4898 if (vs) {
4899 memcpy(vs, &vd->vdev_stat, sizeof (*vs));
4900 vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
4901 vs->vs_state = vd->vdev_state;
4902 vs->vs_rsize = vdev_get_min_asize(vd);
4903
4904 if (vd->vdev_ops->vdev_op_leaf) {
4905 vs->vs_pspace = vd->vdev_psize;
4906 vs->vs_rsize += VDEV_LABEL_START_SIZE +
4907 VDEV_LABEL_END_SIZE;
4908 /*
4909 * Report initializing progress. Since we don't
4910 * have the initializing locks held, this is only
4911 * an estimate (although a fairly accurate one).
4912 */
4913 vs->vs_initialize_bytes_done =
4914 vd->vdev_initialize_bytes_done;
4915 vs->vs_initialize_bytes_est =
4916 vd->vdev_initialize_bytes_est;
4917 vs->vs_initialize_state = vd->vdev_initialize_state;
4918 vs->vs_initialize_action_time =
4919 vd->vdev_initialize_action_time;
4920
4921 /*
4922 * Report manual TRIM progress. Since we don't have
4923 * the manual TRIM locks held, this is only an
4924 * estimate (although fairly accurate one).
4925 */
4926 vs->vs_trim_notsup = !vd->vdev_has_trim;
4927 vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done;
4928 vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
4929 vs->vs_trim_state = vd->vdev_trim_state;
4930 vs->vs_trim_action_time = vd->vdev_trim_action_time;
4931
4932 /* Set when there is a deferred resilver. */
4933 vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
4934 }
4935
4936 /*
4937 * Report expandable space on top-level, non-auxiliary devices
4938 * only. The expandable space is reported in terms of metaslab
4939 * sized units since that determines how much space the pool
4940 * can expand.
4941 */
4942 if (vd->vdev_aux == NULL && tvd != NULL) {
4943 vs->vs_esize = P2ALIGN_TYPED(
4944 vd->vdev_max_asize - vd->vdev_asize,
4945 1ULL << tvd->vdev_ms_shift, uint64_t);
4946 }
4947
4948 vs->vs_configured_ashift = vd->vdev_top != NULL
4949 ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
4950 vs->vs_logical_ashift = vd->vdev_logical_ashift;
4951 if (vd->vdev_physical_ashift <= ASHIFT_MAX)
4952 vs->vs_physical_ashift = vd->vdev_physical_ashift;
4953 else
4954 vs->vs_physical_ashift = 0;
4955
4956 /*
4957 * Report fragmentation and rebuild progress for top-level,
4958 * non-auxiliary, concrete devices.
4959 */
4960 if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
4961 vdev_is_concrete(vd)) {
4962 /*
4963 * The vdev fragmentation rating doesn't take into
4964 * account the embedded slog metaslab (vdev_log_mg).
4965 * Since it's only one metaslab, it would have a tiny
4966 * impact on the overall fragmentation.
4967 */
4968 vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
4969 vd->vdev_mg->mg_fragmentation : 0;
4970 }
4971 vs->vs_noalloc = MAX(vd->vdev_noalloc,
4972 tvd ? tvd->vdev_noalloc : 0);
4973 }
4974
4975 vdev_get_stats_ex_impl(vd, vs, vsx);
4976 mutex_exit(&vd->vdev_stat_lock);
4977 }
4978
4979 void
vdev_get_stats(vdev_t * vd,vdev_stat_t * vs)4980 vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
4981 {
4982 return (vdev_get_stats_ex(vd, vs, NULL));
4983 }
4984
4985 void
vdev_clear_stats(vdev_t * vd)4986 vdev_clear_stats(vdev_t *vd)
4987 {
4988 mutex_enter(&vd->vdev_stat_lock);
4989 vd->vdev_stat.vs_space = 0;
4990 vd->vdev_stat.vs_dspace = 0;
4991 vd->vdev_stat.vs_alloc = 0;
4992 mutex_exit(&vd->vdev_stat_lock);
4993 }
4994
4995 void
vdev_scan_stat_init(vdev_t * vd)4996 vdev_scan_stat_init(vdev_t *vd)
4997 {
4998 vdev_stat_t *vs = &vd->vdev_stat;
4999
5000 for (int c = 0; c < vd->vdev_children; c++)
5001 vdev_scan_stat_init(vd->vdev_child[c]);
5002
5003 mutex_enter(&vd->vdev_stat_lock);
5004 vs->vs_scan_processed = 0;
5005 mutex_exit(&vd->vdev_stat_lock);
5006 }
5007
5008 void
vdev_stat_update(zio_t * zio,uint64_t psize)5009 vdev_stat_update(zio_t *zio, uint64_t psize)
5010 {
5011 spa_t *spa = zio->io_spa;
5012 vdev_t *rvd = spa->spa_root_vdev;
5013 vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
5014 vdev_t *pvd;
5015 uint64_t txg = zio->io_txg;
5016 /* Suppress ASAN false positive */
5017 #ifdef __SANITIZE_ADDRESS__
5018 vdev_stat_t *vs = vd ? &vd->vdev_stat : NULL;
5019 vdev_stat_ex_t *vsx = vd ? &vd->vdev_stat_ex : NULL;
5020 #else
5021 vdev_stat_t *vs = &vd->vdev_stat;
5022 vdev_stat_ex_t *vsx = &vd->vdev_stat_ex;
5023 #endif
5024 zio_type_t type = zio->io_type;
5025 int flags = zio->io_flags;
5026
5027 /*
5028 * If this i/o is a gang leader, it didn't do any actual work.
5029 */
5030 if (zio->io_gang_tree)
5031 return;
5032
5033 if (zio->io_error == 0) {
5034 /*
5035 * If this is a root i/o, don't count it -- we've already
5036 * counted the top-level vdevs, and vdev_get_stats() will
5037 * aggregate them when asked. This reduces contention on
5038 * the root vdev_stat_lock and implicitly handles blocks
5039 * that compress away to holes, for which there is no i/o.
5040 * (Holes never create vdev children, so all the counters
5041 * remain zero, which is what we want.)
5042 *
5043 * Note: this only applies to successful i/o (io_error == 0)
5044 * because unlike i/o counts, errors are not additive.
5045 * When reading a ditto block, for example, failure of
5046 * one top-level vdev does not imply a root-level error.
5047 */
5048 if (vd == rvd)
5049 return;
5050
5051 ASSERT(vd == zio->io_vd);
5052
5053 if (flags & ZIO_FLAG_IO_BYPASS)
5054 return;
5055
5056 mutex_enter(&vd->vdev_stat_lock);
5057
5058 if (flags & ZIO_FLAG_IO_REPAIR) {
5059 /*
5060 * Repair is the result of a resilver issued by the
5061 * scan thread (spa_sync).
5062 */
5063 if (flags & ZIO_FLAG_SCAN_THREAD) {
5064 dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
5065 dsl_scan_phys_t *scn_phys = &scn->scn_phys;
5066 uint64_t *processed = &scn_phys->scn_processed;
5067
5068 if (vd->vdev_ops->vdev_op_leaf)
5069 atomic_add_64(processed, psize);
5070 vs->vs_scan_processed += psize;
5071 }
5072
5073 /*
5074 * Repair is the result of a rebuild issued by the
5075 * rebuild thread (vdev_rebuild_thread). To avoid
5076 * double counting repaired bytes the virtual dRAID
5077 * spare vdev is excluded from the processed bytes.
5078 */
5079 if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
5080 vdev_t *tvd = vd->vdev_top;
5081 vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
5082 vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
5083 uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
5084
5085 if (vd->vdev_ops->vdev_op_leaf &&
5086 vd->vdev_ops != &vdev_draid_spare_ops) {
5087 atomic_add_64(rebuilt, psize);
5088 }
5089 vs->vs_rebuild_processed += psize;
5090 }
5091
5092 if (flags & ZIO_FLAG_SELF_HEAL)
5093 vs->vs_self_healed += psize;
5094 }
5095
5096 /*
5097 * The bytes/ops/histograms are recorded at the leaf level and
5098 * aggregated into the higher level vdevs in vdev_get_stats().
5099 */
5100 if (vd->vdev_ops->vdev_op_leaf &&
5101 (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
5102 zio_type_t vs_type = type;
5103 zio_priority_t priority = zio->io_priority;
5104
5105 /*
5106 * TRIM ops and bytes are reported to user space as
5107 * ZIO_TYPE_FLUSH. This is done to preserve the
5108 * vdev_stat_t structure layout for user space.
5109 */
5110 if (type == ZIO_TYPE_TRIM)
5111 vs_type = ZIO_TYPE_FLUSH;
5112
5113 /*
5114 * Solely for the purposes of 'zpool iostat -lqrw'
5115 * reporting use the priority to categorize the IO.
5116 * Only the following are reported to user space:
5117 *
5118 * ZIO_PRIORITY_SYNC_READ,
5119 * ZIO_PRIORITY_SYNC_WRITE,
5120 * ZIO_PRIORITY_ASYNC_READ,
5121 * ZIO_PRIORITY_ASYNC_WRITE,
5122 * ZIO_PRIORITY_SCRUB,
5123 * ZIO_PRIORITY_TRIM,
5124 * ZIO_PRIORITY_REBUILD.
5125 */
5126 if (priority == ZIO_PRIORITY_INITIALIZING) {
5127 ASSERT3U(type, ==, ZIO_TYPE_WRITE);
5128 priority = ZIO_PRIORITY_ASYNC_WRITE;
5129 } else if (priority == ZIO_PRIORITY_REMOVAL) {
5130 priority = ((type == ZIO_TYPE_WRITE) ?
5131 ZIO_PRIORITY_ASYNC_WRITE :
5132 ZIO_PRIORITY_ASYNC_READ);
5133 }
5134
5135 vs->vs_ops[vs_type]++;
5136 vs->vs_bytes[vs_type] += psize;
5137
5138 if (flags & ZIO_FLAG_DELEGATED) {
5139 vsx->vsx_agg_histo[priority]
5140 [RQ_HISTO(zio->io_size)]++;
5141 } else {
5142 vsx->vsx_ind_histo[priority]
5143 [RQ_HISTO(zio->io_size)]++;
5144 }
5145
5146 if (zio->io_delta && zio->io_delay) {
5147 vsx->vsx_queue_histo[priority]
5148 [L_HISTO(zio->io_delta - zio->io_delay)]++;
5149 vsx->vsx_disk_histo[type]
5150 [L_HISTO(zio->io_delay)]++;
5151 vsx->vsx_total_histo[type]
5152 [L_HISTO(zio->io_delta)]++;
5153 }
5154 }
5155
5156 mutex_exit(&vd->vdev_stat_lock);
5157 return;
5158 }
5159
5160 if (flags & ZIO_FLAG_SPECULATIVE)
5161 return;
5162
5163 /*
5164 * If this is an I/O error that is going to be retried, then ignore the
5165 * error. Otherwise, the user may interpret B_FAILFAST I/O errors as
5166 * hard errors, when in reality they can happen for any number of
5167 * innocuous reasons (bus resets, MPxIO link failure, etc).
5168 */
5169 if (zio->io_error == EIO &&
5170 !(zio->io_flags & ZIO_FLAG_IO_RETRY))
5171 return;
5172
5173 /*
5174 * Intent logs writes won't propagate their error to the root
5175 * I/O so don't mark these types of failures as pool-level
5176 * errors.
5177 */
5178 if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
5179 return;
5180
5181 if (type == ZIO_TYPE_WRITE && txg != 0 &&
5182 (!(flags & ZIO_FLAG_IO_REPAIR) ||
5183 (flags & ZIO_FLAG_SCAN_THREAD) ||
5184 spa->spa_claiming)) {
5185 /*
5186 * This is either a normal write (not a repair), or it's
5187 * a repair induced by the scrub thread, or it's a repair
5188 * made by zil_claim() during spa_load() in the first txg.
5189 * In the normal case, we commit the DTL change in the same
5190 * txg as the block was born. In the scrub-induced repair
5191 * case, we know that scrubs run in first-pass syncing context,
5192 * so we commit the DTL change in spa_syncing_txg(spa).
5193 * In the zil_claim() case, we commit in spa_first_txg(spa).
5194 *
5195 * We currently do not make DTL entries for failed spontaneous
5196 * self-healing writes triggered by normal (non-scrubbing)
5197 * reads, because we have no transactional context in which to
5198 * do so -- and it's not clear that it'd be desirable anyway.
5199 */
5200 if (vd->vdev_ops->vdev_op_leaf) {
5201 uint64_t commit_txg = txg;
5202 if (flags & ZIO_FLAG_SCAN_THREAD) {
5203 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
5204 ASSERT(spa_sync_pass(spa) == 1);
5205 vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
5206 commit_txg = spa_syncing_txg(spa);
5207 } else if (spa->spa_claiming) {
5208 ASSERT(flags & ZIO_FLAG_IO_REPAIR);
5209 commit_txg = spa_first_txg(spa);
5210 }
5211 ASSERT(commit_txg >= spa_syncing_txg(spa));
5212 if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
5213 return;
5214 for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
5215 vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
5216 vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
5217 }
5218 if (vd != rvd)
5219 vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
5220 }
5221 }
5222
5223 int64_t
vdev_deflated_space(vdev_t * vd,int64_t space)5224 vdev_deflated_space(vdev_t *vd, int64_t space)
5225 {
5226 ASSERT0((space & (SPA_MINBLOCKSIZE-1)));
5227 ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
5228
5229 return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio);
5230 }
5231
5232 /*
5233 * Update the in-core space usage stats for this vdev, its metaslab class,
5234 * and the root vdev.
5235 */
5236 void
vdev_space_update(vdev_t * vd,int64_t alloc_delta,int64_t defer_delta,int64_t space_delta)5237 vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
5238 int64_t space_delta)
5239 {
5240 (void) defer_delta;
5241 int64_t dspace_delta;
5242 spa_t *spa = vd->vdev_spa;
5243 vdev_t *rvd = spa->spa_root_vdev;
5244
5245 ASSERT(vd == vd->vdev_top);
5246
5247 /*
5248 * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
5249 * factor. We must calculate this here and not at the root vdev
5250 * because the root vdev's psize-to-asize is simply the max of its
5251 * children's, thus not accurate enough for us.
5252 */
5253 dspace_delta = vdev_deflated_space(vd, space_delta);
5254
5255 mutex_enter(&vd->vdev_stat_lock);
5256 /* ensure we won't underflow */
5257 if (alloc_delta < 0) {
5258 ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta);
5259 }
5260
5261 vd->vdev_stat.vs_alloc += alloc_delta;
5262 vd->vdev_stat.vs_space += space_delta;
5263 vd->vdev_stat.vs_dspace += dspace_delta;
5264 mutex_exit(&vd->vdev_stat_lock);
5265
5266 /* every class but log contributes to root space stats */
5267 if (vd->vdev_mg != NULL && !vd->vdev_islog) {
5268 ASSERT(!vd->vdev_isl2cache);
5269 mutex_enter(&rvd->vdev_stat_lock);
5270 rvd->vdev_stat.vs_alloc += alloc_delta;
5271 rvd->vdev_stat.vs_space += space_delta;
5272 rvd->vdev_stat.vs_dspace += dspace_delta;
5273 mutex_exit(&rvd->vdev_stat_lock);
5274 }
5275 /* Note: metaslab_class_space_update moved to metaslab_space_update */
5276 }
5277
5278 /*
5279 * Mark a top-level vdev's config as dirty, placing it on the dirty list
5280 * so that it will be written out next time the vdev configuration is synced.
5281 * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs.
5282 */
5283 void
vdev_config_dirty(vdev_t * vd)5284 vdev_config_dirty(vdev_t *vd)
5285 {
5286 spa_t *spa = vd->vdev_spa;
5287 vdev_t *rvd = spa->spa_root_vdev;
5288 int c;
5289
5290 ASSERT(spa_writeable(spa));
5291
5292 /*
5293 * If this is an aux vdev (as with l2cache and spare devices), then we
5294 * update the vdev config manually and set the sync flag.
5295 */
5296 if (vd->vdev_aux != NULL) {
5297 spa_aux_vdev_t *sav = vd->vdev_aux;
5298 nvlist_t **aux;
5299 uint_t naux;
5300
5301 for (c = 0; c < sav->sav_count; c++) {
5302 if (sav->sav_vdevs[c] == vd)
5303 break;
5304 }
5305
5306 if (c == sav->sav_count) {
5307 /*
5308 * We're being removed. There's nothing more to do.
5309 */
5310 ASSERT(sav->sav_sync == B_TRUE);
5311 return;
5312 }
5313
5314 sav->sav_sync = B_TRUE;
5315
5316 if (nvlist_lookup_nvlist_array(sav->sav_config,
5317 ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
5318 VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
5319 ZPOOL_CONFIG_SPARES, &aux, &naux));
5320 }
5321
5322 ASSERT(c < naux);
5323
5324 /*
5325 * Setting the nvlist in the middle if the array is a little
5326 * sketchy, but it will work.
5327 */
5328 nvlist_free(aux[c]);
5329 aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0);
5330
5331 return;
5332 }
5333
5334 /*
5335 * The dirty list is protected by the SCL_CONFIG lock. The caller
5336 * must either hold SCL_CONFIG as writer, or must be the sync thread
5337 * (which holds SCL_CONFIG as reader). There's only one sync thread,
5338 * so this is sufficient to ensure mutual exclusion.
5339 */
5340 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
5341 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5342 spa_config_held(spa, SCL_CONFIG, RW_READER)));
5343
5344 if (vd == rvd) {
5345 for (c = 0; c < rvd->vdev_children; c++)
5346 vdev_config_dirty(rvd->vdev_child[c]);
5347 } else {
5348 ASSERT(vd == vd->vdev_top);
5349
5350 if (!list_link_active(&vd->vdev_config_dirty_node) &&
5351 vdev_is_concrete(vd)) {
5352 list_insert_head(&spa->spa_config_dirty_list, vd);
5353 }
5354 }
5355 }
5356
5357 void
vdev_config_clean(vdev_t * vd)5358 vdev_config_clean(vdev_t *vd)
5359 {
5360 spa_t *spa = vd->vdev_spa;
5361
5362 ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) ||
5363 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5364 spa_config_held(spa, SCL_CONFIG, RW_READER)));
5365
5366 ASSERT(list_link_active(&vd->vdev_config_dirty_node));
5367 list_remove(&spa->spa_config_dirty_list, vd);
5368 }
5369
5370 /*
5371 * Mark a top-level vdev's state as dirty, so that the next pass of
5372 * spa_sync() can convert this into vdev_config_dirty(). We distinguish
5373 * the state changes from larger config changes because they require
5374 * much less locking, and are often needed for administrative actions.
5375 */
5376 void
vdev_state_dirty(vdev_t * vd)5377 vdev_state_dirty(vdev_t *vd)
5378 {
5379 spa_t *spa = vd->vdev_spa;
5380
5381 ASSERT(spa_writeable(spa));
5382 ASSERT(vd == vd->vdev_top);
5383
5384 /*
5385 * The state list is protected by the SCL_STATE lock. The caller
5386 * must either hold SCL_STATE as writer, or must be the sync thread
5387 * (which holds SCL_STATE as reader). There's only one sync thread,
5388 * so this is sufficient to ensure mutual exclusion.
5389 */
5390 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
5391 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5392 spa_config_held(spa, SCL_STATE, RW_READER)));
5393
5394 if (!list_link_active(&vd->vdev_state_dirty_node) &&
5395 vdev_is_concrete(vd))
5396 list_insert_head(&spa->spa_state_dirty_list, vd);
5397 }
5398
5399 void
vdev_state_clean(vdev_t * vd)5400 vdev_state_clean(vdev_t *vd)
5401 {
5402 spa_t *spa = vd->vdev_spa;
5403
5404 ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) ||
5405 (dsl_pool_sync_context(spa_get_dsl(spa)) &&
5406 spa_config_held(spa, SCL_STATE, RW_READER)));
5407
5408 ASSERT(list_link_active(&vd->vdev_state_dirty_node));
5409 list_remove(&spa->spa_state_dirty_list, vd);
5410 }
5411
5412 /*
5413 * Propagate vdev state up from children to parent.
5414 */
5415 void
vdev_propagate_state(vdev_t * vd)5416 vdev_propagate_state(vdev_t *vd)
5417 {
5418 spa_t *spa = vd->vdev_spa;
5419 vdev_t *rvd = spa->spa_root_vdev;
5420 int degraded = 0, faulted = 0;
5421 int corrupted = 0;
5422 vdev_t *child;
5423
5424 if (vd->vdev_children > 0) {
5425 for (int c = 0; c < vd->vdev_children; c++) {
5426 child = vd->vdev_child[c];
5427
5428 /*
5429 * Don't factor holes or indirect vdevs into the
5430 * decision.
5431 */
5432 if (!vdev_is_concrete(child))
5433 continue;
5434
5435 if (!vdev_readable(child) ||
5436 (!vdev_writeable(child) && spa_writeable(spa))) {
5437 /*
5438 * Root special: if there is a top-level log
5439 * device, treat the root vdev as if it were
5440 * degraded.
5441 */
5442 if (child->vdev_islog && vd == rvd)
5443 degraded++;
5444 else
5445 faulted++;
5446 } else if (child->vdev_state <= VDEV_STATE_DEGRADED) {
5447 degraded++;
5448 }
5449
5450 if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
5451 corrupted++;
5452 }
5453
5454 vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded);
5455
5456 /*
5457 * Root special: if there is a top-level vdev that cannot be
5458 * opened due to corrupted metadata, then propagate the root
5459 * vdev's aux state as 'corrupt' rather than 'insufficient
5460 * replicas'.
5461 */
5462 if (corrupted && vd == rvd &&
5463 rvd->vdev_state == VDEV_STATE_CANT_OPEN)
5464 vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN,
5465 VDEV_AUX_CORRUPT_DATA);
5466 }
5467
5468 if (vd->vdev_parent)
5469 vdev_propagate_state(vd->vdev_parent);
5470 }
5471
5472 /*
5473 * Set a vdev's state. If this is during an open, we don't update the parent
5474 * state, because we're in the process of opening children depth-first.
5475 * Otherwise, we propagate the change to the parent.
5476 *
5477 * If this routine places a device in a faulted state, an appropriate ereport is
5478 * generated.
5479 */
5480 void
vdev_set_state(vdev_t * vd,boolean_t isopen,vdev_state_t state,vdev_aux_t aux)5481 vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
5482 {
5483 uint64_t save_state;
5484 spa_t *spa = vd->vdev_spa;
5485
5486 if (state == vd->vdev_state) {
5487 /*
5488 * Since vdev_offline() code path is already in an offline
5489 * state we can miss a statechange event to OFFLINE. Check
5490 * the previous state to catch this condition.
5491 */
5492 if (vd->vdev_ops->vdev_op_leaf &&
5493 (state == VDEV_STATE_OFFLINE) &&
5494 (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) {
5495 /* post an offline state change */
5496 zfs_post_state_change(spa, vd, vd->vdev_prevstate);
5497 }
5498 vd->vdev_stat.vs_aux = aux;
5499 return;
5500 }
5501
5502 save_state = vd->vdev_state;
5503
5504 vd->vdev_state = state;
5505 vd->vdev_stat.vs_aux = aux;
5506
5507 /*
5508 * If we are setting the vdev state to anything but an open state, then
5509 * always close the underlying device unless the device has requested
5510 * a delayed close (i.e. we're about to remove or fault the device).
5511 * Otherwise, we keep accessible but invalid devices open forever.
5512 * We don't call vdev_close() itself, because that implies some extra
5513 * checks (offline, etc) that we don't want here. This is limited to
5514 * leaf devices, because otherwise closing the device will affect other
5515 * children.
5516 */
5517 if (!vd->vdev_delayed_close && vdev_is_dead(vd) &&
5518 vd->vdev_ops->vdev_op_leaf)
5519 vd->vdev_ops->vdev_op_close(vd);
5520
5521 if (vd->vdev_removed &&
5522 state == VDEV_STATE_CANT_OPEN &&
5523 (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
5524 /*
5525 * If the previous state is set to VDEV_STATE_REMOVED, then this
5526 * device was previously marked removed and someone attempted to
5527 * reopen it. If this failed due to a nonexistent device, then
5528 * keep the device in the REMOVED state. We also let this be if
5529 * it is one of our special test online cases, which is only
5530 * attempting to online the device and shouldn't generate an FMA
5531 * fault.
5532 */
5533 vd->vdev_state = VDEV_STATE_REMOVED;
5534 vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
5535 } else if (state == VDEV_STATE_REMOVED) {
5536 vd->vdev_removed = B_TRUE;
5537 } else if (state == VDEV_STATE_CANT_OPEN) {
5538 /*
5539 * If we fail to open a vdev during an import or recovery, we
5540 * mark it as "not available", which signifies that it was
5541 * never there to begin with. Failure to open such a device
5542 * is not considered an error.
5543 */
5544 if ((spa_load_state(spa) == SPA_LOAD_IMPORT ||
5545 spa_load_state(spa) == SPA_LOAD_RECOVER) &&
5546 vd->vdev_ops->vdev_op_leaf)
5547 vd->vdev_not_present = 1;
5548
5549 /*
5550 * Post the appropriate ereport. If the 'prevstate' field is
5551 * set to something other than VDEV_STATE_UNKNOWN, it indicates
5552 * that this is part of a vdev_reopen(). In this case, we don't
5553 * want to post the ereport if the device was already in the
5554 * CANT_OPEN state beforehand.
5555 *
5556 * If the 'checkremove' flag is set, then this is an attempt to
5557 * online the device in response to an insertion event. If we
5558 * hit this case, then we have detected an insertion event for a
5559 * faulted or offline device that wasn't in the removed state.
5560 * In this scenario, we don't post an ereport because we are
5561 * about to replace the device, or attempt an online with
5562 * vdev_forcefault, which will generate the fault for us.
5563 */
5564 if ((vd->vdev_prevstate != state || vd->vdev_forcefault) &&
5565 !vd->vdev_not_present && !vd->vdev_checkremove &&
5566 vd != spa->spa_root_vdev) {
5567 const char *class;
5568
5569 switch (aux) {
5570 case VDEV_AUX_OPEN_FAILED:
5571 class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
5572 break;
5573 case VDEV_AUX_CORRUPT_DATA:
5574 class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
5575 break;
5576 case VDEV_AUX_NO_REPLICAS:
5577 class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
5578 break;
5579 case VDEV_AUX_BAD_GUID_SUM:
5580 class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
5581 break;
5582 case VDEV_AUX_TOO_SMALL:
5583 class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
5584 break;
5585 case VDEV_AUX_BAD_LABEL:
5586 class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
5587 break;
5588 case VDEV_AUX_BAD_ASHIFT:
5589 class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT;
5590 break;
5591 default:
5592 class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
5593 }
5594
5595 (void) zfs_ereport_post(class, spa, vd, NULL, NULL,
5596 save_state);
5597 }
5598
5599 /* Erase any notion of persistent removed state */
5600 vd->vdev_removed = B_FALSE;
5601 } else {
5602 vd->vdev_removed = B_FALSE;
5603 }
5604
5605 /*
5606 * Notify ZED of any significant state-change on a leaf vdev.
5607 *
5608 */
5609 if (vd->vdev_ops->vdev_op_leaf) {
5610 /* preserve original state from a vdev_reopen() */
5611 if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) &&
5612 (vd->vdev_prevstate != vd->vdev_state) &&
5613 (save_state <= VDEV_STATE_CLOSED))
5614 save_state = vd->vdev_prevstate;
5615
5616 /* filter out state change due to initial vdev_open */
5617 if (save_state > VDEV_STATE_CLOSED)
5618 zfs_post_state_change(spa, vd, save_state);
5619 }
5620
5621 if (!isopen && vd->vdev_parent)
5622 vdev_propagate_state(vd->vdev_parent);
5623 }
5624
5625 boolean_t
vdev_children_are_offline(vdev_t * vd)5626 vdev_children_are_offline(vdev_t *vd)
5627 {
5628 ASSERT(!vd->vdev_ops->vdev_op_leaf);
5629
5630 for (uint64_t i = 0; i < vd->vdev_children; i++) {
5631 if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE)
5632 return (B_FALSE);
5633 }
5634
5635 return (B_TRUE);
5636 }
5637
5638 /*
5639 * Check the vdev configuration to ensure that it's capable of supporting
5640 * a root pool. We do not support partial configuration.
5641 */
5642 boolean_t
vdev_is_bootable(vdev_t * vd)5643 vdev_is_bootable(vdev_t *vd)
5644 {
5645 if (!vd->vdev_ops->vdev_op_leaf) {
5646 const char *vdev_type = vd->vdev_ops->vdev_op_type;
5647
5648 if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0)
5649 return (B_FALSE);
5650 }
5651
5652 for (int c = 0; c < vd->vdev_children; c++) {
5653 if (!vdev_is_bootable(vd->vdev_child[c]))
5654 return (B_FALSE);
5655 }
5656 return (B_TRUE);
5657 }
5658
5659 boolean_t
vdev_is_concrete(vdev_t * vd)5660 vdev_is_concrete(vdev_t *vd)
5661 {
5662 vdev_ops_t *ops = vd->vdev_ops;
5663 if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops ||
5664 ops == &vdev_missing_ops || ops == &vdev_root_ops) {
5665 return (B_FALSE);
5666 } else {
5667 return (B_TRUE);
5668 }
5669 }
5670
5671 /*
5672 * Determine if a log device has valid content. If the vdev was
5673 * removed or faulted in the MOS config then we know that
5674 * the content on the log device has already been written to the pool.
5675 */
5676 boolean_t
vdev_log_state_valid(vdev_t * vd)5677 vdev_log_state_valid(vdev_t *vd)
5678 {
5679 if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted &&
5680 !vd->vdev_removed)
5681 return (B_TRUE);
5682
5683 for (int c = 0; c < vd->vdev_children; c++)
5684 if (vdev_log_state_valid(vd->vdev_child[c]))
5685 return (B_TRUE);
5686
5687 return (B_FALSE);
5688 }
5689
5690 /*
5691 * Expand a vdev if possible.
5692 */
5693 void
vdev_expand(vdev_t * vd,uint64_t txg)5694 vdev_expand(vdev_t *vd, uint64_t txg)
5695 {
5696 ASSERT(vd->vdev_top == vd);
5697 ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
5698 ASSERT(vdev_is_concrete(vd));
5699
5700 vdev_set_deflate_ratio(vd);
5701
5702 if ((vd->vdev_spa->spa_raidz_expand == NULL ||
5703 vd->vdev_spa->spa_raidz_expand->vre_vdev_id != vd->vdev_id) &&
5704 (vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count &&
5705 vdev_is_concrete(vd)) {
5706 vdev_metaslab_group_create(vd);
5707 VERIFY0(vdev_metaslab_init(vd, txg));
5708 vdev_config_dirty(vd);
5709 }
5710 }
5711
5712 /*
5713 * Split a vdev.
5714 */
5715 void
vdev_split(vdev_t * vd)5716 vdev_split(vdev_t *vd)
5717 {
5718 vdev_t *cvd, *pvd = vd->vdev_parent;
5719
5720 VERIFY3U(pvd->vdev_children, >, 1);
5721
5722 vdev_remove_child(pvd, vd);
5723 vdev_compact_children(pvd);
5724
5725 ASSERT3P(pvd->vdev_child, !=, NULL);
5726
5727 cvd = pvd->vdev_child[0];
5728 if (pvd->vdev_children == 1) {
5729 vdev_remove_parent(cvd);
5730 cvd->vdev_splitting = B_TRUE;
5731 }
5732 vdev_propagate_state(cvd);
5733 }
5734
5735 void
vdev_deadman(vdev_t * vd,const char * tag)5736 vdev_deadman(vdev_t *vd, const char *tag)
5737 {
5738 for (int c = 0; c < vd->vdev_children; c++) {
5739 vdev_t *cvd = vd->vdev_child[c];
5740
5741 vdev_deadman(cvd, tag);
5742 }
5743
5744 if (vd->vdev_ops->vdev_op_leaf) {
5745 vdev_queue_t *vq = &vd->vdev_queue;
5746
5747 mutex_enter(&vq->vq_lock);
5748 if (vq->vq_active > 0) {
5749 spa_t *spa = vd->vdev_spa;
5750 zio_t *fio;
5751 uint64_t delta;
5752
5753 zfs_dbgmsg("slow vdev: %s has %u active IOs",
5754 vd->vdev_path, vq->vq_active);
5755
5756 /*
5757 * Look at the head of all the pending queues,
5758 * if any I/O has been outstanding for longer than
5759 * the spa_deadman_synctime invoke the deadman logic.
5760 */
5761 fio = list_head(&vq->vq_active_list);
5762 delta = gethrtime() - fio->io_timestamp;
5763 if (delta > spa_deadman_synctime(spa))
5764 zio_deadman(fio, tag);
5765 }
5766 mutex_exit(&vq->vq_lock);
5767 }
5768 }
5769
5770 void
vdev_defer_resilver(vdev_t * vd)5771 vdev_defer_resilver(vdev_t *vd)
5772 {
5773 ASSERT(vd->vdev_ops->vdev_op_leaf);
5774
5775 vd->vdev_resilver_deferred = B_TRUE;
5776 vd->vdev_spa->spa_resilver_deferred = B_TRUE;
5777 }
5778
5779 /*
5780 * Clears the resilver deferred flag on all leaf devs under vd. Returns
5781 * B_TRUE if we have devices that need to be resilvered and are available to
5782 * accept resilver I/Os.
5783 */
5784 boolean_t
vdev_clear_resilver_deferred(vdev_t * vd,dmu_tx_t * tx)5785 vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx)
5786 {
5787 boolean_t resilver_needed = B_FALSE;
5788 spa_t *spa = vd->vdev_spa;
5789
5790 for (int c = 0; c < vd->vdev_children; c++) {
5791 vdev_t *cvd = vd->vdev_child[c];
5792 resilver_needed |= vdev_clear_resilver_deferred(cvd, tx);
5793 }
5794
5795 if (vd == spa->spa_root_vdev &&
5796 spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
5797 spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
5798 vdev_config_dirty(vd);
5799 spa->spa_resilver_deferred = B_FALSE;
5800 return (resilver_needed);
5801 }
5802
5803 if (!vdev_is_concrete(vd) || vd->vdev_aux ||
5804 !vd->vdev_ops->vdev_op_leaf)
5805 return (resilver_needed);
5806
5807 vd->vdev_resilver_deferred = B_FALSE;
5808
5809 return (!vdev_is_dead(vd) && !vd->vdev_offline &&
5810 vdev_resilver_needed(vd, NULL, NULL));
5811 }
5812
5813 boolean_t
vdev_xlate_is_empty(zfs_range_seg64_t * rs)5814 vdev_xlate_is_empty(zfs_range_seg64_t *rs)
5815 {
5816 return (rs->rs_start == rs->rs_end);
5817 }
5818
5819 /*
5820 * Translate a logical range to the first contiguous physical range for the
5821 * specified vdev_t. This function is initially called with a leaf vdev and
5822 * will walk each parent vdev until it reaches a top-level vdev. Once the
5823 * top-level is reached the physical range is initialized and the recursive
5824 * function begins to unwind. As it unwinds it calls the parent's vdev
5825 * specific translation function to do the real conversion.
5826 */
5827 void
vdev_xlate(vdev_t * vd,const zfs_range_seg64_t * logical_rs,zfs_range_seg64_t * physical_rs,zfs_range_seg64_t * remain_rs)5828 vdev_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
5829 zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs)
5830 {
5831 /*
5832 * Walk up the vdev tree
5833 */
5834 if (vd != vd->vdev_top) {
5835 vdev_xlate(vd->vdev_parent, logical_rs, physical_rs,
5836 remain_rs);
5837 } else {
5838 /*
5839 * We've reached the top-level vdev, initialize the physical
5840 * range to the logical range and set an empty remaining
5841 * range then start to unwind.
5842 */
5843 physical_rs->rs_start = logical_rs->rs_start;
5844 physical_rs->rs_end = logical_rs->rs_end;
5845
5846 remain_rs->rs_start = logical_rs->rs_start;
5847 remain_rs->rs_end = logical_rs->rs_start;
5848
5849 return;
5850 }
5851
5852 vdev_t *pvd = vd->vdev_parent;
5853 ASSERT3P(pvd, !=, NULL);
5854 ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL);
5855
5856 /*
5857 * As this recursive function unwinds, translate the logical
5858 * range into its physical and any remaining components by calling
5859 * the vdev specific translate function.
5860 */
5861 zfs_range_seg64_t intermediate = { 0 };
5862 pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate, remain_rs);
5863
5864 physical_rs->rs_start = intermediate.rs_start;
5865 physical_rs->rs_end = intermediate.rs_end;
5866 }
5867
5868 void
vdev_xlate_walk(vdev_t * vd,const zfs_range_seg64_t * logical_rs,vdev_xlate_func_t * func,void * arg)5869 vdev_xlate_walk(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
5870 vdev_xlate_func_t *func, void *arg)
5871 {
5872 zfs_range_seg64_t iter_rs = *logical_rs;
5873 zfs_range_seg64_t physical_rs;
5874 zfs_range_seg64_t remain_rs;
5875
5876 while (!vdev_xlate_is_empty(&iter_rs)) {
5877
5878 vdev_xlate(vd, &iter_rs, &physical_rs, &remain_rs);
5879
5880 /*
5881 * With raidz and dRAID, it's possible that the logical range
5882 * does not live on this leaf vdev. Only when there is a non-
5883 * zero physical size call the provided function.
5884 */
5885 if (!vdev_xlate_is_empty(&physical_rs))
5886 func(arg, &physical_rs);
5887
5888 iter_rs = remain_rs;
5889 }
5890 }
5891
5892 static char *
vdev_name(vdev_t * vd,char * buf,int buflen)5893 vdev_name(vdev_t *vd, char *buf, int buflen)
5894 {
5895 if (vd->vdev_path == NULL) {
5896 if (strcmp(vd->vdev_ops->vdev_op_type, "root") == 0) {
5897 strlcpy(buf, vd->vdev_spa->spa_name, buflen);
5898 } else if (!vd->vdev_ops->vdev_op_leaf) {
5899 snprintf(buf, buflen, "%s-%llu",
5900 vd->vdev_ops->vdev_op_type,
5901 (u_longlong_t)vd->vdev_id);
5902 }
5903 } else {
5904 strlcpy(buf, vd->vdev_path, buflen);
5905 }
5906 return (buf);
5907 }
5908
5909 /*
5910 * Look at the vdev tree and determine whether any devices are currently being
5911 * replaced.
5912 */
5913 boolean_t
vdev_replace_in_progress(vdev_t * vdev)5914 vdev_replace_in_progress(vdev_t *vdev)
5915 {
5916 ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0);
5917
5918 if (vdev->vdev_ops == &vdev_replacing_ops)
5919 return (B_TRUE);
5920
5921 /*
5922 * A 'spare' vdev indicates that we have a replace in progress, unless
5923 * it has exactly two children, and the second, the hot spare, has
5924 * finished being resilvered.
5925 */
5926 if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 ||
5927 !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING)))
5928 return (B_TRUE);
5929
5930 for (int i = 0; i < vdev->vdev_children; i++) {
5931 if (vdev_replace_in_progress(vdev->vdev_child[i]))
5932 return (B_TRUE);
5933 }
5934
5935 return (B_FALSE);
5936 }
5937
5938 /*
5939 * Add a (source=src, propname=propval) list to an nvlist.
5940 */
5941 static void
vdev_prop_add_list(nvlist_t * nvl,const char * propname,const char * strval,uint64_t intval,zprop_source_t src)5942 vdev_prop_add_list(nvlist_t *nvl, const char *propname, const char *strval,
5943 uint64_t intval, zprop_source_t src)
5944 {
5945 nvlist_t *propval;
5946
5947 propval = fnvlist_alloc();
5948 fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
5949
5950 if (strval != NULL)
5951 fnvlist_add_string(propval, ZPROP_VALUE, strval);
5952 else
5953 fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
5954
5955 fnvlist_add_nvlist(nvl, propname, propval);
5956 nvlist_free(propval);
5957 }
5958
5959 static void
vdev_props_set_sync(void * arg,dmu_tx_t * tx)5960 vdev_props_set_sync(void *arg, dmu_tx_t *tx)
5961 {
5962 vdev_t *vd;
5963 nvlist_t *nvp = arg;
5964 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
5965 objset_t *mos = spa->spa_meta_objset;
5966 nvpair_t *elem = NULL;
5967 uint64_t vdev_guid;
5968 uint64_t objid;
5969 nvlist_t *nvprops;
5970
5971 vdev_guid = fnvlist_lookup_uint64(nvp, ZPOOL_VDEV_PROPS_SET_VDEV);
5972 nvprops = fnvlist_lookup_nvlist(nvp, ZPOOL_VDEV_PROPS_SET_PROPS);
5973 vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
5974
5975 /* this vdev could get removed while waiting for this sync task */
5976 if (vd == NULL)
5977 return;
5978
5979 /*
5980 * Set vdev property values in the vdev props mos object.
5981 */
5982 if (vd->vdev_root_zap != 0) {
5983 objid = vd->vdev_root_zap;
5984 } else if (vd->vdev_top_zap != 0) {
5985 objid = vd->vdev_top_zap;
5986 } else if (vd->vdev_leaf_zap != 0) {
5987 objid = vd->vdev_leaf_zap;
5988 } else {
5989 panic("unexpected vdev type");
5990 }
5991
5992 mutex_enter(&spa->spa_props_lock);
5993
5994 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
5995 uint64_t intval;
5996 const char *strval;
5997 vdev_prop_t prop;
5998 const char *propname = nvpair_name(elem);
5999 zprop_type_t proptype;
6000
6001 switch (prop = vdev_name_to_prop(propname)) {
6002 case VDEV_PROP_USERPROP:
6003 if (vdev_prop_user(propname)) {
6004 strval = fnvpair_value_string(elem);
6005 if (strlen(strval) == 0) {
6006 /* remove the property if value == "" */
6007 (void) zap_remove(mos, objid, propname,
6008 tx);
6009 } else {
6010 VERIFY0(zap_update(mos, objid, propname,
6011 1, strlen(strval) + 1, strval, tx));
6012 }
6013 spa_history_log_internal(spa, "vdev set", tx,
6014 "vdev_guid=%llu: %s=%s",
6015 (u_longlong_t)vdev_guid, nvpair_name(elem),
6016 strval);
6017 }
6018 break;
6019 default:
6020 /* normalize the property name */
6021 propname = vdev_prop_to_name(prop);
6022 proptype = vdev_prop_get_type(prop);
6023
6024 if (nvpair_type(elem) == DATA_TYPE_STRING) {
6025 ASSERT(proptype == PROP_TYPE_STRING);
6026 strval = fnvpair_value_string(elem);
6027 VERIFY0(zap_update(mos, objid, propname,
6028 1, strlen(strval) + 1, strval, tx));
6029 spa_history_log_internal(spa, "vdev set", tx,
6030 "vdev_guid=%llu: %s=%s",
6031 (u_longlong_t)vdev_guid, nvpair_name(elem),
6032 strval);
6033 } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
6034 intval = fnvpair_value_uint64(elem);
6035
6036 if (proptype == PROP_TYPE_INDEX) {
6037 const char *unused;
6038 VERIFY0(vdev_prop_index_to_string(
6039 prop, intval, &unused));
6040 }
6041 VERIFY0(zap_update(mos, objid, propname,
6042 sizeof (uint64_t), 1, &intval, tx));
6043 spa_history_log_internal(spa, "vdev set", tx,
6044 "vdev_guid=%llu: %s=%lld",
6045 (u_longlong_t)vdev_guid,
6046 nvpair_name(elem), (longlong_t)intval);
6047 } else {
6048 panic("invalid vdev property type %u",
6049 nvpair_type(elem));
6050 }
6051 }
6052
6053 }
6054
6055 mutex_exit(&spa->spa_props_lock);
6056 }
6057
6058 int
vdev_prop_set(vdev_t * vd,nvlist_t * innvl,nvlist_t * outnvl)6059 vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
6060 {
6061 spa_t *spa = vd->vdev_spa;
6062 nvpair_t *elem = NULL;
6063 uint64_t vdev_guid;
6064 nvlist_t *nvprops;
6065 int error = 0;
6066
6067 ASSERT(vd != NULL);
6068
6069 /* Check that vdev has a zap we can use */
6070 if (vd->vdev_root_zap == 0 &&
6071 vd->vdev_top_zap == 0 &&
6072 vd->vdev_leaf_zap == 0)
6073 return (SET_ERROR(EINVAL));
6074
6075 if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_SET_VDEV,
6076 &vdev_guid) != 0)
6077 return (SET_ERROR(EINVAL));
6078
6079 if (nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_SET_PROPS,
6080 &nvprops) != 0)
6081 return (SET_ERROR(EINVAL));
6082
6083 if ((vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE)) == NULL)
6084 return (SET_ERROR(EINVAL));
6085
6086 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
6087 const char *propname = nvpair_name(elem);
6088 vdev_prop_t prop = vdev_name_to_prop(propname);
6089 uint64_t intval = 0;
6090 const char *strval = NULL;
6091
6092 if (prop == VDEV_PROP_USERPROP && !vdev_prop_user(propname)) {
6093 error = EINVAL;
6094 goto end;
6095 }
6096
6097 if (prop != VDEV_PROP_USERPROP && vdev_prop_readonly(prop)) {
6098 error = EROFS;
6099 goto end;
6100 }
6101
6102 /* Special Processing */
6103 switch (prop) {
6104 case VDEV_PROP_PATH:
6105 if (vd->vdev_path == NULL) {
6106 error = EROFS;
6107 break;
6108 }
6109 if (nvpair_value_string(elem, &strval) != 0) {
6110 error = EINVAL;
6111 break;
6112 }
6113 /* New path must start with /dev/ */
6114 if (strncmp(strval, "/dev/", 5)) {
6115 error = EINVAL;
6116 break;
6117 }
6118 error = spa_vdev_setpath(spa, vdev_guid, strval);
6119 break;
6120 case VDEV_PROP_ALLOCATING:
6121 if (nvpair_value_uint64(elem, &intval) != 0) {
6122 error = EINVAL;
6123 break;
6124 }
6125 if (intval != vd->vdev_noalloc)
6126 break;
6127 if (intval == 0)
6128 error = spa_vdev_noalloc(spa, vdev_guid);
6129 else
6130 error = spa_vdev_alloc(spa, vdev_guid);
6131 break;
6132 case VDEV_PROP_FAILFAST:
6133 if (nvpair_value_uint64(elem, &intval) != 0) {
6134 error = EINVAL;
6135 break;
6136 }
6137 vd->vdev_failfast = intval & 1;
6138 break;
6139 case VDEV_PROP_SIT_OUT:
6140 /* Only expose this for a draid or raidz leaf */
6141 if (!vd->vdev_ops->vdev_op_leaf ||
6142 vd->vdev_top == NULL ||
6143 (vd->vdev_top->vdev_ops != &vdev_raidz_ops &&
6144 vd->vdev_top->vdev_ops != &vdev_draid_ops)) {
6145 error = ENOTSUP;
6146 break;
6147 }
6148 if (nvpair_value_uint64(elem, &intval) != 0) {
6149 error = EINVAL;
6150 break;
6151 }
6152 if (intval == 1) {
6153 vdev_t *ancestor = vd;
6154 while (ancestor->vdev_parent != vd->vdev_top)
6155 ancestor = ancestor->vdev_parent;
6156 vdev_t *pvd = vd->vdev_top;
6157 uint_t sitouts = 0;
6158 for (int i = 0; i < pvd->vdev_children; i++) {
6159 if (pvd->vdev_child[i] == ancestor)
6160 continue;
6161 if (vdev_sit_out_reads(
6162 pvd->vdev_child[i], 0)) {
6163 sitouts++;
6164 }
6165 }
6166 if (sitouts >= vdev_get_nparity(pvd)) {
6167 error = ZFS_ERR_TOO_MANY_SITOUTS;
6168 break;
6169 }
6170 if (error == 0)
6171 vdev_raidz_sit_child(vd,
6172 INT64_MAX - gethrestime_sec());
6173 } else {
6174 vdev_raidz_unsit_child(vd);
6175 }
6176 break;
6177 case VDEV_PROP_AUTOSIT:
6178 if (vd->vdev_ops != &vdev_raidz_ops &&
6179 vd->vdev_ops != &vdev_draid_ops) {
6180 error = ENOTSUP;
6181 break;
6182 }
6183 if (nvpair_value_uint64(elem, &intval) != 0) {
6184 error = EINVAL;
6185 break;
6186 }
6187 vd->vdev_autosit = intval == 1;
6188 break;
6189 case VDEV_PROP_CHECKSUM_N:
6190 if (nvpair_value_uint64(elem, &intval) != 0) {
6191 error = EINVAL;
6192 break;
6193 }
6194 vd->vdev_checksum_n = intval;
6195 break;
6196 case VDEV_PROP_CHECKSUM_T:
6197 if (nvpair_value_uint64(elem, &intval) != 0) {
6198 error = EINVAL;
6199 break;
6200 }
6201 vd->vdev_checksum_t = intval;
6202 break;
6203 case VDEV_PROP_IO_N:
6204 if (nvpair_value_uint64(elem, &intval) != 0) {
6205 error = EINVAL;
6206 break;
6207 }
6208 vd->vdev_io_n = intval;
6209 break;
6210 case VDEV_PROP_IO_T:
6211 if (nvpair_value_uint64(elem, &intval) != 0) {
6212 error = EINVAL;
6213 break;
6214 }
6215 vd->vdev_io_t = intval;
6216 break;
6217 case VDEV_PROP_SLOW_IO_N:
6218 if (nvpair_value_uint64(elem, &intval) != 0) {
6219 error = EINVAL;
6220 break;
6221 }
6222 vd->vdev_slow_io_n = intval;
6223 break;
6224 case VDEV_PROP_SLOW_IO_T:
6225 if (nvpair_value_uint64(elem, &intval) != 0) {
6226 error = EINVAL;
6227 break;
6228 }
6229 vd->vdev_slow_io_t = intval;
6230 break;
6231 default:
6232 /* Most processing is done in vdev_props_set_sync */
6233 break;
6234 }
6235 end:
6236 if (error != 0) {
6237 intval = error;
6238 vdev_prop_add_list(outnvl, propname, strval, intval, 0);
6239 return (error);
6240 }
6241 }
6242
6243 return (dsl_sync_task(spa->spa_name, NULL, vdev_props_set_sync,
6244 innvl, 6, ZFS_SPACE_CHECK_EXTRA_RESERVED));
6245 }
6246
6247 int
vdev_prop_get(vdev_t * vd,nvlist_t * innvl,nvlist_t * outnvl)6248 vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
6249 {
6250 spa_t *spa = vd->vdev_spa;
6251 objset_t *mos = spa->spa_meta_objset;
6252 int err = 0;
6253 uint64_t objid;
6254 uint64_t vdev_guid;
6255 nvpair_t *elem = NULL;
6256 nvlist_t *nvprops = NULL;
6257 uint64_t intval = 0;
6258 char *strval = NULL;
6259 const char *propname = NULL;
6260 vdev_prop_t prop;
6261
6262 ASSERT(vd != NULL);
6263 ASSERT(mos != NULL);
6264
6265 if (nvlist_lookup_uint64(innvl, ZPOOL_VDEV_PROPS_GET_VDEV,
6266 &vdev_guid) != 0)
6267 return (SET_ERROR(EINVAL));
6268
6269 nvlist_lookup_nvlist(innvl, ZPOOL_VDEV_PROPS_GET_PROPS, &nvprops);
6270
6271 if (vd->vdev_root_zap != 0) {
6272 objid = vd->vdev_root_zap;
6273 } else if (vd->vdev_top_zap != 0) {
6274 objid = vd->vdev_top_zap;
6275 } else if (vd->vdev_leaf_zap != 0) {
6276 objid = vd->vdev_leaf_zap;
6277 } else {
6278 return (SET_ERROR(EINVAL));
6279 }
6280 ASSERT(objid != 0);
6281
6282 mutex_enter(&spa->spa_props_lock);
6283
6284 if (nvprops != NULL) {
6285 char namebuf[64] = { 0 };
6286
6287 while ((elem = nvlist_next_nvpair(nvprops, elem)) != NULL) {
6288 intval = 0;
6289 strval = NULL;
6290 propname = nvpair_name(elem);
6291 prop = vdev_name_to_prop(propname);
6292 zprop_source_t src = ZPROP_SRC_DEFAULT;
6293 uint64_t integer_size, num_integers;
6294
6295 switch (prop) {
6296 /* Special Read-only Properties */
6297 case VDEV_PROP_NAME:
6298 strval = vdev_name(vd, namebuf,
6299 sizeof (namebuf));
6300 if (strval == NULL)
6301 continue;
6302 vdev_prop_add_list(outnvl, propname, strval, 0,
6303 ZPROP_SRC_NONE);
6304 continue;
6305 case VDEV_PROP_CAPACITY:
6306 /* percent used */
6307 intval = (vd->vdev_stat.vs_dspace == 0) ? 0 :
6308 (vd->vdev_stat.vs_alloc * 100 /
6309 vd->vdev_stat.vs_dspace);
6310 vdev_prop_add_list(outnvl, propname, NULL,
6311 intval, ZPROP_SRC_NONE);
6312 continue;
6313 case VDEV_PROP_STATE:
6314 vdev_prop_add_list(outnvl, propname, NULL,
6315 vd->vdev_state, ZPROP_SRC_NONE);
6316 continue;
6317 case VDEV_PROP_GUID:
6318 vdev_prop_add_list(outnvl, propname, NULL,
6319 vd->vdev_guid, ZPROP_SRC_NONE);
6320 continue;
6321 case VDEV_PROP_ASIZE:
6322 vdev_prop_add_list(outnvl, propname, NULL,
6323 vd->vdev_asize, ZPROP_SRC_NONE);
6324 continue;
6325 case VDEV_PROP_PSIZE:
6326 vdev_prop_add_list(outnvl, propname, NULL,
6327 vd->vdev_psize, ZPROP_SRC_NONE);
6328 continue;
6329 case VDEV_PROP_ASHIFT:
6330 vdev_prop_add_list(outnvl, propname, NULL,
6331 vd->vdev_ashift, ZPROP_SRC_NONE);
6332 continue;
6333 case VDEV_PROP_SIZE:
6334 vdev_prop_add_list(outnvl, propname, NULL,
6335 vd->vdev_stat.vs_dspace, ZPROP_SRC_NONE);
6336 continue;
6337 case VDEV_PROP_FREE:
6338 vdev_prop_add_list(outnvl, propname, NULL,
6339 vd->vdev_stat.vs_dspace -
6340 vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
6341 continue;
6342 case VDEV_PROP_ALLOCATED:
6343 vdev_prop_add_list(outnvl, propname, NULL,
6344 vd->vdev_stat.vs_alloc, ZPROP_SRC_NONE);
6345 continue;
6346 case VDEV_PROP_EXPANDSZ:
6347 vdev_prop_add_list(outnvl, propname, NULL,
6348 vd->vdev_stat.vs_esize, ZPROP_SRC_NONE);
6349 continue;
6350 case VDEV_PROP_FRAGMENTATION:
6351 vdev_prop_add_list(outnvl, propname, NULL,
6352 vd->vdev_stat.vs_fragmentation,
6353 ZPROP_SRC_NONE);
6354 continue;
6355 case VDEV_PROP_PARITY:
6356 vdev_prop_add_list(outnvl, propname, NULL,
6357 vdev_get_nparity(vd), ZPROP_SRC_NONE);
6358 continue;
6359 case VDEV_PROP_PATH:
6360 if (vd->vdev_path == NULL)
6361 continue;
6362 vdev_prop_add_list(outnvl, propname,
6363 vd->vdev_path, 0, ZPROP_SRC_NONE);
6364 continue;
6365 case VDEV_PROP_DEVID:
6366 if (vd->vdev_devid == NULL)
6367 continue;
6368 vdev_prop_add_list(outnvl, propname,
6369 vd->vdev_devid, 0, ZPROP_SRC_NONE);
6370 continue;
6371 case VDEV_PROP_PHYS_PATH:
6372 if (vd->vdev_physpath == NULL)
6373 continue;
6374 vdev_prop_add_list(outnvl, propname,
6375 vd->vdev_physpath, 0, ZPROP_SRC_NONE);
6376 continue;
6377 case VDEV_PROP_ENC_PATH:
6378 if (vd->vdev_enc_sysfs_path == NULL)
6379 continue;
6380 vdev_prop_add_list(outnvl, propname,
6381 vd->vdev_enc_sysfs_path, 0, ZPROP_SRC_NONE);
6382 continue;
6383 case VDEV_PROP_FRU:
6384 if (vd->vdev_fru == NULL)
6385 continue;
6386 vdev_prop_add_list(outnvl, propname,
6387 vd->vdev_fru, 0, ZPROP_SRC_NONE);
6388 continue;
6389 case VDEV_PROP_PARENT:
6390 if (vd->vdev_parent != NULL) {
6391 strval = vdev_name(vd->vdev_parent,
6392 namebuf, sizeof (namebuf));
6393 vdev_prop_add_list(outnvl, propname,
6394 strval, 0, ZPROP_SRC_NONE);
6395 }
6396 continue;
6397 case VDEV_PROP_CHILDREN:
6398 if (vd->vdev_children > 0)
6399 strval = kmem_zalloc(ZAP_MAXVALUELEN,
6400 KM_SLEEP);
6401 for (uint64_t i = 0; i < vd->vdev_children;
6402 i++) {
6403 const char *vname;
6404
6405 vname = vdev_name(vd->vdev_child[i],
6406 namebuf, sizeof (namebuf));
6407 if (vname == NULL)
6408 vname = "(unknown)";
6409 if (strlen(strval) > 0)
6410 strlcat(strval, ",",
6411 ZAP_MAXVALUELEN);
6412 strlcat(strval, vname, ZAP_MAXVALUELEN);
6413 }
6414 if (strval != NULL) {
6415 vdev_prop_add_list(outnvl, propname,
6416 strval, 0, ZPROP_SRC_NONE);
6417 kmem_free(strval, ZAP_MAXVALUELEN);
6418 }
6419 continue;
6420 case VDEV_PROP_NUMCHILDREN:
6421 vdev_prop_add_list(outnvl, propname, NULL,
6422 vd->vdev_children, ZPROP_SRC_NONE);
6423 continue;
6424 case VDEV_PROP_READ_ERRORS:
6425 vdev_prop_add_list(outnvl, propname, NULL,
6426 vd->vdev_stat.vs_read_errors,
6427 ZPROP_SRC_NONE);
6428 continue;
6429 case VDEV_PROP_WRITE_ERRORS:
6430 vdev_prop_add_list(outnvl, propname, NULL,
6431 vd->vdev_stat.vs_write_errors,
6432 ZPROP_SRC_NONE);
6433 continue;
6434 case VDEV_PROP_CHECKSUM_ERRORS:
6435 vdev_prop_add_list(outnvl, propname, NULL,
6436 vd->vdev_stat.vs_checksum_errors,
6437 ZPROP_SRC_NONE);
6438 continue;
6439 case VDEV_PROP_INITIALIZE_ERRORS:
6440 vdev_prop_add_list(outnvl, propname, NULL,
6441 vd->vdev_stat.vs_initialize_errors,
6442 ZPROP_SRC_NONE);
6443 continue;
6444 case VDEV_PROP_TRIM_ERRORS:
6445 vdev_prop_add_list(outnvl, propname, NULL,
6446 vd->vdev_stat.vs_trim_errors,
6447 ZPROP_SRC_NONE);
6448 continue;
6449 case VDEV_PROP_SLOW_IOS:
6450 vdev_prop_add_list(outnvl, propname, NULL,
6451 vd->vdev_stat.vs_slow_ios,
6452 ZPROP_SRC_NONE);
6453 continue;
6454 case VDEV_PROP_OPS_NULL:
6455 vdev_prop_add_list(outnvl, propname, NULL,
6456 vd->vdev_stat.vs_ops[ZIO_TYPE_NULL],
6457 ZPROP_SRC_NONE);
6458 continue;
6459 case VDEV_PROP_OPS_READ:
6460 vdev_prop_add_list(outnvl, propname, NULL,
6461 vd->vdev_stat.vs_ops[ZIO_TYPE_READ],
6462 ZPROP_SRC_NONE);
6463 continue;
6464 case VDEV_PROP_OPS_WRITE:
6465 vdev_prop_add_list(outnvl, propname, NULL,
6466 vd->vdev_stat.vs_ops[ZIO_TYPE_WRITE],
6467 ZPROP_SRC_NONE);
6468 continue;
6469 case VDEV_PROP_OPS_FREE:
6470 vdev_prop_add_list(outnvl, propname, NULL,
6471 vd->vdev_stat.vs_ops[ZIO_TYPE_FREE],
6472 ZPROP_SRC_NONE);
6473 continue;
6474 case VDEV_PROP_OPS_CLAIM:
6475 vdev_prop_add_list(outnvl, propname, NULL,
6476 vd->vdev_stat.vs_ops[ZIO_TYPE_CLAIM],
6477 ZPROP_SRC_NONE);
6478 continue;
6479 case VDEV_PROP_OPS_TRIM:
6480 /*
6481 * TRIM ops and bytes are reported to user
6482 * space as ZIO_TYPE_FLUSH. This is done to
6483 * preserve the vdev_stat_t structure layout
6484 * for user space.
6485 */
6486 vdev_prop_add_list(outnvl, propname, NULL,
6487 vd->vdev_stat.vs_ops[ZIO_TYPE_FLUSH],
6488 ZPROP_SRC_NONE);
6489 continue;
6490 case VDEV_PROP_BYTES_NULL:
6491 vdev_prop_add_list(outnvl, propname, NULL,
6492 vd->vdev_stat.vs_bytes[ZIO_TYPE_NULL],
6493 ZPROP_SRC_NONE);
6494 continue;
6495 case VDEV_PROP_BYTES_READ:
6496 vdev_prop_add_list(outnvl, propname, NULL,
6497 vd->vdev_stat.vs_bytes[ZIO_TYPE_READ],
6498 ZPROP_SRC_NONE);
6499 continue;
6500 case VDEV_PROP_BYTES_WRITE:
6501 vdev_prop_add_list(outnvl, propname, NULL,
6502 vd->vdev_stat.vs_bytes[ZIO_TYPE_WRITE],
6503 ZPROP_SRC_NONE);
6504 continue;
6505 case VDEV_PROP_BYTES_FREE:
6506 vdev_prop_add_list(outnvl, propname, NULL,
6507 vd->vdev_stat.vs_bytes[ZIO_TYPE_FREE],
6508 ZPROP_SRC_NONE);
6509 continue;
6510 case VDEV_PROP_BYTES_CLAIM:
6511 vdev_prop_add_list(outnvl, propname, NULL,
6512 vd->vdev_stat.vs_bytes[ZIO_TYPE_CLAIM],
6513 ZPROP_SRC_NONE);
6514 continue;
6515 case VDEV_PROP_BYTES_TRIM:
6516 /*
6517 * TRIM ops and bytes are reported to user
6518 * space as ZIO_TYPE_FLUSH. This is done to
6519 * preserve the vdev_stat_t structure layout
6520 * for user space.
6521 */
6522 vdev_prop_add_list(outnvl, propname, NULL,
6523 vd->vdev_stat.vs_bytes[ZIO_TYPE_FLUSH],
6524 ZPROP_SRC_NONE);
6525 continue;
6526 case VDEV_PROP_REMOVING:
6527 vdev_prop_add_list(outnvl, propname, NULL,
6528 vd->vdev_removing, ZPROP_SRC_NONE);
6529 continue;
6530 case VDEV_PROP_RAIDZ_EXPANDING:
6531 /* Only expose this for raidz */
6532 if (vd->vdev_ops == &vdev_raidz_ops) {
6533 vdev_prop_add_list(outnvl, propname,
6534 NULL, vd->vdev_rz_expanding,
6535 ZPROP_SRC_NONE);
6536 }
6537 continue;
6538 case VDEV_PROP_SIT_OUT:
6539 /* Only expose this for a draid or raidz leaf */
6540 if (vd->vdev_ops->vdev_op_leaf &&
6541 vd->vdev_top != NULL &&
6542 (vd->vdev_top->vdev_ops ==
6543 &vdev_raidz_ops ||
6544 vd->vdev_top->vdev_ops ==
6545 &vdev_draid_ops)) {
6546 vdev_prop_add_list(outnvl, propname,
6547 NULL, vdev_sit_out_reads(vd, 0),
6548 ZPROP_SRC_NONE);
6549 }
6550 continue;
6551 case VDEV_PROP_TRIM_SUPPORT:
6552 /* only valid for leaf vdevs */
6553 if (vd->vdev_ops->vdev_op_leaf) {
6554 vdev_prop_add_list(outnvl, propname,
6555 NULL, vd->vdev_has_trim,
6556 ZPROP_SRC_NONE);
6557 }
6558 continue;
6559 /* Numeric Properites */
6560 case VDEV_PROP_ALLOCATING:
6561 /* Leaf vdevs cannot have this property */
6562 if (vd->vdev_mg == NULL &&
6563 vd->vdev_top != NULL) {
6564 src = ZPROP_SRC_NONE;
6565 intval = ZPROP_BOOLEAN_NA;
6566 } else {
6567 err = vdev_prop_get_int(vd, prop,
6568 &intval);
6569 if (err && err != ENOENT)
6570 break;
6571
6572 if (intval ==
6573 vdev_prop_default_numeric(prop))
6574 src = ZPROP_SRC_DEFAULT;
6575 else
6576 src = ZPROP_SRC_LOCAL;
6577 }
6578
6579 vdev_prop_add_list(outnvl, propname, NULL,
6580 intval, src);
6581 break;
6582 case VDEV_PROP_FAILFAST:
6583 src = ZPROP_SRC_LOCAL;
6584 strval = NULL;
6585
6586 err = zap_lookup(mos, objid, nvpair_name(elem),
6587 sizeof (uint64_t), 1, &intval);
6588 if (err == ENOENT) {
6589 intval = vdev_prop_default_numeric(
6590 prop);
6591 err = 0;
6592 } else if (err) {
6593 break;
6594 }
6595 if (intval == vdev_prop_default_numeric(prop))
6596 src = ZPROP_SRC_DEFAULT;
6597
6598 vdev_prop_add_list(outnvl, propname, strval,
6599 intval, src);
6600 break;
6601 case VDEV_PROP_AUTOSIT:
6602 /* Only raidz vdevs cannot have this property */
6603 if (vd->vdev_ops != &vdev_raidz_ops &&
6604 vd->vdev_ops != &vdev_draid_ops) {
6605 src = ZPROP_SRC_NONE;
6606 intval = ZPROP_BOOLEAN_NA;
6607 } else {
6608 err = vdev_prop_get_int(vd, prop,
6609 &intval);
6610 if (err && err != ENOENT)
6611 break;
6612
6613 if (intval ==
6614 vdev_prop_default_numeric(prop))
6615 src = ZPROP_SRC_DEFAULT;
6616 else
6617 src = ZPROP_SRC_LOCAL;
6618 }
6619
6620 vdev_prop_add_list(outnvl, propname, NULL,
6621 intval, src);
6622 break;
6623
6624 case VDEV_PROP_CHECKSUM_N:
6625 case VDEV_PROP_CHECKSUM_T:
6626 case VDEV_PROP_IO_N:
6627 case VDEV_PROP_IO_T:
6628 case VDEV_PROP_SLOW_IO_N:
6629 case VDEV_PROP_SLOW_IO_T:
6630 err = vdev_prop_get_int(vd, prop, &intval);
6631 if (err && err != ENOENT)
6632 break;
6633
6634 if (intval == vdev_prop_default_numeric(prop))
6635 src = ZPROP_SRC_DEFAULT;
6636 else
6637 src = ZPROP_SRC_LOCAL;
6638
6639 vdev_prop_add_list(outnvl, propname, NULL,
6640 intval, src);
6641 break;
6642 /* Text Properties */
6643 case VDEV_PROP_COMMENT:
6644 /* Exists in the ZAP below */
6645 /* FALLTHRU */
6646 case VDEV_PROP_USERPROP:
6647 /* User Properites */
6648 src = ZPROP_SRC_LOCAL;
6649
6650 err = zap_length(mos, objid, nvpair_name(elem),
6651 &integer_size, &num_integers);
6652 if (err)
6653 break;
6654
6655 switch (integer_size) {
6656 case 8:
6657 /* User properties cannot be integers */
6658 err = EINVAL;
6659 break;
6660 case 1:
6661 /* string property */
6662 strval = kmem_alloc(num_integers,
6663 KM_SLEEP);
6664 err = zap_lookup(mos, objid,
6665 nvpair_name(elem), 1,
6666 num_integers, strval);
6667 if (err) {
6668 kmem_free(strval,
6669 num_integers);
6670 break;
6671 }
6672 vdev_prop_add_list(outnvl, propname,
6673 strval, 0, src);
6674 kmem_free(strval, num_integers);
6675 break;
6676 }
6677 break;
6678 default:
6679 err = ENOENT;
6680 break;
6681 }
6682 if (err)
6683 break;
6684 }
6685 } else {
6686 /*
6687 * Get all properties from the MOS vdev property object.
6688 */
6689 zap_cursor_t zc;
6690 zap_attribute_t *za = zap_attribute_alloc();
6691 for (zap_cursor_init(&zc, mos, objid);
6692 (err = zap_cursor_retrieve(&zc, za)) == 0;
6693 zap_cursor_advance(&zc)) {
6694 intval = 0;
6695 strval = NULL;
6696 zprop_source_t src = ZPROP_SRC_DEFAULT;
6697 propname = za->za_name;
6698
6699 switch (za->za_integer_length) {
6700 case 8:
6701 /* We do not allow integer user properties */
6702 /* This is likely an internal value */
6703 break;
6704 case 1:
6705 /* string property */
6706 strval = kmem_alloc(za->za_num_integers,
6707 KM_SLEEP);
6708 err = zap_lookup(mos, objid, za->za_name, 1,
6709 za->za_num_integers, strval);
6710 if (err) {
6711 kmem_free(strval, za->za_num_integers);
6712 break;
6713 }
6714 vdev_prop_add_list(outnvl, propname, strval, 0,
6715 src);
6716 kmem_free(strval, za->za_num_integers);
6717 break;
6718
6719 default:
6720 break;
6721 }
6722 }
6723 zap_cursor_fini(&zc);
6724 zap_attribute_free(za);
6725 }
6726
6727 mutex_exit(&spa->spa_props_lock);
6728 if (err && err != ENOENT) {
6729 return (err);
6730 }
6731
6732 return (0);
6733 }
6734
6735 EXPORT_SYMBOL(vdev_fault);
6736 EXPORT_SYMBOL(vdev_degrade);
6737 EXPORT_SYMBOL(vdev_online);
6738 EXPORT_SYMBOL(vdev_offline);
6739 EXPORT_SYMBOL(vdev_clear);
6740
6741 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, UINT, ZMOD_RW,
6742 "Target number of metaslabs per top-level vdev");
6743
6744 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, UINT, ZMOD_RW,
6745 "Default lower limit for metaslab size");
6746
6747 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_ms_shift, UINT, ZMOD_RW,
6748 "Default upper limit for metaslab size");
6749
6750 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, UINT, ZMOD_RW,
6751 "Minimum number of metaslabs per top-level vdev");
6752
6753 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, UINT, ZMOD_RW,
6754 "Practical upper limit of total metaslabs per top-level vdev");
6755
6756 ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW,
6757 "Rate limit slow IO (delay) events to this many per second");
6758
6759 ZFS_MODULE_PARAM(zfs, zfs_, deadman_events_per_second, UINT, ZMOD_RW,
6760 "Rate limit hung IO (deadman) events to this many per second");
6761
6762 ZFS_MODULE_PARAM(zfs, zfs_, dio_write_verify_events_per_second, UINT, ZMOD_RW,
6763 "Rate Direct I/O write verify events to this many per second");
6764
6765 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, direct_write_verify, UINT, ZMOD_RW,
6766 "Direct I/O writes will perform for checksum verification before "
6767 "commiting write");
6768
6769 ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW,
6770 "Rate limit checksum events to this many checksum errors per second "
6771 "(do not set below ZED threshold).");
6772
6773 ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW,
6774 "Ignore errors during resilver/scrub");
6775
6776 ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW,
6777 "Bypass vdev_validate()");
6778
6779 ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW,
6780 "Disable cache flushes");
6781
6782 ZFS_MODULE_PARAM(zfs, zfs_, embedded_slog_min_ms, UINT, ZMOD_RW,
6783 "Minimum number of metaslabs required to dedicate one for log blocks");
6784
6785 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift,
6786 param_set_min_auto_ashift, param_get_uint, ZMOD_RW,
6787 "Minimum ashift used when creating new top-level vdevs");
6788
6789 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift,
6790 param_set_max_auto_ashift, param_get_uint, ZMOD_RW,
6791 "Maximum ashift used when optimizing for logical -> physical sector "
6792 "size on new top-level vdevs");
6793
6794 ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, raidz_impl,
6795 param_set_raidz_impl, param_get_raidz_impl, ZMOD_RW,
6796 "RAIDZ implementation");
6797